1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/sysmacros.h> 32 #include <sys/mntent.h> 33 #include <sys/u8_textprep.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/file.h> 38 #include <sys/kmem.h> 39 #include <sys/errno.h> 40 #include <sys/atomic.h> 41 #include <sys/zfs_dir.h> 42 #include <sys/zfs_acl.h> 43 #include <sys/zfs_ioctl.h> 44 #include <sys/zfs_rlock.h> 45 #include <sys/zfs_fuid.h> 46 #include <sys/zfs_vnops.h> 47 #include <sys/zfs_ctldir.h> 48 #include <sys/dnode.h> 49 #include <sys/fs/zfs.h> 50 #include <sys/zpl.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_objset.h> 53 #include <sys/dmu_tx.h> 54 #include <sys/zfs_refcount.h> 55 #include <sys/stat.h> 56 #include <sys/zap.h> 57 #include <sys/zfs_znode.h> 58 #include <sys/sa.h> 59 #include <sys/zfs_sa.h> 60 #include <sys/zfs_stat.h> 61 #include <linux/mm_compat.h> 62 63 #include "zfs_prop.h" 64 #include "zfs_comutil.h" 65 66 static kmem_cache_t *znode_cache = NULL; 67 static kmem_cache_t *znode_hold_cache = NULL; 68 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; 69 70 /* 71 * This is used by the test suite so that it can delay znodes from being 72 * freed in order to inspect the unlinked set. 73 */ 74 static int zfs_unlink_suspend_progress = 0; 75 76 /* 77 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 78 * z_rangelock. It will modify the offset and length of the lock to reflect 79 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 80 * called with the rangelock_t's rl_lock held, which avoids races. 81 */ 82 static void 83 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) 84 { 85 znode_t *zp = arg; 86 87 /* 88 * If in append mode, convert to writer and lock starting at the 89 * current end of file. 90 */ 91 if (new->lr_type == RL_APPEND) { 92 new->lr_offset = zp->z_size; 93 new->lr_type = RL_WRITER; 94 } 95 96 /* 97 * If we need to grow the block size then lock the whole file range. 98 */ 99 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 100 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 101 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { 102 new->lr_offset = 0; 103 new->lr_length = UINT64_MAX; 104 } 105 } 106 107 static int 108 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 109 { 110 (void) arg, (void) kmflags; 111 znode_t *zp = buf; 112 113 inode_init_once(ZTOI(zp)); 114 list_link_init(&zp->z_link_node); 115 116 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 117 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 118 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); 119 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 120 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); 121 122 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 123 124 zp->z_dirlocks = NULL; 125 zp->z_acl_cached = NULL; 126 zp->z_xattr_cached = NULL; 127 zp->z_xattr_parent = 0; 128 zp->z_sync_writes_cnt = 0; 129 zp->z_async_writes_cnt = 0; 130 131 return (0); 132 } 133 134 static void 135 zfs_znode_cache_destructor(void *buf, void *arg) 136 { 137 (void) arg; 138 znode_t *zp = buf; 139 140 ASSERT(!list_link_active(&zp->z_link_node)); 141 mutex_destroy(&zp->z_lock); 142 rw_destroy(&zp->z_parent_lock); 143 rw_destroy(&zp->z_name_lock); 144 mutex_destroy(&zp->z_acl_lock); 145 rw_destroy(&zp->z_xattr_lock); 146 zfs_rangelock_fini(&zp->z_rangelock); 147 148 ASSERT3P(zp->z_dirlocks, ==, NULL); 149 ASSERT3P(zp->z_acl_cached, ==, NULL); 150 ASSERT3P(zp->z_xattr_cached, ==, NULL); 151 152 ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); 153 ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); 154 } 155 156 static int 157 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) 158 { 159 (void) arg, (void) kmflags; 160 znode_hold_t *zh = buf; 161 162 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); 163 zh->zh_refcount = 0; 164 165 return (0); 166 } 167 168 static void 169 zfs_znode_hold_cache_destructor(void *buf, void *arg) 170 { 171 (void) arg; 172 znode_hold_t *zh = buf; 173 174 mutex_destroy(&zh->zh_lock); 175 } 176 177 void 178 zfs_znode_init(void) 179 { 180 /* 181 * Initialize zcache. The KMC_SLAB hint is used in order that it be 182 * backed by kmalloc() when on the Linux slab in order that any 183 * wait_on_bit() operations on the related inode operate properly. 184 */ 185 ASSERT(znode_cache == NULL); 186 znode_cache = kmem_cache_create("zfs_znode_cache", 187 sizeof (znode_t), 0, zfs_znode_cache_constructor, 188 zfs_znode_cache_destructor, NULL, NULL, NULL, 189 KMC_SLAB | KMC_RECLAIMABLE); 190 191 ASSERT(znode_hold_cache == NULL); 192 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", 193 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, 194 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); 195 } 196 197 void 198 zfs_znode_fini(void) 199 { 200 /* 201 * Cleanup zcache 202 */ 203 if (znode_cache) 204 kmem_cache_destroy(znode_cache); 205 znode_cache = NULL; 206 207 if (znode_hold_cache) 208 kmem_cache_destroy(znode_hold_cache); 209 znode_hold_cache = NULL; 210 } 211 212 /* 213 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to 214 * serialize access to a znode and its SA buffer while the object is being 215 * created or destroyed. This kind of locking would normally reside in the 216 * znode itself but in this case that's impossible because the znode and SA 217 * buffer may not yet exist. Therefore the locking is handled externally 218 * with an array of mutexes and AVLs trees which contain per-object locks. 219 * 220 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted 221 * in to the correct AVL tree and finally the per-object lock is held. In 222 * zfs_znode_hold_exit() the process is reversed. The per-object lock is 223 * released, removed from the AVL tree and destroyed if there are no waiters. 224 * 225 * This scheme has two important properties: 226 * 227 * 1) No memory allocations are performed while holding one of the z_hold_locks. 228 * This ensures evict(), which can be called from direct memory reclaim, will 229 * never block waiting on a z_hold_locks which just happens to have hashed 230 * to the same index. 231 * 232 * 2) All locks used to serialize access to an object are per-object and never 233 * shared. This minimizes lock contention without creating a large number 234 * of dedicated locks. 235 * 236 * On the downside it does require znode_lock_t structures to be frequently 237 * allocated and freed. However, because these are backed by a kmem cache 238 * and very short lived this cost is minimal. 239 */ 240 int 241 zfs_znode_hold_compare(const void *a, const void *b) 242 { 243 const znode_hold_t *zh_a = (const znode_hold_t *)a; 244 const znode_hold_t *zh_b = (const znode_hold_t *)b; 245 246 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); 247 } 248 249 static boolean_t __maybe_unused 250 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) 251 { 252 znode_hold_t *zh, search; 253 int i = ZFS_OBJ_HASH(zfsvfs, obj); 254 boolean_t held; 255 256 search.zh_obj = obj; 257 258 mutex_enter(&zfsvfs->z_hold_locks[i]); 259 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 260 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; 261 mutex_exit(&zfsvfs->z_hold_locks[i]); 262 263 return (held); 264 } 265 266 znode_hold_t * 267 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) 268 { 269 znode_hold_t *zh, *zh_new, search; 270 int i = ZFS_OBJ_HASH(zfsvfs, obj); 271 boolean_t found = B_FALSE; 272 273 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); 274 search.zh_obj = obj; 275 276 mutex_enter(&zfsvfs->z_hold_locks[i]); 277 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 278 if (likely(zh == NULL)) { 279 zh = zh_new; 280 zh->zh_obj = obj; 281 avl_add(&zfsvfs->z_hold_trees[i], zh); 282 } else { 283 ASSERT3U(zh->zh_obj, ==, obj); 284 found = B_TRUE; 285 } 286 zh->zh_refcount++; 287 ASSERT3S(zh->zh_refcount, >, 0); 288 mutex_exit(&zfsvfs->z_hold_locks[i]); 289 290 if (found == B_TRUE) 291 kmem_cache_free(znode_hold_cache, zh_new); 292 293 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); 294 mutex_enter(&zh->zh_lock); 295 296 return (zh); 297 } 298 299 void 300 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) 301 { 302 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); 303 boolean_t remove = B_FALSE; 304 305 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); 306 mutex_exit(&zh->zh_lock); 307 308 mutex_enter(&zfsvfs->z_hold_locks[i]); 309 ASSERT3S(zh->zh_refcount, >, 0); 310 if (--zh->zh_refcount == 0) { 311 avl_remove(&zfsvfs->z_hold_trees[i], zh); 312 remove = B_TRUE; 313 } 314 mutex_exit(&zfsvfs->z_hold_locks[i]); 315 316 if (remove == B_TRUE) 317 kmem_cache_free(znode_hold_cache, zh); 318 } 319 320 dev_t 321 zfs_cmpldev(uint64_t dev) 322 { 323 return (dev); 324 } 325 326 static void 327 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 328 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 329 { 330 ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); 331 332 mutex_enter(&zp->z_lock); 333 334 ASSERT(zp->z_sa_hdl == NULL); 335 ASSERT(zp->z_acl_cached == NULL); 336 if (sa_hdl == NULL) { 337 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, 338 SA_HDL_SHARED, &zp->z_sa_hdl)); 339 } else { 340 zp->z_sa_hdl = sa_hdl; 341 sa_set_userp(sa_hdl, zp); 342 } 343 344 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 345 346 mutex_exit(&zp->z_lock); 347 } 348 349 void 350 zfs_znode_dmu_fini(znode_t *zp) 351 { 352 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || 353 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); 354 355 sa_handle_destroy(zp->z_sa_hdl); 356 zp->z_sa_hdl = NULL; 357 } 358 359 /* 360 * Called by new_inode() to allocate a new inode. 361 */ 362 int 363 zfs_inode_alloc(struct super_block *sb, struct inode **ip) 364 { 365 znode_t *zp; 366 367 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 368 *ip = ZTOI(zp); 369 370 return (0); 371 } 372 373 /* 374 * Called in multiple places when an inode should be destroyed. 375 */ 376 void 377 zfs_inode_destroy(struct inode *ip) 378 { 379 znode_t *zp = ITOZ(ip); 380 zfsvfs_t *zfsvfs = ZTOZSB(zp); 381 382 mutex_enter(&zfsvfs->z_znodes_lock); 383 if (list_link_active(&zp->z_link_node)) { 384 list_remove(&zfsvfs->z_all_znodes, zp); 385 } 386 mutex_exit(&zfsvfs->z_znodes_lock); 387 388 if (zp->z_acl_cached) { 389 zfs_acl_free(zp->z_acl_cached); 390 zp->z_acl_cached = NULL; 391 } 392 393 if (zp->z_xattr_cached) { 394 nvlist_free(zp->z_xattr_cached); 395 zp->z_xattr_cached = NULL; 396 } 397 398 kmem_cache_free(znode_cache, zp); 399 } 400 401 static void 402 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) 403 { 404 uint64_t rdev = 0; 405 406 switch (ip->i_mode & S_IFMT) { 407 case S_IFREG: 408 ip->i_op = &zpl_inode_operations; 409 ip->i_fop = &zpl_file_operations; 410 ip->i_mapping->a_ops = &zpl_address_space_operations; 411 break; 412 413 case S_IFDIR: 414 ip->i_op = &zpl_dir_inode_operations; 415 ip->i_fop = &zpl_dir_file_operations; 416 ITOZ(ip)->z_zn_prefetch = B_TRUE; 417 break; 418 419 case S_IFLNK: 420 ip->i_op = &zpl_symlink_inode_operations; 421 break; 422 423 /* 424 * rdev is only stored in a SA only for device files. 425 */ 426 case S_IFCHR: 427 case S_IFBLK: 428 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, 429 sizeof (rdev)); 430 zfs_fallthrough; 431 case S_IFIFO: 432 case S_IFSOCK: 433 init_special_inode(ip, ip->i_mode, rdev); 434 ip->i_op = &zpl_special_inode_operations; 435 break; 436 437 default: 438 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", 439 (u_longlong_t)ip->i_ino, ip->i_mode); 440 441 /* Assume the inode is a file and attempt to continue */ 442 ip->i_mode = S_IFREG | 0644; 443 ip->i_op = &zpl_inode_operations; 444 ip->i_fop = &zpl_file_operations; 445 ip->i_mapping->a_ops = &zpl_address_space_operations; 446 break; 447 } 448 } 449 450 static void 451 zfs_set_inode_flags(znode_t *zp, struct inode *ip) 452 { 453 /* 454 * Linux and Solaris have different sets of file attributes, so we 455 * restrict this conversion to the intersection of the two. 456 */ 457 unsigned int flags = 0; 458 if (zp->z_pflags & ZFS_IMMUTABLE) 459 flags |= S_IMMUTABLE; 460 if (zp->z_pflags & ZFS_APPENDONLY) 461 flags |= S_APPEND; 462 463 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); 464 } 465 466 /* 467 * Update the embedded inode given the znode. 468 */ 469 void 470 zfs_znode_update_vfs(znode_t *zp) 471 { 472 struct inode *ip; 473 uint32_t blksize; 474 u_longlong_t i_blocks; 475 476 ASSERT(zp != NULL); 477 ip = ZTOI(zp); 478 479 /* Skip .zfs control nodes which do not exist on disk. */ 480 if (zfsctl_is_node(ip)) 481 return; 482 483 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); 484 485 spin_lock(&ip->i_lock); 486 ip->i_mode = zp->z_mode; 487 ip->i_blocks = i_blocks; 488 i_size_write(ip, zp->z_size); 489 spin_unlock(&ip->i_lock); 490 } 491 492 493 /* 494 * Construct a znode+inode and initialize. 495 * 496 * This does not do a call to dmu_set_user() that is 497 * up to the caller to do, in case you don't want to 498 * return the znode 499 */ 500 static znode_t * 501 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 502 dmu_object_type_t obj_type, sa_handle_t *hdl) 503 { 504 znode_t *zp; 505 struct inode *ip; 506 uint64_t mode; 507 uint64_t parent; 508 uint64_t tmp_gen; 509 uint64_t links; 510 uint64_t z_uid, z_gid; 511 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 512 inode_timespec_t tmp_ts; 513 uint64_t projid = ZFS_DEFAULT_PROJID; 514 sa_bulk_attr_t bulk[12]; 515 int count = 0; 516 517 ASSERT(zfsvfs != NULL); 518 519 ip = new_inode(zfsvfs->z_sb); 520 if (ip == NULL) 521 return (NULL); 522 523 zp = ITOZ(ip); 524 ASSERT(zp->z_dirlocks == NULL); 525 ASSERT3P(zp->z_acl_cached, ==, NULL); 526 ASSERT3P(zp->z_xattr_cached, ==, NULL); 527 zp->z_unlinked = B_FALSE; 528 zp->z_atime_dirty = B_FALSE; 529 zp->z_is_ctldir = B_FALSE; 530 zp->z_suspended = B_FALSE; 531 zp->z_sa_hdl = NULL; 532 zp->z_mapcnt = 0; 533 zp->z_id = db->db_object; 534 zp->z_blksz = blksz; 535 zp->z_seq = 0x7A4653; 536 zp->z_sync_cnt = 0; 537 zp->z_sync_writes_cnt = 0; 538 zp->z_async_writes_cnt = 0; 539 540 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 541 542 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 543 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); 544 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 545 &zp->z_size, 8); 546 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 547 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 548 &zp->z_pflags, 8); 549 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 550 &parent, 8); 551 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); 552 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); 553 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 554 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 555 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 556 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 557 558 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || 559 (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 560 (zp->z_pflags & ZFS_PROJID) && 561 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { 562 if (hdl == NULL) 563 sa_handle_destroy(zp->z_sa_hdl); 564 zp->z_sa_hdl = NULL; 565 goto error; 566 } 567 568 zp->z_projid = projid; 569 zp->z_mode = ip->i_mode = mode; 570 ip->i_generation = (uint32_t)tmp_gen; 571 ip->i_blkbits = SPA_MINBLOCKSHIFT; 572 set_nlink(ip, (uint32_t)links); 573 zfs_uid_write(ip, z_uid); 574 zfs_gid_write(ip, z_gid); 575 zfs_set_inode_flags(zp, ip); 576 577 /* Cache the xattr parent id */ 578 if (zp->z_pflags & ZFS_XATTR) 579 zp->z_xattr_parent = parent; 580 581 ZFS_TIME_DECODE(&tmp_ts, atime); 582 zpl_inode_set_atime_to_ts(ip, tmp_ts); 583 ZFS_TIME_DECODE(&tmp_ts, mtime); 584 zpl_inode_set_mtime_to_ts(ip, tmp_ts); 585 ZFS_TIME_DECODE(&tmp_ts, ctime); 586 zpl_inode_set_ctime_to_ts(ip, tmp_ts); 587 ZFS_TIME_DECODE(&zp->z_btime, btime); 588 589 ip->i_ino = zp->z_id; 590 zfs_znode_update_vfs(zp); 591 zfs_inode_set_ops(zfsvfs, ip); 592 593 /* 594 * The only way insert_inode_locked() can fail is if the ip->i_ino 595 * number is already hashed for this super block. This can never 596 * happen because the inode numbers map 1:1 with the object numbers. 597 * 598 * Exceptions include rolling back a mounted file system, either 599 * from the zfs rollback or zfs recv command. 600 * 601 * Active inodes are unhashed during the rollback, but since zrele 602 * can happen asynchronously, we can't guarantee they've been 603 * unhashed. This can cause hash collisions in unlinked drain 604 * processing so do not hash unlinked znodes. 605 */ 606 if (links > 0) 607 VERIFY3S(insert_inode_locked(ip), ==, 0); 608 609 mutex_enter(&zfsvfs->z_znodes_lock); 610 list_insert_tail(&zfsvfs->z_all_znodes, zp); 611 mutex_exit(&zfsvfs->z_znodes_lock); 612 613 if (links > 0) 614 unlock_new_inode(ip); 615 return (zp); 616 617 error: 618 iput(ip); 619 return (NULL); 620 } 621 622 /* 623 * Safely mark an inode dirty. Inodes which are part of a read-only 624 * file system or snapshot may not be dirtied. 625 */ 626 void 627 zfs_mark_inode_dirty(struct inode *ip) 628 { 629 zfsvfs_t *zfsvfs = ITOZSB(ip); 630 631 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 632 return; 633 634 mark_inode_dirty(ip); 635 } 636 637 static uint64_t empty_xattr; 638 static uint64_t pad[4]; 639 static zfs_acl_phys_t acl_phys; 640 /* 641 * Create a new DMU object to hold a zfs znode. 642 * 643 * IN: dzp - parent directory for new znode 644 * vap - file attributes for new znode 645 * tx - dmu transaction id for zap operations 646 * cr - credentials of caller 647 * flag - flags: 648 * IS_ROOT_NODE - new object will be root 649 * IS_TMPFILE - new object is of O_TMPFILE 650 * IS_XATTR - new object is an attribute 651 * acl_ids - ACL related attributes 652 * 653 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) 654 * 655 */ 656 void 657 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 658 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 659 { 660 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 661 uint64_t mode, size, links, parent, pflags; 662 uint64_t projid = ZFS_DEFAULT_PROJID; 663 uint64_t rdev = 0; 664 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 665 dmu_buf_t *db; 666 inode_timespec_t now; 667 uint64_t gen, obj; 668 int bonuslen; 669 int dnodesize; 670 sa_handle_t *sa_hdl; 671 dmu_object_type_t obj_type; 672 sa_bulk_attr_t *sa_attrs; 673 int cnt = 0; 674 zfs_acl_locator_cb_t locate = { 0 }; 675 znode_hold_t *zh; 676 677 if (zfsvfs->z_replay) { 678 obj = vap->va_nodeid; 679 now = vap->va_ctime; /* see zfs_replay_create() */ 680 gen = vap->va_nblocks; /* ditto */ 681 dnodesize = vap->va_fsid; /* ditto */ 682 } else { 683 obj = 0; 684 gethrestime(&now); 685 gen = dmu_tx_get_txg(tx); 686 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 687 } 688 689 if (dnodesize == 0) 690 dnodesize = DNODE_MIN_SIZE; 691 692 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 693 694 bonuslen = (obj_type == DMU_OT_SA) ? 695 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 696 697 /* 698 * Create a new DMU object. 699 */ 700 /* 701 * There's currently no mechanism for pre-reading the blocks that will 702 * be needed to allocate a new object, so we accept the small chance 703 * that there will be an i/o error and we will fail one of the 704 * assertions below. 705 */ 706 if (S_ISDIR(vap->va_mode)) { 707 if (zfsvfs->z_replay) { 708 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 709 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 710 obj_type, bonuslen, dnodesize, tx)); 711 } else { 712 obj = zap_create_norm_dnsize(zfsvfs->z_os, 713 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 714 obj_type, bonuslen, dnodesize, tx); 715 } 716 } else { 717 if (zfsvfs->z_replay) { 718 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 719 DMU_OT_PLAIN_FILE_CONTENTS, 0, 720 obj_type, bonuslen, dnodesize, tx)); 721 } else { 722 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 723 DMU_OT_PLAIN_FILE_CONTENTS, 0, 724 obj_type, bonuslen, dnodesize, tx); 725 } 726 } 727 728 zh = zfs_znode_hold_enter(zfsvfs, obj); 729 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 730 731 /* 732 * If this is the root, fix up the half-initialized parent pointer 733 * to reference the just-allocated physical data area. 734 */ 735 if (flag & IS_ROOT_NODE) { 736 dzp->z_id = obj; 737 } 738 739 /* 740 * If parent is an xattr, so am I. 741 */ 742 if (dzp->z_pflags & ZFS_XATTR) { 743 flag |= IS_XATTR; 744 } 745 746 if (zfsvfs->z_use_fuids) 747 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 748 else 749 pflags = 0; 750 751 if (S_ISDIR(vap->va_mode)) { 752 size = 2; /* contents ("." and "..") */ 753 links = 2; 754 } else { 755 size = 0; 756 links = (flag & IS_TMPFILE) ? 0 : 1; 757 } 758 759 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) 760 rdev = vap->va_rdev; 761 762 parent = dzp->z_id; 763 mode = acl_ids->z_mode; 764 if (flag & IS_XATTR) 765 pflags |= ZFS_XATTR; 766 767 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { 768 /* 769 * With ZFS_PROJID flag, we can easily know whether there is 770 * project ID stored on disk or not. See zfs_space_delta_cb(). 771 */ 772 if (obj_type != DMU_OT_ZNODE && 773 dmu_objset_projectquota_enabled(zfsvfs->z_os)) 774 pflags |= ZFS_PROJID; 775 776 /* 777 * Inherit project ID from parent if required. 778 */ 779 projid = zfs_inherit_projid(dzp); 780 if (dzp->z_pflags & ZFS_PROJINHERIT) 781 pflags |= ZFS_PROJINHERIT; 782 } 783 784 /* 785 * No execs denied will be determined when zfs_mode_compute() is called. 786 */ 787 pflags |= acl_ids->z_aclp->z_hints & 788 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 789 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 790 791 ZFS_TIME_ENCODE(&now, crtime); 792 ZFS_TIME_ENCODE(&now, ctime); 793 794 if (vap->va_mask & ATTR_ATIME) { 795 ZFS_TIME_ENCODE(&vap->va_atime, atime); 796 } else { 797 ZFS_TIME_ENCODE(&now, atime); 798 } 799 800 if (vap->va_mask & ATTR_MTIME) { 801 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 802 } else { 803 ZFS_TIME_ENCODE(&now, mtime); 804 } 805 806 /* Now add in all of the "SA" attributes */ 807 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 808 &sa_hdl)); 809 810 /* 811 * Setup the array of attributes to be replaced/set on the new file 812 * 813 * order for DMU_OT_ZNODE is critical since it needs to be constructed 814 * in the old znode_phys_t format. Don't change this ordering 815 */ 816 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 817 818 if (obj_type == DMU_OT_ZNODE) { 819 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 820 NULL, &atime, 16); 821 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 822 NULL, &mtime, 16); 823 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 824 NULL, &ctime, 16); 825 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 826 NULL, &crtime, 16); 827 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 828 NULL, &gen, 8); 829 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 830 NULL, &mode, 8); 831 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 832 NULL, &size, 8); 833 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 834 NULL, &parent, 8); 835 } else { 836 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 837 NULL, &mode, 8); 838 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 839 NULL, &size, 8); 840 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 841 NULL, &gen, 8); 842 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 843 NULL, &acl_ids->z_fuid, 8); 844 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 845 NULL, &acl_ids->z_fgid, 8); 846 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 847 NULL, &parent, 8); 848 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 849 NULL, &pflags, 8); 850 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 851 NULL, &atime, 16); 852 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 853 NULL, &mtime, 16); 854 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 855 NULL, &ctime, 16); 856 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 857 NULL, &crtime, 16); 858 } 859 860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 861 862 if (obj_type == DMU_OT_ZNODE) { 863 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 864 &empty_xattr, 8); 865 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 866 pflags & ZFS_PROJID) { 867 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), 868 NULL, &projid, 8); 869 } 870 if (obj_type == DMU_OT_ZNODE || 871 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 873 NULL, &rdev, 8); 874 } 875 if (obj_type == DMU_OT_ZNODE) { 876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 877 NULL, &pflags, 8); 878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 879 &acl_ids->z_fuid, 8); 880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 881 &acl_ids->z_fgid, 8); 882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 883 sizeof (uint64_t) * 4); 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 885 &acl_phys, sizeof (zfs_acl_phys_t)); 886 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 887 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 888 &acl_ids->z_aclp->z_acl_count, 8); 889 locate.cb_aclp = acl_ids->z_aclp; 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 891 zfs_acl_data_locator, &locate, 892 acl_ids->z_aclp->z_acl_bytes); 893 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 894 acl_ids->z_fuid, acl_ids->z_fgid); 895 } 896 897 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); 898 899 if (!(flag & IS_ROOT_NODE)) { 900 /* 901 * The call to zfs_znode_alloc() may fail if memory is low 902 * via the call path: alloc_inode() -> inode_init_always() -> 903 * security_inode_alloc() -> inode_alloc_security(). Since 904 * the existing code is written such that zfs_mknode() can 905 * not fail retry until sufficient memory has been reclaimed. 906 */ 907 do { 908 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 909 } while (*zpp == NULL); 910 911 VERIFY(*zpp != NULL); 912 VERIFY(dzp != NULL); 913 } else { 914 /* 915 * If we are creating the root node, the "parent" we 916 * passed in is the znode for the root. 917 */ 918 *zpp = dzp; 919 920 (*zpp)->z_sa_hdl = sa_hdl; 921 } 922 923 (*zpp)->z_pflags = pflags; 924 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; 925 (*zpp)->z_dnodesize = dnodesize; 926 (*zpp)->z_projid = projid; 927 928 if (obj_type == DMU_OT_ZNODE || 929 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 930 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 931 } 932 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 933 zfs_znode_hold_exit(zfsvfs, zh); 934 } 935 936 /* 937 * Update in-core attributes. It is assumed the caller will be doing an 938 * sa_bulk_update to push the changes out. 939 */ 940 void 941 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 942 { 943 xoptattr_t *xoap; 944 boolean_t update_inode = B_FALSE; 945 946 xoap = xva_getxoptattr(xvap); 947 ASSERT(xoap); 948 949 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 950 uint64_t times[2]; 951 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 952 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 953 ×, sizeof (times), tx); 954 XVA_SET_RTN(xvap, XAT_CREATETIME); 955 } 956 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 957 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 958 zp->z_pflags, tx); 959 XVA_SET_RTN(xvap, XAT_READONLY); 960 } 961 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 962 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 963 zp->z_pflags, tx); 964 XVA_SET_RTN(xvap, XAT_HIDDEN); 965 } 966 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 967 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 968 zp->z_pflags, tx); 969 XVA_SET_RTN(xvap, XAT_SYSTEM); 970 } 971 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 972 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 973 zp->z_pflags, tx); 974 XVA_SET_RTN(xvap, XAT_ARCHIVE); 975 } 976 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 977 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 978 zp->z_pflags, tx); 979 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 980 981 update_inode = B_TRUE; 982 } 983 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 984 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 985 zp->z_pflags, tx); 986 XVA_SET_RTN(xvap, XAT_NOUNLINK); 987 } 988 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 989 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 990 zp->z_pflags, tx); 991 XVA_SET_RTN(xvap, XAT_APPENDONLY); 992 993 update_inode = B_TRUE; 994 } 995 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 996 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 997 zp->z_pflags, tx); 998 XVA_SET_RTN(xvap, XAT_NODUMP); 999 } 1000 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1001 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1002 zp->z_pflags, tx); 1003 XVA_SET_RTN(xvap, XAT_OPAQUE); 1004 } 1005 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1006 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1007 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1008 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1009 } 1010 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1011 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1012 zp->z_pflags, tx); 1013 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1014 } 1015 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1016 zfs_sa_set_scanstamp(zp, xvap, tx); 1017 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1018 } 1019 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1020 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1021 zp->z_pflags, tx); 1022 XVA_SET_RTN(xvap, XAT_REPARSE); 1023 } 1024 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1025 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1026 zp->z_pflags, tx); 1027 XVA_SET_RTN(xvap, XAT_OFFLINE); 1028 } 1029 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1030 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1031 zp->z_pflags, tx); 1032 XVA_SET_RTN(xvap, XAT_SPARSE); 1033 } 1034 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 1035 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, 1036 zp->z_pflags, tx); 1037 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 1038 } 1039 1040 if (update_inode) 1041 zfs_set_inode_flags(zp, ZTOI(zp)); 1042 } 1043 1044 int 1045 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1046 { 1047 dmu_object_info_t doi; 1048 dmu_buf_t *db; 1049 znode_t *zp; 1050 znode_hold_t *zh; 1051 int err; 1052 sa_handle_t *hdl; 1053 1054 *zpp = NULL; 1055 1056 again: 1057 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1058 1059 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1060 if (err) { 1061 zfs_znode_hold_exit(zfsvfs, zh); 1062 return (err); 1063 } 1064 1065 dmu_object_info_from_db(db, &doi); 1066 if (doi.doi_bonus_type != DMU_OT_SA && 1067 (doi.doi_bonus_type != DMU_OT_ZNODE || 1068 (doi.doi_bonus_type == DMU_OT_ZNODE && 1069 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1070 sa_buf_rele(db, NULL); 1071 zfs_znode_hold_exit(zfsvfs, zh); 1072 return (SET_ERROR(EINVAL)); 1073 } 1074 1075 hdl = dmu_buf_get_user(db); 1076 if (hdl != NULL) { 1077 zp = sa_get_userdata(hdl); 1078 1079 1080 /* 1081 * Since "SA" does immediate eviction we 1082 * should never find a sa handle that doesn't 1083 * know about the znode. 1084 */ 1085 1086 ASSERT3P(zp, !=, NULL); 1087 1088 mutex_enter(&zp->z_lock); 1089 ASSERT3U(zp->z_id, ==, obj_num); 1090 /* 1091 * If zp->z_unlinked is set, the znode is already marked 1092 * for deletion and should not be discovered. Check this 1093 * after checking igrab() due to fsetxattr() & O_TMPFILE. 1094 * 1095 * If igrab() returns NULL the VFS has independently 1096 * determined the inode should be evicted and has 1097 * called iput_final() to start the eviction process. 1098 * The SA handle is still valid but because the VFS 1099 * requires that the eviction succeed we must drop 1100 * our locks and references to allow the eviction to 1101 * complete. The zfs_zget() may then be retried. 1102 * 1103 * This unlikely case could be optimized by registering 1104 * a sops->drop_inode() callback. The callback would 1105 * need to detect the active SA hold thereby informing 1106 * the VFS that this inode should not be evicted. 1107 */ 1108 if (igrab(ZTOI(zp)) == NULL) { 1109 if (zp->z_unlinked) 1110 err = SET_ERROR(ENOENT); 1111 else 1112 err = SET_ERROR(EAGAIN); 1113 } else { 1114 *zpp = zp; 1115 err = 0; 1116 } 1117 1118 mutex_exit(&zp->z_lock); 1119 sa_buf_rele(db, NULL); 1120 zfs_znode_hold_exit(zfsvfs, zh); 1121 1122 if (err == EAGAIN) { 1123 /* inode might need this to finish evict */ 1124 cond_resched(); 1125 goto again; 1126 } 1127 return (err); 1128 } 1129 1130 /* 1131 * Not found create new znode/vnode but only if file exists. 1132 * 1133 * There is a small window where zfs_vget() could 1134 * find this object while a file create is still in 1135 * progress. This is checked for in zfs_znode_alloc() 1136 * 1137 * if zfs_znode_alloc() fails it will drop the hold on the 1138 * bonus buffer. 1139 */ 1140 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1141 doi.doi_bonus_type, NULL); 1142 if (zp == NULL) { 1143 err = SET_ERROR(ENOENT); 1144 } else { 1145 *zpp = zp; 1146 } 1147 zfs_znode_hold_exit(zfsvfs, zh); 1148 return (err); 1149 } 1150 1151 int 1152 zfs_rezget(znode_t *zp) 1153 { 1154 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1155 dmu_object_info_t doi; 1156 dmu_buf_t *db; 1157 uint64_t obj_num = zp->z_id; 1158 uint64_t mode; 1159 uint64_t links; 1160 sa_bulk_attr_t bulk[11]; 1161 int err; 1162 int count = 0; 1163 uint64_t gen; 1164 uint64_t z_uid, z_gid; 1165 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 1166 inode_timespec_t tmp_ts; 1167 uint64_t projid = ZFS_DEFAULT_PROJID; 1168 znode_hold_t *zh; 1169 1170 /* 1171 * skip ctldir, otherwise they will always get invalidated. This will 1172 * cause funny behaviour for the mounted snapdirs. Especially for 1173 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent 1174 * anyone automount it again as long as someone is still using the 1175 * detached mount. 1176 */ 1177 if (zp->z_is_ctldir) 1178 return (0); 1179 1180 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1181 1182 mutex_enter(&zp->z_acl_lock); 1183 if (zp->z_acl_cached) { 1184 zfs_acl_free(zp->z_acl_cached); 1185 zp->z_acl_cached = NULL; 1186 } 1187 mutex_exit(&zp->z_acl_lock); 1188 1189 rw_enter(&zp->z_xattr_lock, RW_WRITER); 1190 if (zp->z_xattr_cached) { 1191 nvlist_free(zp->z_xattr_cached); 1192 zp->z_xattr_cached = NULL; 1193 } 1194 rw_exit(&zp->z_xattr_lock); 1195 1196 ASSERT(zp->z_sa_hdl == NULL); 1197 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1198 if (err) { 1199 zfs_znode_hold_exit(zfsvfs, zh); 1200 return (err); 1201 } 1202 1203 dmu_object_info_from_db(db, &doi); 1204 if (doi.doi_bonus_type != DMU_OT_SA && 1205 (doi.doi_bonus_type != DMU_OT_ZNODE || 1206 (doi.doi_bonus_type == DMU_OT_ZNODE && 1207 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1208 sa_buf_rele(db, NULL); 1209 zfs_znode_hold_exit(zfsvfs, zh); 1210 return (SET_ERROR(EINVAL)); 1211 } 1212 1213 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1214 1215 /* reload cached values */ 1216 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1217 &gen, sizeof (gen)); 1218 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1219 &zp->z_size, sizeof (zp->z_size)); 1220 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1221 &links, sizeof (links)); 1222 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1223 &zp->z_pflags, sizeof (zp->z_pflags)); 1224 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1225 &z_uid, sizeof (z_uid)); 1226 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1227 &z_gid, sizeof (z_gid)); 1228 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1229 &mode, sizeof (mode)); 1230 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1231 &atime, 16); 1232 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 1233 &mtime, 16); 1234 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 1235 &ctime, 16); 1236 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 1237 1238 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1239 zfs_znode_dmu_fini(zp); 1240 zfs_znode_hold_exit(zfsvfs, zh); 1241 return (SET_ERROR(EIO)); 1242 } 1243 1244 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { 1245 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), 1246 &projid, 8); 1247 if (err != 0 && err != ENOENT) { 1248 zfs_znode_dmu_fini(zp); 1249 zfs_znode_hold_exit(zfsvfs, zh); 1250 return (SET_ERROR(err)); 1251 } 1252 } 1253 1254 zp->z_projid = projid; 1255 zp->z_mode = ZTOI(zp)->i_mode = mode; 1256 zfs_uid_write(ZTOI(zp), z_uid); 1257 zfs_gid_write(ZTOI(zp), z_gid); 1258 1259 ZFS_TIME_DECODE(&tmp_ts, atime); 1260 zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); 1261 ZFS_TIME_DECODE(&tmp_ts, mtime); 1262 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); 1263 ZFS_TIME_DECODE(&tmp_ts, ctime); 1264 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); 1265 ZFS_TIME_DECODE(&zp->z_btime, btime); 1266 1267 if ((uint32_t)gen != ZTOI(zp)->i_generation) { 1268 zfs_znode_dmu_fini(zp); 1269 zfs_znode_hold_exit(zfsvfs, zh); 1270 return (SET_ERROR(EIO)); 1271 } 1272 1273 set_nlink(ZTOI(zp), (uint32_t)links); 1274 zfs_set_inode_flags(zp, ZTOI(zp)); 1275 1276 zp->z_blksz = doi.doi_data_block_size; 1277 zp->z_atime_dirty = B_FALSE; 1278 zfs_znode_update_vfs(zp); 1279 1280 /* 1281 * If the file has zero links, then it has been unlinked on the send 1282 * side and it must be in the received unlinked set. 1283 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1284 * stale data and to prevent automatic removal of the file in 1285 * zfs_zinactive(). The file will be removed either when it is removed 1286 * on the send side and the next incremental stream is received or 1287 * when the unlinked set gets processed. 1288 */ 1289 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); 1290 if (zp->z_unlinked) 1291 zfs_znode_dmu_fini(zp); 1292 1293 zfs_znode_hold_exit(zfsvfs, zh); 1294 1295 return (0); 1296 } 1297 1298 void 1299 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1300 { 1301 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1302 objset_t *os = zfsvfs->z_os; 1303 uint64_t obj = zp->z_id; 1304 uint64_t acl_obj = zfs_external_acl(zp); 1305 znode_hold_t *zh; 1306 1307 zh = zfs_znode_hold_enter(zfsvfs, obj); 1308 if (acl_obj) { 1309 VERIFY(!zp->z_is_sa); 1310 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1311 } 1312 VERIFY(0 == dmu_object_free(os, obj, tx)); 1313 zfs_znode_dmu_fini(zp); 1314 zfs_znode_hold_exit(zfsvfs, zh); 1315 } 1316 1317 void 1318 zfs_zinactive(znode_t *zp) 1319 { 1320 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1321 uint64_t z_id = zp->z_id; 1322 znode_hold_t *zh; 1323 1324 ASSERT(zp->z_sa_hdl); 1325 1326 /* 1327 * Don't allow a zfs_zget() while were trying to release this znode. 1328 */ 1329 zh = zfs_znode_hold_enter(zfsvfs, z_id); 1330 1331 mutex_enter(&zp->z_lock); 1332 1333 /* 1334 * If this was the last reference to a file with no links, remove 1335 * the file from the file system unless the file system is mounted 1336 * read-only. That can happen, for example, if the file system was 1337 * originally read-write, the file was opened, then unlinked and 1338 * the file system was made read-only before the file was finally 1339 * closed. The file will remain in the unlinked set. 1340 */ 1341 if (zp->z_unlinked) { 1342 ASSERT(!zfsvfs->z_issnap); 1343 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { 1344 mutex_exit(&zp->z_lock); 1345 zfs_znode_hold_exit(zfsvfs, zh); 1346 zfs_rmnode(zp); 1347 return; 1348 } 1349 } 1350 1351 mutex_exit(&zp->z_lock); 1352 zfs_znode_dmu_fini(zp); 1353 1354 zfs_znode_hold_exit(zfsvfs, zh); 1355 } 1356 1357 /* 1358 * Determine whether the znode's atime must be updated. The logic mostly 1359 * duplicates the Linux kernel's relatime_need_update() functionality. 1360 * This function is only called if the underlying filesystem actually has 1361 * atime updates enabled. 1362 */ 1363 boolean_t 1364 zfs_relatime_need_update(const struct inode *ip) 1365 { 1366 inode_timespec_t now, tmp_atime, tmp_ts; 1367 1368 gethrestime(&now); 1369 tmp_atime = zpl_inode_get_atime(ip); 1370 /* 1371 * In relatime mode, only update the atime if the previous atime 1372 * is earlier than either the ctime or mtime or if at least a day 1373 * has passed since the last update of atime. 1374 */ 1375 tmp_ts = zpl_inode_get_mtime(ip); 1376 if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) 1377 return (B_TRUE); 1378 1379 tmp_ts = zpl_inode_get_ctime(ip); 1380 if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) 1381 return (B_TRUE); 1382 1383 if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) 1384 return (B_TRUE); 1385 1386 return (B_FALSE); 1387 } 1388 1389 /* 1390 * Prepare to update znode time stamps. 1391 * 1392 * IN: zp - znode requiring timestamp update 1393 * flag - ATTR_MTIME, ATTR_CTIME flags 1394 * 1395 * OUT: zp - z_seq 1396 * mtime - new mtime 1397 * ctime - new ctime 1398 * 1399 * Note: We don't update atime here, because we rely on Linux VFS to do 1400 * atime updating. 1401 */ 1402 void 1403 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1404 uint64_t ctime[2]) 1405 { 1406 inode_timespec_t now, tmp_ts; 1407 1408 gethrestime(&now); 1409 1410 zp->z_seq++; 1411 1412 if (flag & ATTR_MTIME) { 1413 ZFS_TIME_ENCODE(&now, mtime); 1414 ZFS_TIME_DECODE(&tmp_ts, mtime); 1415 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); 1416 if (ZTOZSB(zp)->z_use_fuids) { 1417 zp->z_pflags |= (ZFS_ARCHIVE | 1418 ZFS_AV_MODIFIED); 1419 } 1420 } 1421 1422 if (flag & ATTR_CTIME) { 1423 ZFS_TIME_ENCODE(&now, ctime); 1424 ZFS_TIME_DECODE(&tmp_ts, ctime); 1425 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); 1426 if (ZTOZSB(zp)->z_use_fuids) 1427 zp->z_pflags |= ZFS_ARCHIVE; 1428 } 1429 } 1430 1431 /* 1432 * Grow the block size for a file. 1433 * 1434 * IN: zp - znode of file to free data in. 1435 * size - requested block size 1436 * tx - open transaction. 1437 * 1438 * NOTE: this function assumes that the znode is write locked. 1439 */ 1440 void 1441 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1442 { 1443 int error; 1444 u_longlong_t dummy; 1445 1446 if (size <= zp->z_blksz) 1447 return; 1448 /* 1449 * If the file size is already greater than the current blocksize, 1450 * we will not grow. If there is more than one block in a file, 1451 * the blocksize cannot change. 1452 */ 1453 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1454 return; 1455 1456 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, 1457 size, 0, tx); 1458 1459 if (error == ENOTSUP) 1460 return; 1461 ASSERT0(error); 1462 1463 /* What blocksize did we actually get? */ 1464 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1465 } 1466 1467 /* 1468 * Increase the file length 1469 * 1470 * IN: zp - znode of file to free data in. 1471 * end - new end-of-file 1472 * 1473 * RETURN: 0 on success, error code on failure 1474 */ 1475 static int 1476 zfs_extend(znode_t *zp, uint64_t end) 1477 { 1478 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1479 dmu_tx_t *tx; 1480 zfs_locked_range_t *lr; 1481 uint64_t newblksz; 1482 int error; 1483 1484 /* 1485 * We will change zp_size, lock the whole file. 1486 */ 1487 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1488 1489 /* 1490 * Nothing to do if file already at desired length. 1491 */ 1492 if (end <= zp->z_size) { 1493 zfs_rangelock_exit(lr); 1494 return (0); 1495 } 1496 tx = dmu_tx_create(zfsvfs->z_os); 1497 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1498 zfs_sa_upgrade_txholds(tx, zp); 1499 if (end > zp->z_blksz && 1500 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1501 /* 1502 * We are growing the file past the current block size. 1503 */ 1504 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { 1505 /* 1506 * File's blocksize is already larger than the 1507 * "recordsize" property. Only let it grow to 1508 * the next power of 2. 1509 */ 1510 ASSERT(!ISP2(zp->z_blksz)); 1511 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1512 } else { 1513 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); 1514 } 1515 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1516 } else { 1517 newblksz = 0; 1518 } 1519 1520 error = dmu_tx_assign(tx, TXG_WAIT); 1521 if (error) { 1522 dmu_tx_abort(tx); 1523 zfs_rangelock_exit(lr); 1524 return (error); 1525 } 1526 1527 if (newblksz) 1528 zfs_grow_blocksize(zp, newblksz, tx); 1529 1530 zp->z_size = end; 1531 1532 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), 1533 &zp->z_size, sizeof (zp->z_size), tx)); 1534 1535 zfs_rangelock_exit(lr); 1536 1537 dmu_tx_commit(tx); 1538 1539 return (0); 1540 } 1541 1542 /* 1543 * zfs_zero_partial_page - Modeled after update_pages() but 1544 * with different arguments and semantics for use by zfs_freesp(). 1545 * 1546 * Zeroes a piece of a single page cache entry for zp at offset 1547 * start and length len. 1548 * 1549 * Caller must acquire a range lock on the file for the region 1550 * being zeroed in order that the ARC and page cache stay in sync. 1551 */ 1552 static void 1553 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) 1554 { 1555 struct address_space *mp = ZTOI(zp)->i_mapping; 1556 struct page *pp; 1557 int64_t off; 1558 void *pb; 1559 1560 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); 1561 1562 off = start & (PAGE_SIZE - 1); 1563 start &= PAGE_MASK; 1564 1565 pp = find_lock_page(mp, start >> PAGE_SHIFT); 1566 if (pp) { 1567 if (mapping_writably_mapped(mp)) 1568 flush_dcache_page(pp); 1569 1570 pb = kmap(pp); 1571 memset(pb + off, 0, len); 1572 kunmap(pp); 1573 1574 if (mapping_writably_mapped(mp)) 1575 flush_dcache_page(pp); 1576 1577 mark_page_accessed(pp); 1578 SetPageUptodate(pp); 1579 ClearPageError(pp); 1580 unlock_page(pp); 1581 put_page(pp); 1582 } 1583 } 1584 1585 /* 1586 * Free space in a file. 1587 * 1588 * IN: zp - znode of file to free data in. 1589 * off - start of section to free. 1590 * len - length of section to free. 1591 * 1592 * RETURN: 0 on success, error code on failure 1593 */ 1594 static int 1595 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1596 { 1597 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1598 zfs_locked_range_t *lr; 1599 int error; 1600 1601 /* 1602 * Lock the range being freed. 1603 */ 1604 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1605 1606 /* 1607 * Nothing to do if file already at desired length. 1608 */ 1609 if (off >= zp->z_size) { 1610 zfs_rangelock_exit(lr); 1611 return (0); 1612 } 1613 1614 if (off + len > zp->z_size) 1615 len = zp->z_size - off; 1616 1617 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1618 1619 /* 1620 * Zero partial page cache entries. This must be done under a 1621 * range lock in order to keep the ARC and page cache in sync. 1622 */ 1623 if (zn_has_cached_data(zp, off, off + len - 1)) { 1624 loff_t first_page, last_page, page_len; 1625 loff_t first_page_offset, last_page_offset; 1626 1627 /* first possible full page in hole */ 1628 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; 1629 /* last page of hole */ 1630 last_page = (off + len) >> PAGE_SHIFT; 1631 1632 /* offset of first_page */ 1633 first_page_offset = first_page << PAGE_SHIFT; 1634 /* offset of last_page */ 1635 last_page_offset = last_page << PAGE_SHIFT; 1636 1637 /* truncate whole pages */ 1638 if (last_page_offset > first_page_offset) { 1639 truncate_inode_pages_range(ZTOI(zp)->i_mapping, 1640 first_page_offset, last_page_offset - 1); 1641 } 1642 1643 /* truncate sub-page ranges */ 1644 if (first_page > last_page) { 1645 /* entire punched area within a single page */ 1646 zfs_zero_partial_page(zp, off, len); 1647 } else { 1648 /* beginning of punched area at the end of a page */ 1649 page_len = first_page_offset - off; 1650 if (page_len > 0) 1651 zfs_zero_partial_page(zp, off, page_len); 1652 1653 /* end of punched area at the beginning of a page */ 1654 page_len = off + len - last_page_offset; 1655 if (page_len > 0) 1656 zfs_zero_partial_page(zp, last_page_offset, 1657 page_len); 1658 } 1659 } 1660 zfs_rangelock_exit(lr); 1661 1662 return (error); 1663 } 1664 1665 /* 1666 * Truncate a file 1667 * 1668 * IN: zp - znode of file to free data in. 1669 * end - new end-of-file. 1670 * 1671 * RETURN: 0 on success, error code on failure 1672 */ 1673 static int 1674 zfs_trunc(znode_t *zp, uint64_t end) 1675 { 1676 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1677 dmu_tx_t *tx; 1678 zfs_locked_range_t *lr; 1679 int error; 1680 sa_bulk_attr_t bulk[2]; 1681 int count = 0; 1682 1683 /* 1684 * We will change zp_size, lock the whole file. 1685 */ 1686 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1687 1688 /* 1689 * Nothing to do if file already at desired length. 1690 */ 1691 if (end >= zp->z_size) { 1692 zfs_rangelock_exit(lr); 1693 return (0); 1694 } 1695 1696 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1697 DMU_OBJECT_END); 1698 if (error) { 1699 zfs_rangelock_exit(lr); 1700 return (error); 1701 } 1702 tx = dmu_tx_create(zfsvfs->z_os); 1703 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1704 zfs_sa_upgrade_txholds(tx, zp); 1705 dmu_tx_mark_netfree(tx); 1706 error = dmu_tx_assign(tx, TXG_WAIT); 1707 if (error) { 1708 dmu_tx_abort(tx); 1709 zfs_rangelock_exit(lr); 1710 return (error); 1711 } 1712 1713 zp->z_size = end; 1714 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1715 NULL, &zp->z_size, sizeof (zp->z_size)); 1716 1717 if (end == 0) { 1718 zp->z_pflags &= ~ZFS_SPARSE; 1719 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1720 NULL, &zp->z_pflags, 8); 1721 } 1722 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); 1723 1724 dmu_tx_commit(tx); 1725 zfs_rangelock_exit(lr); 1726 1727 return (0); 1728 } 1729 1730 /* 1731 * Free space in a file 1732 * 1733 * IN: zp - znode of file to free data in. 1734 * off - start of range 1735 * len - end of range (0 => EOF) 1736 * flag - current file open mode flags. 1737 * log - TRUE if this action should be logged 1738 * 1739 * RETURN: 0 on success, error code on failure 1740 */ 1741 int 1742 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1743 { 1744 dmu_tx_t *tx; 1745 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1746 zilog_t *zilog = zfsvfs->z_log; 1747 uint64_t mode; 1748 uint64_t mtime[2], ctime[2]; 1749 sa_bulk_attr_t bulk[3]; 1750 int count = 0; 1751 int error; 1752 1753 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1754 sizeof (mode))) != 0) 1755 return (error); 1756 1757 if (off > zp->z_size) { 1758 error = zfs_extend(zp, off+len); 1759 if (error == 0 && log) 1760 goto log; 1761 goto out; 1762 } 1763 1764 if (len == 0) { 1765 error = zfs_trunc(zp, off); 1766 } else { 1767 if ((error = zfs_free_range(zp, off, len)) == 0 && 1768 off + len > zp->z_size) 1769 error = zfs_extend(zp, off+len); 1770 } 1771 if (error || !log) 1772 goto out; 1773 log: 1774 tx = dmu_tx_create(zfsvfs->z_os); 1775 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1776 zfs_sa_upgrade_txholds(tx, zp); 1777 error = dmu_tx_assign(tx, TXG_WAIT); 1778 if (error) { 1779 dmu_tx_abort(tx); 1780 goto out; 1781 } 1782 1783 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1784 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1785 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1786 NULL, &zp->z_pflags, 8); 1787 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1788 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1789 ASSERT(error == 0); 1790 1791 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1792 1793 dmu_tx_commit(tx); 1794 1795 zfs_znode_update_vfs(zp); 1796 error = 0; 1797 1798 out: 1799 /* 1800 * Truncate the page cache - for file truncate operations, use 1801 * the purpose-built API for truncations. For punching operations, 1802 * the truncation is handled under a range lock in zfs_free_range. 1803 */ 1804 if (len == 0) 1805 truncate_setsize(ZTOI(zp), off); 1806 return (error); 1807 } 1808 1809 void 1810 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1811 { 1812 struct super_block *sb; 1813 zfsvfs_t *zfsvfs; 1814 uint64_t moid, obj, sa_obj, version; 1815 uint64_t sense = ZFS_CASE_SENSITIVE; 1816 uint64_t norm = 0; 1817 nvpair_t *elem; 1818 int size; 1819 int error; 1820 int i; 1821 znode_t *rootzp = NULL; 1822 vattr_t vattr; 1823 znode_t *zp; 1824 zfs_acl_ids_t acl_ids; 1825 1826 /* 1827 * First attempt to create master node. 1828 */ 1829 /* 1830 * In an empty objset, there are no blocks to read and thus 1831 * there can be no i/o errors (which we assert below). 1832 */ 1833 moid = MASTER_NODE_OBJ; 1834 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1835 DMU_OT_NONE, 0, tx); 1836 ASSERT(error == 0); 1837 1838 /* 1839 * Set starting attributes. 1840 */ 1841 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1842 elem = NULL; 1843 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1844 /* For the moment we expect all zpl props to be uint64_ts */ 1845 uint64_t val; 1846 const char *name; 1847 1848 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1849 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1850 name = nvpair_name(elem); 1851 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1852 if (val < version) 1853 version = val; 1854 } else { 1855 error = zap_update(os, moid, name, 8, 1, &val, tx); 1856 } 1857 ASSERT(error == 0); 1858 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1859 norm = val; 1860 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1861 sense = val; 1862 } 1863 ASSERT(version != 0); 1864 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1865 ASSERT(error == 0); 1866 1867 /* 1868 * Create zap object used for SA attribute registration 1869 */ 1870 1871 if (version >= ZPL_VERSION_SA) { 1872 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1873 DMU_OT_NONE, 0, tx); 1874 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1875 ASSERT(error == 0); 1876 } else { 1877 sa_obj = 0; 1878 } 1879 /* 1880 * Create a delete queue. 1881 */ 1882 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1883 1884 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1885 ASSERT(error == 0); 1886 1887 /* 1888 * Create root znode. Create minimal znode/inode/zfsvfs/sb 1889 * to allow zfs_mknode to work. 1890 */ 1891 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; 1892 vattr.va_mode = S_IFDIR|0755; 1893 vattr.va_uid = crgetuid(cr); 1894 vattr.va_gid = crgetgid(cr); 1895 1896 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1897 rootzp->z_unlinked = B_FALSE; 1898 rootzp->z_atime_dirty = B_FALSE; 1899 rootzp->z_is_sa = USE_SA(version, os); 1900 rootzp->z_pflags = 0; 1901 1902 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1903 zfsvfs->z_os = os; 1904 zfsvfs->z_parent = zfsvfs; 1905 zfsvfs->z_version = version; 1906 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1907 zfsvfs->z_use_sa = USE_SA(version, os); 1908 zfsvfs->z_norm = norm; 1909 1910 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); 1911 sb->s_fs_info = zfsvfs; 1912 1913 ZTOI(rootzp)->i_sb = sb; 1914 1915 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1916 &zfsvfs->z_attr_table); 1917 1918 ASSERT(error == 0); 1919 1920 /* 1921 * Fold case on file systems that are always or sometimes case 1922 * insensitive. 1923 */ 1924 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1925 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1926 1927 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1928 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1929 offsetof(znode_t, z_link_node)); 1930 1931 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); 1932 zfsvfs->z_hold_size = size; 1933 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, 1934 KM_SLEEP); 1935 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); 1936 for (i = 0; i != size; i++) { 1937 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, 1938 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); 1939 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); 1940 } 1941 1942 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1943 cr, NULL, &acl_ids, zfs_init_idmap)); 1944 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1945 ASSERT3P(zp, ==, rootzp); 1946 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1947 ASSERT(error == 0); 1948 zfs_acl_ids_free(&acl_ids); 1949 1950 atomic_set(&ZTOI(rootzp)->i_count, 0); 1951 sa_handle_destroy(rootzp->z_sa_hdl); 1952 kmem_cache_free(znode_cache, rootzp); 1953 1954 for (i = 0; i != size; i++) { 1955 avl_destroy(&zfsvfs->z_hold_trees[i]); 1956 mutex_destroy(&zfsvfs->z_hold_locks[i]); 1957 } 1958 1959 mutex_destroy(&zfsvfs->z_znodes_lock); 1960 1961 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); 1962 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); 1963 kmem_free(sb, sizeof (struct super_block)); 1964 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1965 } 1966 1967 EXPORT_SYMBOL(zfs_create_fs); 1968 EXPORT_SYMBOL(zfs_obj_to_path); 1969 1970 module_param(zfs_object_mutex_size, uint, 0644); 1971 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); 1972 module_param(zfs_unlink_suspend_progress, int, 0644); 1973 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " 1974 "(debug - leaks space into the unlinked set)"); 1975