1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/kmem.h> 42 #include <sys/errno.h> 43 #include <sys/unistd.h> 44 #include <sys/atomic.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/zfs_rlock.h> 49 #include <sys/zfs_fuid.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/kidmap.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 60 #include "zfs_prop.h" 61 62 #if defined(_KERNEL) && defined(__NetBSD__) 63 #include <miscfs/specfs/specdev.h> 64 static const struct genfs_ops zfs_genfsops = { 65 .gop_write = genfs_compat_gop_write, 66 }; 67 68 #endif 69 70 extern int (**zfs_vnodeop_p)(void *); 71 extern int (**zfs_fifoop_p)(void *); 72 extern int (**zfs_specop_p)(void *); 73 74 /* 75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 76 * turned on when DEBUG is also defined. 77 */ 78 #ifdef DEBUG 79 #define ZNODE_STATS 80 #endif /* DEBUG */ 81 82 #ifdef ZNODE_STATS 83 #define ZNODE_STAT_ADD(stat) ((stat)++) 84 #else 85 #define ZNODE_STAT_ADD(stat) /* nothing */ 86 #endif /* ZNODE_STATS */ 87 88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 90 91 /* 92 * Functions needed for userland (ie: libzpool) are not put under 93 * #ifdef_KERNEL; the rest of the functions have dependencies 94 * (such as VFS logic) that will not compile easily in userland. 95 */ 96 #ifdef _KERNEL 97 static kmem_cache_t *znode_cache = NULL; 98 99 /*ARGSUSED*/ 100 static void 101 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 102 { 103 /* 104 * We should never drop all dbuf refs without first clearing 105 * the eviction callback. 106 */ 107 panic("evicting znode %p\n", user_ptr); 108 } 109 110 /*ARGSUSED*/ 111 static int 112 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 113 { 114 znode_t *zp = arg; 115 116 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 117 118 list_link_init(&zp->z_link_node); 119 120 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 121 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 122 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 123 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 124 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 125 126 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 127 avl_create(&zp->z_range_avl, zfs_range_compare, 128 sizeof (rl_t), offsetof(rl_t, r_node)); 129 130 zp->z_dbuf = NULL; 131 zp->z_dirlocks = NULL; 132 return (0); 133 } 134 135 /*ARGSUSED*/ 136 static void 137 zfs_znode_cache_destructor(void *buf, void *arg) 138 { 139 znode_t *zp = arg; 140 141 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 142 ASSERT(ZTOV(zp) == NULL); 143 144 ASSERT(!list_link_active(&zp->z_link_node)); 145 mutex_destroy(&zp->z_lock); 146 rw_destroy(&zp->z_map_lock); 147 rw_destroy(&zp->z_parent_lock); 148 rw_destroy(&zp->z_name_lock); 149 mutex_destroy(&zp->z_acl_lock); 150 avl_destroy(&zp->z_range_avl); 151 mutex_destroy(&zp->z_range_lock); 152 153 ASSERT(zp->z_dbuf == NULL); 154 ASSERT(zp->z_dirlocks == NULL); 155 } 156 157 #ifdef ZNODE_STATS 158 static struct { 159 uint64_t zms_zfsvfs_invalid; 160 uint64_t zms_zfsvfs_unmounted; 161 uint64_t zms_zfsvfs_recheck_invalid; 162 uint64_t zms_obj_held; 163 uint64_t zms_vnode_locked; 164 uint64_t zms_not_only_dnlc; 165 } znode_move_stats; 166 #endif /* ZNODE_STATS */ 167 168 static void 169 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 170 { 171 vnode_t *vp; 172 173 /* Copy fields. */ 174 nzp->z_zfsvfs = ozp->z_zfsvfs; 175 176 /* Swap vnodes. */ 177 vp = nzp->z_vnode; 178 nzp->z_vnode = ozp->z_vnode; 179 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 180 ZTOV(ozp)->v_data = ozp; 181 ZTOV(nzp)->v_data = nzp; 182 183 nzp->z_id = ozp->z_id; 184 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 185 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 186 nzp->z_unlinked = ozp->z_unlinked; 187 nzp->z_atime_dirty = ozp->z_atime_dirty; 188 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 189 nzp->z_blksz = ozp->z_blksz; 190 nzp->z_seq = ozp->z_seq; 191 nzp->z_mapcnt = ozp->z_mapcnt; 192 nzp->z_last_itx = ozp->z_last_itx; 193 nzp->z_gen = ozp->z_gen; 194 nzp->z_sync_cnt = ozp->z_sync_cnt; 195 nzp->z_phys = ozp->z_phys; 196 nzp->z_dbuf = ozp->z_dbuf; 197 198 /* Update back pointers. */ 199 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 200 znode_evict_error); 201 202 /* 203 * Invalidate the original znode by clearing fields that provide a 204 * pointer back to the znode. Set the low bit of the vfs pointer to 205 * ensure that zfs_znode_move() recognizes the znode as invalid in any 206 * subsequent callback. 207 */ 208 ozp->z_dbuf = NULL; 209 POINTER_INVALIDATE(&ozp->z_zfsvfs); 210 } 211 212 /* 213 * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 214 * returns a non-zero error code. 215 */ 216 static int 217 zfs_enter(zfsvfs_t *zfsvfs) 218 { 219 ZFS_ENTER(zfsvfs); 220 return (0); 221 } 222 223 #ifndef __NetBSD__ 224 /*ARGSUSED*/ 225 static kmem_cbrc_t 226 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 227 { 228 znode_t *ozp = buf, *nzp = newbuf; 229 zfsvfs_t *zfsvfs; 230 vnode_t *vp; 231 232 /* 233 * The znode is on the file system's list of known znodes if the vfs 234 * pointer is valid. We set the low bit of the vfs pointer when freeing 235 * the znode to invalidate it, and the memory patterns written by kmem 236 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 237 * created znode sets the vfs pointer last of all to indicate that the 238 * znode is known and in a valid state to be moved by this function. 239 */ 240 zfsvfs = ozp->z_zfsvfs; 241 if (!POINTER_IS_VALID(zfsvfs)) { 242 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 243 return (KMEM_CBRC_DONT_KNOW); 244 } 245 246 /* 247 * Ensure that the filesystem is not unmounted during the move. 248 */ 249 if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 250 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 251 return (KMEM_CBRC_DONT_KNOW); 252 } 253 254 mutex_enter(&zfsvfs->z_znodes_lock); 255 /* 256 * Recheck the vfs pointer in case the znode was removed just before 257 * acquiring the lock. 258 */ 259 if (zfsvfs != ozp->z_zfsvfs) { 260 mutex_exit(&zfsvfs->z_znodes_lock); 261 ZFS_EXIT(zfsvfs); 262 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 263 return (KMEM_CBRC_DONT_KNOW); 264 } 265 266 /* 267 * At this point we know that as long as we hold z_znodes_lock, the 268 * znode cannot be freed and fields within the znode can be safely 269 * accessed. Now, prevent a race with zfs_zget(). 270 */ 271 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 272 mutex_exit(&zfsvfs->z_znodes_lock); 273 ZFS_EXIT(zfsvfs); 274 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 275 return (KMEM_CBRC_LATER); 276 } 277 278 vp = ZTOV(ozp); 279 if (mutex_tryenter(&vp->v_lock) == 0) { 280 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 281 mutex_exit(&zfsvfs->z_znodes_lock); 282 ZFS_EXIT(zfsvfs); 283 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 284 return (KMEM_CBRC_LATER); 285 } 286 287 /* Only move znodes that are referenced _only_ by the DNLC. */ 288 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 289 mutex_exit(&vp->v_lock); 290 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 291 mutex_exit(&zfsvfs->z_znodes_lock); 292 ZFS_EXIT(zfsvfs); 293 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 294 return (KMEM_CBRC_LATER); 295 } 296 297 /* 298 * The znode is known and in a valid state to move. We're holding the 299 * locks needed to execute the critical section. 300 */ 301 zfs_znode_move_impl(ozp, nzp); 302 mutex_exit(&vp->v_lock); 303 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 304 305 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 306 mutex_exit(&zfsvfs->z_znodes_lock); 307 ZFS_EXIT(zfsvfs); 308 309 return (KMEM_CBRC_YES); 310 } 311 #endif /* !__NetBSD__ */ 312 313 void 314 zfs_znode_init(void) 315 { 316 /* 317 * Initialize zcache 318 */ 319 ASSERT(znode_cache == NULL); 320 znode_cache = kmem_cache_create("zfs_znode_cache", 321 sizeof (znode_t), 0, zfs_znode_cache_constructor, 322 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 323 } 324 325 void 326 zfs_znode_fini(void) 327 { 328 329 /* 330 * Cleanup zcache 331 */ 332 if (znode_cache) 333 kmem_cache_destroy(znode_cache); 334 znode_cache = NULL; 335 } 336 337 #ifndef __NetBSD__ 338 struct vnodeops *zfs_dvnodeops; 339 struct vnodeops *zfs_fvnodeops; 340 struct vnodeops *zfs_symvnodeops; 341 struct vnodeops *zfs_xdvnodeops; 342 struct vnodeops *zfs_evnodeops; 343 #endif 344 void 345 zfs_remove_op_tables() 346 { 347 #ifndef __NetBSD__ 348 /* 349 * Remove vfs ops 350 */ 351 ASSERT(zfsfstype); 352 (void) vfs_freevfsops_by_type(zfsfstype); 353 zfsfstype = 0; 354 355 /* 356 * Remove vnode ops 357 */ 358 if (zfs_dvnodeops) 359 vn_freevnodeops(zfs_dvnodeops); 360 if (zfs_fvnodeops) 361 vn_freevnodeops(zfs_fvnodeops); 362 if (zfs_symvnodeops) 363 vn_freevnodeops(zfs_symvnodeops); 364 if (zfs_xdvnodeops) 365 vn_freevnodeops(zfs_xdvnodeops); 366 if (zfs_evnodeops) 367 vn_freevnodeops(zfs_evnodeops); 368 369 zfs_dvnodeops = NULL; 370 zfs_fvnodeops = NULL; 371 zfs_symvnodeops = NULL; 372 zfs_xdvnodeops = NULL; 373 zfs_evnodeops = NULL; 374 #endif 375 } 376 #ifndef __NetBSD__ 377 extern const fs_operation_def_t zfs_dvnodeops_template[]; 378 extern const fs_operation_def_t zfs_fvnodeops_template[]; 379 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 380 extern const fs_operation_def_t zfs_symvnodeops_template[]; 381 extern const fs_operation_def_t zfs_evnodeops_template[]; 382 #endif 383 int 384 zfs_create_op_tables() 385 { 386 #ifndef __NetBSD__ 387 int error; 388 389 /* 390 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 391 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 392 * In this case we just return as the ops vectors are already set up. 393 */ 394 if (zfs_dvnodeops) 395 return (0); 396 397 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 398 &zfs_dvnodeops); 399 if (error) 400 return (error); 401 402 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 403 &zfs_fvnodeops); 404 if (error) 405 return (error); 406 407 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 408 &zfs_symvnodeops); 409 if (error) 410 return (error); 411 412 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 413 &zfs_xdvnodeops); 414 if (error) 415 return (error); 416 417 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 418 &zfs_evnodeops); 419 420 return (error); 421 #endif 422 return 0; 423 } 424 425 /* 426 * zfs_init_fs - Initialize the zfsvfs struct and the file system 427 * incore "master" object. Verify version compatibility. 428 */ 429 int 430 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 431 { 432 extern int zfsfstype; 433 434 objset_t *os = zfsvfs->z_os; 435 int i, error; 436 uint64_t fsid_guid; 437 uint64_t zval; 438 439 *zpp = NULL; 440 441 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 442 if (error) { 443 return (error); 444 } else if (zfsvfs->z_version > ZPL_VERSION) { 445 (void) printf("Mismatched versions: File system " 446 "is version %llu on-disk format, which is " 447 "incompatible with this software version %lld!", 448 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 449 return (ENOTSUP); 450 } 451 452 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 453 return (error); 454 zfsvfs->z_norm = (int)zval; 455 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 456 return (error); 457 zfsvfs->z_utf8 = (zval != 0); 458 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 459 return (error); 460 zfsvfs->z_case = (uint_t)zval; 461 /* 462 * Fold case on file systems that are always or sometimes case 463 * insensitive. 464 */ 465 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 466 zfsvfs->z_case == ZFS_CASE_MIXED) 467 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 468 469 /* 470 * The fsid is 64 bits, composed of an 8-bit fs type, which 471 * separates our fsid from any other filesystem types, and a 472 * 56-bit objset unique ID. The objset unique ID is unique to 473 * all objsets open on this system, provided by unique_create(). 474 * The 8-bit fs type must be put in the low bits of fsid[1] 475 * because that's where other Solaris filesystems put it. 476 */ 477 fsid_guid = dmu_objset_fsid_guid(os); 478 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 479 zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid; 480 zfsvfs->z_vfs->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) | 481 zfsfstype & 0xFF; 482 zfsvfs->z_vfs->mnt_stat.f_fsid = fsid_guid; 483 484 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 485 &zfsvfs->z_root); 486 if (error) 487 return (error); 488 ASSERT(zfsvfs->z_root != 0); 489 490 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 491 &zfsvfs->z_unlinkedobj); 492 if (error) 493 return (error); 494 495 /* 496 * Initialize zget mutex's 497 */ 498 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 499 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 500 501 error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 502 if (error) { 503 /* 504 * On error, we destroy the mutexes here since it's not 505 * possible for the caller to determine if the mutexes were 506 * initialized properly. 507 */ 508 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 509 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 510 return (error); 511 } 512 ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 513 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 514 &zfsvfs->z_fuid_obj); 515 if (error == ENOENT) 516 error = 0; 517 518 return (0); 519 } 520 521 /* 522 * define a couple of values we need available 523 * for both 64 and 32 bit environments. 524 */ 525 #ifndef NBITSMINOR64 526 #define NBITSMINOR64 32 527 #endif 528 #ifndef MAXMAJ64 529 #define MAXMAJ64 0xffffffffUL 530 #endif 531 #ifndef MAXMIN64 532 #define MAXMIN64 0xffffffffUL 533 #endif 534 535 /* 536 * Create special expldev for ZFS private use. 537 * Can't use standard expldev since it doesn't do 538 * what we want. The standard expldev() takes a 539 * dev32_t in LP64 and expands it to a long dev_t. 540 * We need an interface that takes a dev32_t in ILP32 541 * and expands it to a long dev_t. 542 */ 543 static uint64_t 544 zfs_expldev(dev_t dev) 545 { 546 return ((uint64_t)major(dev) << NBITSMINOR64) | 547 (minor_t)minor(dev); 548 } 549 550 /* 551 * Special cmpldev for ZFS private use. 552 * Can't use standard cmpldev since it takes 553 * a long dev_t and compresses it to dev32_t in 554 * LP64. We need to do a compaction of a long dev_t 555 * to a dev32_t in ILP32. 556 */ 557 dev_t 558 zfs_cmpldev(uint64_t dev) 559 { 560 minor_t minor = (minor_t)dev & MAXMIN64; 561 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 562 563 return makedev(minor, major); 564 } 565 566 static void 567 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 568 { 569 znode_t *nzp; 570 571 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 572 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 573 574 mutex_enter(&zp->z_lock); 575 576 ASSERT(zp->z_dbuf == NULL); 577 zp->z_dbuf = db; 578 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 579 580 /* 581 * there should be no 582 * concurrent zgets on this object. 583 */ 584 if (nzp != NULL) 585 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 586 587 /* 588 * Slap on VROOT if we are the root znode 589 */ 590 if (zp->z_id == zfsvfs->z_root) 591 ZTOV(zp)->v_flag |= VROOT; 592 593 mutex_exit(&zp->z_lock); 594 } 595 596 void 597 zfs_znode_dmu_fini(znode_t *zp) 598 { 599 dmu_buf_t *db = zp->z_dbuf; 600 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 601 zp->z_unlinked || 602 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 603 ASSERT(zp->z_dbuf != NULL); 604 zp->z_dbuf = NULL; 605 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 606 dmu_buf_rele(db, NULL); 607 } 608 609 /* 610 * Construct a new znode/vnode and intialize. 611 * 612 * This does not do a call to dmu_set_user() that is 613 * up to the caller to do, in case you don't want to 614 * return the znode 615 */ 616 617 static znode_t * 618 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 619 { 620 znode_t *zp; 621 vnode_t *vp; 622 int error; 623 624 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 625 for (;;) { 626 627 error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs, 628 zfs_vnodeop_p, &zp->z_vnode); 629 if (__predict_true(error == 0)) 630 break; 631 printf("WARNING: zfs_znode_alloc: unable to get vnode, " 632 "error=%d\n", error); 633 (void)kpause("zfsnewvn", false, hz, NULL); 634 } 635 636 ASSERT(zp->z_dirlocks == NULL); 637 ASSERT(zp->z_dbuf == NULL); 638 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 639 640 /* 641 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 642 * the zfs_znode_move() callback. 643 */ 644 zp->z_phys = NULL; 645 zp->z_unlinked = 0; 646 zp->z_atime_dirty = 0; 647 zp->z_mapcnt = 0; 648 zp->z_last_itx = 0; 649 zp->z_id = db->db_object; 650 zp->z_blksz = blksz; 651 zp->z_seq = 0x7A4653; 652 zp->z_sync_cnt = 0; 653 654 vp = ZTOV(zp); 655 656 zfs_znode_dmu_init(zfsvfs, zp, db); 657 658 zp->z_gen = zp->z_phys->zp_gen; 659 660 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 661 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 662 vp->v_data = zp; 663 switch (vp->v_type) { 664 case VDIR: 665 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 666 break; 667 case VBLK: 668 case VCHR: 669 /* XXX NetBSD vp->v_op = zfs_specop_p; */ 670 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev)); 671 break; 672 case VFIFO: 673 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */ 674 break; 675 } 676 677 dprintf("zfs_znode_alloc znode %p -- vnode %p\n", zp, vp); 678 dprintf("zfs_znode_alloc z_id %ld\n", zp->z_id); 679 //cpu_Debugger(); 680 681 uvm_vnp_setsize(vp, zp->z_phys->zp_size); 682 683 mutex_enter(&zfsvfs->z_znodes_lock); 684 list_insert_tail(&zfsvfs->z_all_znodes, zp); 685 membar_producer(); 686 /* 687 * Everything else must be valid before assigning z_zfsvfs makes the 688 * znode eligible for zfs_znode_move(). 689 */ 690 zp->z_zfsvfs = zfsvfs; 691 mutex_exit(&zfsvfs->z_znodes_lock); 692 693 return (zp); 694 } 695 696 /* 697 * Create a new DMU object to hold a zfs znode. 698 * 699 * IN: dzp - parent directory for new znode 700 * vap - file attributes for new znode 701 * tx - dmu transaction id for zap operations 702 * cr - credentials of caller 703 * flag - flags: 704 * IS_ROOT_NODE - new object will be root 705 * IS_XATTR - new object is an attribute 706 * IS_REPLAY - intent log replay 707 * bonuslen - length of bonus buffer 708 * setaclp - File/Dir initial ACL 709 * fuidp - Tracks fuid allocation. 710 * 711 * OUT: zpp - allocated znode 712 * 713 */ 714 void 715 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 716 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 717 zfs_fuid_info_t **fuidp) 718 { 719 dmu_buf_t *db; 720 znode_phys_t *pzp; 721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 722 timestruc_t now; 723 uint64_t gen, obj; 724 int err; 725 726 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 727 728 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 729 obj = vap->va_nodeid; 730 flag |= IS_REPLAY; 731 now = vap->va_ctime; /* see zfs_replay_create() */ 732 gen = vap->va_nblocks; /* ditto */ 733 } else { 734 obj = 0; 735 gethrestime(&now); 736 gen = dmu_tx_get_txg(tx); 737 } 738 739 /* 740 * Create a new DMU object. 741 */ 742 /* 743 * There's currently no mechanism for pre-reading the blocks that will 744 * be to needed allocate a new object, so we accept the small chance 745 * that there will be an i/o error and we will fail one of the 746 * assertions below. 747 */ 748 if (vap->va_type == VDIR) { 749 if (flag & IS_REPLAY) { 750 err = zap_create_claim_norm(zfsvfs->z_os, obj, 751 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 752 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 753 ASSERT3U(err, ==, 0); 754 } else { 755 obj = zap_create_norm(zfsvfs->z_os, 756 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 757 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 758 } 759 } else { 760 if (flag & IS_REPLAY) { 761 err = dmu_object_claim(zfsvfs->z_os, obj, 762 DMU_OT_PLAIN_FILE_CONTENTS, 0, 763 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 764 ASSERT3U(err, ==, 0); 765 } else { 766 obj = dmu_object_alloc(zfsvfs->z_os, 767 DMU_OT_PLAIN_FILE_CONTENTS, 0, 768 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 769 } 770 } 771 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 772 dmu_buf_will_dirty(db, tx); 773 774 /* 775 * Initialize the znode physical data to zero. 776 */ 777 ASSERT(db->db_size >= sizeof (znode_phys_t)); 778 bzero(db->db_data, db->db_size); 779 pzp = db->db_data; 780 781 /* 782 * If this is the root, fix up the half-initialized parent pointer 783 * to reference the just-allocated physical data area. 784 */ 785 if (flag & IS_ROOT_NODE) { 786 dzp->z_dbuf = db; 787 dzp->z_phys = pzp; 788 dzp->z_id = obj; 789 } 790 791 /* 792 * If parent is an xattr, so am I. 793 */ 794 if (dzp->z_phys->zp_flags & ZFS_XATTR) 795 flag |= IS_XATTR; 796 797 if (vap->va_type == VBLK || vap->va_type == VCHR) { 798 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 799 } 800 801 if (zfsvfs->z_use_fuids) 802 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 803 804 if (vap->va_type == VDIR) { 805 pzp->zp_size = 2; /* contents ("." and "..") */ 806 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 807 } 808 809 pzp->zp_parent = dzp->z_id; 810 if (flag & IS_XATTR) 811 pzp->zp_flags |= ZFS_XATTR; 812 813 pzp->zp_gen = gen; 814 815 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 816 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 817 818 if (vap->va_mask & AT_ATIME) { 819 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 820 } else { 821 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 822 } 823 824 if (vap->va_mask & AT_MTIME) { 825 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 826 } else { 827 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 828 } 829 830 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 831 if (!(flag & IS_ROOT_NODE)) { 832 dprintf("zfs_mknode parent vp %p - zp %p\n", ZTOV(dzp), dzp); 833 dprintf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, obj), obj); 834 835 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 836 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 837 838 genfs_node_init(ZTOV(*zpp), &zfs_genfsops); 839 840 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 841 } else { 842 /* 843 * If we are creating the root node, the "parent" we 844 * passed in is the znode for the root. 845 */ 846 *zpp = dzp; 847 } 848 zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 849 } 850 851 void 852 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 853 { 854 xoptattr_t *xoap; 855 856 xoap = xva_getxoptattr(xvap); 857 ASSERT(xoap); 858 859 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 860 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 861 XVA_SET_RTN(xvap, XAT_CREATETIME); 862 } 863 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 864 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 865 XVA_SET_RTN(xvap, XAT_READONLY); 866 } 867 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 868 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 869 XVA_SET_RTN(xvap, XAT_HIDDEN); 870 } 871 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 872 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 873 XVA_SET_RTN(xvap, XAT_SYSTEM); 874 } 875 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 876 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 877 XVA_SET_RTN(xvap, XAT_ARCHIVE); 878 } 879 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 880 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 881 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 882 } 883 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 884 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 885 XVA_SET_RTN(xvap, XAT_NOUNLINK); 886 } 887 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 888 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 889 XVA_SET_RTN(xvap, XAT_APPENDONLY); 890 } 891 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 892 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 893 XVA_SET_RTN(xvap, XAT_NODUMP); 894 } 895 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 896 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 897 XVA_SET_RTN(xvap, XAT_OPAQUE); 898 } 899 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 900 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 901 xoap->xoa_av_quarantined); 902 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 903 } 904 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 905 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 906 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 907 } 908 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 909 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 910 sizeof (xoap->xoa_av_scanstamp)); 911 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 912 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 913 } 914 } 915 916 int 917 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 918 { 919 dmu_object_info_t doi; 920 dmu_buf_t *db; 921 znode_t *zp; 922 vnode_t *vp; 923 int err, first = 1; 924 925 *zpp = NULL; 926 again: 927 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 928 929 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 930 if (err) { 931 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 932 return (err); 933 } 934 935 dmu_object_info_from_db(db, &doi); 936 if (doi.doi_bonus_type != DMU_OT_ZNODE || 937 doi.doi_bonus_size < sizeof (znode_phys_t)) { 938 dmu_buf_rele(db, NULL); 939 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 940 return (EINVAL); 941 } 942 943 zp = dmu_buf_get_user(db); 944 if (zp != NULL) { 945 mutex_enter(&zp->z_lock); 946 947 /* 948 * Since we do immediate eviction of the z_dbuf, we 949 * should never find a dbuf with a znode that doesn't 950 * know about the dbuf. 951 */ 952 ASSERT3P(zp->z_dbuf, ==, db); 953 ASSERT3U(zp->z_id, ==, obj_num); 954 if (zp->z_unlinked) { 955 err = ENOENT; 956 } else { 957 if ((vp = ZTOV(zp)) != NULL) { 958 mutex_enter(&vp->v_interlock); 959 mutex_exit(&zp->z_lock); 960 if (vget(vp, LK_INTERLOCK) != 0) { 961 dmu_buf_rele(db, NULL); 962 mutex_exit(&vp->v_interlock); 963 goto again; 964 } 965 mutex_enter(&zp->z_lock); 966 } else { 967 if (first) { 968 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 969 first = 0; 970 } 971 /* 972 * znode is dying so we can't reuse it, we must 973 * wait until destruction is completed. 974 */ 975 dmu_buf_rele(db, NULL); 976 mutex_exit(&zp->z_lock); 977 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 978 kpause("zcollide", 0, 1, NULL); 979 goto again; 980 } 981 *zpp = zp; 982 err = 0; 983 } 984 985 dmu_buf_rele(db, NULL); 986 mutex_exit(&zp->z_lock); 987 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 988 return (err); 989 } 990 991 /* 992 * Not found create new znode/vnode 993 */ 994 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 995 vp = ZTOV(zp); 996 997 genfs_node_init(vp, &zfs_genfsops); 998 999 VOP_UNLOCK(vp, 0); 1000 1001 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1002 *zpp = zp; 1003 return (0); 1004 } 1005 1006 int 1007 zfs_rezget(znode_t *zp) 1008 { 1009 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1010 dmu_object_info_t doi; 1011 dmu_buf_t *db; 1012 uint64_t obj_num = zp->z_id; 1013 int err; 1014 1015 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 1016 1017 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 1018 if (err) { 1019 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1020 return (err); 1021 } 1022 1023 dmu_object_info_from_db(db, &doi); 1024 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1025 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1026 dmu_buf_rele(db, NULL); 1027 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1028 return (EINVAL); 1029 } 1030 1031 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 1032 dmu_buf_rele(db, NULL); 1033 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1034 return (EIO); 1035 } 1036 1037 zfs_znode_dmu_init(zfsvfs, zp, db); 1038 zp->z_unlinked = (zp->z_phys->zp_links == 0); 1039 zp->z_blksz = doi.doi_data_block_size; 1040 1041 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1042 1043 return (0); 1044 } 1045 1046 void 1047 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1048 { 1049 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1050 objset_t *os = zfsvfs->z_os; 1051 uint64_t obj = zp->z_id; 1052 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1053 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1054 if (acl_obj) 1055 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1056 VERIFY(0 == dmu_object_free(os, obj, tx)); 1057 zfs_znode_dmu_fini(zp); 1058 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1059 zfs_znode_free(zp); 1060 } 1061 1062 /* 1063 * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock 1064 * will be released in zfs_zinactive. 1065 */ 1066 void 1067 zfs_zinactive(znode_t *zp) 1068 { 1069 vnode_t *vp = ZTOV(zp); 1070 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1071 1072 ASSERT(zp->z_dbuf && zp->z_phys); 1073 1074 //printf("zfs_zinactive vp %p - zp %p\n", vp, zp); 1075 //printf("Going to lock %p with %ld\n", ZFS_OBJ_MUTEX(zfsvfs, z_id), z_id); 1076 1077 mutex_enter(&zp->z_lock); 1078 /* 1079 * If this was the last reference to a file with no links, 1080 * remove the file from the file system. 1081 */ 1082 if (zp->z_unlinked) { 1083 mutex_exit(&zp->z_lock); 1084 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 1085 zfs_rmnode(zp); 1086 return; 1087 } 1088 1089 mutex_exit(&zp->z_lock); 1090 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 1091 zfs_znode_free(zp); 1092 } 1093 1094 void 1095 zfs_znode_free(znode_t *zp) 1096 { 1097 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1098 ASSERT(ZTOV(zp) == NULL); 1099 1100 dprintf("destroying znode %p\n", zp); 1101 //cpu_Debugger(); 1102 mutex_enter(&zfsvfs->z_znodes_lock); 1103 POINTER_INVALIDATE(&zp->z_zfsvfs); 1104 list_remove(&zfsvfs->z_all_znodes, zp); 1105 mutex_exit(&zfsvfs->z_znodes_lock); 1106 1107 kmem_cache_free(znode_cache, zp); 1108 1109 VFS_RELE(zfsvfs->z_vfs); 1110 } 1111 1112 void 1113 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1114 { 1115 timestruc_t now; 1116 1117 ASSERT(MUTEX_HELD(&zp->z_lock)); 1118 1119 gethrestime(&now); 1120 1121 if (tx) { 1122 dmu_buf_will_dirty(zp->z_dbuf, tx); 1123 zp->z_atime_dirty = 0; 1124 zp->z_seq++; 1125 } else { 1126 zp->z_atime_dirty = 1; 1127 } 1128 1129 if (flag & AT_ATIME) 1130 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1131 1132 if (flag & AT_MTIME) { 1133 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1134 if (zp->z_zfsvfs->z_use_fuids) 1135 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1136 } 1137 1138 if (flag & AT_CTIME) { 1139 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1140 if (zp->z_zfsvfs->z_use_fuids) 1141 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1142 } 1143 } 1144 1145 /* 1146 * Update the requested znode timestamps with the current time. 1147 * If we are in a transaction, then go ahead and mark the znode 1148 * dirty in the transaction so the timestamps will go to disk. 1149 * Otherwise, we will get pushed next time the znode is updated 1150 * in a transaction, or when this znode eventually goes inactive. 1151 * 1152 * Why is this OK? 1153 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1154 * 2 - Multiple consecutive updates will be collapsed into a single 1155 * znode update by the transaction grouping semantics of the DMU. 1156 */ 1157 void 1158 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1159 { 1160 mutex_enter(&zp->z_lock); 1161 zfs_time_stamper_locked(zp, flag, tx); 1162 mutex_exit(&zp->z_lock); 1163 } 1164 1165 /* 1166 * Grow the block size for a file. 1167 * 1168 * IN: zp - znode of file to free data in. 1169 * size - requested block size 1170 * tx - open transaction. 1171 * 1172 * NOTE: this function assumes that the znode is write locked. 1173 */ 1174 void 1175 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1176 { 1177 int error; 1178 u_longlong_t dummy; 1179 1180 if (size <= zp->z_blksz) 1181 return; 1182 /* 1183 * If the file size is already greater than the current blocksize, 1184 * we will not grow. If there is more than one block in a file, 1185 * the blocksize cannot change. 1186 */ 1187 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1188 return; 1189 1190 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1191 size, 0, tx); 1192 if (error == ENOTSUP) 1193 return; 1194 ASSERT3U(error, ==, 0); 1195 1196 /* What blocksize did we actually get? */ 1197 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1198 } 1199 1200 /* 1201 * Increase the file length 1202 * 1203 * IN: zp - znode of file to free data in. 1204 * end - new end-of-file 1205 * 1206 * RETURN: 0 if success 1207 * error code if failure 1208 */ 1209 static int 1210 zfs_extend(znode_t *zp, uint64_t end) 1211 { 1212 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1213 dmu_tx_t *tx; 1214 rl_t *rl; 1215 uint64_t newblksz; 1216 int error; 1217 1218 /* 1219 * We will change zp_size, lock the whole file. 1220 */ 1221 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1222 1223 /* 1224 * Nothing to do if file already at desired length. 1225 */ 1226 if (end <= zp->z_phys->zp_size) { 1227 zfs_range_unlock(rl); 1228 return (0); 1229 } 1230 top: 1231 tx = dmu_tx_create(zfsvfs->z_os); 1232 dmu_tx_hold_bonus(tx, zp->z_id); 1233 if (end > zp->z_blksz && 1234 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1235 /* 1236 * We are growing the file past the current block size. 1237 */ 1238 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1239 ASSERT(!ISP2(zp->z_blksz)); 1240 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1241 } else { 1242 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1243 } 1244 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1245 } else { 1246 newblksz = 0; 1247 } 1248 1249 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1250 if (error) { 1251 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1252 dmu_tx_wait(tx); 1253 dmu_tx_abort(tx); 1254 goto top; 1255 } 1256 dmu_tx_abort(tx); 1257 zfs_range_unlock(rl); 1258 return (error); 1259 } 1260 dmu_buf_will_dirty(zp->z_dbuf, tx); 1261 1262 if (newblksz) 1263 zfs_grow_blocksize(zp, newblksz, tx); 1264 1265 zp->z_phys->zp_size = end; 1266 1267 zfs_range_unlock(rl); 1268 1269 dmu_tx_commit(tx); 1270 1271 rw_enter(&zp->z_map_lock, RW_WRITER); 1272 uvm_vnp_setsize(ZTOV(zp), end); 1273 rw_exit(&zp->z_map_lock); 1274 1275 return (0); 1276 } 1277 1278 /* 1279 * Free space in a file. 1280 * 1281 * IN: zp - znode of file to free data in. 1282 * off - start of section to free. 1283 * len - length of section to free. 1284 * 1285 * RETURN: 0 if success 1286 * error code if failure 1287 */ 1288 static int 1289 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1290 { 1291 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1292 rl_t *rl; 1293 int error; 1294 1295 /* 1296 * Lock the range being freed. 1297 */ 1298 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1299 1300 /* 1301 * Nothing to do if file already at desired length. 1302 */ 1303 if (off >= zp->z_phys->zp_size) { 1304 zfs_range_unlock(rl); 1305 return (0); 1306 } 1307 1308 if (off + len > zp->z_phys->zp_size) 1309 len = zp->z_phys->zp_size - off; 1310 1311 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1312 1313 if (error == 0) { 1314 /* 1315 * In NetBSD we cannot free block in the middle of a file, 1316 * but only at the end of a file. 1317 */ 1318 rw_enter(&zp->z_map_lock, RW_WRITER); 1319 uvm_vnp_setsize(ZTOV(zp), off); 1320 rw_exit(&zp->z_map_lock); 1321 } 1322 1323 zfs_range_unlock(rl); 1324 1325 return (error); 1326 } 1327 1328 /* 1329 * Truncate a file 1330 * 1331 * IN: zp - znode of file to free data in. 1332 * end - new end-of-file. 1333 * 1334 * RETURN: 0 if success 1335 * error code if failure 1336 */ 1337 static int 1338 zfs_trunc(znode_t *zp, uint64_t end) 1339 { 1340 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1341 vnode_t *vp = ZTOV(zp); 1342 dmu_tx_t *tx; 1343 rl_t *rl; 1344 int error; 1345 1346 /* 1347 * We will change zp_size, lock the whole file. 1348 */ 1349 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1350 1351 /* 1352 * Nothing to do if file already at desired length. 1353 */ 1354 if (end >= zp->z_phys->zp_size) { 1355 zfs_range_unlock(rl); 1356 return (0); 1357 } 1358 1359 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1360 if (error) { 1361 zfs_range_unlock(rl); 1362 return (error); 1363 } 1364 top: 1365 tx = dmu_tx_create(zfsvfs->z_os); 1366 dmu_tx_hold_bonus(tx, zp->z_id); 1367 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1368 if (error) { 1369 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1370 dmu_tx_wait(tx); 1371 dmu_tx_abort(tx); 1372 goto top; 1373 } 1374 dmu_tx_abort(tx); 1375 zfs_range_unlock(rl); 1376 return (error); 1377 } 1378 dmu_buf_will_dirty(zp->z_dbuf, tx); 1379 1380 zp->z_phys->zp_size = end; 1381 1382 dmu_tx_commit(tx); 1383 1384 zfs_range_unlock(rl); 1385 1386 /* 1387 * Clear any mapped pages in the truncated region. This has to 1388 * happen outside of the transaction to avoid the possibility of 1389 * a deadlock with someone trying to push a page that we are 1390 * about to invalidate. 1391 */ 1392 rw_enter(&zp->z_map_lock, RW_WRITER); 1393 uvm_vnp_setsize(vp, end); 1394 rw_exit(&zp->z_map_lock); 1395 1396 return (0); 1397 } 1398 1399 /* 1400 * Free space in a file 1401 * 1402 * IN: zp - znode of file to free data in. 1403 * off - start of range 1404 * len - end of range (0 => EOF) 1405 * flag - current file open mode flags. 1406 * log - TRUE if this action should be logged 1407 * 1408 * RETURN: 0 if success 1409 * error code if failure 1410 */ 1411 int 1412 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1413 { 1414 vnode_t *vp = ZTOV(zp); 1415 dmu_tx_t *tx; 1416 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1417 zilog_t *zilog = zfsvfs->z_log; 1418 int error; 1419 1420 if (off > zp->z_phys->zp_size) { 1421 error = zfs_extend(zp, off+len); 1422 if (error == 0 && log) 1423 goto log; 1424 else 1425 return (error); 1426 } 1427 1428 if (len == 0) { 1429 error = zfs_trunc(zp, off); 1430 } else { 1431 if ((error = zfs_free_range(zp, off, len)) == 0 && 1432 off + len > zp->z_phys->zp_size) 1433 error = zfs_extend(zp, off+len); 1434 } 1435 if (error || !log) 1436 return (error); 1437 log: 1438 tx = dmu_tx_create(zfsvfs->z_os); 1439 dmu_tx_hold_bonus(tx, zp->z_id); 1440 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1441 if (error) { 1442 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1443 dmu_tx_wait(tx); 1444 dmu_tx_abort(tx); 1445 goto log; 1446 } 1447 dmu_tx_abort(tx); 1448 return (error); 1449 } 1450 1451 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1452 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1453 1454 dmu_tx_commit(tx); 1455 return (0); 1456 } 1457 1458 void 1459 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1460 { 1461 zfsvfs_t zfsvfs; 1462 uint64_t moid, doid, version; 1463 uint64_t sense = ZFS_CASE_SENSITIVE; 1464 uint64_t norm = 0; 1465 nvpair_t *elem; 1466 int error; 1467 znode_t *rootzp = NULL; 1468 vnode_t *vp; 1469 vattr_t vattr; 1470 znode_t *zp; 1471 1472 /* 1473 * First attempt to create master node. 1474 */ 1475 /* 1476 * In an empty objset, there are no blocks to read and thus 1477 * there can be no i/o errors (which we assert below). 1478 */ 1479 moid = MASTER_NODE_OBJ; 1480 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1481 DMU_OT_NONE, 0, tx); 1482 ASSERT(error == 0); 1483 1484 /* 1485 * Set starting attributes. 1486 */ 1487 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1488 version = ZPL_VERSION; 1489 else 1490 version = ZPL_VERSION_FUID - 1; 1491 error = zap_update(os, moid, ZPL_VERSION_STR, 1492 8, 1, &version, tx); 1493 elem = NULL; 1494 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1495 /* For the moment we expect all zpl props to be uint64_ts */ 1496 uint64_t val; 1497 char *name; 1498 1499 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1500 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1501 name = nvpair_name(elem); 1502 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1503 version = val; 1504 error = zap_update(os, moid, ZPL_VERSION_STR, 1505 8, 1, &version, tx); 1506 } else { 1507 error = zap_update(os, moid, name, 8, 1, &val, tx); 1508 } 1509 ASSERT(error == 0); 1510 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1511 norm = val; 1512 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1513 sense = val; 1514 } 1515 ASSERT(version != 0); 1516 1517 /* 1518 * Create a delete queue. 1519 */ 1520 doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1521 1522 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1523 ASSERT(error == 0); 1524 1525 /* 1526 * Create root znode. Create minimal znode/vnode/zfsvfs 1527 * to allow zfs_mknode to work. 1528 */ 1529 vattr_null(&vattr); 1530 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1531 vattr.va_type = VDIR; 1532 vattr.va_mode = S_IFDIR|0755; 1533 vattr.va_uid = crgetuid(cr); 1534 vattr.va_gid = crgetgid(cr); 1535 1536 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1537 rootzp->z_unlinked = 0; 1538 rootzp->z_atime_dirty = 0; 1539 1540 for (;;) { 1541 error = getnewvnode(VT_ZFS, NULL, zfs_vnodeop_p, 1542 &rootzp->z_vnode); 1543 if (error == 0) 1544 break; 1545 printf("WARNING: zfs_create_fs: unable to get vnode, " 1546 "error=%d\n", error); 1547 kpause("zfsvn", false, hz, NULL); 1548 } 1549 1550 vp = ZTOV(rootzp); 1551 vp->v_type = VDIR; 1552 1553 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1554 1555 zfsvfs.z_os = os; 1556 zfsvfs.z_assign = TXG_NOWAIT; 1557 zfsvfs.z_parent = &zfsvfs; 1558 zfsvfs.z_version = version; 1559 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1560 zfsvfs.z_norm = norm; 1561 /* 1562 * Fold case on file systems that are always or sometimes case 1563 * insensitive. 1564 */ 1565 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1566 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1567 1568 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1569 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1570 offsetof(znode_t, z_link_node)); 1571 1572 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1573 rootzp->z_zfsvfs = &zfsvfs; 1574 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1575 ASSERT3P(zp, ==, rootzp); 1576 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1577 ASSERT(error == 0); 1578 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1579 1580 dmu_buf_rele(rootzp->z_dbuf, NULL); 1581 rootzp->z_dbuf = NULL; 1582 ungetnewvnode(vp); 1583 kmem_cache_free(znode_cache, rootzp); 1584 } 1585 1586 #endif /* _KERNEL */ 1587 /* 1588 * Given an object number, return its parent object number and whether 1589 * or not the object is an extended attribute directory. 1590 */ 1591 static int 1592 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1593 { 1594 dmu_buf_t *db; 1595 dmu_object_info_t doi; 1596 znode_phys_t *zp; 1597 int error; 1598 1599 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1600 return (error); 1601 1602 dmu_object_info_from_db(db, &doi); 1603 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1604 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1605 dmu_buf_rele(db, FTAG); 1606 return (EINVAL); 1607 } 1608 1609 zp = db->db_data; 1610 *pobjp = zp->zp_parent; 1611 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1612 S_ISDIR(zp->zp_mode); 1613 dmu_buf_rele(db, FTAG); 1614 1615 return (0); 1616 } 1617 1618 int 1619 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1620 { 1621 char *path = buf + len - 1; 1622 int error; 1623 1624 *path = '\0'; 1625 1626 for (;;) { 1627 uint64_t pobj; 1628 char component[MAXNAMELEN + 2]; 1629 size_t complen; 1630 int is_xattrdir; 1631 1632 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1633 &is_xattrdir)) != 0) 1634 break; 1635 1636 if (pobj == obj) { 1637 if (path[0] != '/') 1638 *--path = '/'; 1639 break; 1640 } 1641 1642 component[0] = '/'; 1643 if (is_xattrdir) { 1644 (void) sprintf(component + 1, "<xattrdir>"); 1645 } else { 1646 error = zap_value_search(osp, pobj, obj, 1647 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1648 if (error != 0) 1649 break; 1650 } 1651 1652 complen = strlen(component); 1653 path -= complen; 1654 ASSERT(path >= buf); 1655 bcopy(component, path, complen); 1656 obj = pobj; 1657 } 1658 1659 if (error == 0) 1660 (void) memmove(buf, path, buf + len - path); 1661 return (error); 1662 } 1663