1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/kmem.h> 42 #include <sys/errno.h> 43 #include <sys/unistd.h> 44 #include <sys/atomic.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/zfs_rlock.h> 49 #include <sys/zfs_fuid.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/kidmap.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 60 #include "zfs_prop.h" 61 62 #if defined(_KERNEL) && defined(__NetBSD__) 63 #include <miscfs/specfs/specdev.h> 64 static const struct genfs_ops zfs_genfsops = { 65 .gop_write = genfs_compat_gop_write, 66 }; 67 68 #endif 69 70 extern int (**zfs_vnodeop_p)(void *); 71 extern int (**zfs_fifoop_p)(void *); 72 extern int (**zfs_specop_p)(void *); 73 74 /* 75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 76 * turned on when DEBUG is also defined. 77 */ 78 #ifdef DEBUG 79 #define ZNODE_STATS 80 #endif /* DEBUG */ 81 82 #ifdef ZNODE_STATS 83 #define ZNODE_STAT_ADD(stat) ((stat)++) 84 #else 85 #define ZNODE_STAT_ADD(stat) /* nothing */ 86 #endif /* ZNODE_STATS */ 87 88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 90 91 /* 92 * Functions needed for userland (ie: libzpool) are not put under 93 * #ifdef_KERNEL; the rest of the functions have dependencies 94 * (such as VFS logic) that will not compile easily in userland. 95 */ 96 #ifdef _KERNEL 97 /* 98 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 99 * be freed before it can be safely accessed. 100 */ 101 krwlock_t zfsvfs_lock; 102 103 static kmem_cache_t *znode_cache = NULL; 104 105 /*ARGSUSED*/ 106 static void 107 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 108 { 109 /* 110 * We should never drop all dbuf refs without first clearing 111 * the eviction callback. 112 */ 113 panic("evicting znode %p\n", user_ptr); 114 } 115 116 /*ARGSUSED*/ 117 static int 118 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 119 { 120 znode_t *zp = arg; 121 122 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 123 124 list_link_init(&zp->z_link_node); 125 126 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 127 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 128 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 129 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 130 131 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 132 avl_create(&zp->z_range_avl, zfs_range_compare, 133 sizeof (rl_t), offsetof(rl_t, r_node)); 134 135 zp->z_dbuf = NULL; 136 zp->z_dirlocks = NULL; 137 zp->z_acl_cached = NULL; 138 return (0); 139 } 140 141 /*ARGSUSED*/ 142 static void 143 zfs_znode_cache_destructor(void *buf, void *arg) 144 { 145 znode_t *zp = arg; 146 147 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 148 ASSERT(ZTOV(zp) == NULL); 149 150 ASSERT(!list_link_active(&zp->z_link_node)); 151 mutex_destroy(&zp->z_lock); 152 rw_destroy(&zp->z_parent_lock); 153 rw_destroy(&zp->z_name_lock); 154 mutex_destroy(&zp->z_acl_lock); 155 avl_destroy(&zp->z_range_avl); 156 mutex_destroy(&zp->z_range_lock); 157 158 ASSERT(zp->z_dbuf == NULL); 159 ASSERT(zp->z_dirlocks == NULL); 160 ASSERT(zp->z_acl_cached == NULL); 161 } 162 163 #ifdef ZNODE_STATS 164 static struct { 165 uint64_t zms_zfsvfs_invalid; 166 uint64_t zms_zfsvfs_recheck1; 167 uint64_t zms_zfsvfs_unmounted; 168 uint64_t zms_zfsvfs_recheck2; 169 uint64_t zms_obj_held; 170 uint64_t zms_vnode_locked; 171 uint64_t zms_not_only_dnlc; 172 } znode_move_stats; 173 #endif /* ZNODE_STATS */ 174 175 static void 176 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 177 { 178 vnode_t *vp; 179 180 /* Copy fields. */ 181 nzp->z_zfsvfs = ozp->z_zfsvfs; 182 183 /* Swap vnodes. */ 184 vp = nzp->z_vnode; 185 nzp->z_vnode = ozp->z_vnode; 186 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 187 ZTOV(ozp)->v_data = ozp; 188 ZTOV(nzp)->v_data = nzp; 189 190 nzp->z_id = ozp->z_id; 191 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 192 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 193 nzp->z_unlinked = ozp->z_unlinked; 194 nzp->z_atime_dirty = ozp->z_atime_dirty; 195 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 196 nzp->z_blksz = ozp->z_blksz; 197 nzp->z_seq = ozp->z_seq; 198 nzp->z_mapcnt = ozp->z_mapcnt; 199 nzp->z_last_itx = ozp->z_last_itx; 200 nzp->z_gen = ozp->z_gen; 201 nzp->z_sync_cnt = ozp->z_sync_cnt; 202 nzp->z_phys = ozp->z_phys; 203 nzp->z_dbuf = ozp->z_dbuf; 204 205 /* 206 * Since this is just an idle znode and kmem is already dealing with 207 * memory pressure, release any cached ACL. 208 */ 209 if (ozp->z_acl_cached) { 210 zfs_acl_free(ozp->z_acl_cached); 211 ozp->z_acl_cached = NULL; 212 } 213 214 /* Update back pointers. */ 215 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 216 znode_evict_error); 217 218 /* 219 * Invalidate the original znode by clearing fields that provide a 220 * pointer back to the znode. Set the low bit of the vfs pointer to 221 * ensure that zfs_znode_move() recognizes the znode as invalid in any 222 * subsequent callback. 223 */ 224 ozp->z_dbuf = NULL; 225 POINTER_INVALIDATE(&ozp->z_zfsvfs); 226 } 227 228 #ifndef __NetBSD__ 229 /*ARGSUSED*/ 230 static kmem_cbrc_t 231 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 232 { 233 znode_t *ozp = buf, *nzp = newbuf; 234 zfsvfs_t *zfsvfs; 235 vnode_t *vp; 236 237 /* 238 * The znode is on the file system's list of known znodes if the vfs 239 * pointer is valid. We set the low bit of the vfs pointer when freeing 240 * the znode to invalidate it, and the memory patterns written by kmem 241 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 242 * created znode sets the vfs pointer last of all to indicate that the 243 * znode is known and in a valid state to be moved by this function. 244 */ 245 zfsvfs = ozp->z_zfsvfs; 246 if (!POINTER_IS_VALID(zfsvfs)) { 247 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 248 return (KMEM_CBRC_DONT_KNOW); 249 } 250 251 /* 252 * Close a small window in which it's possible that the filesystem could 253 * be unmounted and freed, and zfsvfs, though valid in the previous 254 * statement, could point to unrelated memory by the time we try to 255 * prevent the filesystem from being unmounted. 256 */ 257 rw_enter(&zfsvfs_lock, RW_WRITER); 258 if (zfsvfs != ozp->z_zfsvfs) { 259 rw_exit(&zfsvfs_lock); 260 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 261 return (KMEM_CBRC_DONT_KNOW); 262 } 263 264 /* 265 * If the znode is still valid, then so is the file system. We know that 266 * no valid file system can be freed while we hold zfsvfs_lock, so we 267 * can safely ensure that the filesystem is not and will not be 268 * unmounted. The next statement is equivalent to ZFS_ENTER(). 269 */ 270 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 271 if (zfsvfs->z_unmounted) { 272 ZFS_EXIT(zfsvfs); 273 rw_exit(&zfsvfs_lock); 274 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 275 return (KMEM_CBRC_DONT_KNOW); 276 } 277 rw_exit(&zfsvfs_lock); 278 279 mutex_enter(&zfsvfs->z_znodes_lock); 280 /* 281 * Recheck the vfs pointer in case the znode was removed just before 282 * acquiring the lock. 283 */ 284 if (zfsvfs != ozp->z_zfsvfs) { 285 mutex_exit(&zfsvfs->z_znodes_lock); 286 ZFS_EXIT(zfsvfs); 287 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 288 return (KMEM_CBRC_DONT_KNOW); 289 } 290 291 /* 292 * At this point we know that as long as we hold z_znodes_lock, the 293 * znode cannot be freed and fields within the znode can be safely 294 * accessed. Now, prevent a race with zfs_zget(). 295 */ 296 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 297 mutex_exit(&zfsvfs->z_znodes_lock); 298 ZFS_EXIT(zfsvfs); 299 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 300 return (KMEM_CBRC_LATER); 301 } 302 303 vp = ZTOV(ozp); 304 if (mutex_tryenter(&vp->v_lock) == 0) { 305 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 306 mutex_exit(&zfsvfs->z_znodes_lock); 307 ZFS_EXIT(zfsvfs); 308 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 309 return (KMEM_CBRC_LATER); 310 } 311 312 /* Only move znodes that are referenced _only_ by the DNLC. */ 313 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 314 mutex_exit(&vp->v_lock); 315 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 316 mutex_exit(&zfsvfs->z_znodes_lock); 317 ZFS_EXIT(zfsvfs); 318 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 319 return (KMEM_CBRC_LATER); 320 } 321 322 /* 323 * The znode is known and in a valid state to move. We're holding the 324 * locks needed to execute the critical section. 325 */ 326 zfs_znode_move_impl(ozp, nzp); 327 mutex_exit(&vp->v_lock); 328 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 329 330 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 331 mutex_exit(&zfsvfs->z_znodes_lock); 332 ZFS_EXIT(zfsvfs); 333 334 return (KMEM_CBRC_YES); 335 } 336 #endif /* !__NetBSD__ */ 337 338 void 339 zfs_znode_init(void) 340 { 341 /* 342 * Initialize zcache 343 */ 344 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 345 ASSERT(znode_cache == NULL); 346 znode_cache = kmem_cache_create("zfs_znode_cache", 347 sizeof (znode_t), 0, zfs_znode_cache_constructor, 348 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 349 } 350 351 void 352 zfs_znode_fini(void) 353 { 354 355 /* 356 * Cleanup zcache 357 */ 358 if (znode_cache) 359 kmem_cache_destroy(znode_cache); 360 znode_cache = NULL; 361 rw_destroy(&zfsvfs_lock); 362 } 363 364 #ifndef __NetBSD__ 365 struct vnodeops *zfs_dvnodeops; 366 struct vnodeops *zfs_fvnodeops; 367 struct vnodeops *zfs_symvnodeops; 368 struct vnodeops *zfs_xdvnodeops; 369 struct vnodeops *zfs_evnodeops; 370 struct vnodeops *zfs_sharevnodeops; 371 #endif 372 373 void 374 zfs_remove_op_tables() 375 { 376 #ifndef __NetBSD__ 377 /* 378 * Remove vfs ops 379 */ 380 ASSERT(zfsfstype); 381 (void) vfs_freevfsops_by_type(zfsfstype); 382 zfsfstype = 0; 383 384 /* 385 * Remove vnode ops 386 */ 387 if (zfs_dvnodeops) 388 vn_freevnodeops(zfs_dvnodeops); 389 if (zfs_fvnodeops) 390 vn_freevnodeops(zfs_fvnodeops); 391 if (zfs_symvnodeops) 392 vn_freevnodeops(zfs_symvnodeops); 393 if (zfs_xdvnodeops) 394 vn_freevnodeops(zfs_xdvnodeops); 395 if (zfs_evnodeops) 396 vn_freevnodeops(zfs_evnodeops); 397 if (zfs_sharevnodeops) 398 vn_freevnodeops(zfs_sharevnodeops); 399 400 zfs_dvnodeops = NULL; 401 zfs_fvnodeops = NULL; 402 zfs_symvnodeops = NULL; 403 zfs_xdvnodeops = NULL; 404 zfs_evnodeops = NULL; 405 zfs_sharevnodeops = NULL; 406 #endif 407 } 408 409 #ifndef __NetBSD__ 410 extern const fs_operation_def_t zfs_dvnodeops_template[]; 411 extern const fs_operation_def_t zfs_fvnodeops_template[]; 412 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 413 extern const fs_operation_def_t zfs_symvnodeops_template[]; 414 extern const fs_operation_def_t zfs_evnodeops_template[]; 415 extern const fs_operation_def_t zfs_sharevnodeops_template[]; 416 #endif 417 418 int 419 zfs_create_op_tables() 420 { 421 #ifndef __NetBSD__ 422 int error; 423 424 /* 425 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 426 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 427 * In this case we just return as the ops vectors are already set up. 428 */ 429 if (zfs_dvnodeops) 430 return (0); 431 432 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 433 &zfs_dvnodeops); 434 if (error) 435 return (error); 436 437 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 438 &zfs_fvnodeops); 439 if (error) 440 return (error); 441 442 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 443 &zfs_symvnodeops); 444 if (error) 445 return (error); 446 447 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 448 &zfs_xdvnodeops); 449 if (error) 450 return (error); 451 452 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 453 &zfs_evnodeops); 454 if (error) 455 return (error); 456 457 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, 458 &zfs_sharevnodeops); 459 460 return (error); 461 #endif 462 return 0; 463 } 464 465 int 466 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 467 { 468 zfs_acl_ids_t acl_ids; 469 vattr_t vattr; 470 znode_t *sharezp; 471 vnode_t *vp; 472 znode_t *zp; 473 int error; 474 475 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 476 vattr.va_type = VDIR; 477 vattr.va_mode = S_IFDIR|0555; 478 vattr.va_uid = crgetuid(kcred); 479 vattr.va_gid = crgetgid(kcred); 480 481 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 482 sharezp->z_unlinked = 0; 483 sharezp->z_atime_dirty = 0; 484 sharezp->z_zfsvfs = zfsvfs; 485 486 vp = ZTOV(sharezp); 487 vn_reinit(vp); 488 vp->v_type = VDIR; 489 490 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 491 kcred, NULL, &acl_ids)); 492 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 493 &zp, 0, &acl_ids); 494 ASSERT3P(zp, ==, sharezp); 495 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ 496 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 497 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 498 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 499 zfsvfs->z_shares_dir = sharezp->z_id; 500 501 zfs_acl_ids_free(&acl_ids); 502 ZTOV(sharezp)->v_count = 0; 503 dmu_buf_rele(sharezp->z_dbuf, NULL); 504 sharezp->z_dbuf = NULL; 505 kmem_cache_free(znode_cache, sharezp); 506 507 return (error); 508 } 509 510 /* 511 * define a couple of values we need available 512 * for both 64 and 32 bit environments. 513 */ 514 #ifndef NBITSMINOR64 515 #define NBITSMINOR64 32 516 #endif 517 #ifndef MAXMAJ64 518 #define MAXMAJ64 0xffffffffUL 519 #endif 520 #ifndef MAXMIN64 521 #define MAXMIN64 0xffffffffUL 522 #endif 523 524 /* 525 * Create special expldev for ZFS private use. 526 * Can't use standard expldev since it doesn't do 527 * what we want. The standard expldev() takes a 528 * dev32_t in LP64 and expands it to a long dev_t. 529 * We need an interface that takes a dev32_t in ILP32 530 * and expands it to a long dev_t. 531 */ 532 static uint64_t 533 zfs_expldev(dev_t dev) 534 { 535 return ((uint64_t)major(dev) << NBITSMINOR64) | 536 (minor_t)minor(dev); 537 } 538 539 /* 540 * Special cmpldev for ZFS private use. 541 * Can't use standard cmpldev since it takes 542 * a long dev_t and compresses it to dev32_t in 543 * LP64. We need to do a compaction of a long dev_t 544 * to a dev32_t in ILP32. 545 */ 546 dev_t 547 zfs_cmpldev(uint64_t dev) 548 { 549 minor_t minor = (minor_t)dev & MAXMIN64; 550 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 551 552 return makedev(minor, major); 553 } 554 555 static void 556 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 557 { 558 znode_t *nzp; 559 560 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 561 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 562 563 mutex_enter(&zp->z_lock); 564 565 ASSERT(zp->z_dbuf == NULL); 566 ASSERT(zp->z_acl_cached == NULL); 567 zp->z_dbuf = db; 568 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 569 570 /* 571 * there should be no 572 * concurrent zgets on this object. 573 */ 574 if (nzp != NULL) 575 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 576 577 /* 578 * Slap on VROOT if we are the root znode 579 */ 580 if (zp->z_id == zfsvfs->z_root) 581 ZTOV(zp)->v_flag |= VROOT; 582 583 mutex_exit(&zp->z_lock); 584 vn_exists(ZTOV(zp)); 585 } 586 587 void 588 zfs_znode_dmu_fini(znode_t *zp) 589 { 590 dmu_buf_t *db = zp->z_dbuf; 591 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 592 zp->z_unlinked || 593 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 594 ASSERT(zp->z_dbuf != NULL); 595 zp->z_dbuf = NULL; 596 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 597 dmu_buf_rele(db, NULL); 598 } 599 600 /* 601 * Construct a new znode/vnode and intialize. 602 * 603 * This does not do a call to dmu_set_user() that is 604 * up to the caller to do, in case you don't want to 605 * return the znode 606 */ 607 608 static znode_t * 609 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 610 { 611 znode_t *zp; 612 vnode_t *vp; 613 int error; 614 615 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 616 for (;;) { 617 618 error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs, 619 zfs_vnodeop_p, &zp->z_vnode); 620 if (__predict_true(error == 0)) 621 break; 622 printf("WARNING: zfs_znode_alloc: unable to get vnode, " 623 "error=%d\n", error); 624 (void)kpause("zfsnewvn", false, hz, NULL); 625 } 626 627 ASSERT(zp->z_dirlocks == NULL); 628 ASSERT(zp->z_dbuf == NULL); 629 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 630 631 /* 632 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 633 * the zfs_znode_move() callback. 634 */ 635 zp->z_phys = NULL; 636 zp->z_unlinked = 0; 637 zp->z_atime_dirty = 0; 638 zp->z_mapcnt = 0; 639 zp->z_last_itx = 0; 640 zp->z_id = db->db_object; 641 zp->z_blksz = blksz; 642 zp->z_seq = 0x7A4653; 643 zp->z_sync_cnt = 0; 644 645 vp = ZTOV(zp); 646 647 zfs_znode_dmu_init(zfsvfs, zp, db); 648 649 zp->z_gen = zp->z_phys->zp_gen; 650 651 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 652 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 653 vp->v_data = zp; 654 switch (vp->v_type) { 655 case VDIR: 656 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 657 break; 658 case VBLK: 659 case VCHR: 660 /* XXX NetBSD vp->v_op = zfs_specop_p; */ 661 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev)); 662 break; 663 case VFIFO: 664 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */ 665 break; 666 } 667 668 dprintf("zfs_znode_alloc znode %p -- vnode %p\n", zp, vp); 669 dprintf("zfs_znode_alloc z_id %ld\n", zp->z_id); 670 //cpu_Debugger(); 671 672 uvm_vnp_setsize(vp, zp->z_phys->zp_size); 673 674 mutex_enter(&zfsvfs->z_znodes_lock); 675 list_insert_tail(&zfsvfs->z_all_znodes, zp); 676 membar_producer(); 677 /* 678 * Everything else must be valid before assigning z_zfsvfs makes the 679 * znode eligible for zfs_znode_move(). 680 */ 681 zp->z_zfsvfs = zfsvfs; 682 mutex_exit(&zfsvfs->z_znodes_lock); 683 684 return (zp); 685 } 686 687 /* 688 * Create a new DMU object to hold a zfs znode. 689 * 690 * IN: dzp - parent directory for new znode 691 * vap - file attributes for new znode 692 * tx - dmu transaction id for zap operations 693 * cr - credentials of caller 694 * flag - flags: 695 * IS_ROOT_NODE - new object will be root 696 * IS_XATTR - new object is an attribute 697 * bonuslen - length of bonus buffer 698 * setaclp - File/Dir initial ACL 699 * fuidp - Tracks fuid allocation. 700 * 701 * OUT: zpp - allocated znode 702 * 703 */ 704 void 705 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 706 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 707 { 708 dmu_buf_t *db; 709 znode_phys_t *pzp; 710 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 711 timestruc_t now; 712 uint64_t gen, obj; 713 int err; 714 715 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 716 717 if (zfsvfs->z_replay) { 718 obj = vap->va_nodeid; 719 now = vap->va_ctime; /* see zfs_replay_create() */ 720 gen = vap->va_nblocks; /* ditto */ 721 } else { 722 obj = 0; 723 gethrestime(&now); 724 gen = dmu_tx_get_txg(tx); 725 } 726 727 /* 728 * Create a new DMU object. 729 */ 730 /* 731 * There's currently no mechanism for pre-reading the blocks that will 732 * be to needed allocate a new object, so we accept the small chance 733 * that there will be an i/o error and we will fail one of the 734 * assertions below. 735 */ 736 if (vap->va_type == VDIR) { 737 if (zfsvfs->z_replay) { 738 err = zap_create_claim_norm(zfsvfs->z_os, obj, 739 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 740 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 741 ASSERT3U(err, ==, 0); 742 } else { 743 obj = zap_create_norm(zfsvfs->z_os, 744 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 745 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 746 } 747 } else { 748 if (zfsvfs->z_replay) { 749 err = dmu_object_claim(zfsvfs->z_os, obj, 750 DMU_OT_PLAIN_FILE_CONTENTS, 0, 751 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 752 ASSERT3U(err, ==, 0); 753 } else { 754 obj = dmu_object_alloc(zfsvfs->z_os, 755 DMU_OT_PLAIN_FILE_CONTENTS, 0, 756 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 757 } 758 } 759 760 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 761 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 762 dmu_buf_will_dirty(db, tx); 763 764 /* 765 * Initialize the znode physical data to zero. 766 */ 767 ASSERT(db->db_size >= sizeof (znode_phys_t)); 768 bzero(db->db_data, db->db_size); 769 pzp = db->db_data; 770 771 /* 772 * If this is the root, fix up the half-initialized parent pointer 773 * to reference the just-allocated physical data area. 774 */ 775 if (flag & IS_ROOT_NODE) { 776 dzp->z_dbuf = db; 777 dzp->z_phys = pzp; 778 dzp->z_id = obj; 779 } 780 781 /* 782 * If parent is an xattr, so am I. 783 */ 784 if (dzp->z_phys->zp_flags & ZFS_XATTR) 785 flag |= IS_XATTR; 786 787 if (vap->va_type == VBLK || vap->va_type == VCHR) { 788 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 789 } 790 791 if (zfsvfs->z_use_fuids) 792 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 793 794 if (vap->va_type == VDIR) { 795 pzp->zp_size = 2; /* contents ("." and "..") */ 796 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 797 } 798 799 pzp->zp_parent = dzp->z_id; 800 if (flag & IS_XATTR) 801 pzp->zp_flags |= ZFS_XATTR; 802 803 pzp->zp_gen = gen; 804 805 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 806 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 807 808 if (vap->va_mask & AT_ATIME) { 809 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 810 } else { 811 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 812 } 813 814 if (vap->va_mask & AT_MTIME) { 815 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 816 } else { 817 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 818 } 819 pzp->zp_uid = acl_ids->z_fuid; 820 pzp->zp_gid = acl_ids->z_fgid; 821 pzp->zp_mode = acl_ids->z_mode; 822 if (!(flag & IS_ROOT_NODE)) { 823 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 824 } else { 825 /* 826 * If we are creating the root node, the "parent" we 827 * passed in is the znode for the root. 828 */ 829 *zpp = dzp; 830 } 831 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 832 if (vap->va_mask & AT_XVATTR) 833 zfs_xvattr_set(*zpp, (xvattr_t *)vap); 834 835 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 836 } 837 838 void 839 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 840 { 841 xoptattr_t *xoap; 842 843 xoap = xva_getxoptattr(xvap); 844 ASSERT(xoap); 845 846 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 847 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 848 XVA_SET_RTN(xvap, XAT_CREATETIME); 849 } 850 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 851 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 852 XVA_SET_RTN(xvap, XAT_READONLY); 853 } 854 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 855 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 856 XVA_SET_RTN(xvap, XAT_HIDDEN); 857 } 858 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 859 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 860 XVA_SET_RTN(xvap, XAT_SYSTEM); 861 } 862 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 863 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 864 XVA_SET_RTN(xvap, XAT_ARCHIVE); 865 } 866 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 867 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 868 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 869 } 870 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 871 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 872 XVA_SET_RTN(xvap, XAT_NOUNLINK); 873 } 874 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 875 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 876 XVA_SET_RTN(xvap, XAT_APPENDONLY); 877 } 878 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 879 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 880 XVA_SET_RTN(xvap, XAT_NODUMP); 881 } 882 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 883 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 884 XVA_SET_RTN(xvap, XAT_OPAQUE); 885 } 886 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 887 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 888 xoap->xoa_av_quarantined); 889 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 890 } 891 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 892 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 893 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 894 } 895 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 896 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 897 sizeof (xoap->xoa_av_scanstamp)); 898 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 899 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 900 } 901 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 902 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); 903 XVA_SET_RTN(xvap, XAT_REPARSE); 904 } 905 } 906 907 int 908 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 909 { 910 dmu_object_info_t doi; 911 dmu_buf_t *db; 912 znode_t *zp; 913 vnode_t *vp; 914 int err, first = 1; 915 916 *zpp = NULL; 917 again: 918 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 919 920 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 921 if (err) { 922 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 923 return (err); 924 } 925 926 dmu_object_info_from_db(db, &doi); 927 if (doi.doi_bonus_type != DMU_OT_ZNODE || 928 doi.doi_bonus_size < sizeof (znode_phys_t)) { 929 dmu_buf_rele(db, NULL); 930 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 931 return (EINVAL); 932 } 933 934 zp = dmu_buf_get_user(db); 935 if (zp != NULL) { 936 mutex_enter(&zp->z_lock); 937 938 /* 939 * Since we do immediate eviction of the z_dbuf, we 940 * should never find a dbuf with a znode that doesn't 941 * know about the dbuf. 942 */ 943 ASSERT3P(zp->z_dbuf, ==, db); 944 ASSERT3U(zp->z_id, ==, obj_num); 945 if (zp->z_unlinked) { 946 err = ENOENT; 947 } else { 948 if ((vp = ZTOV(zp)) != NULL) { 949 mutex_enter(&vp->v_interlock); 950 mutex_exit(&zp->z_lock); 951 if (vget(vp, LK_INTERLOCK) != 0) { 952 dmu_buf_rele(db, NULL); 953 mutex_exit(&vp->v_interlock); 954 goto again; 955 } 956 mutex_enter(&zp->z_lock); 957 } else { 958 if (first) { 959 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 960 first = 0; 961 } 962 /* 963 * znode is dying so we can't reuse it, we must 964 * wait until destruction is completed. 965 */ 966 dmu_buf_rele(db, NULL); 967 mutex_exit(&zp->z_lock); 968 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 969 kpause("zcollide", 0, 1, NULL); 970 goto again; 971 } 972 *zpp = zp; 973 err = 0; 974 } 975 976 dmu_buf_rele(db, NULL); 977 mutex_exit(&zp->z_lock); 978 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 979 return (err); 980 } 981 982 /* 983 * Not found create new znode/vnode 984 * but only if file exists. 985 * 986 * There is a small window where zfs_vget() could 987 * find this object while a file create is still in 988 * progress. Since a gen number can never be zero 989 * we will check that to determine if its an allocated 990 * file. 991 */ 992 993 if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 994 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 995 *zpp = zp; 996 997 vp = ZTOV(zp); 998 genfs_node_init(vp, &zfs_genfsops); 999 VOP_UNLOCK(vp, 0); 1000 1001 err = 0; 1002 } else { 1003 dmu_buf_rele(db, NULL); 1004 err = ENOENT; 1005 } 1006 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1007 return (err); 1008 } 1009 1010 int 1011 zfs_rezget(znode_t *zp) 1012 { 1013 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1014 dmu_object_info_t doi; 1015 dmu_buf_t *db; 1016 uint64_t obj_num = zp->z_id; 1017 int err; 1018 1019 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 1020 1021 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 1022 if (err) { 1023 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1024 return (err); 1025 } 1026 1027 dmu_object_info_from_db(db, &doi); 1028 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1029 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1030 dmu_buf_rele(db, NULL); 1031 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1032 return (EINVAL); 1033 } 1034 1035 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 1036 dmu_buf_rele(db, NULL); 1037 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1038 return (EIO); 1039 } 1040 1041 mutex_enter(&zp->z_acl_lock); 1042 if (zp->z_acl_cached) { 1043 zfs_acl_free(zp->z_acl_cached); 1044 zp->z_acl_cached = NULL; 1045 } 1046 mutex_exit(&zp->z_acl_lock); 1047 1048 zfs_znode_dmu_init(zfsvfs, zp, db); 1049 zp->z_unlinked = (zp->z_phys->zp_links == 0); 1050 zp->z_blksz = doi.doi_data_block_size; 1051 1052 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1053 1054 return (0); 1055 } 1056 1057 void 1058 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1059 { 1060 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1061 objset_t *os = zfsvfs->z_os; 1062 uint64_t obj = zp->z_id; 1063 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1064 1065 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1066 if (acl_obj) 1067 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1068 VERIFY(0 == dmu_object_free(os, obj, tx)); 1069 zfs_znode_dmu_fini(zp); 1070 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1071 zfs_znode_free(zp); 1072 } 1073 1074 /* 1075 * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock 1076 * will be released in zfs_zinactive. 1077 */ 1078 void 1079 zfs_zinactive(znode_t *zp) 1080 { 1081 vnode_t *vp = ZTOV(zp); 1082 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1083 uint64_t z_id = zp->z_id; 1084 1085 ASSERT(zp->z_dbuf && zp->z_phys); 1086 1087 /* 1088 * Don't allow a zfs_zget() while were trying to release this znode 1089 */ 1090 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1091 1092 mutex_enter(&zp->z_lock); 1093 /* 1094 * If this was the last reference to a file with no links, 1095 * remove the file from the file system. 1096 */ 1097 if (zp->z_unlinked) { 1098 mutex_exit(&zp->z_lock); 1099 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1100 zfs_rmnode(zp); 1101 return; 1102 } 1103 1104 mutex_exit(&zp->z_lock); 1105 /* XXX why disabled zfs_znode_dmu_fini(zp); */ 1106 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1107 zfs_znode_free(zp); 1108 } 1109 1110 void 1111 zfs_znode_free(znode_t *zp) 1112 { 1113 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1114 ASSERT(ZTOV(zp) == NULL); 1115 1116 dprintf("destroying znode %p\n", zp); 1117 //cpu_Debugger(); 1118 mutex_enter(&zfsvfs->z_znodes_lock); 1119 POINTER_INVALIDATE(&zp->z_zfsvfs); 1120 list_remove(&zfsvfs->z_all_znodes, zp); 1121 mutex_exit(&zfsvfs->z_znodes_lock); 1122 1123 if (zp->z_acl_cached) { 1124 zfs_acl_free(zp->z_acl_cached); 1125 zp->z_acl_cached = NULL; 1126 } 1127 1128 kmem_cache_free(znode_cache, zp); 1129 1130 VFS_RELE(zfsvfs->z_vfs); 1131 } 1132 1133 void 1134 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1135 { 1136 timestruc_t now; 1137 1138 ASSERT(MUTEX_HELD(&zp->z_lock)); 1139 1140 gethrestime(&now); 1141 1142 if (tx) { 1143 dmu_buf_will_dirty(zp->z_dbuf, tx); 1144 zp->z_atime_dirty = 0; 1145 zp->z_seq++; 1146 } else { 1147 zp->z_atime_dirty = 1; 1148 } 1149 1150 if (flag & AT_ATIME) 1151 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1152 1153 if (flag & AT_MTIME) { 1154 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1155 if (zp->z_zfsvfs->z_use_fuids) 1156 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1157 } 1158 1159 if (flag & AT_CTIME) { 1160 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1161 if (zp->z_zfsvfs->z_use_fuids) 1162 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1163 } 1164 } 1165 1166 /* 1167 * Update the requested znode timestamps with the current time. 1168 * If we are in a transaction, then go ahead and mark the znode 1169 * dirty in the transaction so the timestamps will go to disk. 1170 * Otherwise, we will get pushed next time the znode is updated 1171 * in a transaction, or when this znode eventually goes inactive. 1172 * 1173 * Why is this OK? 1174 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1175 * 2 - Multiple consecutive updates will be collapsed into a single 1176 * znode update by the transaction grouping semantics of the DMU. 1177 */ 1178 void 1179 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1180 { 1181 mutex_enter(&zp->z_lock); 1182 zfs_time_stamper_locked(zp, flag, tx); 1183 mutex_exit(&zp->z_lock); 1184 } 1185 1186 /* 1187 * Grow the block size for a file. 1188 * 1189 * IN: zp - znode of file to free data in. 1190 * size - requested block size 1191 * tx - open transaction. 1192 * 1193 * NOTE: this function assumes that the znode is write locked. 1194 */ 1195 void 1196 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1197 { 1198 int error; 1199 u_longlong_t dummy; 1200 1201 if (size <= zp->z_blksz) 1202 return; 1203 /* 1204 * If the file size is already greater than the current blocksize, 1205 * we will not grow. If there is more than one block in a file, 1206 * the blocksize cannot change. 1207 */ 1208 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1209 return; 1210 1211 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1212 size, 0, tx); 1213 if (error == ENOTSUP) 1214 return; 1215 ASSERT3U(error, ==, 0); 1216 1217 /* What blocksize did we actually get? */ 1218 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1219 } 1220 1221 /* 1222 * Increase the file length 1223 * 1224 * IN: zp - znode of file to free data in. 1225 * end - new end-of-file 1226 * 1227 * RETURN: 0 if success 1228 * error code if failure 1229 */ 1230 static int 1231 zfs_extend(znode_t *zp, uint64_t end) 1232 { 1233 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1234 dmu_tx_t *tx; 1235 rl_t *rl; 1236 uint64_t newblksz; 1237 int error; 1238 1239 /* 1240 * We will change zp_size, lock the whole file. 1241 */ 1242 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1243 1244 /* 1245 * Nothing to do if file already at desired length. 1246 */ 1247 if (end <= zp->z_phys->zp_size) { 1248 zfs_range_unlock(rl); 1249 return (0); 1250 } 1251 top: 1252 tx = dmu_tx_create(zfsvfs->z_os); 1253 dmu_tx_hold_bonus(tx, zp->z_id); 1254 if (end > zp->z_blksz && 1255 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1256 /* 1257 * We are growing the file past the current block size. 1258 */ 1259 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1260 ASSERT(!ISP2(zp->z_blksz)); 1261 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1262 } else { 1263 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1264 } 1265 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1266 } else { 1267 newblksz = 0; 1268 } 1269 1270 error = dmu_tx_assign(tx, TXG_NOWAIT); 1271 if (error) { 1272 if (error == ERESTART) { 1273 dmu_tx_wait(tx); 1274 dmu_tx_abort(tx); 1275 goto top; 1276 } 1277 dmu_tx_abort(tx); 1278 zfs_range_unlock(rl); 1279 return (error); 1280 } 1281 dmu_buf_will_dirty(zp->z_dbuf, tx); 1282 1283 if (newblksz) 1284 zfs_grow_blocksize(zp, newblksz, tx); 1285 1286 zp->z_phys->zp_size = end; 1287 1288 zfs_range_unlock(rl); 1289 1290 dmu_tx_commit(tx); 1291 1292 uvm_vnp_setsize(ZTOV(zp), end); 1293 1294 return (0); 1295 } 1296 1297 /* 1298 * Free space in a file. 1299 * 1300 * IN: zp - znode of file to free data in. 1301 * off - start of section to free. 1302 * len - length of section to free. 1303 * 1304 * RETURN: 0 if success 1305 * error code if failure 1306 */ 1307 static int 1308 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1309 { 1310 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1311 rl_t *rl; 1312 int error; 1313 1314 /* 1315 * Lock the range being freed. 1316 */ 1317 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1318 1319 /* 1320 * Nothing to do if file already at desired length. 1321 */ 1322 if (off >= zp->z_phys->zp_size) { 1323 zfs_range_unlock(rl); 1324 return (0); 1325 } 1326 1327 if (off + len > zp->z_phys->zp_size) 1328 len = zp->z_phys->zp_size - off; 1329 1330 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1331 1332 if (error == 0) { 1333 /* 1334 * In NetBSD we cannot free block in the middle of a file, 1335 * but only at the end of a file. 1336 */ 1337 uvm_vnp_setsize(ZTOV(zp), off); 1338 } 1339 1340 zfs_range_unlock(rl); 1341 1342 return (error); 1343 } 1344 1345 /* 1346 * Truncate a file 1347 * 1348 * IN: zp - znode of file to free data in. 1349 * end - new end-of-file. 1350 * 1351 * RETURN: 0 if success 1352 * error code if failure 1353 */ 1354 static int 1355 zfs_trunc(znode_t *zp, uint64_t end) 1356 { 1357 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1358 vnode_t *vp = ZTOV(zp); 1359 dmu_tx_t *tx; 1360 rl_t *rl; 1361 int error; 1362 1363 /* 1364 * We will change zp_size, lock the whole file. 1365 */ 1366 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1367 1368 /* 1369 * Nothing to do if file already at desired length. 1370 */ 1371 if (end >= zp->z_phys->zp_size) { 1372 zfs_range_unlock(rl); 1373 return (0); 1374 } 1375 1376 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1377 if (error) { 1378 zfs_range_unlock(rl); 1379 return (error); 1380 } 1381 top: 1382 tx = dmu_tx_create(zfsvfs->z_os); 1383 dmu_tx_hold_bonus(tx, zp->z_id); 1384 error = dmu_tx_assign(tx, TXG_NOWAIT); 1385 if (error) { 1386 if (error == ERESTART) { 1387 dmu_tx_wait(tx); 1388 dmu_tx_abort(tx); 1389 goto top; 1390 } 1391 dmu_tx_abort(tx); 1392 zfs_range_unlock(rl); 1393 return (error); 1394 } 1395 dmu_buf_will_dirty(zp->z_dbuf, tx); 1396 1397 zp->z_phys->zp_size = end; 1398 1399 dmu_tx_commit(tx); 1400 1401 /* 1402 * Clear any mapped pages in the truncated region. This has to 1403 * happen outside of the transaction to avoid the possibility of 1404 * a deadlock with someone trying to push a page that we are 1405 * about to invalidate. 1406 */ 1407 1408 uvm_vnp_setsize(vp, end); 1409 1410 return (0); 1411 } 1412 1413 /* 1414 * Free space in a file 1415 * 1416 * IN: zp - znode of file to free data in. 1417 * off - start of range 1418 * len - end of range (0 => EOF) 1419 * flag - current file open mode flags. 1420 * log - TRUE if this action should be logged 1421 * 1422 * RETURN: 0 if success 1423 * error code if failure 1424 */ 1425 int 1426 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1427 { 1428 vnode_t *vp = ZTOV(zp); 1429 dmu_tx_t *tx; 1430 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1431 zilog_t *zilog = zfsvfs->z_log; 1432 int error; 1433 1434 if (off > zp->z_phys->zp_size) { 1435 error = zfs_extend(zp, off+len); 1436 if (error == 0 && log) 1437 goto log; 1438 else 1439 return (error); 1440 } 1441 1442 if (len == 0) { 1443 error = zfs_trunc(zp, off); 1444 } else { 1445 if ((error = zfs_free_range(zp, off, len)) == 0 && 1446 off + len > zp->z_phys->zp_size) 1447 error = zfs_extend(zp, off+len); 1448 } 1449 if (error || !log) 1450 return (error); 1451 log: 1452 tx = dmu_tx_create(zfsvfs->z_os); 1453 dmu_tx_hold_bonus(tx, zp->z_id); 1454 error = dmu_tx_assign(tx, TXG_NOWAIT); 1455 if (error) { 1456 if (error == ERESTART) { 1457 dmu_tx_wait(tx); 1458 dmu_tx_abort(tx); 1459 goto log; 1460 } 1461 dmu_tx_abort(tx); 1462 return (error); 1463 } 1464 1465 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1466 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1467 1468 dmu_tx_commit(tx); 1469 return (0); 1470 } 1471 1472 void 1473 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1474 { 1475 zfsvfs_t zfsvfs; 1476 uint64_t moid, obj, version; 1477 uint64_t sense = ZFS_CASE_SENSITIVE; 1478 uint64_t norm = 0; 1479 nvpair_t *elem; 1480 int error; 1481 int i; 1482 znode_t *rootzp = NULL; 1483 vnode_t *vp; 1484 vattr_t vattr; 1485 znode_t *zp; 1486 zfs_acl_ids_t acl_ids; 1487 1488 /* 1489 * First attempt to create master node. 1490 */ 1491 /* 1492 * In an empty objset, there are no blocks to read and thus 1493 * there can be no i/o errors (which we assert below). 1494 */ 1495 moid = MASTER_NODE_OBJ; 1496 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1497 DMU_OT_NONE, 0, tx); 1498 ASSERT(error == 0); 1499 1500 /* 1501 * Set starting attributes. 1502 */ 1503 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1504 version = ZPL_VERSION; 1505 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1506 version = ZPL_VERSION_USERSPACE - 1; 1507 else 1508 version = ZPL_VERSION_FUID - 1; 1509 elem = NULL; 1510 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1511 /* For the moment we expect all zpl props to be uint64_ts */ 1512 uint64_t val; 1513 char *name; 1514 1515 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1516 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1517 name = nvpair_name(elem); 1518 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1519 if (val < version) 1520 version = val; 1521 } else { 1522 error = zap_update(os, moid, name, 8, 1, &val, tx); 1523 } 1524 ASSERT(error == 0); 1525 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1526 norm = val; 1527 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1528 sense = val; 1529 } 1530 ASSERT(version != 0); 1531 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1532 1533 /* 1534 * Create a delete queue. 1535 */ 1536 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1537 1538 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1539 ASSERT(error == 0); 1540 1541 /* 1542 * Create root znode. Create minimal znode/vnode/zfsvfs 1543 * to allow zfs_mknode to work. 1544 */ 1545 vattr_null(&vattr); 1546 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1547 vattr.va_type = VDIR; 1548 vattr.va_mode = S_IFDIR|0755; 1549 vattr.va_uid = crgetuid(cr); 1550 vattr.va_gid = crgetgid(cr); 1551 1552 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1553 rootzp->z_unlinked = 0; 1554 rootzp->z_atime_dirty = 0; 1555 1556 for (;;) { 1557 error = getnewvnode(VT_ZFS, NULL, zfs_vnodeop_p, 1558 &rootzp->z_vnode); 1559 if (error == 0) 1560 break; 1561 printf("WARNING: zfs_create_fs: unable to get vnode, " 1562 "error=%d\n", error); 1563 kpause("zfsvn", false, hz, NULL); 1564 } 1565 1566 vp = ZTOV(rootzp); 1567 vp->v_type = VDIR; 1568 1569 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1570 1571 zfsvfs.z_os = os; 1572 zfsvfs.z_parent = &zfsvfs; 1573 zfsvfs.z_version = version; 1574 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1575 zfsvfs.z_norm = norm; 1576 /* 1577 * Fold case on file systems that are always or sometimes case 1578 * insensitive. 1579 */ 1580 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1581 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1582 1583 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1584 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1585 offsetof(znode_t, z_link_node)); 1586 1587 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1588 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1589 1590 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1591 rootzp->z_zfsvfs = &zfsvfs; 1592 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1593 cr, NULL, &acl_ids)); 1594 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1595 ASSERT3P(zp, ==, rootzp); 1596 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1597 ASSERT(error == 0); 1598 zfs_acl_ids_free(&acl_ids); 1599 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1600 1601 dmu_buf_rele(rootzp->z_dbuf, NULL); 1602 rootzp->z_dbuf = NULL; 1603 ungetnewvnode(vp); 1604 kmem_cache_free(znode_cache, rootzp); 1605 1606 /* 1607 * Create shares directory 1608 */ 1609 1610 error = zfs_create_share_dir(&zfsvfs, tx); 1611 1612 ASSERT(error == 0); 1613 1614 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1615 mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1616 } 1617 1618 #endif /* _KERNEL */ 1619 /* 1620 * Given an object number, return its parent object number and whether 1621 * or not the object is an extended attribute directory. 1622 */ 1623 static int 1624 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1625 { 1626 dmu_buf_t *db; 1627 dmu_object_info_t doi; 1628 znode_phys_t *zp; 1629 int error; 1630 1631 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1632 return (error); 1633 1634 dmu_object_info_from_db(db, &doi); 1635 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1636 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1637 dmu_buf_rele(db, FTAG); 1638 return (EINVAL); 1639 } 1640 1641 zp = db->db_data; 1642 *pobjp = zp->zp_parent; 1643 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1644 S_ISDIR(zp->zp_mode); 1645 dmu_buf_rele(db, FTAG); 1646 1647 return (0); 1648 } 1649 1650 int 1651 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1652 { 1653 char *path = buf + len - 1; 1654 int error; 1655 1656 *path = '\0'; 1657 1658 for (;;) { 1659 uint64_t pobj; 1660 char component[MAXNAMELEN + 2]; 1661 size_t complen; 1662 int is_xattrdir; 1663 1664 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1665 &is_xattrdir)) != 0) 1666 break; 1667 1668 if (pobj == obj) { 1669 if (path[0] != '/') 1670 *--path = '/'; 1671 break; 1672 } 1673 1674 component[0] = '/'; 1675 if (is_xattrdir) { 1676 (void) sprintf(component + 1, "<xattrdir>"); 1677 } else { 1678 error = zap_value_search(osp, pobj, obj, 1679 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1680 if (error != 0) 1681 break; 1682 } 1683 1684 complen = strlen(component); 1685 path -= complen; 1686 ASSERT(path >= buf); 1687 bcopy(component, path, complen); 1688 obj = pobj; 1689 } 1690 1691 if (error == 0) 1692 (void) memmove(buf, path, buf + len - path); 1693 return (error); 1694 } 1695