1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/kmem.h> 42 #include <sys/errno.h> 43 #include <sys/unistd.h> 44 #include <sys/atomic.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/zfs_rlock.h> 49 #include <sys/zfs_fuid.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/kidmap.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 60 #include "zfs_prop.h" 61 62 #if defined(_KERNEL) && defined(__NetBSD__) 63 #include <miscfs/specfs/specdev.h> 64 static const struct genfs_ops zfs_genfsops = { 65 .gop_write = genfs_compat_gop_write, 66 }; 67 68 #endif 69 70 extern int (**zfs_vnodeop_p)(void *); 71 extern int (**zfs_fifoop_p)(void *); 72 extern int (**zfs_specop_p)(void *); 73 74 /* 75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 76 * turned on when DEBUG is also defined. 77 */ 78 #ifdef DEBUG 79 #define ZNODE_STATS 80 #endif /* DEBUG */ 81 82 #ifdef ZNODE_STATS 83 #define ZNODE_STAT_ADD(stat) ((stat)++) 84 #else 85 #define ZNODE_STAT_ADD(stat) /* nothing */ 86 #endif /* ZNODE_STATS */ 87 88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 90 91 /* 92 * Functions needed for userland (ie: libzpool) are not put under 93 * #ifdef_KERNEL; the rest of the functions have dependencies 94 * (such as VFS logic) that will not compile easily in userland. 95 */ 96 #ifdef _KERNEL 97 /* 98 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 99 * be freed before it can be safely accessed. 100 */ 101 krwlock_t zfsvfs_lock; 102 103 static kmem_cache_t *znode_cache = NULL; 104 105 /*ARGSUSED*/ 106 static void 107 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 108 { 109 /* 110 * We should never drop all dbuf refs without first clearing 111 * the eviction callback. 112 */ 113 panic("evicting znode %p\n", user_ptr); 114 } 115 116 /*ARGSUSED*/ 117 static int 118 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 119 { 120 znode_t *zp = arg; 121 122 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 123 124 list_link_init(&zp->z_link_node); 125 126 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 127 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 128 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 129 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 130 131 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 132 avl_create(&zp->z_range_avl, zfs_range_compare, 133 sizeof (rl_t), offsetof(rl_t, r_node)); 134 135 zp->z_dbuf = NULL; 136 zp->z_dirlocks = NULL; 137 zp->z_acl_cached = NULL; 138 return (0); 139 } 140 141 /*ARGSUSED*/ 142 static void 143 zfs_znode_cache_destructor(void *buf, void *arg) 144 { 145 znode_t *zp = arg; 146 147 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 148 ASSERT(ZTOV(zp) == NULL); 149 150 ASSERT(!list_link_active(&zp->z_link_node)); 151 mutex_destroy(&zp->z_lock); 152 rw_destroy(&zp->z_parent_lock); 153 rw_destroy(&zp->z_name_lock); 154 mutex_destroy(&zp->z_acl_lock); 155 avl_destroy(&zp->z_range_avl); 156 mutex_destroy(&zp->z_range_lock); 157 158 ASSERT(zp->z_dbuf == NULL); 159 ASSERT(zp->z_dirlocks == NULL); 160 ASSERT(zp->z_acl_cached == NULL); 161 } 162 163 #ifdef ZNODE_STATS 164 static struct { 165 uint64_t zms_zfsvfs_invalid; 166 uint64_t zms_zfsvfs_recheck1; 167 uint64_t zms_zfsvfs_unmounted; 168 uint64_t zms_zfsvfs_recheck2; 169 uint64_t zms_obj_held; 170 uint64_t zms_vnode_locked; 171 uint64_t zms_not_only_dnlc; 172 } znode_move_stats; 173 #endif /* ZNODE_STATS */ 174 175 static void 176 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 177 { 178 vnode_t *vp; 179 180 /* Copy fields. */ 181 nzp->z_zfsvfs = ozp->z_zfsvfs; 182 183 /* Swap vnodes. */ 184 vp = nzp->z_vnode; 185 nzp->z_vnode = ozp->z_vnode; 186 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 187 ZTOV(ozp)->v_data = ozp; 188 ZTOV(nzp)->v_data = nzp; 189 190 nzp->z_id = ozp->z_id; 191 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 192 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 193 nzp->z_unlinked = ozp->z_unlinked; 194 nzp->z_atime_dirty = ozp->z_atime_dirty; 195 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 196 nzp->z_blksz = ozp->z_blksz; 197 nzp->z_seq = ozp->z_seq; 198 nzp->z_mapcnt = ozp->z_mapcnt; 199 nzp->z_last_itx = ozp->z_last_itx; 200 nzp->z_gen = ozp->z_gen; 201 nzp->z_sync_cnt = ozp->z_sync_cnt; 202 nzp->z_phys = ozp->z_phys; 203 nzp->z_dbuf = ozp->z_dbuf; 204 205 /* 206 * Since this is just an idle znode and kmem is already dealing with 207 * memory pressure, release any cached ACL. 208 */ 209 if (ozp->z_acl_cached) { 210 zfs_acl_free(ozp->z_acl_cached); 211 ozp->z_acl_cached = NULL; 212 } 213 214 /* Update back pointers. */ 215 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 216 znode_evict_error); 217 218 /* 219 * Invalidate the original znode by clearing fields that provide a 220 * pointer back to the znode. Set the low bit of the vfs pointer to 221 * ensure that zfs_znode_move() recognizes the znode as invalid in any 222 * subsequent callback. 223 */ 224 ozp->z_dbuf = NULL; 225 POINTER_INVALIDATE(&ozp->z_zfsvfs); 226 } 227 228 #ifndef __NetBSD__ 229 /*ARGSUSED*/ 230 static kmem_cbrc_t 231 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 232 { 233 znode_t *ozp = buf, *nzp = newbuf; 234 zfsvfs_t *zfsvfs; 235 vnode_t *vp; 236 237 /* 238 * The znode is on the file system's list of known znodes if the vfs 239 * pointer is valid. We set the low bit of the vfs pointer when freeing 240 * the znode to invalidate it, and the memory patterns written by kmem 241 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 242 * created znode sets the vfs pointer last of all to indicate that the 243 * znode is known and in a valid state to be moved by this function. 244 */ 245 zfsvfs = ozp->z_zfsvfs; 246 if (!POINTER_IS_VALID(zfsvfs)) { 247 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 248 return (KMEM_CBRC_DONT_KNOW); 249 } 250 251 /* 252 * Close a small window in which it's possible that the filesystem could 253 * be unmounted and freed, and zfsvfs, though valid in the previous 254 * statement, could point to unrelated memory by the time we try to 255 * prevent the filesystem from being unmounted. 256 */ 257 rw_enter(&zfsvfs_lock, RW_WRITER); 258 if (zfsvfs != ozp->z_zfsvfs) { 259 rw_exit(&zfsvfs_lock); 260 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 261 return (KMEM_CBRC_DONT_KNOW); 262 } 263 264 /* 265 * If the znode is still valid, then so is the file system. We know that 266 * no valid file system can be freed while we hold zfsvfs_lock, so we 267 * can safely ensure that the filesystem is not and will not be 268 * unmounted. The next statement is equivalent to ZFS_ENTER(). 269 */ 270 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 271 if (zfsvfs->z_unmounted) { 272 ZFS_EXIT(zfsvfs); 273 rw_exit(&zfsvfs_lock); 274 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 275 return (KMEM_CBRC_DONT_KNOW); 276 } 277 rw_exit(&zfsvfs_lock); 278 279 mutex_enter(&zfsvfs->z_znodes_lock); 280 /* 281 * Recheck the vfs pointer in case the znode was removed just before 282 * acquiring the lock. 283 */ 284 if (zfsvfs != ozp->z_zfsvfs) { 285 mutex_exit(&zfsvfs->z_znodes_lock); 286 ZFS_EXIT(zfsvfs); 287 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 288 return (KMEM_CBRC_DONT_KNOW); 289 } 290 291 /* 292 * At this point we know that as long as we hold z_znodes_lock, the 293 * znode cannot be freed and fields within the znode can be safely 294 * accessed. Now, prevent a race with zfs_zget(). 295 */ 296 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 297 mutex_exit(&zfsvfs->z_znodes_lock); 298 ZFS_EXIT(zfsvfs); 299 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 300 return (KMEM_CBRC_LATER); 301 } 302 303 vp = ZTOV(ozp); 304 if (mutex_tryenter(&vp->v_lock) == 0) { 305 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 306 mutex_exit(&zfsvfs->z_znodes_lock); 307 ZFS_EXIT(zfsvfs); 308 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 309 return (KMEM_CBRC_LATER); 310 } 311 312 /* Only move znodes that are referenced _only_ by the DNLC. */ 313 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 314 mutex_exit(&vp->v_lock); 315 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 316 mutex_exit(&zfsvfs->z_znodes_lock); 317 ZFS_EXIT(zfsvfs); 318 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 319 return (KMEM_CBRC_LATER); 320 } 321 322 /* 323 * The znode is known and in a valid state to move. We're holding the 324 * locks needed to execute the critical section. 325 */ 326 zfs_znode_move_impl(ozp, nzp); 327 mutex_exit(&vp->v_lock); 328 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 329 330 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 331 mutex_exit(&zfsvfs->z_znodes_lock); 332 ZFS_EXIT(zfsvfs); 333 334 return (KMEM_CBRC_YES); 335 } 336 #endif /* !__NetBSD__ */ 337 338 void 339 zfs_znode_init(void) 340 { 341 /* 342 * Initialize zcache 343 */ 344 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 345 ASSERT(znode_cache == NULL); 346 znode_cache = kmem_cache_create("zfs_znode_cache", 347 sizeof (znode_t), 0, zfs_znode_cache_constructor, 348 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 349 } 350 351 void 352 zfs_znode_fini(void) 353 { 354 355 /* 356 * Cleanup zcache 357 */ 358 if (znode_cache) 359 kmem_cache_destroy(znode_cache); 360 znode_cache = NULL; 361 rw_destroy(&zfsvfs_lock); 362 } 363 364 #ifndef __NetBSD__ 365 struct vnodeops *zfs_dvnodeops; 366 struct vnodeops *zfs_fvnodeops; 367 struct vnodeops *zfs_symvnodeops; 368 struct vnodeops *zfs_xdvnodeops; 369 struct vnodeops *zfs_evnodeops; 370 struct vnodeops *zfs_sharevnodeops; 371 #endif 372 373 void 374 zfs_remove_op_tables() 375 { 376 #ifndef __NetBSD__ 377 /* 378 * Remove vfs ops 379 */ 380 ASSERT(zfsfstype); 381 (void) vfs_freevfsops_by_type(zfsfstype); 382 zfsfstype = 0; 383 384 /* 385 * Remove vnode ops 386 */ 387 if (zfs_dvnodeops) 388 vn_freevnodeops(zfs_dvnodeops); 389 if (zfs_fvnodeops) 390 vn_freevnodeops(zfs_fvnodeops); 391 if (zfs_symvnodeops) 392 vn_freevnodeops(zfs_symvnodeops); 393 if (zfs_xdvnodeops) 394 vn_freevnodeops(zfs_xdvnodeops); 395 if (zfs_evnodeops) 396 vn_freevnodeops(zfs_evnodeops); 397 if (zfs_sharevnodeops) 398 vn_freevnodeops(zfs_sharevnodeops); 399 400 zfs_dvnodeops = NULL; 401 zfs_fvnodeops = NULL; 402 zfs_symvnodeops = NULL; 403 zfs_xdvnodeops = NULL; 404 zfs_evnodeops = NULL; 405 zfs_sharevnodeops = NULL; 406 #endif 407 } 408 409 #ifndef __NetBSD__ 410 extern const fs_operation_def_t zfs_dvnodeops_template[]; 411 extern const fs_operation_def_t zfs_fvnodeops_template[]; 412 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 413 extern const fs_operation_def_t zfs_symvnodeops_template[]; 414 extern const fs_operation_def_t zfs_evnodeops_template[]; 415 extern const fs_operation_def_t zfs_sharevnodeops_template[]; 416 #endif 417 418 int 419 zfs_create_op_tables() 420 { 421 #ifndef __NetBSD__ 422 int error; 423 424 /* 425 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 426 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 427 * In this case we just return as the ops vectors are already set up. 428 */ 429 if (zfs_dvnodeops) 430 return (0); 431 432 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 433 &zfs_dvnodeops); 434 if (error) 435 return (error); 436 437 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 438 &zfs_fvnodeops); 439 if (error) 440 return (error); 441 442 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 443 &zfs_symvnodeops); 444 if (error) 445 return (error); 446 447 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 448 &zfs_xdvnodeops); 449 if (error) 450 return (error); 451 452 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 453 &zfs_evnodeops); 454 if (error) 455 return (error); 456 457 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, 458 &zfs_sharevnodeops); 459 460 return (error); 461 #endif 462 return 0; 463 } 464 465 int 466 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 467 { 468 zfs_acl_ids_t acl_ids; 469 vattr_t vattr; 470 znode_t *sharezp; 471 vnode_t *vp; 472 znode_t *zp; 473 int error; 474 475 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 476 vattr.va_type = VDIR; 477 vattr.va_mode = S_IFDIR|0555; 478 vattr.va_uid = crgetuid(kcred); 479 vattr.va_gid = crgetgid(kcred); 480 481 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 482 sharezp->z_unlinked = 0; 483 sharezp->z_atime_dirty = 0; 484 sharezp->z_zfsvfs = zfsvfs; 485 486 vp = ZTOV(sharezp); 487 error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs, 488 zfs_vnodeop_p, &zp->z_vnode); 489 vp->v_type = VDIR; 490 491 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 492 kcred, NULL, &acl_ids)); 493 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 494 &zp, 0, &acl_ids); 495 ASSERT3P(zp, ==, sharezp); 496 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ 497 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 498 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 499 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 500 zfsvfs->z_shares_dir = sharezp->z_id; 501 502 zfs_acl_ids_free(&acl_ids); 503 ZTOV(sharezp)->v_count = 0; 504 dmu_buf_rele(sharezp->z_dbuf, NULL); 505 sharezp->z_dbuf = NULL; 506 kmem_cache_free(znode_cache, sharezp); 507 508 return (error); 509 } 510 511 /* 512 * define a couple of values we need available 513 * for both 64 and 32 bit environments. 514 */ 515 #ifndef NBITSMINOR64 516 #define NBITSMINOR64 32 517 #endif 518 #ifndef MAXMAJ64 519 #define MAXMAJ64 0xffffffffUL 520 #endif 521 #ifndef MAXMIN64 522 #define MAXMIN64 0xffffffffUL 523 #endif 524 525 /* 526 * Create special expldev for ZFS private use. 527 * Can't use standard expldev since it doesn't do 528 * what we want. The standard expldev() takes a 529 * dev32_t in LP64 and expands it to a long dev_t. 530 * We need an interface that takes a dev32_t in ILP32 531 * and expands it to a long dev_t. 532 */ 533 static uint64_t 534 zfs_expldev(dev_t dev) 535 { 536 return ((uint64_t)major(dev) << NBITSMINOR64) | 537 (minor_t)minor(dev); 538 } 539 540 /* 541 * Special cmpldev for ZFS private use. 542 * Can't use standard cmpldev since it takes 543 * a long dev_t and compresses it to dev32_t in 544 * LP64. We need to do a compaction of a long dev_t 545 * to a dev32_t in ILP32. 546 */ 547 dev_t 548 zfs_cmpldev(uint64_t dev) 549 { 550 minor_t minor = (minor_t)dev & MAXMIN64; 551 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 552 553 return makedev(minor, major); 554 } 555 556 static void 557 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 558 { 559 znode_t *nzp; 560 561 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 562 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 563 564 mutex_enter(&zp->z_lock); 565 566 ASSERT(zp->z_dbuf == NULL); 567 ASSERT(zp->z_acl_cached == NULL); 568 zp->z_dbuf = db; 569 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 570 571 /* 572 * there should be no 573 * concurrent zgets on this object. 574 */ 575 if (nzp != NULL) 576 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 577 578 /* 579 * Slap on VROOT if we are the root znode 580 */ 581 if (zp->z_id == zfsvfs->z_root) 582 ZTOV(zp)->v_flag |= VROOT; 583 584 mutex_exit(&zp->z_lock); 585 vn_exists(ZTOV(zp)); 586 } 587 588 void 589 zfs_znode_dmu_fini(znode_t *zp) 590 { 591 dmu_buf_t *db = zp->z_dbuf; 592 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 593 zp->z_unlinked || 594 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 595 ASSERT(zp->z_dbuf != NULL); 596 zp->z_dbuf = NULL; 597 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 598 dmu_buf_rele(db, NULL); 599 } 600 601 /* 602 * Construct a new znode/vnode and intialize. 603 * 604 * This does not do a call to dmu_set_user() that is 605 * up to the caller to do, in case you don't want to 606 * return the znode 607 */ 608 609 static znode_t * 610 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 611 { 612 znode_t *zp; 613 vnode_t *vp; 614 int error; 615 616 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 617 for (;;) { 618 619 error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs, 620 zfs_vnodeop_p, &zp->z_vnode); 621 if (__predict_true(error == 0)) 622 break; 623 printf("WARNING: zfs_znode_alloc: unable to get vnode, " 624 "error=%d\n", error); 625 (void)kpause("zfsnewvn", false, hz, NULL); 626 } 627 628 ASSERT(zp->z_dirlocks == NULL); 629 ASSERT(zp->z_dbuf == NULL); 630 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 631 632 /* 633 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 634 * the zfs_znode_move() callback. 635 */ 636 zp->z_phys = NULL; 637 zp->z_unlinked = 0; 638 zp->z_atime_dirty = 0; 639 zp->z_mapcnt = 0; 640 zp->z_last_itx = 0; 641 zp->z_id = db->db_object; 642 zp->z_blksz = blksz; 643 zp->z_seq = 0x7A4653; 644 zp->z_sync_cnt = 0; 645 646 vp = ZTOV(zp); 647 648 zfs_znode_dmu_init(zfsvfs, zp, db); 649 650 zp->z_gen = zp->z_phys->zp_gen; 651 652 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 653 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 654 vp->v_data = zp; 655 switch (vp->v_type) { 656 case VDIR: 657 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 658 break; 659 case VBLK: 660 case VCHR: 661 /* XXX NetBSD vp->v_op = zfs_specop_p; */ 662 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev)); 663 break; 664 case VFIFO: 665 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */ 666 break; 667 } 668 669 dprintf("zfs_znode_alloc znode %p -- vnode %p\n", zp, vp); 670 dprintf("zfs_znode_alloc z_id %ld\n", zp->z_id); 671 //cpu_Debugger(); 672 673 uvm_vnp_setsize(vp, zp->z_phys->zp_size); 674 675 mutex_enter(&zfsvfs->z_znodes_lock); 676 list_insert_tail(&zfsvfs->z_all_znodes, zp); 677 membar_producer(); 678 /* 679 * Everything else must be valid before assigning z_zfsvfs makes the 680 * znode eligible for zfs_znode_move(). 681 */ 682 zp->z_zfsvfs = zfsvfs; 683 mutex_exit(&zfsvfs->z_znodes_lock); 684 685 return (zp); 686 } 687 688 /* 689 * Create a new DMU object to hold a zfs znode. 690 * 691 * IN: dzp - parent directory for new znode 692 * vap - file attributes for new znode 693 * tx - dmu transaction id for zap operations 694 * cr - credentials of caller 695 * flag - flags: 696 * IS_ROOT_NODE - new object will be root 697 * IS_XATTR - new object is an attribute 698 * bonuslen - length of bonus buffer 699 * setaclp - File/Dir initial ACL 700 * fuidp - Tracks fuid allocation. 701 * 702 * OUT: zpp - allocated znode 703 * 704 */ 705 void 706 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 707 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 708 { 709 dmu_buf_t *db; 710 znode_phys_t *pzp; 711 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 712 timestruc_t now; 713 uint64_t gen, obj; 714 int err; 715 716 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 717 718 if (zfsvfs->z_replay) { 719 obj = vap->va_nodeid; 720 now = vap->va_ctime; /* see zfs_replay_create() */ 721 gen = vap->va_nblocks; /* ditto */ 722 } else { 723 obj = 0; 724 gethrestime(&now); 725 gen = dmu_tx_get_txg(tx); 726 } 727 728 /* 729 * Create a new DMU object. 730 */ 731 /* 732 * There's currently no mechanism for pre-reading the blocks that will 733 * be to needed allocate a new object, so we accept the small chance 734 * that there will be an i/o error and we will fail one of the 735 * assertions below. 736 */ 737 if (vap->va_type == VDIR) { 738 if (zfsvfs->z_replay) { 739 err = zap_create_claim_norm(zfsvfs->z_os, obj, 740 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 741 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 742 ASSERT3U(err, ==, 0); 743 } else { 744 obj = zap_create_norm(zfsvfs->z_os, 745 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 746 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 747 } 748 } else { 749 if (zfsvfs->z_replay) { 750 err = dmu_object_claim(zfsvfs->z_os, obj, 751 DMU_OT_PLAIN_FILE_CONTENTS, 0, 752 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 753 ASSERT3U(err, ==, 0); 754 } else { 755 obj = dmu_object_alloc(zfsvfs->z_os, 756 DMU_OT_PLAIN_FILE_CONTENTS, 0, 757 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 758 } 759 } 760 761 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 762 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 763 dmu_buf_will_dirty(db, tx); 764 765 /* 766 * Initialize the znode physical data to zero. 767 */ 768 ASSERT(db->db_size >= sizeof (znode_phys_t)); 769 bzero(db->db_data, db->db_size); 770 pzp = db->db_data; 771 772 /* 773 * If this is the root, fix up the half-initialized parent pointer 774 * to reference the just-allocated physical data area. 775 */ 776 if (flag & IS_ROOT_NODE) { 777 dzp->z_dbuf = db; 778 dzp->z_phys = pzp; 779 dzp->z_id = obj; 780 } 781 782 /* 783 * If parent is an xattr, so am I. 784 */ 785 if (dzp->z_phys->zp_flags & ZFS_XATTR) 786 flag |= IS_XATTR; 787 788 if (vap->va_type == VBLK || vap->va_type == VCHR) { 789 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 790 } 791 792 if (zfsvfs->z_use_fuids) 793 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 794 795 if (vap->va_type == VDIR) { 796 pzp->zp_size = 2; /* contents ("." and "..") */ 797 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 798 } 799 800 pzp->zp_parent = dzp->z_id; 801 if (flag & IS_XATTR) 802 pzp->zp_flags |= ZFS_XATTR; 803 804 pzp->zp_gen = gen; 805 806 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 807 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 808 809 if (vap->va_mask & AT_ATIME) { 810 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 811 } else { 812 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 813 } 814 815 if (vap->va_mask & AT_MTIME) { 816 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 817 } else { 818 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 819 } 820 pzp->zp_uid = acl_ids->z_fuid; 821 pzp->zp_gid = acl_ids->z_fgid; 822 pzp->zp_mode = acl_ids->z_mode; 823 if (!(flag & IS_ROOT_NODE)) { 824 *zpp = zfs_znode_alloc(zfsvfs, db, 0); 825 } else { 826 /* 827 * If we are creating the root node, the "parent" we 828 * passed in is the znode for the root. 829 */ 830 *zpp = dzp; 831 } 832 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 833 if (vap->va_mask & AT_XVATTR) 834 zfs_xvattr_set(*zpp, (xvattr_t *)vap); 835 836 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 837 } 838 839 void 840 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 841 { 842 xoptattr_t *xoap; 843 844 xoap = xva_getxoptattr(xvap); 845 ASSERT(xoap); 846 847 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 848 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 849 XVA_SET_RTN(xvap, XAT_CREATETIME); 850 } 851 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 852 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 853 XVA_SET_RTN(xvap, XAT_READONLY); 854 } 855 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 856 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 857 XVA_SET_RTN(xvap, XAT_HIDDEN); 858 } 859 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 860 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 861 XVA_SET_RTN(xvap, XAT_SYSTEM); 862 } 863 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 864 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 865 XVA_SET_RTN(xvap, XAT_ARCHIVE); 866 } 867 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 868 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 869 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 870 } 871 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 872 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 873 XVA_SET_RTN(xvap, XAT_NOUNLINK); 874 } 875 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 876 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 877 XVA_SET_RTN(xvap, XAT_APPENDONLY); 878 } 879 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 880 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 881 XVA_SET_RTN(xvap, XAT_NODUMP); 882 } 883 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 884 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 885 XVA_SET_RTN(xvap, XAT_OPAQUE); 886 } 887 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 888 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 889 xoap->xoa_av_quarantined); 890 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 891 } 892 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 893 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 894 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 895 } 896 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 897 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 898 sizeof (xoap->xoa_av_scanstamp)); 899 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 900 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 901 } 902 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 903 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); 904 XVA_SET_RTN(xvap, XAT_REPARSE); 905 } 906 } 907 908 int 909 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 910 { 911 dmu_object_info_t doi; 912 dmu_buf_t *db; 913 znode_t *zp; 914 vnode_t *vp; 915 int err, first = 1; 916 917 *zpp = NULL; 918 again: 919 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 920 921 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 922 if (err) { 923 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 924 return (err); 925 } 926 927 dmu_object_info_from_db(db, &doi); 928 if (doi.doi_bonus_type != DMU_OT_ZNODE || 929 doi.doi_bonus_size < sizeof (znode_phys_t)) { 930 dmu_buf_rele(db, NULL); 931 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 932 return (EINVAL); 933 } 934 935 zp = dmu_buf_get_user(db); 936 if (zp != NULL) { 937 mutex_enter(&zp->z_lock); 938 939 /* 940 * Since we do immediate eviction of the z_dbuf, we 941 * should never find a dbuf with a znode that doesn't 942 * know about the dbuf. 943 */ 944 ASSERT3P(zp->z_dbuf, ==, db); 945 ASSERT3U(zp->z_id, ==, obj_num); 946 if (zp->z_unlinked) { 947 err = ENOENT; 948 } else { 949 if ((vp = ZTOV(zp)) != NULL) { 950 mutex_enter(&vp->v_interlock); 951 mutex_exit(&zp->z_lock); 952 if (vget(vp, 0) != 0) { 953 dmu_buf_rele(db, NULL); 954 mutex_exit(&vp->v_interlock); 955 goto again; 956 } 957 mutex_enter(&zp->z_lock); 958 } else { 959 if (first) { 960 ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 961 first = 0; 962 } 963 /* 964 * znode is dying so we can't reuse it, we must 965 * wait until destruction is completed. 966 */ 967 dmu_buf_rele(db, NULL); 968 mutex_exit(&zp->z_lock); 969 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 970 kpause("zcollide", 0, 1, NULL); 971 goto again; 972 } 973 *zpp = zp; 974 err = 0; 975 } 976 977 dmu_buf_rele(db, NULL); 978 mutex_exit(&zp->z_lock); 979 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 980 return (err); 981 } 982 983 /* 984 * Not found create new znode/vnode 985 * but only if file exists. 986 * 987 * There is a small window where zfs_vget() could 988 * find this object while a file create is still in 989 * progress. Since a gen number can never be zero 990 * we will check that to determine if its an allocated 991 * file. 992 */ 993 994 if (((znode_phys_t *)db->db_data)->zp_gen != 0) { 995 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 996 *zpp = zp; 997 998 vp = ZTOV(zp); 999 genfs_node_init(vp, &zfs_genfsops); 1000 VOP_UNLOCK(vp); 1001 1002 err = 0; 1003 } else { 1004 dmu_buf_rele(db, NULL); 1005 err = ENOENT; 1006 } 1007 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1008 return (err); 1009 } 1010 1011 int 1012 zfs_rezget(znode_t *zp) 1013 { 1014 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1015 dmu_object_info_t doi; 1016 dmu_buf_t *db; 1017 uint64_t obj_num = zp->z_id; 1018 int err; 1019 1020 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 1021 1022 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 1023 if (err) { 1024 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1025 return (err); 1026 } 1027 1028 dmu_object_info_from_db(db, &doi); 1029 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1030 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1031 dmu_buf_rele(db, NULL); 1032 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1033 return (EINVAL); 1034 } 1035 1036 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 1037 dmu_buf_rele(db, NULL); 1038 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1039 return (EIO); 1040 } 1041 1042 mutex_enter(&zp->z_acl_lock); 1043 if (zp->z_acl_cached) { 1044 zfs_acl_free(zp->z_acl_cached); 1045 zp->z_acl_cached = NULL; 1046 } 1047 mutex_exit(&zp->z_acl_lock); 1048 1049 zfs_znode_dmu_init(zfsvfs, zp, db); 1050 zp->z_unlinked = (zp->z_phys->zp_links == 0); 1051 zp->z_blksz = doi.doi_data_block_size; 1052 1053 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1054 1055 return (0); 1056 } 1057 1058 void 1059 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1060 { 1061 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1062 objset_t *os = zfsvfs->z_os; 1063 uint64_t obj = zp->z_id; 1064 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1065 1066 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1067 if (acl_obj) 1068 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1069 VERIFY(0 == dmu_object_free(os, obj, tx)); 1070 zfs_znode_dmu_fini(zp); 1071 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1072 zfs_znode_free(zp); 1073 } 1074 1075 /* 1076 * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock 1077 * will be released in zfs_zinactive. 1078 */ 1079 void 1080 zfs_zinactive(znode_t *zp) 1081 { 1082 vnode_t *vp = ZTOV(zp); 1083 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1084 uint64_t z_id = zp->z_id; 1085 1086 ASSERT(zp->z_dbuf && zp->z_phys); 1087 1088 /* 1089 * Don't allow a zfs_zget() while were trying to release this znode 1090 */ 1091 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1092 1093 mutex_enter(&zp->z_lock); 1094 /* 1095 * If this was the last reference to a file with no links, 1096 * remove the file from the file system. 1097 */ 1098 if (zp->z_unlinked) { 1099 mutex_exit(&zp->z_lock); 1100 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1101 zfs_rmnode(zp); 1102 return; 1103 } 1104 1105 mutex_exit(&zp->z_lock); 1106 /* XXX why disabled zfs_znode_dmu_fini(zp); */ 1107 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1108 zfs_znode_free(zp); 1109 } 1110 1111 void 1112 zfs_znode_free(znode_t *zp) 1113 { 1114 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1115 ASSERT(ZTOV(zp) == NULL); 1116 1117 dprintf("destroying znode %p\n", zp); 1118 //cpu_Debugger(); 1119 mutex_enter(&zfsvfs->z_znodes_lock); 1120 POINTER_INVALIDATE(&zp->z_zfsvfs); 1121 list_remove(&zfsvfs->z_all_znodes, zp); 1122 mutex_exit(&zfsvfs->z_znodes_lock); 1123 1124 if (zp->z_acl_cached) { 1125 zfs_acl_free(zp->z_acl_cached); 1126 zp->z_acl_cached = NULL; 1127 } 1128 1129 kmem_cache_free(znode_cache, zp); 1130 1131 VFS_RELE(zfsvfs->z_vfs); 1132 } 1133 1134 void 1135 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1136 { 1137 timestruc_t now; 1138 1139 ASSERT(MUTEX_HELD(&zp->z_lock)); 1140 1141 gethrestime(&now); 1142 1143 if (tx) { 1144 dmu_buf_will_dirty(zp->z_dbuf, tx); 1145 zp->z_atime_dirty = 0; 1146 zp->z_seq++; 1147 } else { 1148 zp->z_atime_dirty = 1; 1149 } 1150 1151 if (flag & AT_ATIME) 1152 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1153 1154 if (flag & AT_MTIME) { 1155 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1156 if (zp->z_zfsvfs->z_use_fuids) 1157 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1158 } 1159 1160 if (flag & AT_CTIME) { 1161 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1162 if (zp->z_zfsvfs->z_use_fuids) 1163 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1164 } 1165 } 1166 1167 /* 1168 * Update the requested znode timestamps with the current time. 1169 * If we are in a transaction, then go ahead and mark the znode 1170 * dirty in the transaction so the timestamps will go to disk. 1171 * Otherwise, we will get pushed next time the znode is updated 1172 * in a transaction, or when this znode eventually goes inactive. 1173 * 1174 * Why is this OK? 1175 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1176 * 2 - Multiple consecutive updates will be collapsed into a single 1177 * znode update by the transaction grouping semantics of the DMU. 1178 */ 1179 void 1180 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1181 { 1182 mutex_enter(&zp->z_lock); 1183 zfs_time_stamper_locked(zp, flag, tx); 1184 mutex_exit(&zp->z_lock); 1185 } 1186 1187 /* 1188 * Grow the block size for a file. 1189 * 1190 * IN: zp - znode of file to free data in. 1191 * size - requested block size 1192 * tx - open transaction. 1193 * 1194 * NOTE: this function assumes that the znode is write locked. 1195 */ 1196 void 1197 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1198 { 1199 int error; 1200 u_longlong_t dummy; 1201 1202 if (size <= zp->z_blksz) 1203 return; 1204 /* 1205 * If the file size is already greater than the current blocksize, 1206 * we will not grow. If there is more than one block in a file, 1207 * the blocksize cannot change. 1208 */ 1209 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1210 return; 1211 1212 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1213 size, 0, tx); 1214 if (error == ENOTSUP) 1215 return; 1216 ASSERT3U(error, ==, 0); 1217 1218 /* What blocksize did we actually get? */ 1219 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1220 } 1221 1222 /* 1223 * Increase the file length 1224 * 1225 * IN: zp - znode of file to free data in. 1226 * end - new end-of-file 1227 * 1228 * RETURN: 0 if success 1229 * error code if failure 1230 */ 1231 static int 1232 zfs_extend(znode_t *zp, uint64_t end) 1233 { 1234 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1235 dmu_tx_t *tx; 1236 rl_t *rl; 1237 uint64_t newblksz; 1238 int error; 1239 1240 /* 1241 * We will change zp_size, lock the whole file. 1242 */ 1243 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1244 1245 /* 1246 * Nothing to do if file already at desired length. 1247 */ 1248 if (end <= zp->z_phys->zp_size) { 1249 zfs_range_unlock(rl); 1250 return (0); 1251 } 1252 top: 1253 tx = dmu_tx_create(zfsvfs->z_os); 1254 dmu_tx_hold_bonus(tx, zp->z_id); 1255 if (end > zp->z_blksz && 1256 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1257 /* 1258 * We are growing the file past the current block size. 1259 */ 1260 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1261 ASSERT(!ISP2(zp->z_blksz)); 1262 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1263 } else { 1264 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1265 } 1266 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1267 } else { 1268 newblksz = 0; 1269 } 1270 1271 error = dmu_tx_assign(tx, TXG_NOWAIT); 1272 if (error) { 1273 if (error == ERESTART) { 1274 dmu_tx_wait(tx); 1275 dmu_tx_abort(tx); 1276 goto top; 1277 } 1278 dmu_tx_abort(tx); 1279 zfs_range_unlock(rl); 1280 return (error); 1281 } 1282 dmu_buf_will_dirty(zp->z_dbuf, tx); 1283 1284 if (newblksz) 1285 zfs_grow_blocksize(zp, newblksz, tx); 1286 1287 zp->z_phys->zp_size = end; 1288 1289 zfs_range_unlock(rl); 1290 1291 dmu_tx_commit(tx); 1292 1293 uvm_vnp_setsize(ZTOV(zp), end); 1294 1295 return (0); 1296 } 1297 1298 /* 1299 * Free space in a file. 1300 * 1301 * IN: zp - znode of file to free data in. 1302 * off - start of section to free. 1303 * len - length of section to free. 1304 * 1305 * RETURN: 0 if success 1306 * error code if failure 1307 */ 1308 static int 1309 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1310 { 1311 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1312 rl_t *rl; 1313 int error; 1314 1315 /* 1316 * Lock the range being freed. 1317 */ 1318 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1319 1320 /* 1321 * Nothing to do if file already at desired length. 1322 */ 1323 if (off >= zp->z_phys->zp_size) { 1324 zfs_range_unlock(rl); 1325 return (0); 1326 } 1327 1328 if (off + len > zp->z_phys->zp_size) 1329 len = zp->z_phys->zp_size - off; 1330 1331 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1332 1333 if (error == 0) { 1334 /* 1335 * In NetBSD we cannot free block in the middle of a file, 1336 * but only at the end of a file. 1337 */ 1338 uvm_vnp_setsize(ZTOV(zp), off); 1339 } 1340 1341 zfs_range_unlock(rl); 1342 1343 return (error); 1344 } 1345 1346 /* 1347 * Truncate a file 1348 * 1349 * IN: zp - znode of file to free data in. 1350 * end - new end-of-file. 1351 * 1352 * RETURN: 0 if success 1353 * error code if failure 1354 */ 1355 static int 1356 zfs_trunc(znode_t *zp, uint64_t end) 1357 { 1358 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1359 vnode_t *vp = ZTOV(zp); 1360 dmu_tx_t *tx; 1361 rl_t *rl; 1362 int error; 1363 1364 /* 1365 * We will change zp_size, lock the whole file. 1366 */ 1367 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1368 1369 /* 1370 * Nothing to do if file already at desired length. 1371 */ 1372 if (end >= zp->z_phys->zp_size) { 1373 zfs_range_unlock(rl); 1374 return (0); 1375 } 1376 1377 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1378 if (error) { 1379 zfs_range_unlock(rl); 1380 return (error); 1381 } 1382 top: 1383 tx = dmu_tx_create(zfsvfs->z_os); 1384 dmu_tx_hold_bonus(tx, zp->z_id); 1385 error = dmu_tx_assign(tx, TXG_NOWAIT); 1386 if (error) { 1387 if (error == ERESTART) { 1388 dmu_tx_wait(tx); 1389 dmu_tx_abort(tx); 1390 goto top; 1391 } 1392 dmu_tx_abort(tx); 1393 zfs_range_unlock(rl); 1394 return (error); 1395 } 1396 dmu_buf_will_dirty(zp->z_dbuf, tx); 1397 1398 zp->z_phys->zp_size = end; 1399 1400 dmu_tx_commit(tx); 1401 1402 /* 1403 * Clear any mapped pages in the truncated region. This has to 1404 * happen outside of the transaction to avoid the possibility of 1405 * a deadlock with someone trying to push a page that we are 1406 * about to invalidate. 1407 */ 1408 1409 uvm_vnp_setsize(vp, end); 1410 1411 return (0); 1412 } 1413 1414 /* 1415 * Free space in a file 1416 * 1417 * IN: zp - znode of file to free data in. 1418 * off - start of range 1419 * len - end of range (0 => EOF) 1420 * flag - current file open mode flags. 1421 * log - TRUE if this action should be logged 1422 * 1423 * RETURN: 0 if success 1424 * error code if failure 1425 */ 1426 int 1427 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1428 { 1429 vnode_t *vp = ZTOV(zp); 1430 dmu_tx_t *tx; 1431 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1432 zilog_t *zilog = zfsvfs->z_log; 1433 int error; 1434 1435 if (off > zp->z_phys->zp_size) { 1436 error = zfs_extend(zp, off+len); 1437 if (error == 0 && log) 1438 goto log; 1439 else 1440 return (error); 1441 } 1442 1443 if (len == 0) { 1444 error = zfs_trunc(zp, off); 1445 } else { 1446 if ((error = zfs_free_range(zp, off, len)) == 0 && 1447 off + len > zp->z_phys->zp_size) 1448 error = zfs_extend(zp, off+len); 1449 } 1450 if (error || !log) 1451 return (error); 1452 log: 1453 tx = dmu_tx_create(zfsvfs->z_os); 1454 dmu_tx_hold_bonus(tx, zp->z_id); 1455 error = dmu_tx_assign(tx, TXG_NOWAIT); 1456 if (error) { 1457 if (error == ERESTART) { 1458 dmu_tx_wait(tx); 1459 dmu_tx_abort(tx); 1460 goto log; 1461 } 1462 dmu_tx_abort(tx); 1463 return (error); 1464 } 1465 1466 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1467 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1468 1469 dmu_tx_commit(tx); 1470 return (0); 1471 } 1472 1473 void 1474 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1475 { 1476 zfsvfs_t zfsvfs; 1477 uint64_t moid, obj, version; 1478 uint64_t sense = ZFS_CASE_SENSITIVE; 1479 uint64_t norm = 0; 1480 nvpair_t *elem; 1481 int error; 1482 int i; 1483 znode_t *rootzp = NULL; 1484 vnode_t *vp; 1485 vattr_t vattr; 1486 znode_t *zp; 1487 zfs_acl_ids_t acl_ids; 1488 1489 /* 1490 * First attempt to create master node. 1491 */ 1492 /* 1493 * In an empty objset, there are no blocks to read and thus 1494 * there can be no i/o errors (which we assert below). 1495 */ 1496 moid = MASTER_NODE_OBJ; 1497 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1498 DMU_OT_NONE, 0, tx); 1499 ASSERT(error == 0); 1500 1501 /* 1502 * Set starting attributes. 1503 */ 1504 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1505 version = ZPL_VERSION; 1506 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1507 version = ZPL_VERSION_USERSPACE - 1; 1508 else 1509 version = ZPL_VERSION_FUID - 1; 1510 elem = NULL; 1511 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1512 /* For the moment we expect all zpl props to be uint64_ts */ 1513 uint64_t val; 1514 char *name; 1515 1516 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1517 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1518 name = nvpair_name(elem); 1519 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1520 if (val < version) 1521 version = val; 1522 } else { 1523 error = zap_update(os, moid, name, 8, 1, &val, tx); 1524 } 1525 ASSERT(error == 0); 1526 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1527 norm = val; 1528 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1529 sense = val; 1530 } 1531 ASSERT(version != 0); 1532 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1533 1534 /* 1535 * Create a delete queue. 1536 */ 1537 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1538 1539 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1540 ASSERT(error == 0); 1541 1542 /* 1543 * Create root znode. Create minimal znode/vnode/zfsvfs 1544 * to allow zfs_mknode to work. 1545 */ 1546 vattr_null(&vattr); 1547 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1548 vattr.va_type = VDIR; 1549 vattr.va_mode = S_IFDIR|0755; 1550 vattr.va_uid = crgetuid(cr); 1551 vattr.va_gid = crgetgid(cr); 1552 1553 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1554 rootzp->z_unlinked = 0; 1555 rootzp->z_atime_dirty = 0; 1556 1557 for (;;) { 1558 error = getnewvnode(VT_ZFS, NULL, zfs_vnodeop_p, 1559 &rootzp->z_vnode); 1560 if (error == 0) 1561 break; 1562 printf("WARNING: zfs_create_fs: unable to get vnode, " 1563 "error=%d\n", error); 1564 kpause("zfsvn", false, hz, NULL); 1565 } 1566 1567 vp = ZTOV(rootzp); 1568 vp->v_type = VDIR; 1569 1570 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1571 1572 zfsvfs.z_os = os; 1573 zfsvfs.z_parent = &zfsvfs; 1574 zfsvfs.z_version = version; 1575 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1576 zfsvfs.z_norm = norm; 1577 /* 1578 * Fold case on file systems that are always or sometimes case 1579 * insensitive. 1580 */ 1581 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1582 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1583 1584 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1585 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1586 offsetof(znode_t, z_link_node)); 1587 1588 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1589 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1590 1591 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1592 rootzp->z_zfsvfs = &zfsvfs; 1593 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1594 cr, NULL, &acl_ids)); 1595 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1596 ASSERT3P(zp, ==, rootzp); 1597 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1598 ASSERT(error == 0); 1599 zfs_acl_ids_free(&acl_ids); 1600 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1601 1602 dmu_buf_rele(rootzp->z_dbuf, NULL); 1603 rootzp->z_dbuf = NULL; 1604 ungetnewvnode(vp); 1605 kmem_cache_free(znode_cache, rootzp); 1606 1607 /* 1608 * Create shares directory 1609 */ 1610 1611 error = zfs_create_share_dir(&zfsvfs, tx); 1612 1613 ASSERT(error == 0); 1614 1615 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1616 mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1617 } 1618 1619 #endif /* _KERNEL */ 1620 /* 1621 * Given an object number, return its parent object number and whether 1622 * or not the object is an extended attribute directory. 1623 */ 1624 static int 1625 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1626 { 1627 dmu_buf_t *db; 1628 dmu_object_info_t doi; 1629 znode_phys_t *zp; 1630 int error; 1631 1632 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1633 return (error); 1634 1635 dmu_object_info_from_db(db, &doi); 1636 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1637 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1638 dmu_buf_rele(db, FTAG); 1639 return (EINVAL); 1640 } 1641 1642 zp = db->db_data; 1643 *pobjp = zp->zp_parent; 1644 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1645 S_ISDIR(zp->zp_mode); 1646 dmu_buf_rele(db, FTAG); 1647 1648 return (0); 1649 } 1650 1651 int 1652 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1653 { 1654 char *path = buf + len - 1; 1655 int error; 1656 1657 *path = '\0'; 1658 1659 for (;;) { 1660 uint64_t pobj; 1661 char component[MAXNAMELEN + 2]; 1662 size_t complen; 1663 int is_xattrdir; 1664 1665 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1666 &is_xattrdir)) != 0) 1667 break; 1668 1669 if (pobj == obj) { 1670 if (path[0] != '/') 1671 *--path = '/'; 1672 break; 1673 } 1674 1675 component[0] = '/'; 1676 if (is_xattrdir) { 1677 (void) sprintf(component + 1, "<xattrdir>"); 1678 } else { 1679 error = zap_value_search(osp, pobj, obj, 1680 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1681 if (error != 0) 1682 break; 1683 } 1684 1685 complen = strlen(component); 1686 path -= complen; 1687 ASSERT(path >= buf); 1688 bcopy(component, path, complen); 1689 obj = pobj; 1690 } 1691 1692 if (error == 0) 1693 (void) memmove(buf, path, buf + len - path); 1694 return (error); 1695 } 1696