1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/kmem.h> 42 #include <sys/errno.h> 43 #include <sys/unistd.h> 44 #include <sys/atomic.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/zfs_rlock.h> 49 #include <sys/zfs_fuid.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/kidmap.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 60 #include "zfs_prop.h" 61 62 #if defined(_KERNEL) && defined(__NetBSD__) 63 #include <miscfs/specfs/specdev.h> 64 static const struct genfs_ops zfs_genfsops = { 65 .gop_write = genfs_compat_gop_write, 66 }; 67 68 #endif 69 70 extern int (**zfs_vnodeop_p)(void *); 71 extern int (**zfs_fifoop_p)(void *); 72 extern int (**zfs_specop_p)(void *); 73 74 /* 75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 76 * turned on when DEBUG is also defined. 77 */ 78 #ifdef DEBUG 79 #define ZNODE_STATS 80 #endif /* DEBUG */ 81 82 #ifdef ZNODE_STATS 83 #define ZNODE_STAT_ADD(stat) ((stat)++) 84 #else 85 #define ZNODE_STAT_ADD(stat) /* nothing */ 86 #endif /* ZNODE_STATS */ 87 88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 90 91 /* 92 * Functions needed for userland (ie: libzpool) are not put under 93 * #ifdef_KERNEL; the rest of the functions have dependencies 94 * (such as VFS logic) that will not compile easily in userland. 95 */ 96 #ifdef _KERNEL 97 /* 98 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 99 * be freed before it can be safely accessed. 100 */ 101 krwlock_t zfsvfs_lock; 102 103 static kmem_cache_t *znode_cache = NULL; 104 105 /*ARGSUSED*/ 106 static void 107 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 108 { 109 /* 110 * We should never drop all dbuf refs without first clearing 111 * the eviction callback. 112 */ 113 panic("evicting znode %p\n", user_ptr); 114 } 115 116 /*ARGSUSED*/ 117 static int 118 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 119 { 120 znode_t *zp = arg; 121 122 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 123 124 zp->z_vnode = NULL; 125 126 list_link_init(&zp->z_link_node); 127 128 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 129 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 130 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 131 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 132 133 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 134 avl_create(&zp->z_range_avl, zfs_range_compare, 135 sizeof (rl_t), offsetof(rl_t, r_node)); 136 137 zp->z_dbuf = NULL; 138 zp->z_dirlocks = NULL; 139 zp->z_acl_cached = NULL; 140 return (0); 141 } 142 143 /*ARGSUSED*/ 144 static void 145 zfs_znode_cache_destructor(void *buf, void *arg) 146 { 147 znode_t *zp = arg; 148 149 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 150 ASSERT(ZTOV(zp) == NULL); 151 152 ASSERT(!list_link_active(&zp->z_link_node)); 153 mutex_destroy(&zp->z_lock); 154 rw_destroy(&zp->z_parent_lock); 155 rw_destroy(&zp->z_name_lock); 156 mutex_destroy(&zp->z_acl_lock); 157 avl_destroy(&zp->z_range_avl); 158 mutex_destroy(&zp->z_range_lock); 159 160 ASSERT(zp->z_dbuf == NULL); 161 ASSERT(zp->z_dirlocks == NULL); 162 ASSERT(zp->z_acl_cached == NULL); 163 } 164 165 #ifdef ZNODE_STATS 166 static struct { 167 uint64_t zms_zfsvfs_invalid; 168 uint64_t zms_zfsvfs_recheck1; 169 uint64_t zms_zfsvfs_unmounted; 170 uint64_t zms_zfsvfs_recheck2; 171 uint64_t zms_obj_held; 172 uint64_t zms_vnode_locked; 173 uint64_t zms_not_only_dnlc; 174 } znode_move_stats; 175 #endif /* ZNODE_STATS */ 176 177 static void 178 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 179 { 180 vnode_t *vp; 181 182 /* Copy fields. */ 183 nzp->z_zfsvfs = ozp->z_zfsvfs; 184 185 /* Swap vnodes. */ 186 vp = nzp->z_vnode; 187 nzp->z_vnode = ozp->z_vnode; 188 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 189 ZTOV(ozp)->v_data = ozp; 190 ZTOV(nzp)->v_data = nzp; 191 192 nzp->z_id = ozp->z_id; 193 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 194 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 195 nzp->z_unlinked = ozp->z_unlinked; 196 nzp->z_atime_dirty = ozp->z_atime_dirty; 197 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 198 nzp->z_blksz = ozp->z_blksz; 199 nzp->z_seq = ozp->z_seq; 200 nzp->z_mapcnt = ozp->z_mapcnt; 201 nzp->z_last_itx = ozp->z_last_itx; 202 nzp->z_gen = ozp->z_gen; 203 nzp->z_sync_cnt = ozp->z_sync_cnt; 204 nzp->z_phys = ozp->z_phys; 205 nzp->z_dbuf = ozp->z_dbuf; 206 207 /* 208 * Since this is just an idle znode and kmem is already dealing with 209 * memory pressure, release any cached ACL. 210 */ 211 if (ozp->z_acl_cached) { 212 zfs_acl_free(ozp->z_acl_cached); 213 ozp->z_acl_cached = NULL; 214 } 215 216 /* Update back pointers. */ 217 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 218 znode_evict_error); 219 220 /* 221 * Invalidate the original znode by clearing fields that provide a 222 * pointer back to the znode. Set the low bit of the vfs pointer to 223 * ensure that zfs_znode_move() recognizes the znode as invalid in any 224 * subsequent callback. 225 */ 226 ozp->z_dbuf = NULL; 227 POINTER_INVALIDATE(&ozp->z_zfsvfs); 228 } 229 230 #ifndef __NetBSD__ 231 /*ARGSUSED*/ 232 static kmem_cbrc_t 233 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 234 { 235 znode_t *ozp = buf, *nzp = newbuf; 236 zfsvfs_t *zfsvfs; 237 vnode_t *vp; 238 239 /* 240 * The znode is on the file system's list of known znodes if the vfs 241 * pointer is valid. We set the low bit of the vfs pointer when freeing 242 * the znode to invalidate it, and the memory patterns written by kmem 243 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 244 * created znode sets the vfs pointer last of all to indicate that the 245 * znode is known and in a valid state to be moved by this function. 246 */ 247 zfsvfs = ozp->z_zfsvfs; 248 if (!POINTER_IS_VALID(zfsvfs)) { 249 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 250 return (KMEM_CBRC_DONT_KNOW); 251 } 252 253 /* 254 * Close a small window in which it's possible that the filesystem could 255 * be unmounted and freed, and zfsvfs, though valid in the previous 256 * statement, could point to unrelated memory by the time we try to 257 * prevent the filesystem from being unmounted. 258 */ 259 rw_enter(&zfsvfs_lock, RW_WRITER); 260 if (zfsvfs != ozp->z_zfsvfs) { 261 rw_exit(&zfsvfs_lock); 262 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 263 return (KMEM_CBRC_DONT_KNOW); 264 } 265 266 /* 267 * If the znode is still valid, then so is the file system. We know that 268 * no valid file system can be freed while we hold zfsvfs_lock, so we 269 * can safely ensure that the filesystem is not and will not be 270 * unmounted. The next statement is equivalent to ZFS_ENTER(). 271 */ 272 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 273 if (zfsvfs->z_unmounted) { 274 ZFS_EXIT(zfsvfs); 275 rw_exit(&zfsvfs_lock); 276 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 277 return (KMEM_CBRC_DONT_KNOW); 278 } 279 rw_exit(&zfsvfs_lock); 280 281 mutex_enter(&zfsvfs->z_znodes_lock); 282 /* 283 * Recheck the vfs pointer in case the znode was removed just before 284 * acquiring the lock. 285 */ 286 if (zfsvfs != ozp->z_zfsvfs) { 287 mutex_exit(&zfsvfs->z_znodes_lock); 288 ZFS_EXIT(zfsvfs); 289 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 290 return (KMEM_CBRC_DONT_KNOW); 291 } 292 293 /* 294 * At this point we know that as long as we hold z_znodes_lock, the 295 * znode cannot be freed and fields within the znode can be safely 296 * accessed. Now, prevent a race with zfs_zget(). 297 */ 298 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 299 mutex_exit(&zfsvfs->z_znodes_lock); 300 ZFS_EXIT(zfsvfs); 301 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 302 return (KMEM_CBRC_LATER); 303 } 304 305 vp = ZTOV(ozp); 306 if (mutex_tryenter(&vp->v_lock) == 0) { 307 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 308 mutex_exit(&zfsvfs->z_znodes_lock); 309 ZFS_EXIT(zfsvfs); 310 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 311 return (KMEM_CBRC_LATER); 312 } 313 314 /* Only move znodes that are referenced _only_ by the DNLC. */ 315 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 316 mutex_exit(&vp->v_lock); 317 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 318 mutex_exit(&zfsvfs->z_znodes_lock); 319 ZFS_EXIT(zfsvfs); 320 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 321 return (KMEM_CBRC_LATER); 322 } 323 324 /* 325 * The znode is known and in a valid state to move. We're holding the 326 * locks needed to execute the critical section. 327 */ 328 zfs_znode_move_impl(ozp, nzp); 329 mutex_exit(&vp->v_lock); 330 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 331 332 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 333 mutex_exit(&zfsvfs->z_znodes_lock); 334 ZFS_EXIT(zfsvfs); 335 336 return (KMEM_CBRC_YES); 337 } 338 #endif /* !__NetBSD__ */ 339 340 void 341 zfs_znode_init(void) 342 { 343 /* 344 * Initialize zcache 345 */ 346 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 347 ASSERT(znode_cache == NULL); 348 znode_cache = kmem_cache_create("zfs_znode_cache", 349 sizeof (znode_t), 0, zfs_znode_cache_constructor, 350 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 351 } 352 353 void 354 zfs_znode_fini(void) 355 { 356 357 /* 358 * Cleanup zcache 359 */ 360 if (znode_cache) 361 kmem_cache_destroy(znode_cache); 362 znode_cache = NULL; 363 rw_destroy(&zfsvfs_lock); 364 } 365 366 #ifndef __NetBSD__ 367 struct vnodeops *zfs_dvnodeops; 368 struct vnodeops *zfs_fvnodeops; 369 struct vnodeops *zfs_symvnodeops; 370 struct vnodeops *zfs_xdvnodeops; 371 struct vnodeops *zfs_evnodeops; 372 struct vnodeops *zfs_sharevnodeops; 373 #endif 374 375 void 376 zfs_remove_op_tables() 377 { 378 #ifndef __NetBSD__ 379 /* 380 * Remove vfs ops 381 */ 382 ASSERT(zfsfstype); 383 (void) vfs_freevfsops_by_type(zfsfstype); 384 zfsfstype = 0; 385 386 /* 387 * Remove vnode ops 388 */ 389 if (zfs_dvnodeops) 390 vn_freevnodeops(zfs_dvnodeops); 391 if (zfs_fvnodeops) 392 vn_freevnodeops(zfs_fvnodeops); 393 if (zfs_symvnodeops) 394 vn_freevnodeops(zfs_symvnodeops); 395 if (zfs_xdvnodeops) 396 vn_freevnodeops(zfs_xdvnodeops); 397 if (zfs_evnodeops) 398 vn_freevnodeops(zfs_evnodeops); 399 if (zfs_sharevnodeops) 400 vn_freevnodeops(zfs_sharevnodeops); 401 402 zfs_dvnodeops = NULL; 403 zfs_fvnodeops = NULL; 404 zfs_symvnodeops = NULL; 405 zfs_xdvnodeops = NULL; 406 zfs_evnodeops = NULL; 407 zfs_sharevnodeops = NULL; 408 #endif 409 } 410 411 #ifndef __NetBSD__ 412 extern const fs_operation_def_t zfs_dvnodeops_template[]; 413 extern const fs_operation_def_t zfs_fvnodeops_template[]; 414 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 415 extern const fs_operation_def_t zfs_symvnodeops_template[]; 416 extern const fs_operation_def_t zfs_evnodeops_template[]; 417 extern const fs_operation_def_t zfs_sharevnodeops_template[]; 418 #endif 419 420 int 421 zfs_create_op_tables() 422 { 423 #ifndef __NetBSD__ 424 int error; 425 426 /* 427 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 428 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 429 * In this case we just return as the ops vectors are already set up. 430 */ 431 if (zfs_dvnodeops) 432 return (0); 433 434 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 435 &zfs_dvnodeops); 436 if (error) 437 return (error); 438 439 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 440 &zfs_fvnodeops); 441 if (error) 442 return (error); 443 444 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 445 &zfs_symvnodeops); 446 if (error) 447 return (error); 448 449 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 450 &zfs_xdvnodeops); 451 if (error) 452 return (error); 453 454 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 455 &zfs_evnodeops); 456 if (error) 457 return (error); 458 459 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, 460 &zfs_sharevnodeops); 461 462 return (error); 463 #endif 464 return 0; 465 } 466 467 int 468 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 469 { 470 zfs_acl_ids_t acl_ids; 471 vattr_t vattr; 472 znode_t *sharezp; 473 znode_t *zp; 474 int error; 475 476 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 477 vattr.va_type = VDIR; 478 vattr.va_mode = S_IFDIR|0555; 479 vattr.va_uid = crgetuid(kcred); 480 vattr.va_gid = crgetgid(kcred); 481 482 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 483 sharezp->z_unlinked = 0; 484 sharezp->z_atime_dirty = 0; 485 sharezp->z_zfsvfs = zfsvfs; 486 487 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 488 kcred, NULL, &acl_ids)); 489 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 490 &zp, 0, &acl_ids); 491 ASSERT3P(zp, ==, sharezp); 492 #ifndef __NetBSD__ 493 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ 494 #endif 495 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 496 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 497 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 498 zfsvfs->z_shares_dir = sharezp->z_id; 499 500 zfs_acl_ids_free(&acl_ids); 501 dmu_buf_rele(sharezp->z_dbuf, NULL); 502 sharezp->z_dbuf = NULL; 503 kmem_cache_free(znode_cache, sharezp); 504 505 return (error); 506 } 507 508 /* 509 * define a couple of values we need available 510 * for both 64 and 32 bit environments. 511 */ 512 #ifndef NBITSMINOR64 513 #define NBITSMINOR64 32 514 #endif 515 #ifndef MAXMAJ64 516 #define MAXMAJ64 0xffffffffUL 517 #endif 518 #ifndef MAXMIN64 519 #define MAXMIN64 0xffffffffUL 520 #endif 521 522 /* 523 * Create special expldev for ZFS private use. 524 * Can't use standard expldev since it doesn't do 525 * what we want. The standard expldev() takes a 526 * dev32_t in LP64 and expands it to a long dev_t. 527 * We need an interface that takes a dev32_t in ILP32 528 * and expands it to a long dev_t. 529 */ 530 static uint64_t 531 zfs_expldev(dev_t dev) 532 { 533 return ((uint64_t)major(dev) << NBITSMINOR64) | 534 (minor_t)minor(dev); 535 } 536 537 /* 538 * Special cmpldev for ZFS private use. 539 * Can't use standard cmpldev since it takes 540 * a long dev_t and compresses it to dev32_t in 541 * LP64. We need to do a compaction of a long dev_t 542 * to a dev32_t in ILP32. 543 */ 544 dev_t 545 zfs_cmpldev(uint64_t dev) 546 { 547 minor_t minor = (minor_t)dev & MAXMIN64; 548 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 549 550 return makedev(minor, major); 551 } 552 553 static void 554 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 555 { 556 znode_t *nzp; 557 558 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 559 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 560 561 mutex_enter(&zp->z_lock); 562 563 ASSERT(zp->z_dbuf == NULL); 564 ASSERT(zp->z_acl_cached == NULL); 565 zp->z_dbuf = db; 566 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 567 568 /* 569 * there should be no 570 * concurrent zgets on this object. 571 */ 572 if (nzp != NULL) 573 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 574 575 /* 576 * Slap on VROOT if we are the root znode 577 */ 578 if (zp->z_id == zfsvfs->z_root) 579 ZTOV(zp)->v_flag |= VROOT; 580 581 mutex_exit(&zp->z_lock); 582 vn_exists(ZTOV(zp)); 583 } 584 585 void 586 zfs_znode_dmu_fini(znode_t *zp) 587 { 588 dmu_buf_t *db = zp->z_dbuf; 589 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 590 zp->z_unlinked || 591 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 592 ASSERT(zp->z_dbuf != NULL); 593 zp->z_dbuf = NULL; 594 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 595 dmu_buf_rele(db, NULL); 596 } 597 598 /* 599 * Create a new DMU object to hold a zfs znode. 600 * 601 * IN: dzp - parent directory for new znode 602 * vap - file attributes for new znode 603 * tx - dmu transaction id for zap operations 604 * cr - credentials of caller 605 * flag - flags: 606 * IS_ROOT_NODE - new object will be root 607 * IS_XATTR - new object is an attribute 608 * bonuslen - length of bonus buffer 609 * setaclp - File/Dir initial ACL 610 * fuidp - Tracks fuid allocation. 611 * 612 * OUT: zpp - allocated znode 613 * 614 */ 615 void 616 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 617 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 618 { 619 dmu_buf_t *db; 620 znode_phys_t *pzp; 621 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 622 timestruc_t now; 623 uint64_t gen, obj; 624 int err; 625 626 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 627 628 if (zfsvfs->z_replay) { 629 obj = vap->va_nodeid; 630 now = vap->va_ctime; /* see zfs_replay_create() */ 631 gen = vap->va_nblocks; /* ditto */ 632 } else { 633 obj = 0; 634 gethrestime(&now); 635 gen = dmu_tx_get_txg(tx); 636 } 637 638 /* 639 * Create a new DMU object. 640 */ 641 /* 642 * There's currently no mechanism for pre-reading the blocks that will 643 * be to needed allocate a new object, so we accept the small chance 644 * that there will be an i/o error and we will fail one of the 645 * assertions below. 646 */ 647 if (vap->va_type == VDIR) { 648 if (zfsvfs->z_replay) { 649 err = zap_create_claim_norm(zfsvfs->z_os, obj, 650 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 651 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 652 ASSERT3U(err, ==, 0); 653 } else { 654 obj = zap_create_norm(zfsvfs->z_os, 655 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 656 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 657 } 658 } else { 659 if (zfsvfs->z_replay) { 660 err = dmu_object_claim(zfsvfs->z_os, obj, 661 DMU_OT_PLAIN_FILE_CONTENTS, 0, 662 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 663 ASSERT3U(err, ==, 0); 664 } else { 665 obj = dmu_object_alloc(zfsvfs->z_os, 666 DMU_OT_PLAIN_FILE_CONTENTS, 0, 667 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 668 } 669 } 670 671 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 672 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 673 dmu_buf_will_dirty(db, tx); 674 675 /* 676 * Initialize the znode physical data to zero. 677 */ 678 ASSERT(db->db_size >= sizeof (znode_phys_t)); 679 bzero(db->db_data, db->db_size); 680 pzp = db->db_data; 681 682 /* 683 * If this is the root, fix up the half-initialized parent pointer 684 * to reference the just-allocated physical data area. 685 */ 686 if (flag & IS_ROOT_NODE) { 687 dzp->z_dbuf = db; 688 dzp->z_phys = pzp; 689 dzp->z_id = obj; 690 } 691 692 /* 693 * If parent is an xattr, so am I. 694 */ 695 if (dzp->z_phys->zp_flags & ZFS_XATTR) 696 flag |= IS_XATTR; 697 698 if (vap->va_type == VBLK || vap->va_type == VCHR) { 699 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 700 } 701 702 if (zfsvfs->z_use_fuids) 703 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 704 705 if (vap->va_type == VDIR) { 706 pzp->zp_size = 2; /* contents ("." and "..") */ 707 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 708 } 709 710 pzp->zp_parent = dzp->z_id; 711 if (flag & IS_XATTR) 712 pzp->zp_flags |= ZFS_XATTR; 713 714 pzp->zp_gen = gen; 715 716 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 717 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 718 719 if (vap->va_mask & AT_ATIME) { 720 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 721 } else { 722 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 723 } 724 725 if (vap->va_mask & AT_MTIME) { 726 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 727 } else { 728 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 729 } 730 pzp->zp_uid = acl_ids->z_fuid; 731 pzp->zp_gid = acl_ids->z_fgid; 732 pzp->zp_mode = acl_ids->z_mode; 733 if (!(flag & IS_ROOT_NODE)) { 734 struct vnode *vp; 735 736 err = vcache_get(zfsvfs->z_vfs, &obj, sizeof(obj), &vp); 737 ASSERT3U(err, ==, 0); 738 *zpp = VTOZ(vp); 739 dmu_buf_rele(db, NULL); 740 } else { 741 /* 742 * If we are creating the root node, the "parent" we 743 * passed in is the znode for the root. 744 */ 745 *zpp = dzp; 746 } 747 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 748 if (vap->va_mask & AT_XVATTR) 749 zfs_xvattr_set(*zpp, (xvattr_t *)vap); 750 751 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 752 } 753 754 void 755 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 756 { 757 xoptattr_t *xoap; 758 759 xoap = xva_getxoptattr(xvap); 760 ASSERT(xoap); 761 762 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 763 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 764 XVA_SET_RTN(xvap, XAT_CREATETIME); 765 } 766 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 767 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 768 XVA_SET_RTN(xvap, XAT_READONLY); 769 } 770 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 771 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 772 XVA_SET_RTN(xvap, XAT_HIDDEN); 773 } 774 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 775 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 776 XVA_SET_RTN(xvap, XAT_SYSTEM); 777 } 778 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 779 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 780 XVA_SET_RTN(xvap, XAT_ARCHIVE); 781 } 782 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 783 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 784 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 785 } 786 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 787 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 788 XVA_SET_RTN(xvap, XAT_NOUNLINK); 789 } 790 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 791 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 792 XVA_SET_RTN(xvap, XAT_APPENDONLY); 793 } 794 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 795 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 796 XVA_SET_RTN(xvap, XAT_NODUMP); 797 } 798 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 799 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 800 XVA_SET_RTN(xvap, XAT_OPAQUE); 801 } 802 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 803 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 804 xoap->xoa_av_quarantined); 805 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 806 } 807 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 808 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 809 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 810 } 811 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 812 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 813 sizeof (xoap->xoa_av_scanstamp)); 814 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 815 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 816 } 817 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 818 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); 819 XVA_SET_RTN(xvap, XAT_REPARSE); 820 } 821 } 822 823 int 824 zfs_loadvnode(struct mount *mp, struct vnode *vp, 825 const void *key, size_t key_len, const void **new_key) 826 { 827 uint64_t obj_num; 828 zfsvfs_t *zfsvfs; 829 dmu_object_info_t doi; 830 dmu_buf_t *db; 831 znode_t *zp; 832 int err; 833 834 KASSERT(key_len == sizeof(obj_num)); 835 memcpy(&obj_num, key, key_len); 836 837 zfsvfs = mp->mnt_data; 838 839 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 840 if (err) { 841 return err; 842 } 843 844 dmu_object_info_from_db(db, &doi); 845 if (doi.doi_bonus_type != DMU_OT_ZNODE || 846 doi.doi_bonus_size < sizeof (znode_phys_t)) { 847 dmu_buf_rele(db, NULL); 848 return EINVAL; 849 } 850 851 KASSERT(dmu_buf_get_user(db) == NULL); 852 853 /* 854 * There is a small window where zfs_vget() could 855 * find this object while a file create is still in 856 * progress. Since a gen number can never be zero 857 * we will check that to determine if its an allocated 858 * file. 859 */ 860 861 if (((znode_phys_t *)db->db_data)->zp_gen == 0) { 862 dmu_buf_rele(db, NULL); 863 return ENOENT; 864 } 865 866 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 867 868 ASSERT(zp->z_dirlocks == NULL); 869 ASSERT(zp->z_dbuf == NULL); 870 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 871 872 /* 873 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 874 * the zfs_znode_move() callback. 875 */ 876 zp->z_phys = NULL; 877 zp->z_unlinked = 0; 878 zp->z_atime_dirty = 0; 879 zp->z_mapcnt = 0; 880 zp->z_last_itx = 0; 881 zp->z_id = db->db_object; 882 zp->z_blksz = doi.doi_data_block_size; 883 zp->z_seq = 0x7A4653; 884 zp->z_sync_cnt = 0; 885 zp->z_vnode = vp; 886 887 zfs_znode_dmu_init(zfsvfs, zp, db); 888 889 zp->z_gen = zp->z_phys->zp_gen; 890 891 vp->v_op = zfs_vnodeop_p; 892 vp->v_tag = VT_ZFS; 893 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 894 vp->v_data = zp; 895 genfs_node_init(vp, &zfs_genfsops); 896 switch (vp->v_type) { 897 case VDIR: 898 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 899 break; 900 case VBLK: 901 case VCHR: 902 /* XXX NetBSD vp->v_op = zfs_specop_p; */ 903 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev)); 904 break; 905 case VFIFO: 906 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */ 907 break; 908 } 909 910 dprintf("zfs_loadvnode znode %p -- vnode %p\n", zp, vp); 911 dprintf("zfs_loadvnode z_id %ld\n", zp->z_id); 912 913 uvm_vnp_setsize(vp, zp->z_phys->zp_size); 914 915 mutex_enter(&zfsvfs->z_znodes_lock); 916 list_insert_tail(&zfsvfs->z_all_znodes, zp); 917 membar_producer(); 918 /* 919 * Everything else must be valid before assigning z_zfsvfs makes the 920 * znode eligible for zfs_znode_move(). 921 */ 922 zp->z_zfsvfs = zfsvfs; 923 mutex_exit(&zfsvfs->z_znodes_lock); 924 925 VFS_HOLD(zfsvfs->z_vfs); 926 927 *new_key = &zp->z_id; 928 929 return 0; 930 } 931 932 int 933 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 934 { 935 struct vnode *vp; 936 int error; 937 938 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 939 940 error = vcache_get(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp); 941 if (error == 0 && VTOZ(vp)->z_unlinked) { 942 vrele(vp); 943 error = ENOENT; 944 } 945 if (error) 946 *zpp = NULL; 947 else 948 *zpp = VTOZ(vp); 949 950 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 951 952 return error; 953 } 954 955 int 956 zfs_rezget(znode_t *zp) 957 { 958 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 959 dmu_object_info_t doi; 960 dmu_buf_t *db; 961 uint64_t obj_num = zp->z_id; 962 int err; 963 964 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 965 966 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 967 if (err) { 968 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 969 return (err); 970 } 971 972 dmu_object_info_from_db(db, &doi); 973 if (doi.doi_bonus_type != DMU_OT_ZNODE || 974 doi.doi_bonus_size < sizeof (znode_phys_t)) { 975 dmu_buf_rele(db, NULL); 976 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 977 return (EINVAL); 978 } 979 980 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 981 dmu_buf_rele(db, NULL); 982 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 983 return (EIO); 984 } 985 986 mutex_enter(&zp->z_acl_lock); 987 if (zp->z_acl_cached) { 988 zfs_acl_free(zp->z_acl_cached); 989 zp->z_acl_cached = NULL; 990 } 991 mutex_exit(&zp->z_acl_lock); 992 993 zfs_znode_dmu_init(zfsvfs, zp, db); 994 zp->z_unlinked = (zp->z_phys->zp_links == 0); 995 zp->z_blksz = doi.doi_data_block_size; 996 997 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 998 999 return (0); 1000 } 1001 1002 void 1003 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1004 { 1005 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1006 objset_t *os = zfsvfs->z_os; 1007 uint64_t obj = zp->z_id; 1008 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1009 1010 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1011 if (acl_obj) 1012 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1013 VERIFY(0 == dmu_object_free(os, obj, tx)); 1014 zfs_znode_dmu_fini(zp); 1015 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1016 zfs_znode_free(zp); 1017 } 1018 1019 void 1020 zfs_zinactive(znode_t *zp) 1021 { 1022 vnode_t *vp = ZTOV(zp); 1023 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1024 uint64_t z_id = zp->z_id; 1025 1026 ASSERT(zp->z_dbuf && zp->z_phys); 1027 1028 /* 1029 * Don't allow a zfs_zget() while were trying to release this znode 1030 */ 1031 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1032 1033 mutex_enter(&zp->z_lock); 1034 /* 1035 * If this was the last reference to a file with no links, 1036 * remove the file from the file system. 1037 */ 1038 if (zp->z_unlinked) { 1039 mutex_exit(&zp->z_lock); 1040 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1041 zfs_rmnode(zp); 1042 return; 1043 } 1044 1045 mutex_exit(&zp->z_lock); 1046 zfs_znode_dmu_fini(zp); 1047 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1048 zfs_znode_free(zp); 1049 } 1050 1051 void 1052 zfs_znode_free(znode_t *zp) 1053 { 1054 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1055 struct vnode *vp; 1056 1057 mutex_enter(&zp->z_lock); 1058 vp = ZTOV(zp); 1059 if (vp != NULL) { 1060 vcache_remove(vp->v_mount, &zp->z_id, sizeof(zp->z_id)); 1061 genfs_node_destroy(vp); 1062 /* 1063 * To interlock with zfs_sync(). 1064 */ 1065 mutex_enter(vp->v_interlock); 1066 vp->v_data = NULL; 1067 mutex_exit(vp->v_interlock); 1068 } 1069 mutex_exit(&zp->z_lock); 1070 1071 dprintf("destroying znode %p\n", zp); 1072 //cpu_Debugger(); 1073 mutex_enter(&zfsvfs->z_znodes_lock); 1074 POINTER_INVALIDATE(&zp->z_zfsvfs); 1075 list_remove(&zfsvfs->z_all_znodes, zp); 1076 mutex_exit(&zfsvfs->z_znodes_lock); 1077 1078 if (zp->z_acl_cached) { 1079 zfs_acl_free(zp->z_acl_cached); 1080 zp->z_acl_cached = NULL; 1081 } 1082 1083 kmem_cache_free(znode_cache, zp); 1084 1085 VFS_RELE(zfsvfs->z_vfs); 1086 } 1087 1088 void 1089 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1090 { 1091 timestruc_t now; 1092 1093 ASSERT(MUTEX_HELD(&zp->z_lock)); 1094 1095 gethrestime(&now); 1096 1097 if (tx) { 1098 dmu_buf_will_dirty(zp->z_dbuf, tx); 1099 zp->z_atime_dirty = 0; 1100 zp->z_seq++; 1101 } else { 1102 zp->z_atime_dirty = 1; 1103 } 1104 1105 if (flag & AT_ATIME) 1106 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1107 1108 if (flag & AT_MTIME) { 1109 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1110 if (zp->z_zfsvfs->z_use_fuids) 1111 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1112 } 1113 1114 if (flag & AT_CTIME) { 1115 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1116 if (zp->z_zfsvfs->z_use_fuids) 1117 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1118 } 1119 } 1120 1121 /* 1122 * Update the requested znode timestamps with the current time. 1123 * If we are in a transaction, then go ahead and mark the znode 1124 * dirty in the transaction so the timestamps will go to disk. 1125 * Otherwise, we will get pushed next time the znode is updated 1126 * in a transaction, or when this znode eventually goes inactive. 1127 * 1128 * Why is this OK? 1129 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1130 * 2 - Multiple consecutive updates will be collapsed into a single 1131 * znode update by the transaction grouping semantics of the DMU. 1132 */ 1133 void 1134 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1135 { 1136 mutex_enter(&zp->z_lock); 1137 zfs_time_stamper_locked(zp, flag, tx); 1138 mutex_exit(&zp->z_lock); 1139 } 1140 1141 /* 1142 * Grow the block size for a file. 1143 * 1144 * IN: zp - znode of file to free data in. 1145 * size - requested block size 1146 * tx - open transaction. 1147 * 1148 * NOTE: this function assumes that the znode is write locked. 1149 */ 1150 void 1151 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1152 { 1153 int error; 1154 u_longlong_t dummy; 1155 1156 if (size <= zp->z_blksz) 1157 return; 1158 /* 1159 * If the file size is already greater than the current blocksize, 1160 * we will not grow. If there is more than one block in a file, 1161 * the blocksize cannot change. 1162 */ 1163 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1164 return; 1165 1166 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1167 size, 0, tx); 1168 if (error == ENOTSUP) 1169 return; 1170 ASSERT3U(error, ==, 0); 1171 1172 /* What blocksize did we actually get? */ 1173 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1174 } 1175 1176 /* 1177 * Increase the file length 1178 * 1179 * IN: zp - znode of file to free data in. 1180 * end - new end-of-file 1181 * 1182 * RETURN: 0 if success 1183 * error code if failure 1184 */ 1185 static int 1186 zfs_extend(znode_t *zp, uint64_t end) 1187 { 1188 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1189 dmu_tx_t *tx; 1190 rl_t *rl; 1191 uint64_t newblksz; 1192 int error; 1193 1194 /* 1195 * We will change zp_size, lock the whole file. 1196 */ 1197 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1198 1199 /* 1200 * Nothing to do if file already at desired length. 1201 */ 1202 if (end <= zp->z_phys->zp_size) { 1203 zfs_range_unlock(rl); 1204 return (0); 1205 } 1206 top: 1207 tx = dmu_tx_create(zfsvfs->z_os); 1208 dmu_tx_hold_bonus(tx, zp->z_id); 1209 if (end > zp->z_blksz && 1210 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1211 /* 1212 * We are growing the file past the current block size. 1213 */ 1214 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1215 ASSERT(!ISP2(zp->z_blksz)); 1216 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1217 } else { 1218 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1219 } 1220 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1221 } else { 1222 newblksz = 0; 1223 } 1224 1225 error = dmu_tx_assign(tx, TXG_NOWAIT); 1226 if (error) { 1227 if (error == ERESTART) { 1228 dmu_tx_wait(tx); 1229 dmu_tx_abort(tx); 1230 goto top; 1231 } 1232 dmu_tx_abort(tx); 1233 zfs_range_unlock(rl); 1234 return (error); 1235 } 1236 dmu_buf_will_dirty(zp->z_dbuf, tx); 1237 1238 if (newblksz) 1239 zfs_grow_blocksize(zp, newblksz, tx); 1240 1241 zp->z_phys->zp_size = end; 1242 1243 zfs_range_unlock(rl); 1244 1245 dmu_tx_commit(tx); 1246 1247 uvm_vnp_setsize(ZTOV(zp), end); 1248 1249 return (0); 1250 } 1251 1252 /* 1253 * Free space in a file. 1254 * 1255 * IN: zp - znode of file to free data in. 1256 * off - start of section to free. 1257 * len - length of section to free. 1258 * 1259 * RETURN: 0 if success 1260 * error code if failure 1261 */ 1262 static int 1263 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1264 { 1265 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1266 rl_t *rl; 1267 int error; 1268 1269 /* 1270 * Lock the range being freed. 1271 */ 1272 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1273 1274 /* 1275 * Nothing to do if file already at desired length. 1276 */ 1277 if (off >= zp->z_phys->zp_size) { 1278 zfs_range_unlock(rl); 1279 return (0); 1280 } 1281 1282 if (off + len > zp->z_phys->zp_size) 1283 len = zp->z_phys->zp_size - off; 1284 1285 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1286 1287 if (error == 0) { 1288 /* 1289 * In NetBSD we cannot free block in the middle of a file, 1290 * but only at the end of a file. 1291 */ 1292 uvm_vnp_setsize(ZTOV(zp), off); 1293 } 1294 1295 zfs_range_unlock(rl); 1296 1297 return (error); 1298 } 1299 1300 /* 1301 * Truncate a file 1302 * 1303 * IN: zp - znode of file to free data in. 1304 * end - new end-of-file. 1305 * 1306 * RETURN: 0 if success 1307 * error code if failure 1308 */ 1309 static int 1310 zfs_trunc(znode_t *zp, uint64_t end) 1311 { 1312 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1313 vnode_t *vp = ZTOV(zp); 1314 dmu_tx_t *tx; 1315 rl_t *rl; 1316 int error; 1317 1318 /* 1319 * We will change zp_size, lock the whole file. 1320 */ 1321 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1322 1323 /* 1324 * Nothing to do if file already at desired length. 1325 */ 1326 if (end >= zp->z_phys->zp_size) { 1327 zfs_range_unlock(rl); 1328 return (0); 1329 } 1330 1331 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1332 if (error) { 1333 zfs_range_unlock(rl); 1334 return (error); 1335 } 1336 top: 1337 tx = dmu_tx_create(zfsvfs->z_os); 1338 dmu_tx_hold_bonus(tx, zp->z_id); 1339 error = dmu_tx_assign(tx, TXG_NOWAIT); 1340 if (error) { 1341 if (error == ERESTART) { 1342 dmu_tx_wait(tx); 1343 dmu_tx_abort(tx); 1344 goto top; 1345 } 1346 dmu_tx_abort(tx); 1347 zfs_range_unlock(rl); 1348 return (error); 1349 } 1350 dmu_buf_will_dirty(zp->z_dbuf, tx); 1351 1352 zp->z_phys->zp_size = end; 1353 1354 dmu_tx_commit(tx); 1355 1356 zfs_range_unlock(rl); 1357 1358 /* 1359 * Clear any mapped pages in the truncated region. This has to 1360 * happen outside of the transaction to avoid the possibility of 1361 * a deadlock with someone trying to push a page that we are 1362 * about to invalidate. 1363 */ 1364 1365 uvm_vnp_setsize(vp, end); 1366 1367 return (0); 1368 } 1369 1370 /* 1371 * Free space in a file 1372 * 1373 * IN: zp - znode of file to free data in. 1374 * off - start of range 1375 * len - end of range (0 => EOF) 1376 * flag - current file open mode flags. 1377 * log - TRUE if this action should be logged 1378 * 1379 * RETURN: 0 if success 1380 * error code if failure 1381 */ 1382 int 1383 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1384 { 1385 vnode_t *vp = ZTOV(zp); 1386 dmu_tx_t *tx; 1387 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1388 zilog_t *zilog = zfsvfs->z_log; 1389 int error; 1390 1391 if (off > zp->z_phys->zp_size) { 1392 error = zfs_extend(zp, off+len); 1393 if (error == 0 && log) 1394 goto log; 1395 else 1396 return (error); 1397 } 1398 1399 if (len == 0) { 1400 error = zfs_trunc(zp, off); 1401 } else { 1402 if ((error = zfs_free_range(zp, off, len)) == 0 && 1403 off + len > zp->z_phys->zp_size) 1404 error = zfs_extend(zp, off+len); 1405 } 1406 if (error || !log) 1407 return (error); 1408 log: 1409 tx = dmu_tx_create(zfsvfs->z_os); 1410 dmu_tx_hold_bonus(tx, zp->z_id); 1411 error = dmu_tx_assign(tx, TXG_NOWAIT); 1412 if (error) { 1413 if (error == ERESTART) { 1414 dmu_tx_wait(tx); 1415 dmu_tx_abort(tx); 1416 goto log; 1417 } 1418 dmu_tx_abort(tx); 1419 return (error); 1420 } 1421 1422 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1423 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1424 1425 dmu_tx_commit(tx); 1426 return (0); 1427 } 1428 1429 void 1430 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1431 { 1432 zfsvfs_t zfsvfs; 1433 uint64_t moid, obj, version; 1434 uint64_t sense = ZFS_CASE_SENSITIVE; 1435 uint64_t norm = 0; 1436 nvpair_t *elem; 1437 int error; 1438 int i; 1439 znode_t *rootzp = NULL; 1440 vattr_t vattr; 1441 znode_t *zp; 1442 zfs_acl_ids_t acl_ids; 1443 1444 /* 1445 * First attempt to create master node. 1446 */ 1447 /* 1448 * In an empty objset, there are no blocks to read and thus 1449 * there can be no i/o errors (which we assert below). 1450 */ 1451 moid = MASTER_NODE_OBJ; 1452 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1453 DMU_OT_NONE, 0, tx); 1454 ASSERT(error == 0); 1455 1456 /* 1457 * Set starting attributes. 1458 */ 1459 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1460 version = ZPL_VERSION; 1461 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1462 version = ZPL_VERSION_USERSPACE - 1; 1463 else 1464 version = ZPL_VERSION_FUID - 1; 1465 elem = NULL; 1466 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1467 /* For the moment we expect all zpl props to be uint64_ts */ 1468 uint64_t val; 1469 char *name; 1470 1471 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1472 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1473 name = nvpair_name(elem); 1474 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1475 if (val < version) 1476 version = val; 1477 } else { 1478 error = zap_update(os, moid, name, 8, 1, &val, tx); 1479 } 1480 ASSERT(error == 0); 1481 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1482 norm = val; 1483 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1484 sense = val; 1485 } 1486 ASSERT(version != 0); 1487 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1488 1489 /* 1490 * Create a delete queue. 1491 */ 1492 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1493 1494 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1495 ASSERT(error == 0); 1496 1497 /* 1498 * Create root znode. Create minimal znode/vnode/zfsvfs 1499 * to allow zfs_mknode to work. 1500 */ 1501 vattr_null(&vattr); 1502 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1503 vattr.va_type = VDIR; 1504 vattr.va_mode = S_IFDIR|0755; 1505 vattr.va_uid = crgetuid(cr); 1506 vattr.va_gid = crgetgid(cr); 1507 1508 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1509 rootzp->z_unlinked = 0; 1510 rootzp->z_atime_dirty = 0; 1511 1512 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1513 1514 zfsvfs.z_os = os; 1515 zfsvfs.z_parent = &zfsvfs; 1516 zfsvfs.z_version = version; 1517 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1518 zfsvfs.z_norm = norm; 1519 /* 1520 * Fold case on file systems that are always or sometimes case 1521 * insensitive. 1522 */ 1523 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1524 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1525 1526 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1527 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1528 offsetof(znode_t, z_link_node)); 1529 1530 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1531 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1532 1533 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1534 rootzp->z_zfsvfs = &zfsvfs; 1535 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1536 cr, NULL, &acl_ids)); 1537 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1538 ASSERT3P(zp, ==, rootzp); 1539 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1540 ASSERT(error == 0); 1541 zfs_acl_ids_free(&acl_ids); 1542 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1543 1544 dmu_buf_rele(rootzp->z_dbuf, NULL); 1545 rootzp->z_dbuf = NULL; 1546 kmem_cache_free(znode_cache, rootzp); 1547 1548 /* 1549 * Create shares directory 1550 */ 1551 1552 error = zfs_create_share_dir(&zfsvfs, tx); 1553 1554 ASSERT(error == 0); 1555 1556 mutex_destroy(&zfsvfs.z_znodes_lock); 1557 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1558 mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1559 } 1560 1561 #endif /* _KERNEL */ 1562 /* 1563 * Given an object number, return its parent object number and whether 1564 * or not the object is an extended attribute directory. 1565 */ 1566 static int 1567 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1568 { 1569 dmu_buf_t *db; 1570 dmu_object_info_t doi; 1571 znode_phys_t *zp; 1572 int error; 1573 1574 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1575 return (error); 1576 1577 dmu_object_info_from_db(db, &doi); 1578 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1579 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1580 dmu_buf_rele(db, FTAG); 1581 return (EINVAL); 1582 } 1583 1584 zp = db->db_data; 1585 *pobjp = zp->zp_parent; 1586 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1587 S_ISDIR(zp->zp_mode); 1588 dmu_buf_rele(db, FTAG); 1589 1590 return (0); 1591 } 1592 1593 int 1594 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1595 { 1596 char *path = buf + len - 1; 1597 int error; 1598 1599 *path = '\0'; 1600 1601 for (;;) { 1602 uint64_t pobj; 1603 char component[MAXNAMELEN + 2]; 1604 size_t complen; 1605 int is_xattrdir; 1606 1607 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1608 &is_xattrdir)) != 0) 1609 break; 1610 1611 if (pobj == obj) { 1612 if (path[0] != '/') 1613 *--path = '/'; 1614 break; 1615 } 1616 1617 component[0] = '/'; 1618 if (is_xattrdir) { 1619 (void) snprintf(component + 1, sizeof(component) - 1, 1620 "<xattrdir>"); 1621 } else { 1622 error = zap_value_search(osp, pobj, obj, 1623 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1624 if (error != 0) 1625 break; 1626 } 1627 1628 complen = strlen(component); 1629 path -= complen; 1630 ASSERT(path >= buf); 1631 bcopy(component, path, complen); 1632 obj = pobj; 1633 } 1634 1635 if (error == 0) 1636 (void) memmove(buf, path, buf + len - path); 1637 return (error); 1638 } 1639