1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/kmem.h> 42 #include <sys/errno.h> 43 #include <sys/unistd.h> 44 #include <sys/atomic.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/zfs_rlock.h> 49 #include <sys/zfs_fuid.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/kidmap.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 60 #include "zfs_prop.h" 61 62 #if defined(_KERNEL) && defined(__NetBSD__) 63 #include <miscfs/specfs/specdev.h> 64 static const struct genfs_ops zfs_genfsops = { 65 .gop_write = genfs_compat_gop_write, 66 }; 67 68 #endif 69 70 extern int (**zfs_vnodeop_p)(void *); 71 extern int (**zfs_fifoop_p)(void *); 72 extern int (**zfs_specop_p)(void *); 73 74 /* 75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 76 * turned on when DEBUG is also defined. 77 */ 78 #ifdef DEBUG 79 #define ZNODE_STATS 80 #endif /* DEBUG */ 81 82 #ifdef ZNODE_STATS 83 #define ZNODE_STAT_ADD(stat) ((stat)++) 84 #else 85 #define ZNODE_STAT_ADD(stat) /* nothing */ 86 #endif /* ZNODE_STATS */ 87 88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 90 91 /* 92 * Functions needed for userland (ie: libzpool) are not put under 93 * #ifdef_KERNEL; the rest of the functions have dependencies 94 * (such as VFS logic) that will not compile easily in userland. 95 */ 96 #ifdef _KERNEL 97 /* 98 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 99 * be freed before it can be safely accessed. 100 */ 101 krwlock_t zfsvfs_lock; 102 103 static kmem_cache_t *znode_cache = NULL; 104 105 /*ARGSUSED*/ 106 static void 107 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 108 { 109 /* 110 * We should never drop all dbuf refs without first clearing 111 * the eviction callback. 112 */ 113 panic("evicting znode %p\n", user_ptr); 114 } 115 116 /*ARGSUSED*/ 117 static int 118 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 119 { 120 znode_t *zp = arg; 121 122 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 123 124 zp->z_vnode = NULL; 125 126 list_link_init(&zp->z_link_node); 127 128 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 129 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 130 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 131 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 132 133 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 134 avl_create(&zp->z_range_avl, zfs_range_compare, 135 sizeof (rl_t), offsetof(rl_t, r_node)); 136 137 zp->z_dbuf = NULL; 138 zp->z_dirlocks = NULL; 139 zp->z_acl_cached = NULL; 140 return (0); 141 } 142 143 /*ARGSUSED*/ 144 static void 145 zfs_znode_cache_destructor(void *buf, void *arg) 146 { 147 znode_t *zp = arg; 148 149 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 150 ASSERT(ZTOV(zp) == NULL); 151 152 ASSERT(!list_link_active(&zp->z_link_node)); 153 mutex_destroy(&zp->z_lock); 154 rw_destroy(&zp->z_parent_lock); 155 rw_destroy(&zp->z_name_lock); 156 mutex_destroy(&zp->z_acl_lock); 157 avl_destroy(&zp->z_range_avl); 158 mutex_destroy(&zp->z_range_lock); 159 160 ASSERT(zp->z_dbuf == NULL); 161 ASSERT(zp->z_dirlocks == NULL); 162 ASSERT(zp->z_acl_cached == NULL); 163 } 164 165 #ifdef ZNODE_STATS 166 static struct { 167 uint64_t zms_zfsvfs_invalid; 168 uint64_t zms_zfsvfs_recheck1; 169 uint64_t zms_zfsvfs_unmounted; 170 uint64_t zms_zfsvfs_recheck2; 171 uint64_t zms_obj_held; 172 uint64_t zms_vnode_locked; 173 uint64_t zms_not_only_dnlc; 174 } znode_move_stats; 175 #endif /* ZNODE_STATS */ 176 177 static void 178 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 179 { 180 vnode_t *vp; 181 182 /* Copy fields. */ 183 nzp->z_zfsvfs = ozp->z_zfsvfs; 184 185 /* Swap vnodes. */ 186 vp = nzp->z_vnode; 187 nzp->z_vnode = ozp->z_vnode; 188 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 189 ZTOV(ozp)->v_data = ozp; 190 ZTOV(nzp)->v_data = nzp; 191 192 nzp->z_id = ozp->z_id; 193 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 194 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 195 nzp->z_unlinked = ozp->z_unlinked; 196 nzp->z_atime_dirty = ozp->z_atime_dirty; 197 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 198 nzp->z_blksz = ozp->z_blksz; 199 nzp->z_seq = ozp->z_seq; 200 nzp->z_mapcnt = ozp->z_mapcnt; 201 nzp->z_last_itx = ozp->z_last_itx; 202 nzp->z_gen = ozp->z_gen; 203 nzp->z_sync_cnt = ozp->z_sync_cnt; 204 nzp->z_phys = ozp->z_phys; 205 nzp->z_dbuf = ozp->z_dbuf; 206 207 /* 208 * Since this is just an idle znode and kmem is already dealing with 209 * memory pressure, release any cached ACL. 210 */ 211 if (ozp->z_acl_cached) { 212 zfs_acl_free(ozp->z_acl_cached); 213 ozp->z_acl_cached = NULL; 214 } 215 216 /* Update back pointers. */ 217 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 218 znode_evict_error); 219 220 /* 221 * Invalidate the original znode by clearing fields that provide a 222 * pointer back to the znode. Set the low bit of the vfs pointer to 223 * ensure that zfs_znode_move() recognizes the znode as invalid in any 224 * subsequent callback. 225 */ 226 ozp->z_dbuf = NULL; 227 POINTER_INVALIDATE(&ozp->z_zfsvfs); 228 } 229 230 #ifndef __NetBSD__ 231 /*ARGSUSED*/ 232 static kmem_cbrc_t 233 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 234 { 235 znode_t *ozp = buf, *nzp = newbuf; 236 zfsvfs_t *zfsvfs; 237 vnode_t *vp; 238 239 /* 240 * The znode is on the file system's list of known znodes if the vfs 241 * pointer is valid. We set the low bit of the vfs pointer when freeing 242 * the znode to invalidate it, and the memory patterns written by kmem 243 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 244 * created znode sets the vfs pointer last of all to indicate that the 245 * znode is known and in a valid state to be moved by this function. 246 */ 247 zfsvfs = ozp->z_zfsvfs; 248 if (!POINTER_IS_VALID(zfsvfs)) { 249 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 250 return (KMEM_CBRC_DONT_KNOW); 251 } 252 253 /* 254 * Close a small window in which it's possible that the filesystem could 255 * be unmounted and freed, and zfsvfs, though valid in the previous 256 * statement, could point to unrelated memory by the time we try to 257 * prevent the filesystem from being unmounted. 258 */ 259 rw_enter(&zfsvfs_lock, RW_WRITER); 260 if (zfsvfs != ozp->z_zfsvfs) { 261 rw_exit(&zfsvfs_lock); 262 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 263 return (KMEM_CBRC_DONT_KNOW); 264 } 265 266 /* 267 * If the znode is still valid, then so is the file system. We know that 268 * no valid file system can be freed while we hold zfsvfs_lock, so we 269 * can safely ensure that the filesystem is not and will not be 270 * unmounted. The next statement is equivalent to ZFS_ENTER(). 271 */ 272 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 273 if (zfsvfs->z_unmounted) { 274 ZFS_EXIT(zfsvfs); 275 rw_exit(&zfsvfs_lock); 276 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 277 return (KMEM_CBRC_DONT_KNOW); 278 } 279 rw_exit(&zfsvfs_lock); 280 281 mutex_enter(&zfsvfs->z_znodes_lock); 282 /* 283 * Recheck the vfs pointer in case the znode was removed just before 284 * acquiring the lock. 285 */ 286 if (zfsvfs != ozp->z_zfsvfs) { 287 mutex_exit(&zfsvfs->z_znodes_lock); 288 ZFS_EXIT(zfsvfs); 289 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 290 return (KMEM_CBRC_DONT_KNOW); 291 } 292 293 /* 294 * At this point we know that as long as we hold z_znodes_lock, the 295 * znode cannot be freed and fields within the znode can be safely 296 * accessed. Now, prevent a race with zfs_zget(). 297 */ 298 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 299 mutex_exit(&zfsvfs->z_znodes_lock); 300 ZFS_EXIT(zfsvfs); 301 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 302 return (KMEM_CBRC_LATER); 303 } 304 305 vp = ZTOV(ozp); 306 if (mutex_tryenter(&vp->v_lock) == 0) { 307 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 308 mutex_exit(&zfsvfs->z_znodes_lock); 309 ZFS_EXIT(zfsvfs); 310 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 311 return (KMEM_CBRC_LATER); 312 } 313 314 /* Only move znodes that are referenced _only_ by the DNLC. */ 315 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 316 mutex_exit(&vp->v_lock); 317 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 318 mutex_exit(&zfsvfs->z_znodes_lock); 319 ZFS_EXIT(zfsvfs); 320 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 321 return (KMEM_CBRC_LATER); 322 } 323 324 /* 325 * The znode is known and in a valid state to move. We're holding the 326 * locks needed to execute the critical section. 327 */ 328 zfs_znode_move_impl(ozp, nzp); 329 mutex_exit(&vp->v_lock); 330 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 331 332 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 333 mutex_exit(&zfsvfs->z_znodes_lock); 334 ZFS_EXIT(zfsvfs); 335 336 return (KMEM_CBRC_YES); 337 } 338 #endif /* !__NetBSD__ */ 339 340 void 341 zfs_znode_init(void) 342 { 343 /* 344 * Initialize zcache 345 */ 346 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 347 ASSERT(znode_cache == NULL); 348 znode_cache = kmem_cache_create("zfs_znode_cache", 349 sizeof (znode_t), 0, zfs_znode_cache_constructor, 350 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 351 } 352 353 void 354 zfs_znode_fini(void) 355 { 356 357 /* 358 * Cleanup zcache 359 */ 360 if (znode_cache) 361 kmem_cache_destroy(znode_cache); 362 znode_cache = NULL; 363 rw_destroy(&zfsvfs_lock); 364 } 365 366 #ifndef __NetBSD__ 367 struct vnodeops *zfs_dvnodeops; 368 struct vnodeops *zfs_fvnodeops; 369 struct vnodeops *zfs_symvnodeops; 370 struct vnodeops *zfs_xdvnodeops; 371 struct vnodeops *zfs_evnodeops; 372 struct vnodeops *zfs_sharevnodeops; 373 #endif 374 375 void 376 zfs_remove_op_tables() 377 { 378 #ifndef __NetBSD__ 379 /* 380 * Remove vfs ops 381 */ 382 ASSERT(zfsfstype); 383 (void) vfs_freevfsops_by_type(zfsfstype); 384 zfsfstype = 0; 385 386 /* 387 * Remove vnode ops 388 */ 389 if (zfs_dvnodeops) 390 vn_freevnodeops(zfs_dvnodeops); 391 if (zfs_fvnodeops) 392 vn_freevnodeops(zfs_fvnodeops); 393 if (zfs_symvnodeops) 394 vn_freevnodeops(zfs_symvnodeops); 395 if (zfs_xdvnodeops) 396 vn_freevnodeops(zfs_xdvnodeops); 397 if (zfs_evnodeops) 398 vn_freevnodeops(zfs_evnodeops); 399 if (zfs_sharevnodeops) 400 vn_freevnodeops(zfs_sharevnodeops); 401 402 zfs_dvnodeops = NULL; 403 zfs_fvnodeops = NULL; 404 zfs_symvnodeops = NULL; 405 zfs_xdvnodeops = NULL; 406 zfs_evnodeops = NULL; 407 zfs_sharevnodeops = NULL; 408 #endif 409 } 410 411 #ifndef __NetBSD__ 412 extern const fs_operation_def_t zfs_dvnodeops_template[]; 413 extern const fs_operation_def_t zfs_fvnodeops_template[]; 414 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 415 extern const fs_operation_def_t zfs_symvnodeops_template[]; 416 extern const fs_operation_def_t zfs_evnodeops_template[]; 417 extern const fs_operation_def_t zfs_sharevnodeops_template[]; 418 #endif 419 420 int 421 zfs_create_op_tables() 422 { 423 #ifndef __NetBSD__ 424 int error; 425 426 /* 427 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 428 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 429 * In this case we just return as the ops vectors are already set up. 430 */ 431 if (zfs_dvnodeops) 432 return (0); 433 434 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 435 &zfs_dvnodeops); 436 if (error) 437 return (error); 438 439 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 440 &zfs_fvnodeops); 441 if (error) 442 return (error); 443 444 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 445 &zfs_symvnodeops); 446 if (error) 447 return (error); 448 449 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 450 &zfs_xdvnodeops); 451 if (error) 452 return (error); 453 454 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 455 &zfs_evnodeops); 456 if (error) 457 return (error); 458 459 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, 460 &zfs_sharevnodeops); 461 462 return (error); 463 #endif 464 return 0; 465 } 466 467 int 468 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 469 { 470 zfs_acl_ids_t acl_ids; 471 vattr_t vattr; 472 znode_t *sharezp; 473 znode_t *zp; 474 int error; 475 476 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 477 vattr.va_type = VDIR; 478 vattr.va_mode = S_IFDIR|0555; 479 vattr.va_uid = crgetuid(kcred); 480 vattr.va_gid = crgetgid(kcred); 481 482 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 483 sharezp->z_unlinked = 0; 484 sharezp->z_atime_dirty = 0; 485 sharezp->z_zfsvfs = zfsvfs; 486 487 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 488 kcred, NULL, &acl_ids)); 489 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, 490 &zp, 0, &acl_ids); 491 ASSERT3P(zp, ==, sharezp); 492 #ifndef __NetBSD__ 493 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ 494 #endif 495 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 496 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 497 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 498 zfsvfs->z_shares_dir = sharezp->z_id; 499 500 zfs_acl_ids_free(&acl_ids); 501 dmu_buf_rele(sharezp->z_dbuf, NULL); 502 sharezp->z_dbuf = NULL; 503 kmem_cache_free(znode_cache, sharezp); 504 505 return (error); 506 } 507 508 /* 509 * define a couple of values we need available 510 * for both 64 and 32 bit environments. 511 */ 512 #ifndef NBITSMINOR64 513 #define NBITSMINOR64 32 514 #endif 515 #ifndef MAXMAJ64 516 #define MAXMAJ64 0xffffffffUL 517 #endif 518 #ifndef MAXMIN64 519 #define MAXMIN64 0xffffffffUL 520 #endif 521 522 /* 523 * Create special expldev for ZFS private use. 524 * Can't use standard expldev since it doesn't do 525 * what we want. The standard expldev() takes a 526 * dev32_t in LP64 and expands it to a long dev_t. 527 * We need an interface that takes a dev32_t in ILP32 528 * and expands it to a long dev_t. 529 */ 530 static uint64_t 531 zfs_expldev(dev_t dev) 532 { 533 return ((uint64_t)major(dev) << NBITSMINOR64) | 534 (minor_t)minor(dev); 535 } 536 537 /* 538 * Special cmpldev for ZFS private use. 539 * Can't use standard cmpldev since it takes 540 * a long dev_t and compresses it to dev32_t in 541 * LP64. We need to do a compaction of a long dev_t 542 * to a dev32_t in ILP32. 543 */ 544 dev_t 545 zfs_cmpldev(uint64_t dev) 546 { 547 minor_t minor = (minor_t)dev & MAXMIN64; 548 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 549 550 return makedev(minor, major); 551 } 552 553 static void 554 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 555 { 556 znode_t *nzp; 557 558 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 559 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 560 561 mutex_enter(&zp->z_lock); 562 563 ASSERT(zp->z_dbuf == NULL); 564 ASSERT(zp->z_acl_cached == NULL); 565 zp->z_dbuf = db; 566 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 567 568 /* 569 * there should be no 570 * concurrent zgets on this object. 571 */ 572 if (nzp != NULL) 573 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 574 575 /* 576 * Slap on VROOT if we are the root znode 577 */ 578 if (zp->z_id == zfsvfs->z_root) 579 ZTOV(zp)->v_flag |= VROOT; 580 581 mutex_exit(&zp->z_lock); 582 vn_exists(ZTOV(zp)); 583 } 584 585 void 586 zfs_znode_dmu_fini(znode_t *zp) 587 { 588 dmu_buf_t *db = zp->z_dbuf; 589 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 590 zp->z_unlinked || 591 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 592 ASSERT(zp->z_dbuf != NULL); 593 zp->z_dbuf = NULL; 594 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 595 dmu_buf_rele(db, NULL); 596 } 597 598 /* 599 * Create a new DMU object to hold a zfs znode. 600 * 601 * IN: dzp - parent directory for new znode 602 * vap - file attributes for new znode 603 * tx - dmu transaction id for zap operations 604 * cr - credentials of caller 605 * flag - flags: 606 * IS_ROOT_NODE - new object will be root 607 * IS_XATTR - new object is an attribute 608 * bonuslen - length of bonus buffer 609 * setaclp - File/Dir initial ACL 610 * fuidp - Tracks fuid allocation. 611 * 612 * OUT: zpp - allocated znode 613 * 614 */ 615 void 616 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 617 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) 618 { 619 dmu_buf_t *db; 620 znode_phys_t *pzp; 621 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 622 timestruc_t now; 623 uint64_t gen, obj; 624 int err; 625 626 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 627 628 if (zfsvfs->z_replay) { 629 obj = vap->va_nodeid; 630 now = vap->va_ctime; /* see zfs_replay_create() */ 631 gen = vap->va_nblocks; /* ditto */ 632 } else { 633 obj = 0; 634 gethrestime(&now); 635 gen = dmu_tx_get_txg(tx); 636 } 637 638 /* 639 * Create a new DMU object. 640 */ 641 /* 642 * There's currently no mechanism for pre-reading the blocks that will 643 * be to needed allocate a new object, so we accept the small chance 644 * that there will be an i/o error and we will fail one of the 645 * assertions below. 646 */ 647 if (vap->va_type == VDIR) { 648 if (zfsvfs->z_replay) { 649 err = zap_create_claim_norm(zfsvfs->z_os, obj, 650 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 651 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 652 ASSERT3U(err, ==, 0); 653 } else { 654 obj = zap_create_norm(zfsvfs->z_os, 655 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 656 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 657 } 658 } else { 659 if (zfsvfs->z_replay) { 660 err = dmu_object_claim(zfsvfs->z_os, obj, 661 DMU_OT_PLAIN_FILE_CONTENTS, 0, 662 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 663 ASSERT3U(err, ==, 0); 664 } else { 665 obj = dmu_object_alloc(zfsvfs->z_os, 666 DMU_OT_PLAIN_FILE_CONTENTS, 0, 667 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 668 } 669 } 670 671 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 672 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 673 dmu_buf_will_dirty(db, tx); 674 675 /* 676 * Initialize the znode physical data to zero. 677 */ 678 ASSERT(db->db_size >= sizeof (znode_phys_t)); 679 bzero(db->db_data, db->db_size); 680 pzp = db->db_data; 681 682 /* 683 * If this is the root, fix up the half-initialized parent pointer 684 * to reference the just-allocated physical data area. 685 */ 686 if (flag & IS_ROOT_NODE) { 687 dzp->z_dbuf = db; 688 dzp->z_phys = pzp; 689 dzp->z_id = obj; 690 } 691 692 /* 693 * If parent is an xattr, so am I. 694 */ 695 if (dzp->z_phys->zp_flags & ZFS_XATTR) 696 flag |= IS_XATTR; 697 698 if (vap->va_type == VBLK || vap->va_type == VCHR) { 699 pzp->zp_rdev = zfs_expldev(vap->va_rdev); 700 } 701 702 if (zfsvfs->z_use_fuids) 703 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 704 705 if (vap->va_type == VDIR) { 706 pzp->zp_size = 2; /* contents ("." and "..") */ 707 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 708 } 709 710 pzp->zp_parent = dzp->z_id; 711 if (flag & IS_XATTR) 712 pzp->zp_flags |= ZFS_XATTR; 713 714 pzp->zp_gen = gen; 715 716 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 717 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 718 719 if (vap->va_mask & AT_ATIME) { 720 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 721 } else { 722 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 723 } 724 725 if (vap->va_mask & AT_MTIME) { 726 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 727 } else { 728 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 729 } 730 pzp->zp_uid = acl_ids->z_fuid; 731 pzp->zp_gid = acl_ids->z_fgid; 732 pzp->zp_mode = acl_ids->z_mode; 733 if (!(flag & IS_ROOT_NODE)) { 734 struct vnode *vp; 735 736 err = vcache_get(zfsvfs->z_vfs, &obj, sizeof(obj), &vp); 737 ASSERT3U(err, ==, 0); 738 *zpp = VTOZ(vp); 739 dmu_buf_rele(db, NULL); 740 } else { 741 /* 742 * If we are creating the root node, the "parent" we 743 * passed in is the znode for the root. 744 */ 745 *zpp = dzp; 746 } 747 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 748 if (vap->va_mask & AT_XVATTR) 749 zfs_xvattr_set(*zpp, (xvattr_t *)vap); 750 751 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 752 } 753 754 void 755 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 756 { 757 xoptattr_t *xoap; 758 759 xoap = xva_getxoptattr(xvap); 760 ASSERT(xoap); 761 762 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 763 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 764 XVA_SET_RTN(xvap, XAT_CREATETIME); 765 } 766 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 767 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 768 XVA_SET_RTN(xvap, XAT_READONLY); 769 } 770 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 771 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 772 XVA_SET_RTN(xvap, XAT_HIDDEN); 773 } 774 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 775 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 776 XVA_SET_RTN(xvap, XAT_SYSTEM); 777 } 778 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 779 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 780 XVA_SET_RTN(xvap, XAT_ARCHIVE); 781 } 782 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 783 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 784 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 785 } 786 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 787 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 788 XVA_SET_RTN(xvap, XAT_NOUNLINK); 789 } 790 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 791 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 792 XVA_SET_RTN(xvap, XAT_APPENDONLY); 793 } 794 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 795 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 796 XVA_SET_RTN(xvap, XAT_NODUMP); 797 } 798 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 799 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 800 XVA_SET_RTN(xvap, XAT_OPAQUE); 801 } 802 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 803 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 804 xoap->xoa_av_quarantined); 805 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 806 } 807 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 808 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 809 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 810 } 811 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 812 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 813 sizeof (xoap->xoa_av_scanstamp)); 814 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 815 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 816 } 817 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 818 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); 819 XVA_SET_RTN(xvap, XAT_REPARSE); 820 } 821 } 822 823 int 824 zfs_loadvnode(struct mount *mp, struct vnode *vp, 825 const void *key, size_t key_len, const void **new_key) 826 { 827 uint64_t obj_num; 828 zfsvfs_t *zfsvfs; 829 dmu_object_info_t doi; 830 dmu_buf_t *db; 831 znode_t *zp; 832 int err; 833 834 KASSERT(key_len == sizeof(obj_num)); 835 memcpy(&obj_num, key, key_len); 836 837 zfsvfs = mp->mnt_data; 838 839 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 840 if (err) { 841 return err; 842 } 843 844 dmu_object_info_from_db(db, &doi); 845 if (doi.doi_bonus_type != DMU_OT_ZNODE || 846 doi.doi_bonus_size < sizeof (znode_phys_t)) { 847 dmu_buf_rele(db, NULL); 848 return EINVAL; 849 } 850 851 KASSERT(dmu_buf_get_user(db) == NULL); 852 853 /* 854 * There is a small window where zfs_vget() could 855 * find this object while a file create is still in 856 * progress. Since a gen number can never be zero 857 * we will check that to determine if its an allocated 858 * file. 859 */ 860 861 if (((znode_phys_t *)db->db_data)->zp_gen == 0) { 862 dmu_buf_rele(db, NULL); 863 return ENOENT; 864 } 865 866 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 867 868 ASSERT(zp->z_dirlocks == NULL); 869 ASSERT(zp->z_dbuf == NULL); 870 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 871 872 /* 873 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 874 * the zfs_znode_move() callback. 875 */ 876 zp->z_phys = NULL; 877 zp->z_unlinked = 0; 878 zp->z_atime_dirty = 0; 879 zp->z_mapcnt = 0; 880 zp->z_last_itx = 0; 881 zp->z_id = db->db_object; 882 zp->z_blksz = doi.doi_data_block_size; 883 zp->z_seq = 0x7A4653; 884 zp->z_sync_cnt = 0; 885 zp->z_vnode = vp; 886 887 zfs_znode_dmu_init(zfsvfs, zp, db); 888 889 zp->z_gen = zp->z_phys->zp_gen; 890 891 vp->v_op = zfs_vnodeop_p; 892 vp->v_tag = VT_ZFS; 893 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 894 vp->v_data = zp; 895 genfs_node_init(vp, &zfs_genfsops); 896 switch (vp->v_type) { 897 case VDIR: 898 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 899 break; 900 case VBLK: 901 case VCHR: 902 /* XXX NetBSD vp->v_op = zfs_specop_p; */ 903 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev)); 904 break; 905 case VFIFO: 906 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */ 907 break; 908 } 909 910 dprintf("zfs_loadvnode znode %p -- vnode %p\n", zp, vp); 911 dprintf("zfs_loadvnode z_id %ld\n", zp->z_id); 912 913 uvm_vnp_setsize(vp, zp->z_phys->zp_size); 914 915 mutex_enter(&zfsvfs->z_znodes_lock); 916 list_insert_tail(&zfsvfs->z_all_znodes, zp); 917 membar_producer(); 918 /* 919 * Everything else must be valid before assigning z_zfsvfs makes the 920 * znode eligible for zfs_znode_move(). 921 */ 922 zp->z_zfsvfs = zfsvfs; 923 mutex_exit(&zfsvfs->z_znodes_lock); 924 925 VFS_HOLD(zfsvfs->z_vfs); 926 927 *new_key = &zp->z_id; 928 929 return 0; 930 } 931 932 int 933 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 934 { 935 struct vnode *vp; 936 int error; 937 938 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 939 940 error = vcache_get(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp); 941 if (error == 0 && VTOZ(vp)->z_unlinked) { 942 vrele(vp); 943 error = ENOENT; 944 } 945 if (error) 946 *zpp = NULL; 947 else 948 *zpp = VTOZ(vp); 949 950 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 951 952 return error; 953 } 954 955 int 956 zfs_rezget(znode_t *zp) 957 { 958 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 959 dmu_object_info_t doi; 960 dmu_buf_t *db; 961 uint64_t obj_num = zp->z_id; 962 int err; 963 964 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 965 966 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 967 if (err) { 968 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 969 return (err); 970 } 971 972 dmu_object_info_from_db(db, &doi); 973 if (doi.doi_bonus_type != DMU_OT_ZNODE || 974 doi.doi_bonus_size < sizeof (znode_phys_t)) { 975 dmu_buf_rele(db, NULL); 976 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 977 return (EINVAL); 978 } 979 980 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 981 dmu_buf_rele(db, NULL); 982 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 983 return (EIO); 984 } 985 986 mutex_enter(&zp->z_acl_lock); 987 if (zp->z_acl_cached) { 988 zfs_acl_free(zp->z_acl_cached); 989 zp->z_acl_cached = NULL; 990 } 991 mutex_exit(&zp->z_acl_lock); 992 993 zfs_znode_dmu_init(zfsvfs, zp, db); 994 zp->z_unlinked = (zp->z_phys->zp_links == 0); 995 zp->z_blksz = doi.doi_data_block_size; 996 997 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 998 999 return (0); 1000 } 1001 1002 void 1003 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1004 { 1005 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1006 objset_t *os = zfsvfs->z_os; 1007 uint64_t obj = zp->z_id; 1008 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 1009 1010 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1011 if (acl_obj) 1012 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1013 VERIFY(0 == dmu_object_free(os, obj, tx)); 1014 zfs_znode_dmu_fini(zp); 1015 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1016 zfs_znode_free(zp); 1017 } 1018 1019 void 1020 zfs_zinactive(znode_t *zp) 1021 { 1022 vnode_t *vp = ZTOV(zp); 1023 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1024 uint64_t z_id = zp->z_id; 1025 1026 ASSERT(zp->z_dbuf && zp->z_phys); 1027 1028 /* 1029 * Don't allow a zfs_zget() while were trying to release this znode 1030 */ 1031 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1032 1033 mutex_enter(&zp->z_lock); 1034 /* 1035 * If this was the last reference to a file with no links, 1036 * remove the file from the file system. 1037 */ 1038 if (zp->z_unlinked) { 1039 mutex_exit(&zp->z_lock); 1040 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1041 zfs_rmnode(zp); 1042 return; 1043 } 1044 1045 mutex_exit(&zp->z_lock); 1046 zfs_znode_dmu_fini(zp); 1047 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1048 zfs_znode_free(zp); 1049 } 1050 1051 void 1052 zfs_znode_free(znode_t *zp) 1053 { 1054 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1055 struct vnode *vp; 1056 1057 mutex_enter(&zp->z_lock); 1058 vp = ZTOV(zp); 1059 if (vp != NULL) { 1060 genfs_node_destroy(vp); 1061 /* 1062 * To interlock with zfs_sync(). 1063 */ 1064 mutex_enter(vp->v_interlock); 1065 vp->v_data = NULL; 1066 mutex_exit(vp->v_interlock); 1067 } 1068 mutex_exit(&zp->z_lock); 1069 1070 dprintf("destroying znode %p\n", zp); 1071 //cpu_Debugger(); 1072 mutex_enter(&zfsvfs->z_znodes_lock); 1073 POINTER_INVALIDATE(&zp->z_zfsvfs); 1074 list_remove(&zfsvfs->z_all_znodes, zp); 1075 mutex_exit(&zfsvfs->z_znodes_lock); 1076 1077 if (zp->z_acl_cached) { 1078 zfs_acl_free(zp->z_acl_cached); 1079 zp->z_acl_cached = NULL; 1080 } 1081 1082 kmem_cache_free(znode_cache, zp); 1083 1084 VFS_RELE(zfsvfs->z_vfs); 1085 } 1086 1087 void 1088 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1089 { 1090 timestruc_t now; 1091 1092 ASSERT(MUTEX_HELD(&zp->z_lock)); 1093 1094 gethrestime(&now); 1095 1096 if (tx) { 1097 dmu_buf_will_dirty(zp->z_dbuf, tx); 1098 zp->z_atime_dirty = 0; 1099 zp->z_seq++; 1100 } else { 1101 zp->z_atime_dirty = 1; 1102 } 1103 1104 if (flag & AT_ATIME) 1105 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1106 1107 if (flag & AT_MTIME) { 1108 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1109 if (zp->z_zfsvfs->z_use_fuids) 1110 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1111 } 1112 1113 if (flag & AT_CTIME) { 1114 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1115 if (zp->z_zfsvfs->z_use_fuids) 1116 zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1117 } 1118 } 1119 1120 /* 1121 * Update the requested znode timestamps with the current time. 1122 * If we are in a transaction, then go ahead and mark the znode 1123 * dirty in the transaction so the timestamps will go to disk. 1124 * Otherwise, we will get pushed next time the znode is updated 1125 * in a transaction, or when this znode eventually goes inactive. 1126 * 1127 * Why is this OK? 1128 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1129 * 2 - Multiple consecutive updates will be collapsed into a single 1130 * znode update by the transaction grouping semantics of the DMU. 1131 */ 1132 void 1133 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1134 { 1135 mutex_enter(&zp->z_lock); 1136 zfs_time_stamper_locked(zp, flag, tx); 1137 mutex_exit(&zp->z_lock); 1138 } 1139 1140 /* 1141 * Grow the block size for a file. 1142 * 1143 * IN: zp - znode of file to free data in. 1144 * size - requested block size 1145 * tx - open transaction. 1146 * 1147 * NOTE: this function assumes that the znode is write locked. 1148 */ 1149 void 1150 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1151 { 1152 int error; 1153 u_longlong_t dummy; 1154 1155 if (size <= zp->z_blksz) 1156 return; 1157 /* 1158 * If the file size is already greater than the current blocksize, 1159 * we will not grow. If there is more than one block in a file, 1160 * the blocksize cannot change. 1161 */ 1162 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1163 return; 1164 1165 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1166 size, 0, tx); 1167 if (error == ENOTSUP) 1168 return; 1169 ASSERT3U(error, ==, 0); 1170 1171 /* What blocksize did we actually get? */ 1172 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1173 } 1174 1175 /* 1176 * Increase the file length 1177 * 1178 * IN: zp - znode of file to free data in. 1179 * end - new end-of-file 1180 * 1181 * RETURN: 0 if success 1182 * error code if failure 1183 */ 1184 static int 1185 zfs_extend(znode_t *zp, uint64_t end) 1186 { 1187 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1188 dmu_tx_t *tx; 1189 rl_t *rl; 1190 uint64_t newblksz; 1191 int error; 1192 1193 /* 1194 * We will change zp_size, lock the whole file. 1195 */ 1196 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1197 1198 /* 1199 * Nothing to do if file already at desired length. 1200 */ 1201 if (end <= zp->z_phys->zp_size) { 1202 zfs_range_unlock(rl); 1203 return (0); 1204 } 1205 top: 1206 tx = dmu_tx_create(zfsvfs->z_os); 1207 dmu_tx_hold_bonus(tx, zp->z_id); 1208 if (end > zp->z_blksz && 1209 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1210 /* 1211 * We are growing the file past the current block size. 1212 */ 1213 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1214 ASSERT(!ISP2(zp->z_blksz)); 1215 newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1216 } else { 1217 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1218 } 1219 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1220 } else { 1221 newblksz = 0; 1222 } 1223 1224 error = dmu_tx_assign(tx, TXG_NOWAIT); 1225 if (error) { 1226 if (error == ERESTART) { 1227 dmu_tx_wait(tx); 1228 dmu_tx_abort(tx); 1229 goto top; 1230 } 1231 dmu_tx_abort(tx); 1232 zfs_range_unlock(rl); 1233 return (error); 1234 } 1235 dmu_buf_will_dirty(zp->z_dbuf, tx); 1236 1237 if (newblksz) 1238 zfs_grow_blocksize(zp, newblksz, tx); 1239 1240 zp->z_phys->zp_size = end; 1241 1242 zfs_range_unlock(rl); 1243 1244 dmu_tx_commit(tx); 1245 1246 uvm_vnp_setsize(ZTOV(zp), end); 1247 1248 return (0); 1249 } 1250 1251 /* 1252 * Free space in a file. 1253 * 1254 * IN: zp - znode of file to free data in. 1255 * off - start of section to free. 1256 * len - length of section to free. 1257 * 1258 * RETURN: 0 if success 1259 * error code if failure 1260 */ 1261 static int 1262 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1263 { 1264 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1265 rl_t *rl; 1266 int error; 1267 1268 /* 1269 * Lock the range being freed. 1270 */ 1271 rl = zfs_range_lock(zp, off, len, RL_WRITER); 1272 1273 /* 1274 * Nothing to do if file already at desired length. 1275 */ 1276 if (off >= zp->z_phys->zp_size) { 1277 zfs_range_unlock(rl); 1278 return (0); 1279 } 1280 1281 if (off + len > zp->z_phys->zp_size) 1282 len = zp->z_phys->zp_size - off; 1283 1284 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1285 1286 if (error == 0) { 1287 /* 1288 * In NetBSD we cannot free block in the middle of a file, 1289 * but only at the end of a file. 1290 */ 1291 uvm_vnp_setsize(ZTOV(zp), off); 1292 } 1293 1294 zfs_range_unlock(rl); 1295 1296 return (error); 1297 } 1298 1299 /* 1300 * Truncate a file 1301 * 1302 * IN: zp - znode of file to free data in. 1303 * end - new end-of-file. 1304 * 1305 * RETURN: 0 if success 1306 * error code if failure 1307 */ 1308 static int 1309 zfs_trunc(znode_t *zp, uint64_t end) 1310 { 1311 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1312 vnode_t *vp = ZTOV(zp); 1313 dmu_tx_t *tx; 1314 rl_t *rl; 1315 int error; 1316 1317 /* 1318 * We will change zp_size, lock the whole file. 1319 */ 1320 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1321 1322 /* 1323 * Nothing to do if file already at desired length. 1324 */ 1325 if (end >= zp->z_phys->zp_size) { 1326 zfs_range_unlock(rl); 1327 return (0); 1328 } 1329 1330 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1331 if (error) { 1332 zfs_range_unlock(rl); 1333 return (error); 1334 } 1335 top: 1336 tx = dmu_tx_create(zfsvfs->z_os); 1337 dmu_tx_hold_bonus(tx, zp->z_id); 1338 error = dmu_tx_assign(tx, TXG_NOWAIT); 1339 if (error) { 1340 if (error == ERESTART) { 1341 dmu_tx_wait(tx); 1342 dmu_tx_abort(tx); 1343 goto top; 1344 } 1345 dmu_tx_abort(tx); 1346 zfs_range_unlock(rl); 1347 return (error); 1348 } 1349 dmu_buf_will_dirty(zp->z_dbuf, tx); 1350 1351 zp->z_phys->zp_size = end; 1352 1353 dmu_tx_commit(tx); 1354 1355 zfs_range_unlock(rl); 1356 1357 /* 1358 * Clear any mapped pages in the truncated region. This has to 1359 * happen outside of the transaction to avoid the possibility of 1360 * a deadlock with someone trying to push a page that we are 1361 * about to invalidate. 1362 */ 1363 1364 uvm_vnp_setsize(vp, end); 1365 1366 return (0); 1367 } 1368 1369 /* 1370 * Free space in a file 1371 * 1372 * IN: zp - znode of file to free data in. 1373 * off - start of range 1374 * len - end of range (0 => EOF) 1375 * flag - current file open mode flags. 1376 * log - TRUE if this action should be logged 1377 * 1378 * RETURN: 0 if success 1379 * error code if failure 1380 */ 1381 int 1382 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1383 { 1384 vnode_t *vp = ZTOV(zp); 1385 dmu_tx_t *tx; 1386 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1387 zilog_t *zilog = zfsvfs->z_log; 1388 int error; 1389 1390 if (off > zp->z_phys->zp_size) { 1391 error = zfs_extend(zp, off+len); 1392 if (error == 0 && log) 1393 goto log; 1394 else 1395 return (error); 1396 } 1397 1398 if (len == 0) { 1399 error = zfs_trunc(zp, off); 1400 } else { 1401 if ((error = zfs_free_range(zp, off, len)) == 0 && 1402 off + len > zp->z_phys->zp_size) 1403 error = zfs_extend(zp, off+len); 1404 } 1405 if (error || !log) 1406 return (error); 1407 log: 1408 tx = dmu_tx_create(zfsvfs->z_os); 1409 dmu_tx_hold_bonus(tx, zp->z_id); 1410 error = dmu_tx_assign(tx, TXG_NOWAIT); 1411 if (error) { 1412 if (error == ERESTART) { 1413 dmu_tx_wait(tx); 1414 dmu_tx_abort(tx); 1415 goto log; 1416 } 1417 dmu_tx_abort(tx); 1418 return (error); 1419 } 1420 1421 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1422 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1423 1424 dmu_tx_commit(tx); 1425 return (0); 1426 } 1427 1428 void 1429 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1430 { 1431 zfsvfs_t zfsvfs; 1432 uint64_t moid, obj, version; 1433 uint64_t sense = ZFS_CASE_SENSITIVE; 1434 uint64_t norm = 0; 1435 nvpair_t *elem; 1436 int error; 1437 int i; 1438 znode_t *rootzp = NULL; 1439 vattr_t vattr; 1440 znode_t *zp; 1441 zfs_acl_ids_t acl_ids; 1442 1443 /* 1444 * First attempt to create master node. 1445 */ 1446 /* 1447 * In an empty objset, there are no blocks to read and thus 1448 * there can be no i/o errors (which we assert below). 1449 */ 1450 moid = MASTER_NODE_OBJ; 1451 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1452 DMU_OT_NONE, 0, tx); 1453 ASSERT(error == 0); 1454 1455 /* 1456 * Set starting attributes. 1457 */ 1458 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) 1459 version = ZPL_VERSION; 1460 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1461 version = ZPL_VERSION_USERSPACE - 1; 1462 else 1463 version = ZPL_VERSION_FUID - 1; 1464 elem = NULL; 1465 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1466 /* For the moment we expect all zpl props to be uint64_ts */ 1467 uint64_t val; 1468 char *name; 1469 1470 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1471 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1472 name = nvpair_name(elem); 1473 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1474 if (val < version) 1475 version = val; 1476 } else { 1477 error = zap_update(os, moid, name, 8, 1, &val, tx); 1478 } 1479 ASSERT(error == 0); 1480 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1481 norm = val; 1482 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1483 sense = val; 1484 } 1485 ASSERT(version != 0); 1486 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1487 1488 /* 1489 * Create a delete queue. 1490 */ 1491 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1492 1493 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1494 ASSERT(error == 0); 1495 1496 /* 1497 * Create root znode. Create minimal znode/vnode/zfsvfs 1498 * to allow zfs_mknode to work. 1499 */ 1500 vattr_null(&vattr); 1501 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1502 vattr.va_type = VDIR; 1503 vattr.va_mode = S_IFDIR|0755; 1504 vattr.va_uid = crgetuid(cr); 1505 vattr.va_gid = crgetgid(cr); 1506 1507 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1508 rootzp->z_unlinked = 0; 1509 rootzp->z_atime_dirty = 0; 1510 1511 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1512 1513 zfsvfs.z_os = os; 1514 zfsvfs.z_parent = &zfsvfs; 1515 zfsvfs.z_version = version; 1516 zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1517 zfsvfs.z_norm = norm; 1518 /* 1519 * Fold case on file systems that are always or sometimes case 1520 * insensitive. 1521 */ 1522 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1523 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1524 1525 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1526 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1527 offsetof(znode_t, z_link_node)); 1528 1529 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1530 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1531 1532 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1533 rootzp->z_zfsvfs = &zfsvfs; 1534 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1535 cr, NULL, &acl_ids)); 1536 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); 1537 ASSERT3P(zp, ==, rootzp); 1538 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1539 ASSERT(error == 0); 1540 zfs_acl_ids_free(&acl_ids); 1541 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1542 1543 dmu_buf_rele(rootzp->z_dbuf, NULL); 1544 rootzp->z_dbuf = NULL; 1545 kmem_cache_free(znode_cache, rootzp); 1546 1547 /* 1548 * Create shares directory 1549 */ 1550 1551 error = zfs_create_share_dir(&zfsvfs, tx); 1552 1553 ASSERT(error == 0); 1554 1555 mutex_destroy(&zfsvfs.z_znodes_lock); 1556 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1557 mutex_destroy(&zfsvfs.z_hold_mtx[i]); 1558 } 1559 1560 #endif /* _KERNEL */ 1561 /* 1562 * Given an object number, return its parent object number and whether 1563 * or not the object is an extended attribute directory. 1564 */ 1565 static int 1566 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1567 { 1568 dmu_buf_t *db; 1569 dmu_object_info_t doi; 1570 znode_phys_t *zp; 1571 int error; 1572 1573 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1574 return (error); 1575 1576 dmu_object_info_from_db(db, &doi); 1577 if (doi.doi_bonus_type != DMU_OT_ZNODE || 1578 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1579 dmu_buf_rele(db, FTAG); 1580 return (EINVAL); 1581 } 1582 1583 zp = db->db_data; 1584 *pobjp = zp->zp_parent; 1585 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1586 S_ISDIR(zp->zp_mode); 1587 dmu_buf_rele(db, FTAG); 1588 1589 return (0); 1590 } 1591 1592 int 1593 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1594 { 1595 char *path = buf + len - 1; 1596 int error; 1597 1598 *path = '\0'; 1599 1600 for (;;) { 1601 uint64_t pobj; 1602 char component[MAXNAMELEN + 2]; 1603 size_t complen; 1604 int is_xattrdir; 1605 1606 if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1607 &is_xattrdir)) != 0) 1608 break; 1609 1610 if (pobj == obj) { 1611 if (path[0] != '/') 1612 *--path = '/'; 1613 break; 1614 } 1615 1616 component[0] = '/'; 1617 if (is_xattrdir) { 1618 (void) snprintf(component + 1, sizeof(component) - 1, 1619 "<xattrdir>"); 1620 } else { 1621 error = zap_value_search(osp, pobj, obj, 1622 ZFS_DIRENT_OBJ(-1ULL), component + 1); 1623 if (error != 0) 1624 break; 1625 } 1626 1627 complen = strlen(component); 1628 path -= complen; 1629 ASSERT(path >= buf); 1630 bcopy(component, path, complen); 1631 obj = pobj; 1632 } 1633 1634 if (error == 0) 1635 (void) memmove(buf, path, buf + len - path); 1636 return (error); 1637 } 1638