1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/pathname.h> 32 #include <sys/vnode.h> 33 #include <sys/vfs.h> 34 #include <sys/vfs_opreg.h> 35 #include <sys/mntent.h> 36 #include <sys/mount.h> 37 #include <sys/cmn_err.h> 38 #include <sys/zfs_znode.h> 39 #include <sys/zfs_dir.h> 40 #include <sys/zil.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/dmu.h> 43 #include <sys/dsl_prop.h> 44 #include <sys/dsl_dataset.h> 45 #include <sys/dsl_deleg.h> 46 #include <sys/spa.h> 47 #include <sys/zap.h> 48 #include <sys/varargs.h> 49 #include <sys/policy.h> 50 #include <sys/atomic.h> 51 #include <sys/mkdev.h> 52 #include <sys/modctl.h> 53 #include <sys/zfs_ioctl.h> 54 #include <sys/zfs_ctldir.h> 55 #include <sys/zfs_fuid.h> 56 #include <sys/sunddi.h> 57 #include <sys/dnlc.h> 58 #include <sys/dmu_objset.h> 59 #include <sys/spa_boot.h> 60 61 #ifdef __NetBSD__ 62 /* include ddi_name_to_major function is there better place for it ?*/ 63 #include <sys/ddi.h> 64 #include <sys/systm.h> 65 #endif 66 67 int zfsfstype; 68 vfsops_t *zfs_vfsops = NULL; 69 static major_t zfs_major; 70 static minor_t zfs_minor; 71 static kmutex_t zfs_dev_mtx; 72 73 int zfs_debug_level; 74 kmutex_t zfs_debug_mtx; 75 76 /* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/ 77 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len); 78 static int zfs_umount(vfs_t *vfsp, int fflag); 79 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 80 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp); 81 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); 82 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp); 83 static int zfs_start(vfs_t *vfsp, int flags); 84 static void zfs_freevfs(vfs_t *vfsp); 85 86 void zfs_init(void); 87 void zfs_fini(void); 88 89 90 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc; 91 92 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = { 93 &zfs_vnodeop_opv_desc, 94 NULL, 95 }; 96 97 static struct vfsops zfs_vfsops_template = { 98 .vfs_name = MOUNT_ZFS, 99 .vfs_min_mount_data = sizeof(struct zfs_args), 100 .vfs_opv_descs = zfs_vnodeop_descs, 101 .vfs_mount = zfs_mount, 102 .vfs_unmount = zfs_umount, 103 .vfs_root = zfs_root, 104 .vfs_statvfs = zfs_statvfs, 105 .vfs_sync = zfs_sync, 106 .vfs_vget = zfs_vget, 107 .vfs_fhtovp = zfs_fhtovp, 108 .vfs_init = zfs_init, 109 .vfs_done = zfs_fini, 110 .vfs_start = zfs_start, 111 .vfs_renamelock_enter = (void*)nullop, 112 .vfs_renamelock_exit = (void*)nullop, 113 .vfs_reinit = (void *)nullop, 114 .vfs_vptofh = (void *)eopnotsupp, 115 .vfs_fhtovp = (void *)eopnotsupp, 116 .vfs_quotactl = (void *)eopnotsupp, 117 .vfs_extattrctl = (void *)eopnotsupp, 118 .vfs_snapshot = (void *)eopnotsupp, 119 .vfs_fsync = (void *)eopnotsupp, 120 }; 121 122 /* 123 * We need to keep a count of active fs's. 124 * This is necessary to prevent our module 125 * from being unloaded after a umount -f 126 */ 127 static uint32_t zfs_active_fs_count = 0; 128 129 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 130 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 131 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 132 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 133 134 /* 135 * MO_DEFAULT is not used since the default value is determined 136 * by the equivalent property. 137 */ 138 static mntopt_t mntopts[] = { 139 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 140 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 141 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 142 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 143 }; 144 145 static mntopts_t zfs_mntopts = { 146 sizeof (mntopts) / sizeof (mntopt_t), 147 mntopts 148 }; 149 150 /*ARGSUSED*/ 151 int 152 zfs_sync(vfs_t *vfsp, int flag, cred_t *cr) 153 { 154 zfsvfs_t *zfsvfs = vfsp->vfs_data; 155 znode_t *zp; 156 vnode_t *vp, *nvp, *mvp; 157 dmu_tx_t *tx; 158 int error; 159 160 161 error = 0; 162 163 /* 164 * Data integrity is job one. We don't want a compromised kernel 165 * writing to the storage pool, so we never sync during panic. 166 */ 167 if (panicstr) 168 return (0); 169 170 /* Allocate a marker vnode. */ 171 mvp = vnalloc(vfsp); 172 173 /* 174 * On NetBSD, we need to push out atime updates. Solaris does 175 * this during VOP_INACTIVE, but that does not work well with the 176 * BSD VFS, so we do it in batch here. 177 */ 178 mutex_enter(&mntvnode_lock); 179 loop: 180 for (vp = TAILQ_FIRST(&vfsp->mnt_vnodelist); vp; vp = nvp) { 181 nvp = TAILQ_NEXT(vp, v_mntvnodes); 182 /* 183 * If the vnode that we are about to sync is no 184 * longer associated with this mount point, start 185 * over. 186 */ 187 if (vp->v_mount != vfsp) 188 goto loop; 189 /* 190 * Don't interfere with concurrent scans of this FS. 191 */ 192 if (vismarker(vp)) 193 continue; 194 /* 195 * Skip the vnode/inode if inaccessible, or if the 196 * atime is clean. 197 */ 198 mutex_enter(&vp->v_interlock); 199 zp = VTOZ(vp); 200 if (zp == NULL || vp->v_type == VNON || 201 (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 || 202 zp->z_atime_dirty == 0 || zp->z_unlinked) { 203 mutex_exit(&vp->v_interlock); 204 continue; 205 } 206 vmark(mvp, vp); 207 mutex_exit(&mntvnode_lock); 208 error = vget(vp, LK_EXCLUSIVE); 209 if (error) { 210 mutex_enter(&mntvnode_lock); 211 nvp = vunmark(mvp); 212 if (error == ENOENT) { 213 goto loop; 214 } 215 continue; 216 } 217 tx = dmu_tx_create(zfsvfs->z_os); 218 dmu_tx_hold_bonus(tx, zp->z_id); 219 error = dmu_tx_assign(tx, TXG_WAIT); 220 if (error) { 221 dmu_tx_abort(tx); 222 } else { 223 dmu_buf_will_dirty(zp->z_dbuf, tx); 224 mutex_enter(&zp->z_lock); 225 zp->z_atime_dirty = 0; 226 mutex_exit(&zp->z_lock); 227 dmu_tx_commit(tx); 228 } 229 vput(vp); 230 mutex_enter(&mntvnode_lock); 231 nvp = vunmark(mvp); 232 } 233 mutex_exit(&mntvnode_lock); 234 235 /* 236 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 237 * to sync metadata, which they would otherwise cache indefinitely. 238 * Semantically, the only requirement is that the sync be initiated. 239 * The DMU syncs out txgs frequently, so there's nothing to do. 240 */ 241 if ((flag & MNT_LAZY) != 0) 242 return (0); 243 244 if (vfsp != NULL) { 245 /* 246 * Sync a specific filesystem. 247 */ 248 zfsvfs_t *zfsvfs = vfsp->vfs_data; 249 dsl_pool_t *dp; 250 251 ZFS_ENTER(zfsvfs); 252 dp = dmu_objset_pool(zfsvfs->z_os); 253 254 /* 255 * If the system is shutting down, then skip any 256 * filesystems which may exist on a suspended pool. 257 */ 258 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 259 ZFS_EXIT(zfsvfs); 260 return (0); 261 } 262 263 if (zfsvfs->z_log != NULL) 264 zil_commit(zfsvfs->z_log, UINT64_MAX, 0); 265 else 266 txg_wait_synced(dp, 0); 267 ZFS_EXIT(zfsvfs); 268 } else { 269 /* 270 * Sync all ZFS filesystems. This is what happens when you 271 * run sync(1M). Unlike other filesystems, ZFS honors the 272 * request by waiting for all pools to commit all dirty data. 273 */ 274 spa_sync_allpools(); 275 } 276 277 vnfree(nvp); 278 279 return (0); 280 } 281 282 static int 283 zfs_create_unique_device(dev_t *dev) 284 { 285 major_t new_major; 286 287 do { 288 ASSERT3U(zfs_minor, <=, MAXMIN); 289 minor_t start = zfs_minor; 290 do { 291 mutex_enter(&zfs_dev_mtx); 292 if (zfs_minor >= MAXMIN) { 293 /* 294 * If we're still using the real major 295 * keep out of /dev/zfs and /dev/zvol minor 296 * number space. If we're using a getudev()'ed 297 * major number, we can use all of its minors. 298 */ 299 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 300 zfs_minor = ZFS_MIN_MINOR; 301 else 302 zfs_minor = 0; 303 } else { 304 zfs_minor++; 305 } 306 *dev = makedevice(zfs_major, zfs_minor); 307 mutex_exit(&zfs_dev_mtx); 308 } while (vfs_devismounted(*dev) && zfs_minor != start); 309 break; 310 #ifndef __NetBSD__ 311 if (zfs_minor == start) { 312 /* 313 * We are using all ~262,000 minor numbers for the 314 * current major number. Create a new major number. 315 */ 316 if ((new_major = getudev()) == (major_t)-1) { 317 cmn_err(CE_WARN, 318 "zfs_mount: Can't get unique major " 319 "device number."); 320 return (-1); 321 } 322 mutex_enter(&zfs_dev_mtx); 323 zfs_major = new_major; 324 zfs_minor = 0; 325 326 mutex_exit(&zfs_dev_mtx); 327 } else { 328 break; 329 } 330 /* CONSTANTCONDITION */ 331 #endif 332 } while (1); 333 334 return (0); 335 } 336 337 static void 338 atime_changed_cb(void *arg, uint64_t newval) 339 { 340 zfsvfs_t *zfsvfs = arg; 341 342 if (newval == TRUE) { 343 zfsvfs->z_atime = TRUE; 344 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 345 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 346 } else { 347 zfsvfs->z_atime = FALSE; 348 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 349 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 350 } 351 } 352 353 static void 354 xattr_changed_cb(void *arg, uint64_t newval) 355 { 356 zfsvfs_t *zfsvfs = arg; 357 358 if (newval == TRUE) { 359 /* XXX locking on vfs_flag? */ 360 #ifdef TODO 361 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 362 #endif 363 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 364 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 365 } else { 366 /* XXX locking on vfs_flag? */ 367 #ifdef TODO 368 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 369 #endif 370 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 371 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 372 } 373 } 374 375 static void 376 blksz_changed_cb(void *arg, uint64_t newval) 377 { 378 zfsvfs_t *zfsvfs = arg; 379 380 if (newval < SPA_MINBLOCKSIZE || 381 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 382 newval = SPA_MAXBLOCKSIZE; 383 384 zfsvfs->z_max_blksz = newval; 385 zfsvfs->z_vfs->vfs_bsize = newval; 386 } 387 388 static void 389 readonly_changed_cb(void *arg, uint64_t newval) 390 { 391 zfsvfs_t *zfsvfs = arg; 392 393 if (newval) { 394 /* XXX locking on vfs_flag? */ 395 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 396 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 397 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 398 } else { 399 /* XXX locking on vfs_flag? */ 400 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 401 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 402 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 403 } 404 } 405 406 static void 407 devices_changed_cb(void *arg, uint64_t newval) 408 { 409 zfsvfs_t *zfsvfs = arg; 410 411 if (newval == FALSE) { 412 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 413 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 414 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 415 } else { 416 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 417 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 418 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 419 } 420 } 421 422 static void 423 setuid_changed_cb(void *arg, uint64_t newval) 424 { 425 zfsvfs_t *zfsvfs = arg; 426 427 if (newval == FALSE) { 428 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 429 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 430 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 431 } else { 432 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 433 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 434 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 435 } 436 } 437 438 static void 439 exec_changed_cb(void *arg, uint64_t newval) 440 { 441 zfsvfs_t *zfsvfs = arg; 442 443 if (newval == FALSE) { 444 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 445 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 446 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 447 } else { 448 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 449 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 450 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 451 } 452 } 453 454 /* 455 * The nbmand mount option can be changed at mount time. 456 * We can't allow it to be toggled on live file systems or incorrect 457 * behavior may be seen from cifs clients 458 * 459 * This property isn't registered via dsl_prop_register(), but this callback 460 * will be called when a file system is first mounted 461 */ 462 static void 463 nbmand_changed_cb(void *arg, uint64_t newval) 464 { 465 zfsvfs_t *zfsvfs = arg; 466 if (newval == FALSE) { 467 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 468 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 469 } else { 470 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 471 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 472 } 473 } 474 475 static void 476 snapdir_changed_cb(void *arg, uint64_t newval) 477 { 478 zfsvfs_t *zfsvfs = arg; 479 480 zfsvfs->z_show_ctldir = newval; 481 } 482 483 static void 484 vscan_changed_cb(void *arg, uint64_t newval) 485 { 486 zfsvfs_t *zfsvfs = arg; 487 488 zfsvfs->z_vscan = newval; 489 } 490 491 static void 492 acl_mode_changed_cb(void *arg, uint64_t newval) 493 { 494 zfsvfs_t *zfsvfs = arg; 495 496 zfsvfs->z_acl_mode = newval; 497 } 498 499 static void 500 acl_inherit_changed_cb(void *arg, uint64_t newval) 501 { 502 zfsvfs_t *zfsvfs = arg; 503 504 zfsvfs->z_acl_inherit = newval; 505 } 506 507 static int 508 zfs_register_callbacks(vfs_t *vfsp) 509 { 510 struct dsl_dataset *ds = NULL; 511 objset_t *os = NULL; 512 zfsvfs_t *zfsvfs = NULL; 513 uint64_t nbmand; 514 int readonly, do_readonly = B_FALSE; 515 int setuid, do_setuid = B_FALSE; 516 int exec, do_exec = B_FALSE; 517 int devices, do_devices = B_FALSE; 518 int xattr, do_xattr = B_FALSE; 519 int atime, do_atime = B_FALSE; 520 int error = 0; 521 522 ASSERT(vfsp); 523 zfsvfs = vfsp->vfs_data; 524 ASSERT(zfsvfs); 525 os = zfsvfs->z_os; 526 527 /* 528 * The act of registering our callbacks will destroy any mount 529 * options we may have. In order to enable temporary overrides 530 * of mount options, we stash away the current values and 531 * restore them after we register the callbacks. 532 */ 533 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 534 readonly = B_TRUE; 535 do_readonly = B_TRUE; 536 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 537 readonly = B_FALSE; 538 do_readonly = B_TRUE; 539 } 540 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 541 devices = B_FALSE; 542 setuid = B_FALSE; 543 do_devices = B_TRUE; 544 do_setuid = B_TRUE; 545 } else { 546 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 547 devices = B_FALSE; 548 do_devices = B_TRUE; 549 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 550 devices = B_TRUE; 551 do_devices = B_TRUE; 552 } 553 554 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 555 setuid = B_FALSE; 556 do_setuid = B_TRUE; 557 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 558 setuid = B_TRUE; 559 do_setuid = B_TRUE; 560 } 561 } 562 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 563 exec = B_FALSE; 564 do_exec = B_TRUE; 565 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 566 exec = B_TRUE; 567 do_exec = B_TRUE; 568 } 569 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 570 xattr = B_FALSE; 571 do_xattr = B_TRUE; 572 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 573 xattr = B_TRUE; 574 do_xattr = B_TRUE; 575 } 576 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 577 atime = B_FALSE; 578 do_atime = B_TRUE; 579 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 580 atime = B_TRUE; 581 do_atime = B_TRUE; 582 } 583 584 /* 585 * nbmand is a special property. It can only be changed at 586 * mount time. 587 * 588 * This is weird, but it is documented to only be changeable 589 * at mount time. 590 */ 591 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 592 nbmand = B_FALSE; 593 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 594 nbmand = B_TRUE; 595 } else { 596 char osname[MAXNAMELEN]; 597 598 dmu_objset_name(os, osname); 599 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 600 NULL)) { 601 return (error); 602 } 603 } 604 605 /* 606 * Register property callbacks. 607 * 608 * It would probably be fine to just check for i/o error from 609 * the first prop_register(), but I guess I like to go 610 * overboard... 611 */ 612 ds = dmu_objset_ds(os); 613 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 614 error = error ? error : dsl_prop_register(ds, 615 "xattr", xattr_changed_cb, zfsvfs); 616 error = error ? error : dsl_prop_register(ds, 617 "recordsize", blksz_changed_cb, zfsvfs); 618 error = error ? error : dsl_prop_register(ds, 619 "readonly", readonly_changed_cb, zfsvfs); 620 error = error ? error : dsl_prop_register(ds, 621 "devices", devices_changed_cb, zfsvfs); 622 error = error ? error : dsl_prop_register(ds, 623 "setuid", setuid_changed_cb, zfsvfs); 624 error = error ? error : dsl_prop_register(ds, 625 "exec", exec_changed_cb, zfsvfs); 626 error = error ? error : dsl_prop_register(ds, 627 "snapdir", snapdir_changed_cb, zfsvfs); 628 error = error ? error : dsl_prop_register(ds, 629 "aclmode", acl_mode_changed_cb, zfsvfs); 630 error = error ? error : dsl_prop_register(ds, 631 "aclinherit", acl_inherit_changed_cb, zfsvfs); 632 error = error ? error : dsl_prop_register(ds, 633 "vscan", vscan_changed_cb, zfsvfs); 634 if (error) 635 goto unregister; 636 637 /* 638 * Invoke our callbacks to restore temporary mount options. 639 */ 640 if (do_readonly) 641 readonly_changed_cb(zfsvfs, readonly); 642 if (do_setuid) 643 setuid_changed_cb(zfsvfs, setuid); 644 if (do_exec) 645 exec_changed_cb(zfsvfs, exec); 646 if (do_devices) 647 devices_changed_cb(zfsvfs, devices); 648 if (do_xattr) 649 xattr_changed_cb(zfsvfs, xattr); 650 if (do_atime) 651 atime_changed_cb(zfsvfs, atime); 652 653 nbmand_changed_cb(zfsvfs, nbmand); 654 655 return (0); 656 657 unregister: 658 /* 659 * We may attempt to unregister some callbacks that are not 660 * registered, but this is OK; it will simply return ENOMSG, 661 * which we will ignore. 662 */ 663 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 664 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); 665 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 666 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 667 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); 668 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 669 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 670 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 671 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 672 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 673 zfsvfs); 674 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); 675 return (error); 676 677 } 678 679 static void 680 uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, 681 int64_t delta, dmu_tx_t *tx) 682 { 683 uint64_t used = 0; 684 char buf[32]; 685 int err; 686 uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 687 688 if (delta == 0) 689 return; 690 691 (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); 692 err = zap_lookup(os, obj, buf, 8, 1, &used); 693 ASSERT(err == 0 || err == ENOENT); 694 /* no underflow/overflow */ 695 ASSERT(delta > 0 || used >= -delta); 696 ASSERT(delta < 0 || used + delta > used); 697 used += delta; 698 if (used == 0) 699 err = zap_remove(os, obj, buf, tx); 700 else 701 err = zap_update(os, obj, buf, 8, 1, &used, tx); 702 ASSERT(err == 0); 703 } 704 705 static int 706 zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus, 707 uint64_t *userp, uint64_t *groupp) 708 { 709 znode_phys_t *znp = bonus; 710 711 if (bonustype != DMU_OT_ZNODE) 712 return (ENOENT); 713 714 *userp = znp->zp_uid; 715 *groupp = znp->zp_gid; 716 return (0); 717 } 718 719 static void 720 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 721 char *domainbuf, int buflen, uid_t *ridp) 722 { 723 uint64_t fuid; 724 const char *domain; 725 726 fuid = strtonum(fuidstr, NULL); 727 728 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 729 if (domain) 730 (void) strlcpy(domainbuf, domain, buflen); 731 else 732 domainbuf[0] = '\0'; 733 *ridp = FUID_RID(fuid); 734 } 735 736 static uint64_t 737 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 738 { 739 switch (type) { 740 case ZFS_PROP_USERUSED: 741 return (DMU_USERUSED_OBJECT); 742 case ZFS_PROP_GROUPUSED: 743 return (DMU_GROUPUSED_OBJECT); 744 case ZFS_PROP_USERQUOTA: 745 return (zfsvfs->z_userquota_obj); 746 case ZFS_PROP_GROUPQUOTA: 747 return (zfsvfs->z_groupquota_obj); 748 } 749 return (0); 750 } 751 752 int 753 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 754 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 755 { 756 int error; 757 zap_cursor_t zc; 758 zap_attribute_t za; 759 zfs_useracct_t *buf = vbuf; 760 uint64_t obj; 761 762 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 763 return (ENOTSUP); 764 765 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 766 if (obj == 0) { 767 *bufsizep = 0; 768 return (0); 769 } 770 771 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 772 (error = zap_cursor_retrieve(&zc, &za)) == 0; 773 zap_cursor_advance(&zc)) { 774 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 775 *bufsizep) 776 break; 777 778 fuidstr_to_sid(zfsvfs, za.za_name, 779 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 780 781 buf->zu_space = za.za_first_integer; 782 buf++; 783 } 784 if (error == ENOENT) 785 error = 0; 786 787 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 788 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 789 *cookiep = zap_cursor_serialize(&zc); 790 zap_cursor_fini(&zc); 791 return (error); 792 } 793 794 /* 795 * buf must be big enough (eg, 32 bytes) 796 */ 797 static int 798 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 799 char *buf, boolean_t addok) 800 { 801 uint64_t fuid; 802 int domainid = 0; 803 804 if (domain && domain[0]) { 805 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 806 if (domainid == -1) 807 return (ENOENT); 808 } 809 fuid = FUID_ENCODE(domainid, rid); 810 (void) sprintf(buf, "%llx", (longlong_t)fuid); 811 return (0); 812 } 813 814 int 815 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 816 const char *domain, uint64_t rid, uint64_t *valp) 817 { 818 char buf[32]; 819 int err; 820 uint64_t obj; 821 822 *valp = 0; 823 824 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 825 return (ENOTSUP); 826 827 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 828 if (obj == 0) 829 return (0); 830 831 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); 832 if (err) 833 return (err); 834 835 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 836 if (err == ENOENT) 837 err = 0; 838 return (err); 839 } 840 841 int 842 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 843 const char *domain, uint64_t rid, uint64_t quota) 844 { 845 char buf[32]; 846 int err; 847 dmu_tx_t *tx; 848 uint64_t *objp; 849 boolean_t fuid_dirtied; 850 851 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 852 return (EINVAL); 853 854 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 855 return (ENOTSUP); 856 857 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 858 &zfsvfs->z_groupquota_obj; 859 860 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 861 if (err) 862 return (err); 863 fuid_dirtied = zfsvfs->z_fuid_dirty; 864 865 tx = dmu_tx_create(zfsvfs->z_os); 866 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 867 if (*objp == 0) { 868 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 869 zfs_userquota_prop_prefixes[type]); 870 } 871 if (fuid_dirtied) 872 zfs_fuid_txhold(zfsvfs, tx); 873 err = dmu_tx_assign(tx, TXG_WAIT); 874 if (err) { 875 dmu_tx_abort(tx); 876 return (err); 877 } 878 879 mutex_enter(&zfsvfs->z_lock); 880 if (*objp == 0) { 881 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 882 DMU_OT_NONE, 0, tx); 883 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 884 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 885 } 886 mutex_exit(&zfsvfs->z_lock); 887 888 if (quota == 0) { 889 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 890 if (err == ENOENT) 891 err = 0; 892 } else { 893 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 894 } 895 ASSERT(err == 0); 896 if (fuid_dirtied) 897 zfs_fuid_sync(zfsvfs, tx); 898 dmu_tx_commit(tx); 899 return (err); 900 } 901 902 boolean_t 903 zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 904 { 905 char buf[32]; 906 uint64_t used, quota, usedobj, quotaobj; 907 int err; 908 909 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 910 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 911 912 if (quotaobj == 0 || zfsvfs->z_replay) 913 return (B_FALSE); 914 915 (void) sprintf(buf, "%llx", (longlong_t)fuid); 916 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 917 if (err != 0) 918 return (B_FALSE); 919 920 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 921 if (err != 0) 922 return (B_FALSE); 923 return (used >= quota); 924 } 925 926 int 927 zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 928 { 929 objset_t *os; 930 zfsvfs_t *zfsvfs; 931 uint64_t zval; 932 int i, error; 933 934 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 935 936 /* 937 * We claim to always be readonly so we can open snapshots; 938 * other ZPL code will prevent us from writing to snapshots. 939 */ 940 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 941 if (error) { 942 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 943 return (error); 944 } 945 946 /* 947 * Initialize the zfs-specific filesystem structure. 948 * Should probably make this a kmem cache, shuffle fields, 949 * and just bzero up to z_hold_mtx[]. 950 */ 951 zfsvfs->z_vfs = NULL; 952 zfsvfs->z_parent = zfsvfs; 953 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 954 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 955 zfsvfs->z_os = os; 956 957 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 958 if (error) { 959 goto out; 960 } else if (zfsvfs->z_version > ZPL_VERSION) { 961 (void) printf("Mismatched versions: File system " 962 "is version %llu on-disk format, which is " 963 "incompatible with this software version %lld!", 964 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 965 error = ENOTSUP; 966 goto out; 967 } 968 969 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 970 goto out; 971 zfsvfs->z_norm = (int)zval; 972 973 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 974 goto out; 975 zfsvfs->z_utf8 = (zval != 0); 976 977 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 978 goto out; 979 zfsvfs->z_case = (uint_t)zval; 980 981 /* 982 * Fold case on file systems that are always or sometimes case 983 * insensitive. 984 */ 985 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 986 zfsvfs->z_case == ZFS_CASE_MIXED) 987 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 988 989 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 990 991 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 992 &zfsvfs->z_root); 993 if (error) 994 goto out; 995 ASSERT(zfsvfs->z_root != 0); 996 997 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 998 &zfsvfs->z_unlinkedobj); 999 if (error) 1000 goto out; 1001 1002 error = zap_lookup(os, MASTER_NODE_OBJ, 1003 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 1004 8, 1, &zfsvfs->z_userquota_obj); 1005 if (error && error != ENOENT) 1006 goto out; 1007 1008 error = zap_lookup(os, MASTER_NODE_OBJ, 1009 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 1010 8, 1, &zfsvfs->z_groupquota_obj); 1011 if (error && error != ENOENT) 1012 goto out; 1013 1014 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 1015 &zfsvfs->z_fuid_obj); 1016 if (error && error != ENOENT) 1017 goto out; 1018 1019 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 1020 &zfsvfs->z_shares_dir); 1021 if (error && error != ENOENT) 1022 goto out; 1023 1024 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1025 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1026 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1027 offsetof(znode_t, z_link_node)); 1028 rrw_init(&zfsvfs->z_teardown_lock); 1029 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1030 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1031 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1032 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1033 1034 *zfvp = zfsvfs; 1035 return (0); 1036 1037 out: 1038 dmu_objset_disown(os, zfsvfs); 1039 *zfvp = NULL; 1040 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1041 return (error); 1042 } 1043 1044 static int 1045 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1046 { 1047 int error; 1048 1049 error = zfs_register_callbacks(zfsvfs->z_vfs); 1050 if (error) 1051 return (error); 1052 1053 /* 1054 * Set the objset user_ptr to track its zfsvfs. 1055 */ 1056 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1057 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1058 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1059 1060 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1061 if (zil_disable) { 1062 zil_destroy(zfsvfs->z_log, B_FALSE); 1063 zfsvfs->z_log = NULL; 1064 } 1065 1066 /* 1067 * If we are not mounting (ie: online recv), then we don't 1068 * have to worry about replaying the log as we blocked all 1069 * operations out since we closed the ZIL. 1070 */ 1071 if (mounting) { 1072 boolean_t readonly; 1073 1074 /* 1075 * During replay we remove the read only flag to 1076 * allow replays to succeed. 1077 */ 1078 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1079 if (readonly != 0) 1080 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1081 else 1082 zfs_unlinked_drain(zfsvfs); 1083 1084 if (zfsvfs->z_log) { 1085 /* 1086 * Parse and replay the intent log. 1087 * 1088 * Because of ziltest, this must be done after 1089 * zfs_unlinked_drain(). (Further note: ziltest 1090 * doesn't use readonly mounts, where 1091 * zfs_unlinked_drain() isn't called.) This is because 1092 * ziltest causes spa_sync() to think it's committed, 1093 * but actually it is not, so the intent log contains 1094 * many txg's worth of changes. 1095 * 1096 * In particular, if object N is in the unlinked set in 1097 * the last txg to actually sync, then it could be 1098 * actually freed in a later txg and then reallocated 1099 * in a yet later txg. This would write a "create 1100 * object N" record to the intent log. Normally, this 1101 * would be fine because the spa_sync() would have 1102 * written out the fact that object N is free, before 1103 * we could write the "create object N" intent log 1104 * record. 1105 * 1106 * But when we are in ziltest mode, we advance the "open 1107 * txg" without actually spa_sync()-ing the changes to 1108 * disk. So we would see that object N is still 1109 * allocated and in the unlinked set, and there is an 1110 * intent log record saying to allocate it. 1111 */ 1112 zfsvfs->z_replay = B_TRUE; 1113 zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); 1114 zfsvfs->z_replay = B_FALSE; 1115 } 1116 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1117 } 1118 1119 return (0); 1120 } 1121 1122 void 1123 zfsvfs_free(zfsvfs_t *zfsvfs) 1124 { 1125 int i; 1126 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1127 1128 /* 1129 * This is a barrier to prevent the filesystem from going away in 1130 * zfs_znode_move() until we can safely ensure that the filesystem is 1131 * not unmounted. We consider the filesystem valid before the barrier 1132 * and invalid after the barrier. 1133 */ 1134 rw_enter(&zfsvfs_lock, RW_READER); 1135 rw_exit(&zfsvfs_lock); 1136 1137 zfs_fuid_destroy(zfsvfs); 1138 mutex_destroy(&zfsvfs->z_znodes_lock); 1139 mutex_destroy(&zfsvfs->z_lock); 1140 list_destroy(&zfsvfs->z_all_znodes); 1141 rrw_destroy(&zfsvfs->z_teardown_lock); 1142 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1143 rw_destroy(&zfsvfs->z_fuid_lock); 1144 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1145 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1146 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1147 } 1148 1149 static void 1150 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1151 { 1152 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1153 if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { 1154 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1155 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1156 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1157 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1158 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1159 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1160 } 1161 } 1162 1163 static int 1164 zfs_domount(vfs_t *vfsp, char *osname) 1165 { 1166 dev_t mount_dev; 1167 uint64_t recordsize, fsid_guid; 1168 int error = 0; 1169 zfsvfs_t *zfsvfs; 1170 1171 ASSERT(vfsp); 1172 ASSERT(osname); 1173 1174 error = zfsvfs_create(osname, &zfsvfs); 1175 if (error) 1176 return (error); 1177 zfsvfs->z_vfs = vfsp; 1178 zfsvfs->z_parent = zfsvfs; 1179 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 1180 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1181 1182 /* Initialize the generic filesystem structure. */ 1183 vfsp->vfs_data = NULL; 1184 1185 if (zfs_create_unique_device(&mount_dev) == -1) { 1186 error = ENODEV; 1187 goto out; 1188 } 1189 ASSERT(vfs_devismounted(mount_dev) == 0); 1190 1191 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1192 NULL)) 1193 goto out; 1194 1195 vfsp->vfs_bsize = DEV_BSIZE; 1196 vfsp->vfs_flag |= VFS_NOTRUNC; 1197 vfsp->vfs_data = zfsvfs; 1198 1199 /* 1200 * The fsid is 64 bits, composed of an 8-bit fs type, which 1201 * separates our fsid from any other filesystem types, and a 1202 * 56-bit objset unique ID. The objset unique ID is unique to 1203 * all objsets open on this system, provided by unique_create(). 1204 * The 8-bit fs type must be put in the low bits of fsid[1] 1205 * because that's where other Solaris filesystems put it. 1206 */ 1207 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1208 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1209 vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid; 1210 vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) | 1211 zfsfstype & 0xFF; 1212 1213 dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count); 1214 /* 1215 * Set features for file system. 1216 */ 1217 zfs_set_fuid_feature(zfsvfs); 1218 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1219 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1220 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1221 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1222 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1223 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1224 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1225 } 1226 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1227 1228 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1229 uint64_t pval; 1230 1231 atime_changed_cb(zfsvfs, B_FALSE); 1232 readonly_changed_cb(zfsvfs, B_TRUE); 1233 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1234 goto out; 1235 xattr_changed_cb(zfsvfs, pval); 1236 zfsvfs->z_issnap = B_TRUE; 1237 1238 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1239 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1240 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1241 } else { 1242 error = zfsvfs_setup(zfsvfs, B_TRUE); 1243 } 1244 1245 dprintf("zfs_vfsops.c zfs_domount called\n"); 1246 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 1247 1248 if (!zfsvfs->z_issnap) 1249 zfsctl_create(zfsvfs); 1250 out: 1251 if (error) { 1252 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1253 zfsvfs_free(zfsvfs); 1254 } else { 1255 atomic_add_32(&zfs_active_fs_count, 1); 1256 } 1257 return (error); 1258 } 1259 1260 void 1261 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1262 { 1263 objset_t *os = zfsvfs->z_os; 1264 struct dsl_dataset *ds; 1265 1266 /* 1267 * Unregister properties. 1268 */ 1269 if (!dmu_objset_is_snapshot(os)) { 1270 ds = dmu_objset_ds(os); 1271 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 1272 zfsvfs) == 0); 1273 1274 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 1275 zfsvfs) == 0); 1276 1277 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 1278 zfsvfs) == 0); 1279 1280 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 1281 zfsvfs) == 0); 1282 1283 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, 1284 zfsvfs) == 0); 1285 1286 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 1287 zfsvfs) == 0); 1288 1289 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 1290 zfsvfs) == 0); 1291 1292 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 1293 zfsvfs) == 0); 1294 1295 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 1296 zfsvfs) == 0); 1297 1298 VERIFY(dsl_prop_unregister(ds, "aclinherit", 1299 acl_inherit_changed_cb, zfsvfs) == 0); 1300 1301 VERIFY(dsl_prop_unregister(ds, "vscan", 1302 vscan_changed_cb, zfsvfs) == 0); 1303 } 1304 } 1305 1306 /* 1307 * Convert a decimal digit string to a uint64_t integer. 1308 */ 1309 static int 1310 str_to_uint64(char *str, uint64_t *objnum) 1311 { 1312 uint64_t num = 0; 1313 1314 while (*str) { 1315 if (*str < '0' || *str > '9') 1316 return (EINVAL); 1317 1318 num = num*10 + *str++ - '0'; 1319 } 1320 1321 *objnum = num; 1322 return (0); 1323 } 1324 1325 /* 1326 * The boot path passed from the boot loader is in the form of 1327 * "rootpool-name/root-filesystem-object-number'. Convert this 1328 * string to a dataset name: "rootpool-name/root-filesystem-name". 1329 */ 1330 static int 1331 zfs_parse_bootfs(char *bpath, char *outpath) 1332 { 1333 char *slashp; 1334 uint64_t objnum; 1335 int error; 1336 1337 if (*bpath == 0 || *bpath == '/') 1338 return (EINVAL); 1339 1340 (void) strcpy(outpath, bpath); 1341 1342 slashp = strchr(bpath, '/'); 1343 1344 /* if no '/', just return the pool name */ 1345 if (slashp == NULL) { 1346 return (0); 1347 } 1348 1349 /* if not a number, just return the root dataset name */ 1350 if (str_to_uint64(slashp+1, &objnum)) { 1351 return (0); 1352 } 1353 1354 *slashp = '\0'; 1355 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1356 *slashp = '/'; 1357 1358 return (error); 1359 } 1360 1361 1362 /* 1363 * zfs_check_global_label: 1364 * Check that the hex label string is appropriate for the dataset 1365 * being mounted into the global_zone proper. 1366 * 1367 * Return an error if the hex label string is not default or 1368 * admin_low/admin_high. For admin_low labels, the corresponding 1369 * dataset must be readonly. 1370 */ 1371 int 1372 zfs_check_global_label(const char *dsname, const char *hexsl) 1373 { 1374 #ifdef PORT_SOLARIS 1375 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1376 return (0); 1377 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1378 return (0); 1379 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1380 /* must be readonly */ 1381 uint64_t rdonly; 1382 1383 if (dsl_prop_get_integer(dsname, 1384 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1385 return (EACCES); 1386 return (rdonly ? 0 : EACCES); 1387 } 1388 return (EACCES); 1389 #else 1390 return 0; 1391 #endif 1392 } 1393 1394 /* 1395 * zfs_mount_label_policy: 1396 * Determine whether the mount is allowed according to MAC check. 1397 * by comparing (where appropriate) label of the dataset against 1398 * the label of the zone being mounted into. If the dataset has 1399 * no label, create one. 1400 * 1401 * Returns: 1402 * 0 : access allowed 1403 * >0 : error code, such as EACCES 1404 */ 1405 static int 1406 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1407 { 1408 #ifdef PORT_SOLARIS 1409 int error, retv; 1410 zone_t *mntzone = NULL; 1411 ts_label_t *mnt_tsl; 1412 bslabel_t *mnt_sl; 1413 bslabel_t ds_sl; 1414 char ds_hexsl[MAXNAMELEN]; 1415 1416 retv = EACCES; /* assume the worst */ 1417 1418 /* 1419 * Start by getting the dataset label if it exists. 1420 */ 1421 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1422 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1423 if (error) 1424 return (EACCES); 1425 1426 /* 1427 * If labeling is NOT enabled, then disallow the mount of datasets 1428 * which have a non-default label already. No other label checks 1429 * are needed. 1430 */ 1431 if (!is_system_labeled()) { 1432 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1433 return (0); 1434 return (EACCES); 1435 } 1436 1437 /* 1438 * Get the label of the mountpoint. If mounting into the global 1439 * zone (i.e. mountpoint is not within an active zone and the 1440 * zoned property is off), the label must be default or 1441 * admin_low/admin_high only; no other checks are needed. 1442 */ 1443 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1444 if (mntzone->zone_id == GLOBAL_ZONEID) { 1445 uint64_t zoned; 1446 1447 zone_rele(mntzone); 1448 1449 if (dsl_prop_get_integer(osname, 1450 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1451 return (EACCES); 1452 if (!zoned) 1453 return (zfs_check_global_label(osname, ds_hexsl)); 1454 else 1455 /* 1456 * This is the case of a zone dataset being mounted 1457 * initially, before the zone has been fully created; 1458 * allow this mount into global zone. 1459 */ 1460 return (0); 1461 } 1462 1463 mnt_tsl = mntzone->zone_slabel; 1464 ASSERT(mnt_tsl != NULL); 1465 label_hold(mnt_tsl); 1466 mnt_sl = label2bslabel(mnt_tsl); 1467 1468 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1469 /* 1470 * The dataset doesn't have a real label, so fabricate one. 1471 */ 1472 char *str = NULL; 1473 1474 if (l_to_str_internal(mnt_sl, &str) == 0 && 1475 dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1476 ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) 1477 retv = 0; 1478 if (str != NULL) 1479 kmem_free(str, strlen(str) + 1); 1480 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1481 /* 1482 * Now compare labels to complete the MAC check. If the 1483 * labels are equal then allow access. If the mountpoint 1484 * label dominates the dataset label, allow readonly access. 1485 * Otherwise, access is denied. 1486 */ 1487 if (blequal(mnt_sl, &ds_sl)) 1488 retv = 0; 1489 else if (bldominates(mnt_sl, &ds_sl)) { 1490 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1491 retv = 0; 1492 } 1493 } 1494 1495 label_rele(mnt_tsl); 1496 zone_rele(mntzone); 1497 return (retv); 1498 #else /* PORT_SOLARIS */ 1499 return (0); 1500 #endif 1501 } 1502 1503 #ifndef __NetBSD__ 1504 static int 1505 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1506 { 1507 int error = 0; 1508 static int zfsrootdone = 0; 1509 zfsvfs_t *zfsvfs = NULL; 1510 znode_t *zp = NULL; 1511 vnode_t *vp = NULL; 1512 char *zfs_bootfs; 1513 char *zfs_devid; 1514 1515 ASSERT(vfsp); 1516 1517 /* 1518 * The filesystem that we mount as root is defined in the 1519 * boot property "zfs-bootfs" with a format of 1520 * "poolname/root-dataset-objnum". 1521 */ 1522 if (why == ROOT_INIT) { 1523 if (zfsrootdone++) 1524 return (EBUSY); 1525 /* 1526 * the process of doing a spa_load will require the 1527 * clock to be set before we could (for example) do 1528 * something better by looking at the timestamp on 1529 * an uberblock, so just set it to -1. 1530 */ 1531 clkset(-1); 1532 1533 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1534 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1535 "bootfs name"); 1536 return (EINVAL); 1537 } 1538 zfs_devid = spa_get_bootprop("diskdevid"); 1539 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1540 if (zfs_devid) 1541 spa_free_bootprop(zfs_devid); 1542 if (error) { 1543 spa_free_bootprop(zfs_bootfs); 1544 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1545 error); 1546 return (error); 1547 } 1548 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1549 spa_free_bootprop(zfs_bootfs); 1550 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1551 error); 1552 return (error); 1553 } 1554 1555 spa_free_bootprop(zfs_bootfs); 1556 1557 if (error = vfs_lock(vfsp)) 1558 return (error); 1559 1560 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1561 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1562 goto out; 1563 } 1564 1565 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1566 ASSERT(zfsvfs); 1567 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1568 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1569 goto out; 1570 } 1571 1572 vp = ZTOV(zp); 1573 mutex_enter(&vp->v_lock); 1574 vp->v_flag |= VROOT; 1575 mutex_exit(&vp->v_lock); 1576 rootvp = vp; 1577 1578 /* 1579 * Leave rootvp held. The root file system is never unmounted. 1580 */ 1581 1582 vfs_add((struct vnode *)0, vfsp, 1583 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1584 out: 1585 vfs_unlock(vfsp); 1586 return (error); 1587 } else if (why == ROOT_REMOUNT) { 1588 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1589 vfsp->vfs_flag |= VFS_REMOUNT; 1590 1591 /* refresh mount options */ 1592 zfs_unregister_callbacks(vfsp->vfs_data); 1593 return (zfs_register_callbacks(vfsp)); 1594 1595 } else if (why == ROOT_UNMOUNT) { 1596 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1597 (void) zfs_sync(vfsp, 0, 0); 1598 return (0); 1599 } 1600 1601 /* 1602 * if "why" is equal to anything else other than ROOT_INIT, 1603 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1604 */ 1605 return (ENOTSUP); 1606 } 1607 #endif /*__NetBSD__ */ 1608 1609 /*ARGSUSED*/ 1610 static int 1611 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len) 1612 { 1613 char *osname; 1614 pathname_t spn; 1615 vnode_t *mvp = vfsp->mnt_vnodecovered; 1616 struct mounta *uap = data; 1617 int error = 0; 1618 int canwrite; 1619 cred_t *cr; 1620 1621 crget(cr); 1622 dprintf("zfs_vfsops.c zfs_mount called\n"); 1623 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 1624 if (mvp->v_type != VDIR) 1625 return (ENOTDIR); 1626 1627 mutex_enter(&mvp->v_interlock); 1628 if ((uap->flags & MS_REMOUNT) == 0 && 1629 (uap->flags & MS_OVERLAY) == 0 && 1630 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1631 mutex_exit(&mvp->v_interlock); 1632 return (EBUSY); 1633 } 1634 mutex_exit(&mvp->v_interlock); 1635 1636 /* 1637 * ZFS does not support passing unparsed data in via MS_DATA. 1638 * Users should use the MS_OPTIONSTR interface; this means 1639 * that all option parsing is already done and the options struct 1640 * can be interrogated. 1641 */ 1642 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1643 return (EINVAL); 1644 1645 osname = PNBUF_GET(); 1646 1647 strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1); 1648 1649 /* 1650 * Check for mount privilege? 1651 * 1652 * If we don't have privilege then see if 1653 * we have local permission to allow it 1654 */ 1655 error = secpolicy_fs_mount(cr, mvp, vfsp); 1656 if (error) { 1657 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); 1658 if (error == 0) { 1659 vattr_t vattr; 1660 1661 /* 1662 * Make sure user is the owner of the mount point 1663 * or has sufficient privileges. 1664 */ 1665 1666 vattr.va_mask = AT_UID; 1667 1668 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1669 goto out; 1670 } 1671 1672 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1673 VOP_ACCESS(mvp, VWRITE, cr) != 0) { 1674 error = EPERM; 1675 goto out; 1676 } 1677 1678 /* XXX NetBSD secpolicy_fs_mount_clearopts(cr, vfsp);*/ 1679 } else { 1680 goto out; 1681 } 1682 } 1683 1684 /* 1685 * Refuse to mount a filesystem if we are in a local zone and the 1686 * dataset is not visible. 1687 */ 1688 if (!INGLOBALZONE(curproc) && 1689 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1690 error = EPERM; 1691 goto out; 1692 } 1693 1694 error = zfs_mount_label_policy(vfsp, osname); 1695 if (error) 1696 goto out; 1697 1698 /* 1699 * When doing a remount, we simply refresh our temporary properties 1700 * according to those options set in the current VFS options. 1701 */ 1702 if (uap->flags & MS_REMOUNT) { 1703 /* refresh mount options */ 1704 zfs_unregister_callbacks(vfsp->vfs_data); 1705 error = zfs_register_callbacks(vfsp); 1706 goto out; 1707 } 1708 1709 /* Mark ZFS as MP SAFE */ 1710 vfsp->mnt_iflag |= IMNT_MPSAFE; 1711 1712 error = zfs_domount(vfsp, osname); 1713 1714 vfs_getnewfsid(vfsp); 1715 1716 /* setup zfs mount info */ 1717 strlcpy(vfsp->mnt_stat.f_mntfromname, osname, 1718 sizeof(vfsp->mnt_stat.f_mntfromname)); 1719 set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname, 1720 UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp); 1721 1722 /* 1723 * Add an extra VFS_HOLD on our parent vfs so that it can't 1724 * disappear due to a forced unmount. 1725 */ 1726 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1727 VFS_HOLD(mvp->v_vfsp); 1728 1729 out: 1730 PNBUF_PUT(osname); 1731 return (error); 1732 } 1733 1734 static int 1735 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp) 1736 { 1737 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1738 dev_t dev; 1739 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1740 1741 ZFS_ENTER(zfsvfs); 1742 1743 dmu_objset_space(zfsvfs->z_os, 1744 &refdbytes, &availbytes, &usedobjs, &availobjs); 1745 1746 /* 1747 * The underlying storage pool actually uses multiple block sizes. 1748 * We report the fragsize as the smallest block size we support, 1749 * and we report our blocksize as the filesystem's maximum blocksize. 1750 */ 1751 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 1752 statp->f_bsize = zfsvfs->z_max_blksz; 1753 1754 /* 1755 * The following report "total" blocks of various kinds in the 1756 * file system, but reported in terms of f_frsize - the 1757 * "fragment" size. 1758 */ 1759 1760 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1761 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 1762 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1763 1764 /* 1765 * statvfs() should really be called statufs(), because it assumes 1766 * static metadata. ZFS doesn't preallocate files, so the best 1767 * we can do is report the max that could possibly fit in f_files, 1768 * and that minus the number actually used in f_ffree. 1769 * For f_ffree, report the smaller of the number of object available 1770 * and the number of blocks (each object will take at least a block). 1771 */ 1772 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1773 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 1774 statp->f_files = statp->f_ffree + usedobjs; 1775 1776 statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0]; 1777 1778 /* 1779 * We're a zfs filesystem. 1780 */ 1781 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); 1782 (void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1783 sizeof(statp->f_mntfromname)); 1784 (void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1785 sizeof(statp->f_mntonname)); 1786 1787 statp->f_namemax = ZFS_MAXNAMELEN; 1788 1789 /* 1790 * We have all of 32 characters to stuff a string here. 1791 * Is there anything useful we could/should provide? 1792 */ 1793 #ifndef __NetBSD__ 1794 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 1795 #endif 1796 ZFS_EXIT(zfsvfs); 1797 return (0); 1798 } 1799 1800 static int 1801 zfs_root(vfs_t *vfsp, vnode_t **vpp) 1802 { 1803 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1804 znode_t *rootzp; 1805 int error; 1806 1807 ZFS_ENTER(zfsvfs); 1808 dprintf("zfs_root called\n"); 1809 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1810 if (error == 0) 1811 *vpp = ZTOV(rootzp); 1812 dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp); 1813 ZFS_EXIT(zfsvfs); 1814 return (error); 1815 } 1816 1817 /* 1818 * Teardown the zfsvfs::z_os. 1819 * 1820 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 1821 * and 'z_teardown_inactive_lock' held. 1822 */ 1823 static int 1824 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1825 { 1826 znode_t *zp; 1827 1828 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1829 1830 if (!unmounting) { 1831 /* 1832 * We purge the parent filesystem's vfsp as the parent 1833 * filesystem and all of its snapshots have their vnode's 1834 * v_vfsp set to the parent's filesystem's vfsp. Note, 1835 * 'z_parent' is self referential for non-snapshots. 1836 */ 1837 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1838 } 1839 1840 /* 1841 * Close the zil. NB: Can't close the zil while zfs_inactive 1842 * threads are blocked as zil_close can call zfs_inactive. 1843 */ 1844 if (zfsvfs->z_log) { 1845 zil_close(zfsvfs->z_log); 1846 zfsvfs->z_log = NULL; 1847 } 1848 1849 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 1850 1851 /* 1852 * If we are not unmounting (ie: online recv) and someone already 1853 * unmounted this file system while we were doing the switcheroo, 1854 * or a reopen of z_os failed then just bail out now. 1855 */ 1856 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1857 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1858 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1859 return (EIO); 1860 } 1861 1862 /* 1863 * At this point there are no vops active, and any new vops will 1864 * fail with EIO since we have z_teardown_lock for writer (only 1865 * relavent for forced unmount). 1866 * 1867 * Release all holds on dbufs. 1868 */ 1869 mutex_enter(&zfsvfs->z_znodes_lock); 1870 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1871 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1872 if (zp->z_dbuf) { 1873 ASSERT(ZTOV(zp)->v_count > 0); 1874 zfs_znode_dmu_fini(zp); 1875 } 1876 mutex_exit(&zfsvfs->z_znodes_lock); 1877 1878 /* 1879 * If we are unmounting, set the unmounted flag and let new vops 1880 * unblock. zfs_inactive will have the unmounted behavior, and all 1881 * other vops will fail with EIO. 1882 */ 1883 if (unmounting) { 1884 zfsvfs->z_unmounted = B_TRUE; 1885 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1886 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1887 } 1888 1889 /* 1890 * z_os will be NULL if there was an error in attempting to reopen 1891 * zfsvfs, so just return as the properties had already been 1892 * unregistered and cached data had been evicted before. 1893 */ 1894 if (zfsvfs->z_os == NULL) 1895 return (0); 1896 1897 /* 1898 * Unregister properties. 1899 */ 1900 zfs_unregister_callbacks(zfsvfs); 1901 1902 /* 1903 * Evict cached data 1904 */ 1905 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { 1906 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1907 (void) dmu_objset_evict_dbufs(zfsvfs->z_os); 1908 } 1909 1910 return (0); 1911 } 1912 1913 /*ARGSUSED*/ 1914 static int 1915 zfs_umount(vfs_t *vfsp, int fflag) 1916 { 1917 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1918 objset_t *os; 1919 int ret, flags = 0; 1920 cred_t *cr; 1921 1922 vnode_t *vpp; 1923 int counter; 1924 1925 counter = 0; 1926 1927 dprintf("ZFS_UMOUNT called\n"); 1928 1929 /*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) { 1930 printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp); 1931 vprint("ZFS vfsp vnode list", vpp); 1932 counter++; 1933 } */ 1934 1935 crget(cr); 1936 #ifdef TODO 1937 ret = secpolicy_fs_unmount(cr, vfsp); 1938 if (ret) { 1939 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1940 ZFS_DELEG_PERM_MOUNT, cr); 1941 if (ret) 1942 return (ret); 1943 } 1944 #endif 1945 /* 1946 * We purge the parent filesystem's vfsp as the parent filesystem 1947 * and all of its snapshots have their vnode's v_vfsp set to the 1948 * parent's filesystem's vfsp. Note, 'z_parent' is self 1949 * referential for non-snapshots. 1950 */ 1951 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1952 1953 /* 1954 * Unmount any snapshots mounted under .zfs before unmounting the 1955 * dataset itself. 1956 */ 1957 if (zfsvfs->z_ctldir != NULL && 1958 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 1959 return (ret); 1960 } 1961 1962 #if 0 1963 if (!(fflag & MS_FORCE)) { 1964 /* 1965 * Check the number of active vnodes in the file system. 1966 * Our count is maintained in the vfs structure, but the 1967 * number is off by 1 to indicate a hold on the vfs 1968 * structure itself. 1969 * 1970 * The '.zfs' directory maintains a reference of its 1971 * own, and any active references underneath are 1972 * reflected in the vnode count. 1973 */ 1974 if (zfsvfs->z_ctldir == NULL) { 1975 if (vfsp->vfs_count > 1){ 1976 return (EBUSY); 1977 } 1978 } else { 1979 if (vfsp->vfs_count > 2 || 1980 zfsvfs->z_ctldir->v_count > 1) { 1981 return (EBUSY); 1982 } 1983 } 1984 } 1985 #endif 1986 vfsp->vfs_flag |= VFS_UNMOUNTED; 1987 1988 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1989 os = zfsvfs->z_os; 1990 1991 /* 1992 * z_os will be NULL if there was an error in 1993 * attempting to reopen zfsvfs. 1994 */ 1995 if (os != NULL) { 1996 /* 1997 * Unset the objset user_ptr. 1998 */ 1999 mutex_enter(&os->os_user_ptr_lock); 2000 dmu_objset_set_user(os, NULL); 2001 mutex_exit(&os->os_user_ptr_lock); 2002 2003 /* 2004 * Finally release the objset 2005 */ 2006 dmu_objset_disown(os, zfsvfs); 2007 } 2008 2009 /* 2010 * We can now safely destroy the '.zfs' directory node. 2011 */ 2012 if (zfsvfs->z_ctldir != NULL) 2013 zfsctl_destroy(zfsvfs); 2014 2015 if (fflag & MS_FORCE) 2016 flags |= FORCECLOSE; 2017 2018 ret = vflush(vfsp, NULL, 0); 2019 if (ret != 0) 2020 return ret; 2021 2022 return (0); 2023 } 2024 2025 static int 2026 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp) 2027 { 2028 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2029 znode_t *zp; 2030 int err; 2031 2032 dprintf("zfs_vget called\n"); 2033 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 2034 2035 ZFS_ENTER(zfsvfs); 2036 err = zfs_zget(zfsvfs, ino, &zp); 2037 if (err == 0 && zp->z_unlinked) { 2038 VN_RELE(ZTOV(zp)); 2039 err = EINVAL; 2040 } 2041 if (err != 0) 2042 *vpp = NULL; 2043 else { 2044 *vpp = ZTOV(zp); 2045 /* XXX NetBSD how to get flags for vn_lock ? */ 2046 vn_lock(*vpp, 0); 2047 } 2048 ZFS_EXIT(zfsvfs); 2049 return (err); 2050 } 2051 2052 static int 2053 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) 2054 { 2055 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2056 znode_t *zp; 2057 uint64_t object = 0; 2058 uint64_t fid_gen = 0; 2059 uint64_t gen_mask; 2060 uint64_t zp_gen; 2061 int i, err; 2062 2063 *vpp = NULL; 2064 2065 dprintf("zfs_fhtovp called\n"); 2066 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 2067 2068 ZFS_ENTER(zfsvfs); 2069 2070 if (fidp->fid_len == LONG_FID_LEN) { 2071 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2072 uint64_t objsetid = 0; 2073 uint64_t setgen = 0; 2074 2075 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2076 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2077 2078 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2079 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2080 2081 ZFS_EXIT(zfsvfs); 2082 2083 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2084 if (err) 2085 return (EINVAL); 2086 ZFS_ENTER(zfsvfs); 2087 } 2088 2089 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2090 zfid_short_t *zfid = (zfid_short_t *)fidp; 2091 2092 for (i = 0; i < sizeof (zfid->zf_object); i++) 2093 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2094 2095 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2096 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2097 } else { 2098 ZFS_EXIT(zfsvfs); 2099 return (EINVAL); 2100 } 2101 2102 /* A zero fid_gen means we are in the .zfs control directories */ 2103 if (fid_gen == 0 && 2104 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2105 *vpp = zfsvfs->z_ctldir; 2106 ASSERT(*vpp != NULL); 2107 if (object == ZFSCTL_INO_SNAPDIR) { 2108 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2109 0, NULL, NULL, NULL, NULL, NULL) == 0); 2110 } else { 2111 VN_HOLD(*vpp); 2112 } 2113 ZFS_EXIT(zfsvfs); 2114 /* XXX: LK_RETRY? */ 2115 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 2116 return (0); 2117 } 2118 2119 gen_mask = -1ULL >> (64 - 8 * i); 2120 2121 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2122 if (err = zfs_zget(zfsvfs, object, &zp)) { 2123 ZFS_EXIT(zfsvfs); 2124 return (err); 2125 } 2126 zp_gen = zp->z_phys->zp_gen & gen_mask; 2127 if (zp_gen == 0) 2128 zp_gen = 1; 2129 if (zp->z_unlinked || zp_gen != fid_gen) { 2130 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2131 VN_RELE(ZTOV(zp)); 2132 ZFS_EXIT(zfsvfs); 2133 return (EINVAL); 2134 } 2135 2136 *vpp = ZTOV(zp); 2137 /* XXX: LK_RETRY? */ 2138 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 2139 ZFS_EXIT(zfsvfs); 2140 return (0); 2141 } 2142 2143 /* 2144 * Block out VOPs and close zfsvfs_t::z_os 2145 * 2146 * Note, if successful, then we return with the 'z_teardown_lock' and 2147 * 'z_teardown_inactive_lock' write held. 2148 */ 2149 int 2150 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2151 { 2152 int error; 2153 2154 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2155 return (error); 2156 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 2157 2158 return (0); 2159 } 2160 2161 /* 2162 * Reopen zfsvfs_t::z_os and release VOPs. 2163 */ 2164 int 2165 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) 2166 { 2167 int err; 2168 2169 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2170 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2171 2172 err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, 2173 &zfsvfs->z_os); 2174 if (err) { 2175 zfsvfs->z_os = NULL; 2176 } else { 2177 znode_t *zp; 2178 2179 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2180 2181 /* 2182 * Attempt to re-establish all the active znodes with 2183 * their dbufs. If a zfs_rezget() fails, then we'll let 2184 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2185 * when they try to use their znode. 2186 */ 2187 mutex_enter(&zfsvfs->z_znodes_lock); 2188 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2189 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2190 (void) zfs_rezget(zp); 2191 } 2192 mutex_exit(&zfsvfs->z_znodes_lock); 2193 2194 } 2195 2196 /* release the VOPs */ 2197 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2198 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 2199 2200 if (err) { 2201 /* 2202 * Since we couldn't reopen zfsvfs::z_os, force 2203 * unmount this file system. 2204 */ 2205 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2206 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp); 2207 } 2208 return (err); 2209 } 2210 2211 static void 2212 zfs_freevfs(vfs_t *vfsp) 2213 { 2214 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2215 2216 /* 2217 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2218 * from zfs_mount(). Release it here. 2219 */ 2220 if (zfsvfs->z_issnap) 2221 VFS_RELE(zfsvfs->z_parent->z_vfs); 2222 2223 zfsvfs_free(zfsvfs); 2224 2225 atomic_add_32(&zfs_active_fs_count, -1); 2226 } 2227 2228 /* 2229 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2230 * so we can't safely do any non-idempotent initialization here. 2231 * Leave that to zfs_init() and zfs_fini(), which are called 2232 * from the module's _init() and _fini() entry points. 2233 */ 2234 /*ARGSUSED*/ 2235 int 2236 zfs_vfsinit(int fstype, char *name) 2237 { 2238 int error; 2239 2240 zfsfstype = fstype; 2241 2242 /* 2243 * Setup vfsops and vnodeops tables. 2244 */ 2245 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2246 2247 error = zfs_create_op_tables(); 2248 if (error) { 2249 zfs_remove_op_tables(); 2250 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2251 vfs_freevfsops_by_type(zfsfstype); 2252 return (error); 2253 } 2254 2255 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2256 mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL); 2257 2258 /* 2259 * Unique major number for all zfs mounts. 2260 * If we run out of 32-bit minors, we'll getudev() another major. 2261 */ 2262 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2263 zfs_minor = ZFS_MIN_MINOR; 2264 2265 return (0); 2266 } 2267 2268 int 2269 zfs_vfsfini(void) 2270 { 2271 int err; 2272 2273 err = vfs_detach(&zfs_vfsops_template); 2274 if (err != 0) 2275 return err; 2276 2277 mutex_destroy(&zfs_debug_mtx); 2278 mutex_destroy(&zfs_dev_mtx); 2279 2280 return 0; 2281 } 2282 2283 void 2284 zfs_init(void) 2285 { 2286 /* 2287 * Initialize .zfs directory structures 2288 */ 2289 zfsctl_init(); 2290 2291 /* 2292 * Initialize znode cache, vnode ops, etc... 2293 */ 2294 zfs_znode_init(); 2295 2296 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2297 } 2298 2299 void 2300 zfs_fini(void) 2301 { 2302 zfsctl_fini(); 2303 zfs_znode_fini(); 2304 } 2305 2306 int 2307 zfs_busy(void) 2308 { 2309 return (zfs_active_fs_count != 0); 2310 } 2311 2312 int 2313 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2314 { 2315 int error; 2316 objset_t *os = zfsvfs->z_os; 2317 dmu_tx_t *tx; 2318 2319 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2320 return (EINVAL); 2321 2322 if (newvers < zfsvfs->z_version) 2323 return (EINVAL); 2324 2325 tx = dmu_tx_create(os); 2326 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2327 error = dmu_tx_assign(tx, TXG_WAIT); 2328 if (error) { 2329 dmu_tx_abort(tx); 2330 return (error); 2331 } 2332 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2333 8, 1, &newvers, tx); 2334 2335 if (error) { 2336 dmu_tx_commit(tx); 2337 return (error); 2338 } 2339 2340 spa_history_internal_log(LOG_DS_UPGRADE, 2341 dmu_objset_spa(os), tx, CRED(), 2342 "oldver=%llu newver=%llu dataset = %llu", 2343 zfsvfs->z_version, newvers, dmu_objset_id(os)); 2344 2345 dmu_tx_commit(tx); 2346 2347 zfsvfs->z_version = newvers; 2348 2349 if (zfsvfs->z_version >= ZPL_VERSION_FUID) 2350 zfs_set_fuid_feature(zfsvfs); 2351 2352 return (0); 2353 } 2354 2355 /* 2356 * Read a property stored within the master node. 2357 */ 2358 int 2359 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2360 { 2361 const char *pname; 2362 int error = ENOENT; 2363 2364 /* 2365 * Look up the file system's value for the property. For the 2366 * version property, we look up a slightly different string. 2367 */ 2368 if (prop == ZFS_PROP_VERSION) 2369 pname = ZPL_VERSION_STR; 2370 else 2371 pname = zfs_prop_to_name(prop); 2372 2373 if (os != NULL) 2374 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2375 2376 if (error == ENOENT) { 2377 /* No value set, use the default value */ 2378 switch (prop) { 2379 case ZFS_PROP_VERSION: 2380 *value = ZPL_VERSION; 2381 break; 2382 case ZFS_PROP_NORMALIZE: 2383 case ZFS_PROP_UTF8ONLY: 2384 *value = 0; 2385 break; 2386 case ZFS_PROP_CASE: 2387 *value = ZFS_CASE_SENSITIVE; 2388 break; 2389 default: 2390 return (error); 2391 } 2392 error = 0; 2393 } 2394 return (error); 2395 } 2396 2397 static int 2398 zfs_start(vfs_t *vfsp, int flags) 2399 { 2400 2401 return (0); 2402 } 2403 2404 2405 #ifdef TODO 2406 static vfsdef_t vfw = { 2407 VFSDEF_VERSION, 2408 MNTTYPE_ZFS, 2409 zfs_vfsinit, 2410 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2411 VSW_XID, 2412 &zfs_mntopts 2413 }; 2414 2415 struct modlfs zfs_modlfs = { 2416 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2417 }; 2418 #endif 2419