1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 /* Portions Copyright 2010 Robert Milkowski */ 30 31 #include <sys/types.h> 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/sysmacros.h> 36 #include <sys/kmem.h> 37 #include <sys/acl.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/mntent.h> 41 #include <sys/mount.h> 42 #include <sys/cmn_err.h> 43 #include <sys/zfs_znode.h> 44 #include <sys/zfs_dir.h> 45 #include <sys/zil.h> 46 #include <sys/fs/zfs.h> 47 #include <sys/dmu.h> 48 #include <sys/dsl_prop.h> 49 #include <sys/dsl_dataset.h> 50 #include <sys/dsl_deleg.h> 51 #include <sys/spa.h> 52 #include <sys/zap.h> 53 #include <sys/sa.h> 54 #include <sys/sa_impl.h> 55 #include <sys/varargs.h> 56 #include <sys/policy.h> 57 #include <sys/atomic.h> 58 #include <sys/zfs_ioctl.h> 59 #include <sys/zfs_ctldir.h> 60 #include <sys/zfs_fuid.h> 61 #include <sys/sunddi.h> 62 #include <sys/dnlc.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/spa_boot.h> 65 #include "zfs_comutil.h" 66 67 #ifdef __FreeBSD_kernel__ 68 69 #include <sys/jail.h> 70 71 struct mtx zfs_debug_mtx; 72 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 73 74 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 75 76 int zfs_super_owner; 77 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 78 "File system owner can perform privileged operation on his file systems"); 79 80 int zfs_debug_level; 81 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 82 "Debug level"); 83 84 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 85 static int zfs_version_acl = ZFS_ACL_VERSION; 86 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 87 "ZFS_ACL_VERSION"); 88 static int zfs_version_spa = SPA_VERSION; 89 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 90 "SPA_VERSION"); 91 static int zfs_version_zpl = ZPL_VERSION; 92 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 93 "ZPL_VERSION"); 94 95 static int zfs_mount(vfs_t *vfsp); 96 static int zfs_umount(vfs_t *vfsp, int fflag); 97 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 98 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 99 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 100 static int zfs_sync(vfs_t *vfsp, int waitfor); 101 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 102 struct ucred **credanonp, int *numsecflavors, int **secflavors); 103 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 104 static void zfs_objset_close(zfsvfs_t *zfsvfs); 105 static void zfs_freevfs(vfs_t *vfsp); 106 107 struct vfsops zfs_vfsops = { 108 .vfs_mount = zfs_mount, 109 .vfs_unmount = zfs_umount, 110 .vfs_root = zfs_root, 111 .vfs_statfs = zfs_statfs, 112 .vfs_vget = zfs_vget, 113 .vfs_sync = zfs_sync, 114 .vfs_checkexp = zfs_checkexp, 115 .vfs_fhtovp = zfs_fhtovp, 116 }; 117 118 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); 119 120 #endif /* __FreeBSD_kernel__ */ 121 122 #ifdef __NetBSD__ 123 124 #include <sys/fstrans.h> 125 #include <sys/mkdev.h> 126 #include <miscfs/genfs/genfs.h> 127 128 int zfs_debug_level; 129 kmutex_t zfs_debug_mtx; 130 131 #define DROP_GIANT() /* nothing */ 132 #define PICKUP_GIANT() /* nothing */ 133 #define vfs_stdsync(a, b) 0 134 135 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len); 136 static int zfs_umount(vfs_t *vfsp, int fflag); 137 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 138 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp); 139 static int zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size); 140 static int zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp); 141 static int zfs_vget(vfs_t *vfsp, ino_t ino, int lktype, vnode_t **vpp); 142 static int zfs_sync(vfs_t *vfsp, int waitfor); 143 static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr); 144 static void zfs_freevfs(vfs_t *vfsp); 145 146 void zfs_init(void); 147 void zfs_fini(void); 148 149 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc; 150 extern const struct vnodeopv_desc zfs_specop_opv_desc; 151 extern const struct vnodeopv_desc zfs_fifoop_opv_desc; 152 extern const struct vnodeopv_desc zfs_sfsop_opv_desc; 153 154 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = { 155 &zfs_vnodeop_opv_desc, 156 &zfs_specop_opv_desc, 157 &zfs_fifoop_opv_desc, 158 &zfs_sfsop_opv_desc, 159 NULL, 160 }; 161 162 struct vfsops zfs_vfsops = { 163 .vfs_name = MOUNT_ZFS, 164 .vfs_min_mount_data = sizeof(struct zfs_args), 165 .vfs_opv_descs = zfs_vnodeop_descs, 166 .vfs_mount = zfs_mount, 167 .vfs_unmount = zfs_umount, 168 .vfs_root = zfs_root, 169 .vfs_statvfs = zfs_statvfs, 170 .vfs_sync = zfs_netbsd_sync, 171 .vfs_vget = zfs_vget, 172 .vfs_loadvnode = zfs_loadvnode, 173 .vfs_newvnode = zfs_newvnode, 174 .vfs_init = zfs_init, 175 .vfs_done = zfs_fini, 176 .vfs_start = (void *)nullop, 177 .vfs_renamelock_enter = genfs_renamelock_enter, 178 .vfs_renamelock_exit = genfs_renamelock_exit, 179 .vfs_reinit = (void *)nullop, 180 .vfs_vptofh = zfs_netbsd_vptofh, 181 .vfs_fhtovp = zfs_netbsd_fhtovp, 182 .vfs_quotactl = (void *)eopnotsupp, 183 .vfs_extattrctl = (void *)eopnotsupp, 184 .vfs_suspendctl = genfs_suspendctl, 185 .vfs_snapshot = (void *)eopnotsupp, 186 .vfs_fsync = (void *)eopnotsupp, 187 }; 188 189 static bool 190 zfs_sync_selector(void *cl, struct vnode *vp) 191 { 192 znode_t *zp; 193 194 /* 195 * Skip the vnode/inode if inaccessible, is control node or if the 196 * atime is clean. 197 */ 198 if (zfsctl_is_node(vp)) 199 return false; 200 zp = VTOZ(vp); 201 return zp != NULL && vp->v_type != VNON && zp->z_atime_dirty != 0 202 && !zp->z_unlinked; 203 } 204 205 static int 206 zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr) 207 { 208 struct vnode_iterator *marker; 209 zfsvfs_t *zfsvfs = vfsp->vfs_data; 210 vnode_t *vp; 211 212 /* 213 * On NetBSD, we need to push out atime updates. Solaris does 214 * this during VOP_INACTIVE, but that does not work well with the 215 * BSD VFS, so we do it in batch here. 216 */ 217 vfs_vnode_iterator_init(vfsp, &marker); 218 while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL))) 219 { 220 znode_t *zp; 221 dmu_buf_t *dbp; 222 dmu_tx_t *tx; 223 int error; 224 225 error = vn_lock(vp, LK_EXCLUSIVE); 226 if (error) { 227 VN_RELE(vp); 228 continue; 229 } 230 ZFS_ENTER(zfsvfs); 231 zp = VTOZ(vp); 232 tx = dmu_tx_create(zfsvfs->z_os); 233 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 234 zfs_sa_upgrade_txholds(tx, zp); 235 error = dmu_tx_assign(tx, TXG_WAIT); 236 if (error) { 237 dmu_tx_abort(tx); 238 } else { 239 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 240 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 241 zp->z_atime_dirty = 0; 242 dmu_tx_commit(tx); 243 } 244 ZFS_EXIT(zfsvfs); 245 vput(vp); 246 } 247 vfs_vnode_iterator_destroy(marker); 248 249 /* 250 * Then do the regular ZFS stuff. 251 */ 252 return zfs_sync(vfsp, waitfor); 253 } 254 255 static int 256 zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size) 257 { 258 znode_t *zp; 259 zfsvfs_t *zfsvfs; 260 uint32_t gen; 261 uint64_t gen64; 262 uint64_t object; 263 zfid_short_t *zfid; 264 int size, i, error; 265 266 if (zfsctl_is_node(vp)) 267 return zfsctl_vptofh(vp, fidp, fh_size); 268 269 zp = VTOZ(vp); 270 zfsvfs = zp->z_zfsvfs; 271 object = zp->z_id; 272 273 ZFS_ENTER(zfsvfs); 274 ZFS_VERIFY_ZP(zp); 275 276 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 277 &gen64, sizeof (uint64_t))) != 0) { 278 ZFS_EXIT(zfsvfs); 279 return (error); 280 } 281 282 gen = (uint32_t)gen64; 283 284 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 285 286 if (*fh_size < size) { 287 ZFS_EXIT(zfsvfs); 288 *fh_size = size; 289 return SET_ERROR(E2BIG); 290 } 291 *fh_size = size; 292 293 zfid = (zfid_short_t *)fidp; 294 295 zfid->zf_len = size; 296 297 for (i = 0; i < sizeof (zfid->zf_object); i++) 298 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 299 300 /* Must have a non-zero generation number to distinguish from .zfs */ 301 if (gen == 0) 302 gen = 1; 303 for (i = 0; i < sizeof (zfid->zf_gen); i++) 304 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 305 306 if (size == LONG_FID_LEN) { 307 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 308 zfid_long_t *zlfid; 309 310 zlfid = (zfid_long_t *)fidp; 311 312 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 313 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 314 315 /* XXX - this should be the generation number for the objset */ 316 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 317 zlfid->zf_setgen[i] = 0; 318 } 319 320 ZFS_EXIT(zfsvfs); 321 return 0; 322 } 323 324 static int 325 zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp) 326 { 327 zfsvfs_t *zfsvfs = vfsp->vfs_data; 328 znode_t *zp; 329 vnode_t *dvp; 330 uint64_t object = 0; 331 uint64_t fid_gen = 0; 332 uint64_t gen_mask; 333 uint64_t zp_gen; 334 int i, err; 335 336 *vpp = NULL; 337 338 ZFS_ENTER(zfsvfs); 339 340 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 341 zfid_long_t *zlfid = (zfid_long_t *)fidp; 342 uint64_t objsetid = 0; 343 uint64_t setgen = 0; 344 345 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 346 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 347 348 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 349 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 350 351 ZFS_EXIT(zfsvfs); 352 353 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 354 if (err) 355 return (SET_ERROR(EINVAL)); 356 ZFS_ENTER(zfsvfs); 357 } 358 359 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 360 zfid_short_t *zfid = (zfid_short_t *)fidp; 361 362 for (i = 0; i < sizeof (zfid->zf_object); i++) 363 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 364 365 for (i = 0; i < sizeof (zfid->zf_gen); i++) 366 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 367 } else { 368 ZFS_EXIT(zfsvfs); 369 return (SET_ERROR(EINVAL)); 370 } 371 372 /* A zero fid_gen means we are in the .zfs control directories */ 373 if (fid_gen == 0 && 374 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 375 ZFS_EXIT(zfsvfs); 376 if (object == ZFSCTL_INO_ROOT) 377 err = zfsctl_root(zfsvfs, vpp); 378 else 379 err = zfsctl_snapshot(zfsvfs, vpp); 380 if (err) 381 return err; 382 err = vn_lock(*vpp, LK_EXCLUSIVE); 383 if (err) { 384 vrele(*vpp); 385 *vpp = NULL; 386 return err; 387 } 388 return 0; 389 } 390 391 gen_mask = -1ULL >> (64 - 8 * i); 392 393 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 394 if (err = zfs_zget(zfsvfs, object, &zp)) { 395 ZFS_EXIT(zfsvfs); 396 return SET_ERROR(ESTALE); 397 } 398 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 399 sizeof (uint64_t)); 400 zp_gen = zp_gen & gen_mask; 401 if (zp_gen == 0) 402 zp_gen = 1; 403 if (zp->z_unlinked || zp_gen != fid_gen) { 404 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 405 VN_RELE(ZTOV(zp)); 406 ZFS_EXIT(zfsvfs); 407 return SET_ERROR(ESTALE); 408 } 409 410 *vpp = ZTOV(zp); 411 ZFS_EXIT(zfsvfs); 412 err = vn_lock(*vpp, lktype); 413 if (err) { 414 vrele(*vpp); 415 *vpp = NULL; 416 return err; 417 } 418 return 0; 419 } 420 #endif /* __NetBSD__ */ 421 422 /* 423 * We need to keep a count of active fs's. 424 * This is necessary to prevent our module 425 * from being unloaded after a umount -f 426 */ 427 static uint32_t zfs_active_fs_count = 0; 428 429 /*ARGSUSED*/ 430 static int 431 zfs_sync(vfs_t *vfsp, int waitfor) 432 { 433 /* 434 * Data integrity is job one. We don't want a compromised kernel 435 * writing to the storage pool, so we never sync during panic. 436 */ 437 if (panicstr) 438 return (0); 439 440 /* 441 * Ignore the system syncher. ZFS already commits async data 442 * at zfs_txg_timeout intervals. 443 */ 444 if (waitfor == MNT_LAZY) 445 return (0); 446 447 if (vfsp != NULL) { 448 /* 449 * Sync a specific filesystem. 450 */ 451 zfsvfs_t *zfsvfs = vfsp->vfs_data; 452 dsl_pool_t *dp; 453 int error; 454 455 error = vfs_stdsync(vfsp, waitfor); 456 if (error != 0) 457 return (error); 458 459 ZFS_ENTER(zfsvfs); 460 dp = dmu_objset_pool(zfsvfs->z_os); 461 462 /* 463 * If the system is shutting down, then skip any 464 * filesystems which may exist on a suspended pool. 465 */ 466 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 467 ZFS_EXIT(zfsvfs); 468 return (0); 469 } 470 471 if (zfsvfs->z_log != NULL) 472 zil_commit(zfsvfs->z_log, 0); 473 474 ZFS_EXIT(zfsvfs); 475 } else { 476 /* 477 * Sync all ZFS filesystems. This is what happens when you 478 * run sync(1M). Unlike other filesystems, ZFS honors the 479 * request by waiting for all pools to commit all dirty data. 480 */ 481 spa_sync_allpools(); 482 } 483 484 return (0); 485 } 486 487 #ifdef illumos 488 static int 489 zfs_create_unique_device(dev_t *dev) 490 { 491 major_t new_major; 492 493 do { 494 ASSERT3U(zfs_minor, <=, MAXMIN32); 495 minor_t start = zfs_minor; 496 do { 497 mutex_enter(&zfs_dev_mtx); 498 if (zfs_minor >= MAXMIN32) { 499 /* 500 * If we're still using the real major 501 * keep out of /dev/zfs and /dev/zvol minor 502 * number space. If we're using a getudev()'ed 503 * major number, we can use all of its minors. 504 */ 505 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 506 zfs_minor = ZFS_MIN_MINOR; 507 else 508 zfs_minor = 0; 509 } else { 510 zfs_minor++; 511 } 512 *dev = makedevice(zfs_major, zfs_minor); 513 mutex_exit(&zfs_dev_mtx); 514 } while (vfs_devismounted(*dev) && zfs_minor != start); 515 #ifdef illumos 516 if (zfs_minor == start) { 517 /* 518 * We are using all ~262,000 minor numbers for the 519 * current major number. Create a new major number. 520 */ 521 if ((new_major = getudev()) == (major_t)-1) { 522 cmn_err(CE_WARN, 523 "zfs_mount: Can't get unique major " 524 "device number."); 525 return (-1); 526 } 527 mutex_enter(&zfs_dev_mtx); 528 zfs_major = new_major; 529 zfs_minor = 0; 530 531 mutex_exit(&zfs_dev_mtx); 532 } else { 533 break; 534 } 535 /* CONSTANTCONDITION */ 536 #endif 537 } while (1); 538 539 return (0); 540 } 541 #endif /* illumos */ 542 543 544 static void 545 atime_changed_cb(void *arg, uint64_t newval) 546 { 547 zfsvfs_t *zfsvfs = arg; 548 549 if (newval == TRUE) { 550 zfsvfs->z_atime = TRUE; 551 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 552 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 553 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 554 } else { 555 zfsvfs->z_atime = FALSE; 556 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 557 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 558 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 559 } 560 } 561 562 static void 563 xattr_changed_cb(void *arg, uint64_t newval) 564 { 565 zfsvfs_t *zfsvfs = arg; 566 567 if (newval == TRUE) { 568 /* XXX locking on vfs_flag? */ 569 #ifdef TODO 570 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 571 #endif 572 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 573 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 574 } else { 575 /* XXX locking on vfs_flag? */ 576 #ifdef TODO 577 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 578 #endif 579 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 580 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 581 } 582 } 583 584 static void 585 blksz_changed_cb(void *arg, uint64_t newval) 586 { 587 zfsvfs_t *zfsvfs = arg; 588 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 589 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 590 ASSERT(ISP2(newval)); 591 592 zfsvfs->z_max_blksz = newval; 593 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 594 } 595 596 static void 597 readonly_changed_cb(void *arg, uint64_t newval) 598 { 599 zfsvfs_t *zfsvfs = arg; 600 601 if (newval) { 602 /* XXX locking on vfs_flag? */ 603 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 604 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 605 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 606 } else { 607 /* XXX locking on vfs_flag? */ 608 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 609 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 610 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 611 } 612 } 613 614 static void 615 setuid_changed_cb(void *arg, uint64_t newval) 616 { 617 zfsvfs_t *zfsvfs = arg; 618 619 if (newval == FALSE) { 620 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 621 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 622 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 623 } else { 624 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 625 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 626 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 627 } 628 } 629 630 static void 631 exec_changed_cb(void *arg, uint64_t newval) 632 { 633 zfsvfs_t *zfsvfs = arg; 634 635 if (newval == FALSE) { 636 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 637 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 638 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 639 } else { 640 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 641 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 642 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 643 } 644 } 645 646 /* 647 * The nbmand mount option can be changed at mount time. 648 * We can't allow it to be toggled on live file systems or incorrect 649 * behavior may be seen from cifs clients 650 * 651 * This property isn't registered via dsl_prop_register(), but this callback 652 * will be called when a file system is first mounted 653 */ 654 static void 655 nbmand_changed_cb(void *arg, uint64_t newval) 656 { 657 zfsvfs_t *zfsvfs = arg; 658 if (newval == FALSE) { 659 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 660 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 661 } else { 662 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 663 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 664 } 665 } 666 667 static void 668 snapdir_changed_cb(void *arg, uint64_t newval) 669 { 670 zfsvfs_t *zfsvfs = arg; 671 672 zfsvfs->z_show_ctldir = newval; 673 } 674 675 static void 676 vscan_changed_cb(void *arg, uint64_t newval) 677 { 678 zfsvfs_t *zfsvfs = arg; 679 680 zfsvfs->z_vscan = newval; 681 } 682 683 static void 684 acl_mode_changed_cb(void *arg, uint64_t newval) 685 { 686 zfsvfs_t *zfsvfs = arg; 687 688 zfsvfs->z_acl_mode = newval; 689 } 690 691 static void 692 acl_inherit_changed_cb(void *arg, uint64_t newval) 693 { 694 zfsvfs_t *zfsvfs = arg; 695 696 zfsvfs->z_acl_inherit = newval; 697 } 698 699 static int 700 zfs_register_callbacks(vfs_t *vfsp) 701 { 702 struct dsl_dataset *ds = NULL; 703 objset_t *os = NULL; 704 zfsvfs_t *zfsvfs = NULL; 705 uint64_t nbmand; 706 boolean_t readonly = B_FALSE; 707 boolean_t do_readonly = B_FALSE; 708 boolean_t setuid = B_FALSE; 709 boolean_t do_setuid = B_FALSE; 710 boolean_t exec = B_FALSE; 711 boolean_t do_exec = B_FALSE; 712 #ifdef illumos 713 boolean_t devices = B_FALSE; 714 boolean_t do_devices = B_FALSE; 715 #endif 716 boolean_t xattr = B_FALSE; 717 boolean_t do_xattr = B_FALSE; 718 boolean_t atime = B_FALSE; 719 boolean_t do_atime = B_FALSE; 720 int error = 0; 721 722 ASSERT(vfsp); 723 zfsvfs = vfsp->vfs_data; 724 ASSERT(zfsvfs); 725 os = zfsvfs->z_os; 726 727 /* 728 * This function can be called for a snapshot when we update snapshot's 729 * mount point, which isn't really supported. 730 */ 731 if (dmu_objset_is_snapshot(os)) 732 return (EOPNOTSUPP); 733 734 /* 735 * The act of registering our callbacks will destroy any mount 736 * options we may have. In order to enable temporary overrides 737 * of mount options, we stash away the current values and 738 * restore them after we register the callbacks. 739 */ 740 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 741 !spa_writeable(dmu_objset_spa(os))) { 742 readonly = B_TRUE; 743 do_readonly = B_TRUE; 744 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 745 readonly = B_FALSE; 746 do_readonly = B_TRUE; 747 } 748 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 749 setuid = B_FALSE; 750 do_setuid = B_TRUE; 751 } else { 752 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 753 setuid = B_FALSE; 754 do_setuid = B_TRUE; 755 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 756 setuid = B_TRUE; 757 do_setuid = B_TRUE; 758 } 759 } 760 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 761 exec = B_FALSE; 762 do_exec = B_TRUE; 763 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 764 exec = B_TRUE; 765 do_exec = B_TRUE; 766 } 767 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 768 xattr = B_FALSE; 769 do_xattr = B_TRUE; 770 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 771 xattr = B_TRUE; 772 do_xattr = B_TRUE; 773 } 774 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 775 atime = B_FALSE; 776 do_atime = B_TRUE; 777 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 778 atime = B_TRUE; 779 do_atime = B_TRUE; 780 } 781 782 /* 783 * We need to enter pool configuration here, so that we can use 784 * dsl_prop_get_int_ds() to handle the special nbmand property below. 785 * dsl_prop_get_integer() can not be used, because it has to acquire 786 * spa_namespace_lock and we can not do that because we already hold 787 * z_teardown_lock. The problem is that spa_config_sync() is called 788 * with spa_namespace_lock held and the function calls ZFS vnode 789 * operations to write the cache file and thus z_teardown_lock is 790 * acquired after spa_namespace_lock. 791 */ 792 ds = dmu_objset_ds(os); 793 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 794 795 /* 796 * nbmand is a special property. It can only be changed at 797 * mount time. 798 * 799 * This is weird, but it is documented to only be changeable 800 * at mount time. 801 */ 802 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 803 nbmand = B_FALSE; 804 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 805 nbmand = B_TRUE; 806 } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { 807 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 808 return (error); 809 } 810 811 /* 812 * Register property callbacks. 813 * 814 * It would probably be fine to just check for i/o error from 815 * the first prop_register(), but I guess I like to go 816 * overboard... 817 */ 818 error = dsl_prop_register(ds, 819 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 820 error = error ? error : dsl_prop_register(ds, 821 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 822 error = error ? error : dsl_prop_register(ds, 823 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 824 error = error ? error : dsl_prop_register(ds, 825 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 826 #ifdef illumos 827 error = error ? error : dsl_prop_register(ds, 828 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); 829 #endif 830 error = error ? error : dsl_prop_register(ds, 831 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 832 error = error ? error : dsl_prop_register(ds, 833 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 834 error = error ? error : dsl_prop_register(ds, 835 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 836 error = error ? error : dsl_prop_register(ds, 837 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 838 error = error ? error : dsl_prop_register(ds, 839 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 840 zfsvfs); 841 error = error ? error : dsl_prop_register(ds, 842 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 843 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 844 if (error) 845 goto unregister; 846 847 /* 848 * Invoke our callbacks to restore temporary mount options. 849 */ 850 if (do_readonly) 851 readonly_changed_cb(zfsvfs, readonly); 852 if (do_setuid) 853 setuid_changed_cb(zfsvfs, setuid); 854 if (do_exec) 855 exec_changed_cb(zfsvfs, exec); 856 if (do_xattr) 857 xattr_changed_cb(zfsvfs, xattr); 858 if (do_atime) 859 atime_changed_cb(zfsvfs, atime); 860 861 nbmand_changed_cb(zfsvfs, nbmand); 862 863 return (0); 864 865 unregister: 866 dsl_prop_unregister_all(ds, zfsvfs); 867 return (error); 868 } 869 870 static int 871 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 872 uint64_t *userp, uint64_t *groupp) 873 { 874 /* 875 * Is it a valid type of object to track? 876 */ 877 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 878 return (SET_ERROR(ENOENT)); 879 880 /* 881 * If we have a NULL data pointer 882 * then assume the id's aren't changing and 883 * return EEXIST to the dmu to let it know to 884 * use the same ids 885 */ 886 if (data == NULL) 887 return (SET_ERROR(EEXIST)); 888 889 if (bonustype == DMU_OT_ZNODE) { 890 znode_phys_t *znp = data; 891 *userp = znp->zp_uid; 892 *groupp = znp->zp_gid; 893 } else { 894 int hdrsize; 895 sa_hdr_phys_t *sap = data; 896 sa_hdr_phys_t sa = *sap; 897 boolean_t swap = B_FALSE; 898 899 ASSERT(bonustype == DMU_OT_SA); 900 901 if (sa.sa_magic == 0) { 902 /* 903 * This should only happen for newly created 904 * files that haven't had the znode data filled 905 * in yet. 906 */ 907 *userp = 0; 908 *groupp = 0; 909 return (0); 910 } 911 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 912 sa.sa_magic = SA_MAGIC; 913 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 914 swap = B_TRUE; 915 } else { 916 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 917 } 918 919 hdrsize = sa_hdrsize(&sa); 920 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 921 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + 922 SA_UID_OFFSET)); 923 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + 924 SA_GID_OFFSET)); 925 if (swap) { 926 *userp = BSWAP_64(*userp); 927 *groupp = BSWAP_64(*groupp); 928 } 929 } 930 return (0); 931 } 932 933 static void 934 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 935 char *domainbuf, int buflen, uid_t *ridp) 936 { 937 uint64_t fuid; 938 const char *domain; 939 940 fuid = strtonum(fuidstr, NULL); 941 942 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 943 if (domain) 944 (void) strlcpy(domainbuf, domain, buflen); 945 else 946 domainbuf[0] = '\0'; 947 *ridp = FUID_RID(fuid); 948 } 949 950 static uint64_t 951 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 952 { 953 switch (type) { 954 case ZFS_PROP_USERUSED: 955 return (DMU_USERUSED_OBJECT); 956 case ZFS_PROP_GROUPUSED: 957 return (DMU_GROUPUSED_OBJECT); 958 case ZFS_PROP_USERQUOTA: 959 return (zfsvfs->z_userquota_obj); 960 case ZFS_PROP_GROUPQUOTA: 961 return (zfsvfs->z_groupquota_obj); 962 } 963 return (0); 964 } 965 966 int 967 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 968 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 969 { 970 int error; 971 zap_cursor_t zc; 972 zap_attribute_t za; 973 zfs_useracct_t *buf = vbuf; 974 uint64_t obj; 975 976 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 977 return (SET_ERROR(ENOTSUP)); 978 979 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 980 if (obj == 0) { 981 *bufsizep = 0; 982 return (0); 983 } 984 985 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 986 (error = zap_cursor_retrieve(&zc, &za)) == 0; 987 zap_cursor_advance(&zc)) { 988 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 989 *bufsizep) 990 break; 991 992 fuidstr_to_sid(zfsvfs, za.za_name, 993 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 994 995 buf->zu_space = za.za_first_integer; 996 buf++; 997 } 998 if (error == ENOENT) 999 error = 0; 1000 1001 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 1002 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 1003 *cookiep = zap_cursor_serialize(&zc); 1004 zap_cursor_fini(&zc); 1005 return (error); 1006 } 1007 1008 /* 1009 * buf must be big enough (eg, 32 bytes) 1010 */ 1011 static int 1012 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 1013 char *buf, boolean_t addok) 1014 { 1015 uint64_t fuid; 1016 int domainid = 0; 1017 1018 if (domain && domain[0]) { 1019 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 1020 if (domainid == -1) 1021 return (SET_ERROR(ENOENT)); 1022 } 1023 fuid = FUID_ENCODE(domainid, rid); 1024 (void) sprintf(buf, "%llx", (longlong_t)fuid); 1025 return (0); 1026 } 1027 1028 int 1029 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 1030 const char *domain, uint64_t rid, uint64_t *valp) 1031 { 1032 char buf[32]; 1033 int err; 1034 uint64_t obj; 1035 1036 *valp = 0; 1037 1038 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 1039 return (SET_ERROR(ENOTSUP)); 1040 1041 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 1042 if (obj == 0) 1043 return (0); 1044 1045 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); 1046 if (err) 1047 return (err); 1048 1049 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 1050 if (err == ENOENT) 1051 err = 0; 1052 return (err); 1053 } 1054 1055 int 1056 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 1057 const char *domain, uint64_t rid, uint64_t quota) 1058 { 1059 char buf[32]; 1060 int err; 1061 dmu_tx_t *tx; 1062 uint64_t *objp; 1063 boolean_t fuid_dirtied; 1064 1065 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 1066 return (SET_ERROR(EINVAL)); 1067 1068 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 1069 return (SET_ERROR(ENOTSUP)); 1070 1071 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 1072 &zfsvfs->z_groupquota_obj; 1073 1074 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 1075 if (err) 1076 return (err); 1077 fuid_dirtied = zfsvfs->z_fuid_dirty; 1078 1079 tx = dmu_tx_create(zfsvfs->z_os); 1080 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 1081 if (*objp == 0) { 1082 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 1083 zfs_userquota_prop_prefixes[type]); 1084 } 1085 if (fuid_dirtied) 1086 zfs_fuid_txhold(zfsvfs, tx); 1087 err = dmu_tx_assign(tx, TXG_WAIT); 1088 if (err) { 1089 dmu_tx_abort(tx); 1090 return (err); 1091 } 1092 1093 mutex_enter(&zfsvfs->z_lock); 1094 if (*objp == 0) { 1095 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 1096 DMU_OT_NONE, 0, tx); 1097 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 1098 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 1099 } 1100 mutex_exit(&zfsvfs->z_lock); 1101 1102 if (quota == 0) { 1103 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 1104 if (err == ENOENT) 1105 err = 0; 1106 } else { 1107 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 1108 } 1109 ASSERT(err == 0); 1110 if (fuid_dirtied) 1111 zfs_fuid_sync(zfsvfs, tx); 1112 dmu_tx_commit(tx); 1113 return (err); 1114 } 1115 1116 boolean_t 1117 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 1118 { 1119 char buf[32]; 1120 uint64_t used, quota, usedobj, quotaobj; 1121 int err; 1122 1123 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 1124 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 1125 1126 if (quotaobj == 0 || zfsvfs->z_replay) 1127 return (B_FALSE); 1128 1129 (void) sprintf(buf, "%llx", (longlong_t)fuid); 1130 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 1131 if (err != 0) 1132 return (B_FALSE); 1133 1134 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 1135 if (err != 0) 1136 return (B_FALSE); 1137 return (used >= quota); 1138 } 1139 1140 boolean_t 1141 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) 1142 { 1143 uint64_t fuid; 1144 uint64_t quotaobj; 1145 1146 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 1147 1148 fuid = isgroup ? zp->z_gid : zp->z_uid; 1149 1150 if (quotaobj == 0 || zfsvfs->z_replay) 1151 return (B_FALSE); 1152 1153 return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); 1154 } 1155 1156 /* 1157 * Associate this zfsvfs with the given objset, which must be owned. 1158 * This will cache a bunch of on-disk state from the objset in the 1159 * zfsvfs. 1160 */ 1161 static int 1162 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 1163 { 1164 int error; 1165 uint64_t val; 1166 1167 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 1168 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1169 zfsvfs->z_os = os; 1170 1171 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 1172 if (error != 0) 1173 return (error); 1174 if (zfsvfs->z_version > 1175 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 1176 (void) printf("Can't mount a version %lld file system " 1177 "on a version %lld pool\n. Pool must be upgraded to mount " 1178 "this file system.", (u_longlong_t)zfsvfs->z_version, 1179 (u_longlong_t)spa_version(dmu_objset_spa(os))); 1180 return (SET_ERROR(ENOTSUP)); 1181 } 1182 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 1183 if (error != 0) 1184 return (error); 1185 zfsvfs->z_norm = (int)val; 1186 1187 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 1188 if (error != 0) 1189 return (error); 1190 zfsvfs->z_utf8 = (val != 0); 1191 1192 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 1193 if (error != 0) 1194 return (error); 1195 zfsvfs->z_case = (uint_t)val; 1196 1197 /* 1198 * Fold case on file systems that are always or sometimes case 1199 * insensitive. 1200 */ 1201 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 1202 zfsvfs->z_case == ZFS_CASE_MIXED) 1203 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1204 1205 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1206 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1207 1208 uint64_t sa_obj = 0; 1209 if (zfsvfs->z_use_sa) { 1210 /* should either have both of these objects or none */ 1211 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 1212 &sa_obj); 1213 if (error != 0) 1214 return (error); 1215 } 1216 1217 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1218 &zfsvfs->z_attr_table); 1219 if (error != 0) 1220 return (error); 1221 1222 if (zfsvfs->z_version >= ZPL_VERSION_SA) 1223 sa_register_update_callback(os, zfs_sa_upgrade); 1224 1225 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 1226 &zfsvfs->z_root); 1227 if (error != 0) 1228 return (error); 1229 ASSERT(zfsvfs->z_root != 0); 1230 1231 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 1232 &zfsvfs->z_unlinkedobj); 1233 if (error != 0) 1234 return (error); 1235 1236 error = zap_lookup(os, MASTER_NODE_OBJ, 1237 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 1238 8, 1, &zfsvfs->z_userquota_obj); 1239 if (error == ENOENT) 1240 zfsvfs->z_userquota_obj = 0; 1241 else if (error != 0) 1242 return (error); 1243 1244 error = zap_lookup(os, MASTER_NODE_OBJ, 1245 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 1246 8, 1, &zfsvfs->z_groupquota_obj); 1247 if (error == ENOENT) 1248 zfsvfs->z_groupquota_obj = 0; 1249 else if (error != 0) 1250 return (error); 1251 1252 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 1253 &zfsvfs->z_fuid_obj); 1254 if (error == ENOENT) 1255 zfsvfs->z_fuid_obj = 0; 1256 else if (error != 0) 1257 return (error); 1258 1259 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 1260 &zfsvfs->z_shares_dir); 1261 if (error == ENOENT) 1262 zfsvfs->z_shares_dir = 0; 1263 else if (error != 0) 1264 return (error); 1265 1266 /* 1267 * Only use the name cache if we are looking for a 1268 * name on a file system that does not require normalization 1269 * or case folding. We can also look there if we happen to be 1270 * on a non-normalizing, mixed sensitivity file system IF we 1271 * are looking for the exact name (which is always the case on 1272 * FreeBSD). 1273 */ 1274 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 1275 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 1276 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 1277 1278 return (0); 1279 } 1280 1281 int 1282 zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 1283 { 1284 objset_t *os; 1285 zfsvfs_t *zfsvfs; 1286 int error; 1287 1288 /* 1289 * XXX: Fix struct statfs so this isn't necessary! 1290 * 1291 * The 'osname' is used as the filesystem's special node, which means 1292 * it must fit in statfs.f_mntfromname, or else it can't be 1293 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 1294 * 'zfs unmount' to think it's not mounted when it is. 1295 */ 1296 if (strlen(osname) >= MNAMELEN) 1297 return (SET_ERROR(ENAMETOOLONG)); 1298 1299 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1300 1301 /* 1302 * We claim to always be readonly so we can open snapshots; 1303 * other ZPL code will prevent us from writing to snapshots. 1304 */ 1305 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 1306 if (error) { 1307 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1308 return (error); 1309 } 1310 1311 zfsvfs->z_vfs = NULL; 1312 zfsvfs->z_parent = zfsvfs; 1313 1314 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1315 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1316 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1317 offsetof(znode_t, z_link_node)); 1318 #ifdef DIAGNOSTIC 1319 rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); 1320 #else 1321 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); 1322 #endif 1323 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1324 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1325 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1326 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1327 1328 error = zfsvfs_init(zfsvfs, os); 1329 if (error != 0) { 1330 dmu_objset_disown(os, zfsvfs); 1331 *zfvp = NULL; 1332 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1333 return (error); 1334 } 1335 1336 *zfvp = zfsvfs; 1337 return (0); 1338 } 1339 1340 static int 1341 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1342 { 1343 int error; 1344 1345 error = zfs_register_callbacks(zfsvfs->z_vfs); 1346 if (error) 1347 return (error); 1348 1349 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1350 1351 /* 1352 * If we are not mounting (ie: online recv), then we don't 1353 * have to worry about replaying the log as we blocked all 1354 * operations out since we closed the ZIL. 1355 */ 1356 if (mounting) { 1357 boolean_t readonly; 1358 1359 /* 1360 * During replay we remove the read only flag to 1361 * allow replays to succeed. 1362 */ 1363 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1364 if (readonly != 0) 1365 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1366 else 1367 zfs_unlinked_drain(zfsvfs); 1368 1369 /* 1370 * Parse and replay the intent log. 1371 * 1372 * Because of ziltest, this must be done after 1373 * zfs_unlinked_drain(). (Further note: ziltest 1374 * doesn't use readonly mounts, where 1375 * zfs_unlinked_drain() isn't called.) This is because 1376 * ziltest causes spa_sync() to think it's committed, 1377 * but actually it is not, so the intent log contains 1378 * many txg's worth of changes. 1379 * 1380 * In particular, if object N is in the unlinked set in 1381 * the last txg to actually sync, then it could be 1382 * actually freed in a later txg and then reallocated 1383 * in a yet later txg. This would write a "create 1384 * object N" record to the intent log. Normally, this 1385 * would be fine because the spa_sync() would have 1386 * written out the fact that object N is free, before 1387 * we could write the "create object N" intent log 1388 * record. 1389 * 1390 * But when we are in ziltest mode, we advance the "open 1391 * txg" without actually spa_sync()-ing the changes to 1392 * disk. So we would see that object N is still 1393 * allocated and in the unlinked set, and there is an 1394 * intent log record saying to allocate it. 1395 */ 1396 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1397 if (zil_replay_disable) { 1398 zil_destroy(zfsvfs->z_log, B_FALSE); 1399 } else { 1400 zfsvfs->z_replay = B_TRUE; 1401 zil_replay(zfsvfs->z_os, zfsvfs, 1402 zfs_replay_vector); 1403 zfsvfs->z_replay = B_FALSE; 1404 } 1405 } 1406 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1407 } 1408 1409 /* 1410 * Set the objset user_ptr to track its zfsvfs. 1411 */ 1412 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1413 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1414 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1415 1416 return (0); 1417 } 1418 1419 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1420 1421 void 1422 zfsvfs_free(zfsvfs_t *zfsvfs) 1423 { 1424 int i; 1425 1426 /* 1427 * This is a barrier to prevent the filesystem from going away in 1428 * zfs_znode_move() until we can safely ensure that the filesystem is 1429 * not unmounted. We consider the filesystem valid before the barrier 1430 * and invalid after the barrier. 1431 */ 1432 rw_enter(&zfsvfs_lock, RW_READER); 1433 rw_exit(&zfsvfs_lock); 1434 1435 zfs_fuid_destroy(zfsvfs); 1436 1437 mutex_destroy(&zfsvfs->z_znodes_lock); 1438 mutex_destroy(&zfsvfs->z_lock); 1439 list_destroy(&zfsvfs->z_all_znodes); 1440 rrm_destroy(&zfsvfs->z_teardown_lock); 1441 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1442 rw_destroy(&zfsvfs->z_fuid_lock); 1443 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1444 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1445 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1446 } 1447 1448 static void 1449 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1450 { 1451 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1452 if (zfsvfs->z_vfs) { 1453 if (zfsvfs->z_use_fuids) { 1454 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1455 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1456 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1457 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1458 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1459 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1460 } else { 1461 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1462 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1463 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1464 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1465 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1466 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1467 } 1468 } 1469 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1470 } 1471 1472 #ifdef __NetBSD__ 1473 int 1474 #else 1475 static int 1476 #endif 1477 zfs_domount(vfs_t *vfsp, char *osname) 1478 { 1479 uint64_t recordsize, fsid_guid; 1480 int error = 0; 1481 zfsvfs_t *zfsvfs; 1482 vnode_t *vp; 1483 1484 ASSERT(vfsp); 1485 ASSERT(osname); 1486 1487 error = zfsvfs_create(osname, &zfsvfs); 1488 if (error) 1489 return (error); 1490 zfsvfs->z_vfs = vfsp; 1491 1492 #ifdef illumos 1493 /* Initialize the generic filesystem structure. */ 1494 vfsp->vfs_bcount = 0; 1495 vfsp->vfs_data = NULL; 1496 1497 if (zfs_create_unique_device(&mount_dev) == -1) { 1498 error = SET_ERROR(ENODEV); 1499 goto out; 1500 } 1501 ASSERT(vfs_devismounted(mount_dev) == 0); 1502 #endif 1503 1504 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1505 NULL)) 1506 goto out; 1507 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1508 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1509 1510 vfsp->vfs_data = zfsvfs; 1511 #ifdef __FreeBSD_kernel__ 1512 vfsp->mnt_flag |= MNT_LOCAL; 1513 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1514 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1515 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1516 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1517 #endif 1518 #ifdef __NetBSD__ 1519 vfsp->mnt_flag |= MNT_LOCAL; 1520 vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP; 1521 #endif 1522 1523 /* 1524 * The fsid is 64 bits, composed of an 8-bit fs type, which 1525 * separates our fsid from any other filesystem types, and a 1526 * 56-bit objset unique ID. The objset unique ID is unique to 1527 * all objsets open on this system, provided by unique_create(). 1528 * The 8-bit fs type must be put in the low bits of fsid[1] 1529 * because that's where other Solaris filesystems put it. 1530 */ 1531 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1532 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1533 #ifdef __FreeBSD_kernel__ 1534 vfsp->vfs_fsid.val[0] = fsid_guid; 1535 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1536 vfsp->mnt_vfc->vfc_typenum & 0xFF; 1537 #endif 1538 #ifdef __NetBSD__ 1539 vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid; 1540 vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) | 1541 makefstype(vfsp->mnt_op->vfs_name) & 0xFF; 1542 vfsp->mnt_stat.f_fsid = fsid_guid; 1543 #endif 1544 1545 /* 1546 * Set features for file system. 1547 */ 1548 zfs_set_fuid_feature(zfsvfs); 1549 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1550 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1551 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1552 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1553 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1554 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1555 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1556 } 1557 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1558 1559 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1560 uint64_t pval; 1561 1562 atime_changed_cb(zfsvfs, B_FALSE); 1563 readonly_changed_cb(zfsvfs, B_TRUE); 1564 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1565 goto out; 1566 xattr_changed_cb(zfsvfs, pval); 1567 zfsvfs->z_issnap = B_TRUE; 1568 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1569 1570 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1571 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1572 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1573 } else { 1574 error = zfsvfs_setup(zfsvfs, B_TRUE); 1575 } 1576 1577 #ifdef __FreeBSD_kernel__ 1578 vfs_mountedfrom(vfsp, osname); 1579 #endif 1580 #ifdef __NetBSD__ 1581 set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp); 1582 #endif 1583 1584 if (!zfsvfs->z_issnap) 1585 zfsctl_create(zfsvfs); 1586 out: 1587 if (error) { 1588 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1589 zfsvfs_free(zfsvfs); 1590 } else { 1591 atomic_inc_32(&zfs_active_fs_count); 1592 } 1593 1594 return (error); 1595 } 1596 1597 void 1598 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1599 { 1600 objset_t *os = zfsvfs->z_os; 1601 1602 if (!dmu_objset_is_snapshot(os)) 1603 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1604 } 1605 1606 #ifdef SECLABEL 1607 /* 1608 * Convert a decimal digit string to a uint64_t integer. 1609 */ 1610 static int 1611 str_to_uint64(char *str, uint64_t *objnum) 1612 { 1613 uint64_t num = 0; 1614 1615 while (*str) { 1616 if (*str < '0' || *str > '9') 1617 return (SET_ERROR(EINVAL)); 1618 1619 num = num*10 + *str++ - '0'; 1620 } 1621 1622 *objnum = num; 1623 return (0); 1624 } 1625 1626 /* 1627 * The boot path passed from the boot loader is in the form of 1628 * "rootpool-name/root-filesystem-object-number'. Convert this 1629 * string to a dataset name: "rootpool-name/root-filesystem-name". 1630 */ 1631 static int 1632 zfs_parse_bootfs(char *bpath, char *outpath) 1633 { 1634 char *slashp; 1635 uint64_t objnum; 1636 int error; 1637 1638 if (*bpath == 0 || *bpath == '/') 1639 return (SET_ERROR(EINVAL)); 1640 1641 (void) strcpy(outpath, bpath); 1642 1643 slashp = strchr(bpath, '/'); 1644 1645 /* if no '/', just return the pool name */ 1646 if (slashp == NULL) { 1647 return (0); 1648 } 1649 1650 /* if not a number, just return the root dataset name */ 1651 if (str_to_uint64(slashp+1, &objnum)) { 1652 return (0); 1653 } 1654 1655 *slashp = '\0'; 1656 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1657 *slashp = '/'; 1658 1659 return (error); 1660 } 1661 1662 /* 1663 * Check that the hex label string is appropriate for the dataset being 1664 * mounted into the global_zone proper. 1665 * 1666 * Return an error if the hex label string is not default or 1667 * admin_low/admin_high. For admin_low labels, the corresponding 1668 * dataset must be readonly. 1669 */ 1670 int 1671 zfs_check_global_label(const char *dsname, const char *hexsl) 1672 { 1673 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1674 return (0); 1675 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1676 return (0); 1677 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1678 /* must be readonly */ 1679 uint64_t rdonly; 1680 1681 if (dsl_prop_get_integer(dsname, 1682 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1683 return (SET_ERROR(EACCES)); 1684 return (rdonly ? 0 : EACCES); 1685 } 1686 return (SET_ERROR(EACCES)); 1687 } 1688 1689 /* 1690 * Determine whether the mount is allowed according to MAC check. 1691 * by comparing (where appropriate) label of the dataset against 1692 * the label of the zone being mounted into. If the dataset has 1693 * no label, create one. 1694 * 1695 * Returns 0 if access allowed, error otherwise (e.g. EACCES) 1696 */ 1697 static int 1698 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1699 { 1700 int error, retv; 1701 zone_t *mntzone = NULL; 1702 ts_label_t *mnt_tsl; 1703 bslabel_t *mnt_sl; 1704 bslabel_t ds_sl; 1705 char ds_hexsl[MAXNAMELEN]; 1706 1707 retv = EACCES; /* assume the worst */ 1708 1709 /* 1710 * Start by getting the dataset label if it exists. 1711 */ 1712 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1713 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1714 if (error) 1715 return (SET_ERROR(EACCES)); 1716 1717 /* 1718 * If labeling is NOT enabled, then disallow the mount of datasets 1719 * which have a non-default label already. No other label checks 1720 * are needed. 1721 */ 1722 if (!is_system_labeled()) { 1723 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1724 return (0); 1725 return (SET_ERROR(EACCES)); 1726 } 1727 1728 /* 1729 * Get the label of the mountpoint. If mounting into the global 1730 * zone (i.e. mountpoint is not within an active zone and the 1731 * zoned property is off), the label must be default or 1732 * admin_low/admin_high only; no other checks are needed. 1733 */ 1734 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1735 if (mntzone->zone_id == GLOBAL_ZONEID) { 1736 uint64_t zoned; 1737 1738 zone_rele(mntzone); 1739 1740 if (dsl_prop_get_integer(osname, 1741 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1742 return (SET_ERROR(EACCES)); 1743 if (!zoned) 1744 return (zfs_check_global_label(osname, ds_hexsl)); 1745 else 1746 /* 1747 * This is the case of a zone dataset being mounted 1748 * initially, before the zone has been fully created; 1749 * allow this mount into global zone. 1750 */ 1751 return (0); 1752 } 1753 1754 mnt_tsl = mntzone->zone_slabel; 1755 ASSERT(mnt_tsl != NULL); 1756 label_hold(mnt_tsl); 1757 mnt_sl = label2bslabel(mnt_tsl); 1758 1759 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1760 /* 1761 * The dataset doesn't have a real label, so fabricate one. 1762 */ 1763 char *str = NULL; 1764 1765 if (l_to_str_internal(mnt_sl, &str) == 0 && 1766 dsl_prop_set_string(osname, 1767 zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1768 ZPROP_SRC_LOCAL, str) == 0) 1769 retv = 0; 1770 if (str != NULL) 1771 kmem_free(str, strlen(str) + 1); 1772 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1773 /* 1774 * Now compare labels to complete the MAC check. If the 1775 * labels are equal then allow access. If the mountpoint 1776 * label dominates the dataset label, allow readonly access. 1777 * Otherwise, access is denied. 1778 */ 1779 if (blequal(mnt_sl, &ds_sl)) 1780 retv = 0; 1781 else if (bldominates(mnt_sl, &ds_sl)) { 1782 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1783 retv = 0; 1784 } 1785 } 1786 1787 label_rele(mnt_tsl); 1788 zone_rele(mntzone); 1789 return (retv); 1790 } 1791 #endif /* SECLABEL */ 1792 1793 #ifdef OPENSOLARIS_MOUNTROOT 1794 static int 1795 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1796 { 1797 int error = 0; 1798 static int zfsrootdone = 0; 1799 zfsvfs_t *zfsvfs = NULL; 1800 znode_t *zp = NULL; 1801 vnode_t *vp = NULL; 1802 char *zfs_bootfs; 1803 char *zfs_devid; 1804 1805 ASSERT(vfsp); 1806 1807 /* 1808 * The filesystem that we mount as root is defined in the 1809 * boot property "zfs-bootfs" with a format of 1810 * "poolname/root-dataset-objnum". 1811 */ 1812 if (why == ROOT_INIT) { 1813 if (zfsrootdone++) 1814 return (SET_ERROR(EBUSY)); 1815 /* 1816 * the process of doing a spa_load will require the 1817 * clock to be set before we could (for example) do 1818 * something better by looking at the timestamp on 1819 * an uberblock, so just set it to -1. 1820 */ 1821 clkset(-1); 1822 1823 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1824 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1825 "bootfs name"); 1826 return (SET_ERROR(EINVAL)); 1827 } 1828 zfs_devid = spa_get_bootprop("diskdevid"); 1829 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1830 if (zfs_devid) 1831 spa_free_bootprop(zfs_devid); 1832 if (error) { 1833 spa_free_bootprop(zfs_bootfs); 1834 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1835 error); 1836 return (error); 1837 } 1838 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1839 spa_free_bootprop(zfs_bootfs); 1840 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1841 error); 1842 return (error); 1843 } 1844 1845 spa_free_bootprop(zfs_bootfs); 1846 1847 if (error = vfs_lock(vfsp)) 1848 return (error); 1849 1850 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1851 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1852 goto out; 1853 } 1854 1855 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1856 ASSERT(zfsvfs); 1857 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1858 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1859 goto out; 1860 } 1861 1862 vp = ZTOV(zp); 1863 mutex_enter(&vp->v_lock); 1864 vp->v_flag |= VROOT; 1865 mutex_exit(&vp->v_lock); 1866 rootvp = vp; 1867 1868 /* 1869 * Leave rootvp held. The root file system is never unmounted. 1870 */ 1871 1872 vfs_add((struct vnode *)0, vfsp, 1873 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1874 out: 1875 vfs_unlock(vfsp); 1876 return (error); 1877 } else if (why == ROOT_REMOUNT) { 1878 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1879 vfsp->vfs_flag |= VFS_REMOUNT; 1880 1881 /* refresh mount options */ 1882 zfs_unregister_callbacks(vfsp->vfs_data); 1883 return (zfs_register_callbacks(vfsp)); 1884 1885 } else if (why == ROOT_UNMOUNT) { 1886 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1887 (void) zfs_sync(vfsp, 0, 0); 1888 return (0); 1889 } 1890 1891 /* 1892 * if "why" is equal to anything else other than ROOT_INIT, 1893 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1894 */ 1895 return (SET_ERROR(ENOTSUP)); 1896 } 1897 #endif /* OPENSOLARIS_MOUNTROOT */ 1898 1899 static int 1900 getpoolname(const char *osname, char *poolname) 1901 { 1902 char *p; 1903 1904 p = strchr(osname, '/'); 1905 if (p == NULL) { 1906 if (strlen(osname) >= MAXNAMELEN) 1907 return (ENAMETOOLONG); 1908 (void) strcpy(poolname, osname); 1909 } else { 1910 if (p - osname >= MAXNAMELEN) 1911 return (ENAMETOOLONG); 1912 (void) strncpy(poolname, osname, p - osname); 1913 poolname[p - osname] = '\0'; 1914 } 1915 return (0); 1916 } 1917 1918 /*ARGSUSED*/ 1919 #ifdef illumos 1920 static int 1921 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1922 #endif 1923 #ifdef __FreeBSD_kernel__ 1924 static int 1925 zfs_mount(vfs_t *vfsp) 1926 #endif 1927 #ifdef __NetBSD__ 1928 static int 1929 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len) 1930 #endif 1931 { 1932 vnode_t *mvp = vfsp->mnt_vnodecovered; 1933 char *osname; 1934 int error = 0; 1935 int canwrite; 1936 1937 #ifdef illumos 1938 if (mvp->v_type != VDIR) 1939 return (SET_ERROR(ENOTDIR)); 1940 1941 mutex_enter(&mvp->v_lock); 1942 if ((uap->flags & MS_REMOUNT) == 0 && 1943 (uap->flags & MS_OVERLAY) == 0 && 1944 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1945 mutex_exit(&mvp->v_lock); 1946 return (SET_ERROR(EBUSY)); 1947 } 1948 mutex_exit(&mvp->v_lock); 1949 1950 /* 1951 * ZFS does not support passing unparsed data in via MS_DATA. 1952 * Users should use the MS_OPTIONSTR interface; this means 1953 * that all option parsing is already done and the options struct 1954 * can be interrogated. 1955 */ 1956 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1957 return (SET_ERROR(EINVAL)); 1958 #endif /* illumos */ 1959 1960 #ifdef __FreeBSD_kernel__ 1961 kthread_t *td = curthread; 1962 cred_t *cr = td->td_ucred; 1963 1964 if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) 1965 return (SET_ERROR(EPERM)); 1966 1967 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1968 return (SET_ERROR(EINVAL)); 1969 1970 /* 1971 * If full-owner-access is enabled and delegated administration is 1972 * turned on, we must set nosuid. 1973 */ 1974 if (zfs_super_owner && 1975 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1976 secpolicy_fs_mount_clearopts(cr, vfsp); 1977 } 1978 1979 #endif /* __FreeBSD_kernel__ */ 1980 1981 #ifdef __NetBSD__ 1982 cred_t *cr = CRED(); 1983 struct mounta *uap = data; 1984 1985 if (uap == NULL) 1986 return (SET_ERROR(EINVAL)); 1987 1988 if (*data_len < sizeof *uap) 1989 return (SET_ERROR(EINVAL)); 1990 1991 if (mvp->v_type != VDIR) 1992 return (SET_ERROR(ENOTDIR)); 1993 1994 mutex_enter(mvp->v_interlock); 1995 if ((uap->flags & MS_REMOUNT) == 0 && 1996 (uap->flags & MS_OVERLAY) == 0 && 1997 (vrefcnt(mvp) != 1 || (mvp->v_flag & VROOT))) { 1998 mutex_exit(mvp->v_interlock); 1999 return (SET_ERROR(EBUSY)); 2000 } 2001 mutex_exit(mvp->v_interlock); 2002 2003 osname = PNBUF_GET(); 2004 strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1); 2005 #endif /* __NetBSD__ */ 2006 2007 /* 2008 * Check for mount privilege? 2009 * 2010 * If we don't have privilege then see if 2011 * we have local permission to allow it 2012 */ 2013 error = secpolicy_fs_mount(cr, mvp, vfsp); 2014 if (error) { 2015 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 2016 goto out; 2017 2018 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 2019 vattr_t vattr; 2020 2021 /* 2022 * Make sure user is the owner of the mount point 2023 * or has sufficient privileges. 2024 */ 2025 2026 vattr.va_mask = AT_UID; 2027 2028 #ifdef __FreeBSD_kernel__ 2029 vn_lock(mvp, LK_SHARED | LK_RETRY); 2030 if (VOP_GETATTR(mvp, &vattr, cr)) { 2031 VOP_UNLOCK(mvp, 0); 2032 goto out; 2033 } 2034 2035 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 2036 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 2037 VOP_UNLOCK(mvp, 0); 2038 goto out; 2039 } 2040 VOP_UNLOCK(mvp, 0); 2041 #endif 2042 #ifdef __NetBSD__ 2043 vn_lock(mvp, LK_SHARED | LK_RETRY); 2044 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 2045 VOP_UNLOCK(mvp, 0); 2046 goto out; 2047 } 2048 2049 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 2050 VOP_ACCESS(mvp, VWRITE, cr) != 0) { 2051 VOP_UNLOCK(mvp, 0); 2052 goto out; 2053 } 2054 VOP_UNLOCK(mvp, 0); 2055 #endif 2056 } 2057 2058 secpolicy_fs_mount_clearopts(cr, vfsp); 2059 } 2060 2061 /* 2062 * Refuse to mount a filesystem if we are in a local zone and the 2063 * dataset is not visible. 2064 */ 2065 if (!INGLOBALZONE(curthread) && 2066 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 2067 error = SET_ERROR(EPERM); 2068 goto out; 2069 } 2070 2071 #ifdef SECLABEL 2072 error = zfs_mount_label_policy(vfsp, osname); 2073 if (error) 2074 goto out; 2075 #endif 2076 2077 #ifdef __FreeBSD_kernel__ 2078 vfsp->vfs_flag |= MNT_NFS4ACLS; 2079 #endif 2080 #ifdef __NetBSD__ 2081 vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP; 2082 #endif 2083 2084 /* 2085 * When doing a remount, we simply refresh our temporary properties 2086 * according to those options set in the current VFS options. 2087 */ 2088 if (vfsp->vfs_flag & MS_REMOUNT) { 2089 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2090 2091 /* 2092 * Refresh mount options with z_teardown_lock blocking I/O while 2093 * the filesystem is in an inconsistent state. 2094 * The lock also serializes this code with filesystem 2095 * manipulations between entry to zfs_suspend_fs() and return 2096 * from zfs_resume_fs(). 2097 */ 2098 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2099 zfs_unregister_callbacks(zfsvfs); 2100 error = zfs_register_callbacks(vfsp); 2101 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2102 goto out; 2103 } 2104 2105 #ifdef __FreeBSD_kernel__ 2106 /* Initial root mount: try hard to import the requested root pool. */ 2107 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 2108 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 2109 char pname[MAXNAMELEN]; 2110 2111 error = getpoolname(osname, pname); 2112 if (error == 0) 2113 error = spa_import_rootpool(pname); 2114 if (error) 2115 goto out; 2116 } 2117 #endif 2118 2119 DROP_GIANT(); 2120 error = zfs_domount(vfsp, osname); 2121 PICKUP_GIANT(); 2122 2123 #ifdef illumos 2124 /* 2125 * Add an extra VFS_HOLD on our parent vfs so that it can't 2126 * disappear due to a forced unmount. 2127 */ 2128 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 2129 VFS_HOLD(mvp->v_vfsp); 2130 #endif 2131 2132 #ifdef __NetBSD__ 2133 /* setup zfs mount info */ 2134 strlcpy(vfsp->mnt_stat.f_mntfromname, osname, 2135 sizeof(vfsp->mnt_stat.f_mntfromname)); 2136 set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname, 2137 UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp); 2138 #endif 2139 2140 out: 2141 return (error); 2142 } 2143 2144 #ifdef __FreeBSD_kernel__ 2145 static int 2146 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 2147 #endif 2148 #ifdef __NetBSD__ 2149 static int 2150 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp) 2151 #endif 2152 { 2153 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2154 uint64_t refdbytes, availbytes, usedobjs, availobjs; 2155 2156 #ifdef __FreeBSD_kernel__ 2157 statp->f_version = STATFS_VERSION; 2158 #endif 2159 2160 ZFS_ENTER(zfsvfs); 2161 2162 dmu_objset_space(zfsvfs->z_os, 2163 &refdbytes, &availbytes, &usedobjs, &availobjs); 2164 2165 /* 2166 * The underlying storage pool actually uses multiple block sizes. 2167 * We report the fragsize as the smallest block size we support, 2168 * and we report our blocksize as the filesystem's maximum blocksize. 2169 */ 2170 statp->f_bsize = SPA_MINBLOCKSIZE; 2171 #ifdef __NetBSD__ 2172 statp->f_frsize = SPA_MINBLOCKSIZE; 2173 #endif 2174 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 2175 2176 /* 2177 * The following report "total" blocks of various kinds in the 2178 * file system, but reported in terms of f_frsize - the 2179 * "fragment" size. 2180 */ 2181 2182 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 2183 statp->f_bfree = availbytes / statp->f_bsize; 2184 statp->f_bavail = statp->f_bfree; /* no root reservation */ 2185 2186 /* 2187 * statvfs() should really be called statufs(), because it assumes 2188 * static metadata. ZFS doesn't preallocate files, so the best 2189 * we can do is report the max that could possibly fit in f_files, 2190 * and that minus the number actually used in f_ffree. 2191 * For f_ffree, report the smaller of the number of object available 2192 * and the number of blocks (each object will take at least a block). 2193 */ 2194 statp->f_ffree = MIN(availobjs, statp->f_bfree); 2195 #ifndef __FreeBSD__ 2196 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 2197 #endif 2198 statp->f_files = statp->f_ffree + usedobjs; 2199 2200 #ifdef __FreeBSD__ 2201 (void) cmpldev(&d32, vfsp->vfs_dev); 2202 statp->f_fsid = d32; 2203 #endif 2204 #ifdef __NetBSD__ 2205 statp->f_fsid = vfsp->mnt_stat.f_fsid; 2206 statp->f_fsidx = vfsp->mnt_stat.f_fsidx; 2207 #endif 2208 2209 /* 2210 * We're a zfs filesystem. 2211 */ 2212 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); 2213 2214 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 2215 sizeof(statp->f_mntfromname)); 2216 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 2217 sizeof(statp->f_mntonname)); 2218 2219 #ifdef __FreeBSD_kernel__ 2220 statp->f_namemax = MAXNAMELEN - 1; 2221 #endif 2222 #ifdef __NetBSD__ 2223 statp->f_namemax = ZFS_MAXNAMELEN; 2224 #endif 2225 2226 ZFS_EXIT(zfsvfs); 2227 return (0); 2228 } 2229 2230 static int 2231 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 2232 { 2233 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2234 znode_t *rootzp; 2235 int error; 2236 2237 ZFS_ENTER(zfsvfs); 2238 2239 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 2240 if (error == 0) 2241 *vpp = ZTOV(rootzp); 2242 2243 ZFS_EXIT(zfsvfs); 2244 2245 if (error == 0) { 2246 error = vn_lock(*vpp, flags); 2247 if (error != 0) { 2248 VN_RELE(*vpp); 2249 *vpp = NULL; 2250 } 2251 } 2252 return (error); 2253 } 2254 2255 /* 2256 * Teardown the zfsvfs::z_os. 2257 * 2258 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 2259 * and 'z_teardown_inactive_lock' held. 2260 */ 2261 static int 2262 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 2263 { 2264 znode_t *zp; 2265 2266 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2267 2268 if (!unmounting) { 2269 /* 2270 * We purge the parent filesystem's vfsp as the parent 2271 * filesystem and all of its snapshots have their vnode's 2272 * v_vfsp set to the parent's filesystem's vfsp. Note, 2273 * 'z_parent' is self referential for non-snapshots. 2274 */ 2275 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2276 #ifdef FREEBSD_NAMECACHE 2277 cache_purgevfs(zfsvfs->z_parent->z_vfs, true); 2278 #endif 2279 } 2280 2281 /* 2282 * Close the zil. NB: Can't close the zil while zfs_inactive 2283 * threads are blocked as zil_close can call zfs_inactive. 2284 */ 2285 if (zfsvfs->z_log) { 2286 zil_close(zfsvfs->z_log); 2287 zfsvfs->z_log = NULL; 2288 } 2289 2290 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 2291 2292 /* 2293 * If we are not unmounting (ie: online recv) and someone already 2294 * unmounted this file system while we were doing the switcheroo, 2295 * or a reopen of z_os failed then just bail out now. 2296 */ 2297 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 2298 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2299 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2300 return (SET_ERROR(EIO)); 2301 } 2302 2303 /* 2304 * At this point there are no vops active, and any new vops will 2305 * fail with EIO since we have z_teardown_lock for writer (only 2306 * relavent for forced unmount). 2307 * 2308 * Release all holds on dbufs. 2309 */ 2310 mutex_enter(&zfsvfs->z_znodes_lock); 2311 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 2312 zp = list_next(&zfsvfs->z_all_znodes, zp)) 2313 if (zp->z_sa_hdl) { 2314 #ifdef __NetBSD__ 2315 ASSERT(vrefcnt(ZTOV(zp)) >= 0); 2316 #else 2317 ASSERT(ZTOV(zp)->v_count >= 0); 2318 #endif 2319 zfs_znode_dmu_fini(zp); 2320 } 2321 mutex_exit(&zfsvfs->z_znodes_lock); 2322 2323 /* 2324 * If we are unmounting, set the unmounted flag and let new vops 2325 * unblock. zfs_inactive will have the unmounted behavior, and all 2326 * other vops will fail with EIO. 2327 */ 2328 if (unmounting) { 2329 zfsvfs->z_unmounted = B_TRUE; 2330 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2331 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2332 } 2333 2334 /* 2335 * z_os will be NULL if there was an error in attempting to reopen 2336 * zfsvfs, so just return as the properties had already been 2337 * unregistered and cached data had been evicted before. 2338 */ 2339 if (zfsvfs->z_os == NULL) 2340 return (0); 2341 2342 /* 2343 * Unregister properties. 2344 */ 2345 zfs_unregister_callbacks(zfsvfs); 2346 2347 /* 2348 * Evict cached data 2349 */ 2350 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 2351 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 2352 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 2353 dmu_objset_evict_dbufs(zfsvfs->z_os); 2354 2355 return (0); 2356 } 2357 2358 /*ARGSUSED*/ 2359 static int 2360 zfs_umount(vfs_t *vfsp, int fflag) 2361 { 2362 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2363 objset_t *os; 2364 int ret; 2365 #ifdef __FreeBSD_kernel__ 2366 kthread_t *td = curthread; 2367 cred_t *cr = td->td_ucred; 2368 #endif 2369 #ifdef __NetBSD__ 2370 cred_t *cr = CRED(); 2371 #endif 2372 2373 ret = secpolicy_fs_unmount(cr, vfsp); 2374 if (ret) { 2375 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 2376 ZFS_DELEG_PERM_MOUNT, cr)) 2377 return (ret); 2378 } 2379 2380 /* 2381 * We purge the parent filesystem's vfsp as the parent filesystem 2382 * and all of its snapshots have their vnode's v_vfsp set to the 2383 * parent's filesystem's vfsp. Note, 'z_parent' is self 2384 * referential for non-snapshots. 2385 */ 2386 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2387 2388 /* 2389 * Unmount any snapshots mounted under .zfs before unmounting the 2390 * dataset itself. 2391 */ 2392 if (zfsvfs->z_ctldir != NULL) { 2393 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 2394 return (ret); 2395 } 2396 2397 if (fflag & MS_FORCE) { 2398 /* 2399 * Mark file system as unmounted before calling 2400 * vflush(FORCECLOSE). This way we ensure no future vnops 2401 * will be called and risk operating on DOOMED vnodes. 2402 */ 2403 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2404 zfsvfs->z_unmounted = B_TRUE; 2405 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2406 } 2407 2408 /* 2409 * Flush all the files. 2410 */ 2411 #ifdef __FreeBSD_kernel__ 2412 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 2413 #endif 2414 #ifdef __NetBSD__ 2415 ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0); 2416 #endif 2417 if (ret != 0) 2418 return (ret); 2419 2420 #ifdef illumos 2421 if (!(fflag & MS_FORCE)) { 2422 /* 2423 * Check the number of active vnodes in the file system. 2424 * Our count is maintained in the vfs structure, but the 2425 * number is off by 1 to indicate a hold on the vfs 2426 * structure itself. 2427 * 2428 * The '.zfs' directory maintains a reference of its 2429 * own, and any active references underneath are 2430 * reflected in the vnode count. 2431 */ 2432 if (zfsvfs->z_ctldir == NULL) { 2433 if (vfsp->vfs_count > 1) 2434 return (SET_ERROR(EBUSY)); 2435 } else { 2436 if (vfsp->vfs_count > 2 || 2437 zfsvfs->z_ctldir->v_count > 1) 2438 return (SET_ERROR(EBUSY)); 2439 } 2440 } 2441 #endif 2442 2443 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 2444 os = zfsvfs->z_os; 2445 2446 /* 2447 * z_os will be NULL if there was an error in 2448 * attempting to reopen zfsvfs. 2449 */ 2450 if (os != NULL) { 2451 /* 2452 * Unset the objset user_ptr. 2453 */ 2454 mutex_enter(&os->os_user_ptr_lock); 2455 dmu_objset_set_user(os, NULL); 2456 mutex_exit(&os->os_user_ptr_lock); 2457 2458 /* 2459 * Finally release the objset 2460 */ 2461 dmu_objset_disown(os, zfsvfs); 2462 } 2463 2464 /* 2465 * We can now safely destroy the '.zfs' directory node. 2466 */ 2467 if (zfsvfs->z_ctldir != NULL) 2468 zfsctl_destroy(zfsvfs); 2469 zfs_freevfs(vfsp); 2470 2471 return (0); 2472 } 2473 2474 static int 2475 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 2476 { 2477 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2478 znode_t *zp; 2479 int err; 2480 2481 /* 2482 * zfs_zget() can't operate on virtual entries like .zfs/ or 2483 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 2484 * This will make NFS to switch to LOOKUP instead of using VGET. 2485 */ 2486 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 2487 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 2488 return (EOPNOTSUPP); 2489 2490 ZFS_ENTER(zfsvfs); 2491 err = zfs_zget(zfsvfs, ino, &zp); 2492 if (err == 0 && zp->z_unlinked) { 2493 VN_RELE(ZTOV(zp)); 2494 err = EINVAL; 2495 } 2496 if (err == 0) 2497 *vpp = ZTOV(zp); 2498 ZFS_EXIT(zfsvfs); 2499 if (err == 0) 2500 err = vn_lock(*vpp, flags); 2501 if (err != 0) 2502 *vpp = NULL; 2503 2504 return (err); 2505 } 2506 2507 #ifdef __FreeBSD_kernel__ 2508 static int 2509 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 2510 struct ucred **credanonp, int *numsecflavors, int **secflavors) 2511 { 2512 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2513 2514 /* 2515 * If this is regular file system vfsp is the same as 2516 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 2517 * zfsvfs->z_parent->z_vfs represents parent file system 2518 * which we have to use here, because only this file system 2519 * has mnt_export configured. 2520 */ 2521 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 2522 credanonp, numsecflavors, secflavors)); 2523 } 2524 2525 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); 2526 CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); 2527 #endif 2528 2529 #ifdef __FreeBSD_kernel__ 2530 static int 2531 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 2532 { 2533 struct componentname cn; 2534 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2535 znode_t *zp; 2536 vnode_t *dvp; 2537 uint64_t object = 0; 2538 uint64_t fid_gen = 0; 2539 uint64_t gen_mask; 2540 uint64_t zp_gen; 2541 int i, err; 2542 2543 *vpp = NULL; 2544 2545 ZFS_ENTER(zfsvfs); 2546 2547 /* 2548 * On FreeBSD we can get snapshot's mount point or its parent file 2549 * system mount point depending if snapshot is already mounted or not. 2550 */ 2551 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 2552 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2553 uint64_t objsetid = 0; 2554 uint64_t setgen = 0; 2555 2556 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2557 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2558 2559 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2560 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2561 2562 ZFS_EXIT(zfsvfs); 2563 2564 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2565 if (err) 2566 return (SET_ERROR(EINVAL)); 2567 ZFS_ENTER(zfsvfs); 2568 } 2569 2570 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2571 zfid_short_t *zfid = (zfid_short_t *)fidp; 2572 2573 for (i = 0; i < sizeof (zfid->zf_object); i++) 2574 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2575 2576 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2577 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2578 } else { 2579 ZFS_EXIT(zfsvfs); 2580 return (SET_ERROR(EINVAL)); 2581 } 2582 2583 /* 2584 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 2585 * directory tree. If the object == zfsvfs->z_shares_dir, then 2586 * we are in the .zfs/shares directory tree. 2587 */ 2588 if ((fid_gen == 0 && 2589 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 2590 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 2591 ZFS_EXIT(zfsvfs); 2592 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 2593 if (object == ZFSCTL_INO_SNAPDIR) { 2594 cn.cn_nameptr = "snapshot"; 2595 cn.cn_namelen = strlen(cn.cn_nameptr); 2596 cn.cn_nameiop = LOOKUP; 2597 cn.cn_flags = ISLASTCN | LOCKLEAF; 2598 cn.cn_lkflags = flags; 2599 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 2600 vput(dvp); 2601 } else if (object == zfsvfs->z_shares_dir) { 2602 /* 2603 * XXX This branch must not be taken, 2604 * if it is, then the lookup below will 2605 * explode. 2606 */ 2607 cn.cn_nameptr = "shares"; 2608 cn.cn_namelen = strlen(cn.cn_nameptr); 2609 cn.cn_nameiop = LOOKUP; 2610 cn.cn_flags = ISLASTCN; 2611 cn.cn_lkflags = flags; 2612 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 2613 vput(dvp); 2614 } else { 2615 *vpp = dvp; 2616 } 2617 return (err); 2618 } 2619 2620 gen_mask = -1ULL >> (64 - 8 * i); 2621 2622 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2623 if (err = zfs_zget(zfsvfs, object, &zp)) { 2624 ZFS_EXIT(zfsvfs); 2625 return (err); 2626 } 2627 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2628 sizeof (uint64_t)); 2629 zp_gen = zp_gen & gen_mask; 2630 if (zp_gen == 0) 2631 zp_gen = 1; 2632 if (zp->z_unlinked || zp_gen != fid_gen) { 2633 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2634 VN_RELE(ZTOV(zp)); 2635 ZFS_EXIT(zfsvfs); 2636 return (SET_ERROR(EINVAL)); 2637 } 2638 2639 *vpp = ZTOV(zp); 2640 ZFS_EXIT(zfsvfs); 2641 err = vn_lock(*vpp, flags); 2642 if (err == 0) 2643 vnode_create_vobject(*vpp, zp->z_size, curthread); 2644 else 2645 *vpp = NULL; 2646 return (err); 2647 } 2648 #endif /* __FreeBSD_kernel__ */ 2649 2650 /* 2651 * Block out VOPs and close zfsvfs_t::z_os 2652 * 2653 * Note, if successful, then we return with the 'z_teardown_lock' and 2654 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2655 * dataset and objset intact so that they can be atomically handed off during 2656 * a subsequent rollback or recv operation and the resume thereafter. 2657 */ 2658 int 2659 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2660 { 2661 int error; 2662 2663 #ifdef __NetBSD__ 2664 if ((error = vfs_suspend(zfsvfs->z_vfs, 0)) != 0) 2665 return error; 2666 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) { 2667 vfs_resume(zfsvfs->z_vfs); 2668 return (error); 2669 } 2670 #else 2671 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2672 return (error); 2673 #endif 2674 2675 return (0); 2676 } 2677 2678 /* 2679 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2680 * is an invariant across any of the operations that can be performed while the 2681 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2682 * are the same: the relevant objset and associated dataset are owned by 2683 * zfsvfs, held, and long held on entry. 2684 */ 2685 #ifdef __NetBSD__ 2686 static bool 2687 zfs_resume_selector(void *cl, struct vnode *vp) 2688 { 2689 2690 if (zfsctl_is_node(vp)) 2691 return false; 2692 return (VTOZ(vp)->z_sa_hdl == NULL); 2693 } 2694 #endif 2695 int 2696 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2697 { 2698 int err; 2699 znode_t *zp; 2700 2701 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2702 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2703 2704 /* 2705 * We already own this, so just update the objset_t, as the one we 2706 * had before may have been evicted. 2707 */ 2708 objset_t *os; 2709 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2710 VERIFY(dsl_dataset_long_held(ds)); 2711 VERIFY0(dmu_objset_from_ds(ds, &os)); 2712 2713 err = zfsvfs_init(zfsvfs, os); 2714 if (err != 0) 2715 goto bail; 2716 2717 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2718 2719 zfs_set_fuid_feature(zfsvfs); 2720 2721 /* 2722 * Attempt to re-establish all the active znodes with 2723 * their dbufs. If a zfs_rezget() fails, then we'll let 2724 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2725 * when they try to use their znode. 2726 */ 2727 mutex_enter(&zfsvfs->z_znodes_lock); 2728 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2729 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2730 (void) zfs_rezget(zp); 2731 } 2732 mutex_exit(&zfsvfs->z_znodes_lock); 2733 2734 bail: 2735 /* release the VOPs */ 2736 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2737 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2738 #ifdef __NetBSD__ 2739 struct vnode_iterator *marker; 2740 vnode_t *vp; 2741 2742 vfs_vnode_iterator_init(zfsvfs->z_vfs, &marker); 2743 while ((vp = vfs_vnode_iterator_next(marker, 2744 zfs_resume_selector, NULL))) { 2745 vgone(vp); 2746 } 2747 vfs_vnode_iterator_destroy(marker); 2748 vfs_resume(zfsvfs->z_vfs); 2749 #endif 2750 2751 if (err) { 2752 /* 2753 * Since we couldn't setup the sa framework, try to force 2754 * unmount this file system. 2755 */ 2756 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 2757 vfs_ref(zfsvfs->z_vfs); 2758 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2759 } 2760 } 2761 return (err); 2762 } 2763 2764 static void 2765 zfs_freevfs(vfs_t *vfsp) 2766 { 2767 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2768 2769 #ifdef illumos 2770 /* 2771 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2772 * from zfs_mount(). Release it here. If we came through 2773 * zfs_mountroot() instead, we didn't grab an extra hold, so 2774 * skip the VFS_RELE for rootvfs. 2775 */ 2776 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2777 VFS_RELE(zfsvfs->z_parent->z_vfs); 2778 #endif 2779 2780 zfsvfs_free(zfsvfs); 2781 2782 atomic_dec_32(&zfs_active_fs_count); 2783 } 2784 2785 #ifdef __FreeBSD_kernel__ 2786 #ifdef __i386__ 2787 static int desiredvnodes_backup; 2788 #endif 2789 2790 static void 2791 zfs_vnodes_adjust(void) 2792 { 2793 #ifdef __i386__ 2794 int newdesiredvnodes; 2795 2796 desiredvnodes_backup = desiredvnodes; 2797 2798 /* 2799 * We calculate newdesiredvnodes the same way it is done in 2800 * vntblinit(). If it is equal to desiredvnodes, it means that 2801 * it wasn't tuned by the administrator and we can tune it down. 2802 */ 2803 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2804 vm_kmem_size / (5 * (sizeof(struct vm_object) + 2805 sizeof(struct vnode)))); 2806 if (newdesiredvnodes == desiredvnodes) 2807 desiredvnodes = (3 * newdesiredvnodes) / 4; 2808 #endif 2809 } 2810 2811 static void 2812 zfs_vnodes_adjust_back(void) 2813 { 2814 2815 #ifdef __i386__ 2816 desiredvnodes = desiredvnodes_backup; 2817 #endif 2818 } 2819 #endif /* __FreeBSD_kernel__ */ 2820 2821 #ifdef __NetBSD__ 2822 static void 2823 zfs_vnodes_adjust(void) 2824 { 2825 } 2826 2827 static void 2828 zfs_vnodes_adjust_back(void) 2829 { 2830 } 2831 #endif 2832 2833 void 2834 zfs_init(void) 2835 { 2836 2837 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2838 2839 /* 2840 * Initialize .zfs directory structures 2841 */ 2842 zfsctl_init(); 2843 2844 /* 2845 * Initialize znode cache, vnode ops, etc... 2846 */ 2847 zfs_znode_init(); 2848 2849 /* 2850 * Reduce number of vnodes. Originally number of vnodes is calculated 2851 * with UFS inode in mind. We reduce it here, because it's too big for 2852 * ZFS/i386. 2853 */ 2854 zfs_vnodes_adjust(); 2855 2856 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2857 } 2858 2859 void 2860 zfs_fini(void) 2861 { 2862 zfsctl_fini(); 2863 zfs_znode_fini(); 2864 zfs_vnodes_adjust_back(); 2865 } 2866 2867 int 2868 zfs_busy(void) 2869 { 2870 return (zfs_active_fs_count != 0); 2871 } 2872 2873 int 2874 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2875 { 2876 int error; 2877 objset_t *os = zfsvfs->z_os; 2878 dmu_tx_t *tx; 2879 2880 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2881 return (SET_ERROR(EINVAL)); 2882 2883 if (newvers < zfsvfs->z_version) 2884 return (SET_ERROR(EINVAL)); 2885 2886 if (zfs_spa_version_map(newvers) > 2887 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2888 return (SET_ERROR(ENOTSUP)); 2889 2890 tx = dmu_tx_create(os); 2891 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2892 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2893 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2894 ZFS_SA_ATTRS); 2895 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2896 } 2897 error = dmu_tx_assign(tx, TXG_WAIT); 2898 if (error) { 2899 dmu_tx_abort(tx); 2900 return (error); 2901 } 2902 2903 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2904 8, 1, &newvers, tx); 2905 2906 if (error) { 2907 dmu_tx_commit(tx); 2908 return (error); 2909 } 2910 2911 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2912 uint64_t sa_obj; 2913 2914 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2915 SPA_VERSION_SA); 2916 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2917 DMU_OT_NONE, 0, tx); 2918 2919 error = zap_add(os, MASTER_NODE_OBJ, 2920 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2921 ASSERT0(error); 2922 2923 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2924 sa_register_update_callback(os, zfs_sa_upgrade); 2925 } 2926 2927 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2928 "from %llu to %llu", zfsvfs->z_version, newvers); 2929 2930 dmu_tx_commit(tx); 2931 2932 zfsvfs->z_version = newvers; 2933 2934 zfs_set_fuid_feature(zfsvfs); 2935 2936 return (0); 2937 } 2938 2939 /* 2940 * Read a property stored within the master node. 2941 */ 2942 int 2943 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2944 { 2945 const char *pname; 2946 int error = ENOENT; 2947 2948 /* 2949 * Look up the file system's value for the property. For the 2950 * version property, we look up a slightly different string. 2951 */ 2952 if (prop == ZFS_PROP_VERSION) 2953 pname = ZPL_VERSION_STR; 2954 else 2955 pname = zfs_prop_to_name(prop); 2956 2957 if (os != NULL) 2958 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2959 2960 if (error == ENOENT) { 2961 /* No value set, use the default value */ 2962 switch (prop) { 2963 case ZFS_PROP_VERSION: 2964 *value = ZPL_VERSION; 2965 break; 2966 case ZFS_PROP_NORMALIZE: 2967 case ZFS_PROP_UTF8ONLY: 2968 *value = 0; 2969 break; 2970 case ZFS_PROP_CASE: 2971 *value = ZFS_CASE_SENSITIVE; 2972 break; 2973 default: 2974 return (error); 2975 } 2976 error = 0; 2977 } 2978 return (error); 2979 } 2980 2981 #if defined(__FreeBSD_kernel__) || defined(__NetBSD__) 2982 #ifdef _KERNEL 2983 void 2984 zfsvfs_update_fromname(const char *oldname, const char *newname) 2985 { 2986 char tmpbuf[MAXPATHLEN]; 2987 struct mount *mp; 2988 char *fromname; 2989 size_t oldlen; 2990 2991 oldlen = strlen(oldname); 2992 2993 #ifdef __NetBSD__ 2994 mount_iterator_t *iter; 2995 mountlist_iterator_init(&iter); 2996 while ((mp = mountlist_iterator_next(iter)) != NULL) { 2997 #else 2998 mtx_lock(&mountlist_mtx); 2999 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3000 #endif 3001 fromname = mp->mnt_stat.f_mntfromname; 3002 if (strcmp(fromname, oldname) == 0) { 3003 (void)strlcpy(fromname, newname, 3004 sizeof(mp->mnt_stat.f_mntfromname)); 3005 continue; 3006 } 3007 if (strncmp(fromname, oldname, oldlen) == 0 && 3008 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 3009 (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", 3010 newname, fromname + oldlen); 3011 (void)strlcpy(fromname, tmpbuf, 3012 sizeof(mp->mnt_stat.f_mntfromname)); 3013 continue; 3014 } 3015 } 3016 #ifdef __NetBSD__ 3017 mountlist_iterator_destroy(iter); 3018 #else 3019 mtx_unlock(&mountlist_mtx); 3020 #endif 3021 } 3022 #endif 3023 #endif 3024