1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/pathname.h> 32 #include <sys/vnode.h> 33 #include <sys/vfs.h> 34 #include <sys/vfs_opreg.h> 35 #include <sys/mntent.h> 36 #include <sys/mount.h> 37 #include <sys/cmn_err.h> 38 #include <sys/zfs_znode.h> 39 #include <sys/zfs_dir.h> 40 #include <sys/zil.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/dmu.h> 43 #include <sys/dsl_prop.h> 44 #include <sys/dsl_dataset.h> 45 #include <sys/dsl_deleg.h> 46 #include <sys/spa.h> 47 #include <sys/zap.h> 48 #include <sys/varargs.h> 49 #include <sys/policy.h> 50 #include <sys/atomic.h> 51 #include <sys/mkdev.h> 52 #include <sys/modctl.h> 53 #include <sys/zfs_ioctl.h> 54 #include <sys/zfs_ctldir.h> 55 #include <sys/zfs_fuid.h> 56 #include <sys/sunddi.h> 57 #include <sys/dnlc.h> 58 #include <sys/dmu_objset.h> 59 #include <sys/spa_boot.h> 60 61 #ifdef __NetBSD__ 62 /* include ddi_name_to_major function is there better place for it ?*/ 63 #include <sys/ddi.h> 64 #include <sys/systm.h> 65 #endif 66 67 int zfsfstype; 68 vfsops_t *zfs_vfsops = NULL; 69 static major_t zfs_major; 70 static minor_t zfs_minor; 71 static kmutex_t zfs_dev_mtx; 72 73 int zfs_debug_level; 74 kmutex_t zfs_debug_mtx; 75 76 /* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/ 77 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len); 78 static int zfs_umount(vfs_t *vfsp, int fflag); 79 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 80 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp); 81 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); 82 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp); 83 static int zfs_start(vfs_t *vfsp, int flags); 84 static void zfs_freevfs(vfs_t *vfsp); 85 86 void zfs_init(void); 87 void zfs_fini(void); 88 89 90 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc; 91 92 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = { 93 &zfs_vnodeop_opv_desc, 94 NULL, 95 }; 96 97 static struct vfsops zfs_vfsops_template = { 98 .vfs_name = MOUNT_ZFS, 99 .vfs_min_mount_data = sizeof(struct zfs_args), 100 .vfs_opv_descs = zfs_vnodeop_descs, 101 .vfs_mount = zfs_mount, 102 .vfs_unmount = zfs_umount, 103 .vfs_root = zfs_root, 104 .vfs_statvfs = zfs_statvfs, 105 .vfs_sync = zfs_sync, 106 .vfs_vget = zfs_vget, 107 .vfs_loadvnode = zfs_loadvnode, 108 .vfs_fhtovp = zfs_fhtovp, 109 .vfs_init = zfs_init, 110 .vfs_done = zfs_fini, 111 .vfs_start = zfs_start, 112 .vfs_renamelock_enter = (void*)nullop, 113 .vfs_renamelock_exit = (void*)nullop, 114 .vfs_reinit = (void *)nullop, 115 .vfs_vptofh = (void *)eopnotsupp, 116 .vfs_fhtovp = (void *)eopnotsupp, 117 .vfs_quotactl = (void *)eopnotsupp, 118 .vfs_extattrctl = (void *)eopnotsupp, 119 .vfs_snapshot = (void *)eopnotsupp, 120 .vfs_fsync = (void *)eopnotsupp, 121 }; 122 123 /* 124 * We need to keep a count of active fs's. 125 * This is necessary to prevent our module 126 * from being unloaded after a umount -f 127 */ 128 static uint32_t zfs_active_fs_count = 0; 129 130 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 131 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 132 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 133 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 134 135 /* 136 * MO_DEFAULT is not used since the default value is determined 137 * by the equivalent property. 138 */ 139 static mntopt_t mntopts[] = { 140 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 141 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 142 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 143 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 144 }; 145 146 static mntopts_t zfs_mntopts = { 147 sizeof (mntopts) / sizeof (mntopt_t), 148 mntopts 149 }; 150 151 static bool 152 zfs_sync_selector(void *cl, struct vnode *vp) 153 { 154 znode_t *zp; 155 156 /* 157 * Skip the vnode/inode if inaccessible, or if the 158 * atime is clean. 159 */ 160 zp = VTOZ(vp); 161 return zp != NULL && vp->v_type != VNON && zp->z_atime_dirty != 0 162 && !zp->z_unlinked; 163 } 164 165 /*ARGSUSED*/ 166 int 167 zfs_sync(vfs_t *vfsp, int flag, cred_t *cr) 168 { 169 zfsvfs_t *zfsvfs = vfsp->vfs_data; 170 znode_t *zp; 171 vnode_t *vp; 172 struct vnode_iterator *marker; 173 dmu_tx_t *tx; 174 int error; 175 176 177 error = 0; 178 179 /* 180 * Data integrity is job one. We don't want a compromised kernel 181 * writing to the storage pool, so we never sync during panic. 182 */ 183 if (panicstr) 184 return (0); 185 186 /* 187 * On NetBSD, we need to push out atime updates. Solaris does 188 * this during VOP_INACTIVE, but that does not work well with the 189 * BSD VFS, so we do it in batch here. 190 */ 191 vfs_vnode_iterator_init(vfsp, &marker); 192 while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL))) 193 { 194 error = vn_lock(vp, LK_EXCLUSIVE); 195 if (error) { 196 vrele(vp); 197 continue; 198 } 199 zp = VTOZ(vp); 200 tx = dmu_tx_create(zfsvfs->z_os); 201 dmu_tx_hold_bonus(tx, zp->z_id); 202 error = dmu_tx_assign(tx, TXG_WAIT); 203 if (error) { 204 dmu_tx_abort(tx); 205 } else { 206 dmu_buf_will_dirty(zp->z_dbuf, tx); 207 mutex_enter(&zp->z_lock); 208 zp->z_atime_dirty = 0; 209 mutex_exit(&zp->z_lock); 210 dmu_tx_commit(tx); 211 } 212 vput(vp); 213 } 214 vfs_vnode_iterator_destroy(marker); 215 216 /* 217 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 218 * to sync metadata, which they would otherwise cache indefinitely. 219 * Semantically, the only requirement is that the sync be initiated. 220 * The DMU syncs out txgs frequently, so there's nothing to do. 221 */ 222 if ((flag & MNT_LAZY) != 0) 223 return (0); 224 225 if (vfsp != NULL) { 226 /* 227 * Sync a specific filesystem. 228 */ 229 zfsvfs_t *zfsvfs = vfsp->vfs_data; 230 dsl_pool_t *dp; 231 232 ZFS_ENTER(zfsvfs); 233 dp = dmu_objset_pool(zfsvfs->z_os); 234 235 /* 236 * If the system is shutting down, then skip any 237 * filesystems which may exist on a suspended pool. 238 */ 239 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 240 ZFS_EXIT(zfsvfs); 241 return (0); 242 } 243 244 if (zfsvfs->z_log != NULL) 245 zil_commit(zfsvfs->z_log, UINT64_MAX, 0); 246 else 247 txg_wait_synced(dp, 0); 248 ZFS_EXIT(zfsvfs); 249 } else { 250 /* 251 * Sync all ZFS filesystems. This is what happens when you 252 * run sync(1M). Unlike other filesystems, ZFS honors the 253 * request by waiting for all pools to commit all dirty data. 254 */ 255 spa_sync_allpools(); 256 } 257 258 return (0); 259 } 260 261 static int 262 zfs_create_unique_device(dev_t *dev) 263 { 264 major_t new_major; 265 266 do { 267 ASSERT3U(zfs_minor, <=, MAXMIN); 268 minor_t start = zfs_minor; 269 do { 270 mutex_enter(&zfs_dev_mtx); 271 if (zfs_minor >= MAXMIN) { 272 /* 273 * If we're still using the real major 274 * keep out of /dev/zfs and /dev/zvol minor 275 * number space. If we're using a getudev()'ed 276 * major number, we can use all of its minors. 277 */ 278 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 279 zfs_minor = ZFS_MIN_MINOR; 280 else 281 zfs_minor = 0; 282 } else { 283 zfs_minor++; 284 } 285 *dev = makedevice(zfs_major, zfs_minor); 286 mutex_exit(&zfs_dev_mtx); 287 } while (vfs_devismounted(*dev) && zfs_minor != start); 288 break; 289 #ifndef __NetBSD__ 290 if (zfs_minor == start) { 291 /* 292 * We are using all ~262,000 minor numbers for the 293 * current major number. Create a new major number. 294 */ 295 if ((new_major = getudev()) == (major_t)-1) { 296 cmn_err(CE_WARN, 297 "zfs_mount: Can't get unique major " 298 "device number."); 299 return (-1); 300 } 301 mutex_enter(&zfs_dev_mtx); 302 zfs_major = new_major; 303 zfs_minor = 0; 304 305 mutex_exit(&zfs_dev_mtx); 306 } else { 307 break; 308 } 309 /* CONSTANTCONDITION */ 310 #endif 311 } while (1); 312 313 return (0); 314 } 315 316 static void 317 atime_changed_cb(void *arg, uint64_t newval) 318 { 319 zfsvfs_t *zfsvfs = arg; 320 321 if (newval == TRUE) { 322 zfsvfs->z_atime = TRUE; 323 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 324 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 325 } else { 326 zfsvfs->z_atime = FALSE; 327 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 328 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 329 } 330 } 331 332 static void 333 xattr_changed_cb(void *arg, uint64_t newval) 334 { 335 zfsvfs_t *zfsvfs = arg; 336 337 if (newval == TRUE) { 338 /* XXX locking on vfs_flag? */ 339 #ifdef TODO 340 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 341 #endif 342 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 343 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 344 } else { 345 /* XXX locking on vfs_flag? */ 346 #ifdef TODO 347 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 348 #endif 349 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 350 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 351 } 352 } 353 354 static void 355 blksz_changed_cb(void *arg, uint64_t newval) 356 { 357 zfsvfs_t *zfsvfs = arg; 358 359 if (newval < SPA_MINBLOCKSIZE || 360 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 361 newval = SPA_MAXBLOCKSIZE; 362 363 zfsvfs->z_max_blksz = newval; 364 zfsvfs->z_vfs->vfs_bsize = newval; 365 } 366 367 static void 368 readonly_changed_cb(void *arg, uint64_t newval) 369 { 370 zfsvfs_t *zfsvfs = arg; 371 372 if (newval) { 373 /* XXX locking on vfs_flag? */ 374 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 375 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 376 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 377 } else { 378 /* XXX locking on vfs_flag? */ 379 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 380 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 381 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 382 } 383 } 384 385 static void 386 devices_changed_cb(void *arg, uint64_t newval) 387 { 388 zfsvfs_t *zfsvfs = arg; 389 390 if (newval == FALSE) { 391 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 392 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 393 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 394 } else { 395 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 396 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 397 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 398 } 399 } 400 401 static void 402 setuid_changed_cb(void *arg, uint64_t newval) 403 { 404 zfsvfs_t *zfsvfs = arg; 405 406 if (newval == FALSE) { 407 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 408 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 409 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 410 } else { 411 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 412 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 413 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 414 } 415 } 416 417 static void 418 exec_changed_cb(void *arg, uint64_t newval) 419 { 420 zfsvfs_t *zfsvfs = arg; 421 422 if (newval == FALSE) { 423 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 424 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 425 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 426 } else { 427 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 428 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 429 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 430 } 431 } 432 433 /* 434 * The nbmand mount option can be changed at mount time. 435 * We can't allow it to be toggled on live file systems or incorrect 436 * behavior may be seen from cifs clients 437 * 438 * This property isn't registered via dsl_prop_register(), but this callback 439 * will be called when a file system is first mounted 440 */ 441 static void 442 nbmand_changed_cb(void *arg, uint64_t newval) 443 { 444 zfsvfs_t *zfsvfs = arg; 445 if (newval == FALSE) { 446 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 447 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 448 } else { 449 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 450 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 451 } 452 } 453 454 static void 455 snapdir_changed_cb(void *arg, uint64_t newval) 456 { 457 zfsvfs_t *zfsvfs = arg; 458 459 zfsvfs->z_show_ctldir = newval; 460 } 461 462 static void 463 vscan_changed_cb(void *arg, uint64_t newval) 464 { 465 zfsvfs_t *zfsvfs = arg; 466 467 zfsvfs->z_vscan = newval; 468 } 469 470 static void 471 acl_mode_changed_cb(void *arg, uint64_t newval) 472 { 473 zfsvfs_t *zfsvfs = arg; 474 475 zfsvfs->z_acl_mode = newval; 476 } 477 478 static void 479 acl_inherit_changed_cb(void *arg, uint64_t newval) 480 { 481 zfsvfs_t *zfsvfs = arg; 482 483 zfsvfs->z_acl_inherit = newval; 484 } 485 486 static int 487 zfs_register_callbacks(vfs_t *vfsp) 488 { 489 struct dsl_dataset *ds = NULL; 490 objset_t *os = NULL; 491 zfsvfs_t *zfsvfs = NULL; 492 uint64_t nbmand; 493 int readonly, do_readonly = B_FALSE; 494 int setuid, do_setuid = B_FALSE; 495 int exec, do_exec = B_FALSE; 496 int devices, do_devices = B_FALSE; 497 int xattr, do_xattr = B_FALSE; 498 int atime, do_atime = B_FALSE; 499 int error = 0; 500 501 ASSERT(vfsp); 502 zfsvfs = vfsp->vfs_data; 503 ASSERT(zfsvfs); 504 os = zfsvfs->z_os; 505 506 /* 507 * The act of registering our callbacks will destroy any mount 508 * options we may have. In order to enable temporary overrides 509 * of mount options, we stash away the current values and 510 * restore them after we register the callbacks. 511 */ 512 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 513 readonly = B_TRUE; 514 do_readonly = B_TRUE; 515 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 516 readonly = B_FALSE; 517 do_readonly = B_TRUE; 518 } 519 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 520 devices = B_FALSE; 521 setuid = B_FALSE; 522 do_devices = B_TRUE; 523 do_setuid = B_TRUE; 524 } else { 525 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 526 devices = B_FALSE; 527 do_devices = B_TRUE; 528 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 529 devices = B_TRUE; 530 do_devices = B_TRUE; 531 } 532 533 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 534 setuid = B_FALSE; 535 do_setuid = B_TRUE; 536 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 537 setuid = B_TRUE; 538 do_setuid = B_TRUE; 539 } 540 } 541 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 542 exec = B_FALSE; 543 do_exec = B_TRUE; 544 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 545 exec = B_TRUE; 546 do_exec = B_TRUE; 547 } 548 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 549 xattr = B_FALSE; 550 do_xattr = B_TRUE; 551 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 552 xattr = B_TRUE; 553 do_xattr = B_TRUE; 554 } 555 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 556 atime = B_FALSE; 557 do_atime = B_TRUE; 558 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 559 atime = B_TRUE; 560 do_atime = B_TRUE; 561 } 562 563 /* 564 * nbmand is a special property. It can only be changed at 565 * mount time. 566 * 567 * This is weird, but it is documented to only be changeable 568 * at mount time. 569 */ 570 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 571 nbmand = B_FALSE; 572 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 573 nbmand = B_TRUE; 574 } else { 575 char osname[MAXNAMELEN]; 576 577 dmu_objset_name(os, osname); 578 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 579 NULL)) { 580 return (error); 581 } 582 } 583 584 /* 585 * Register property callbacks. 586 * 587 * It would probably be fine to just check for i/o error from 588 * the first prop_register(), but I guess I like to go 589 * overboard... 590 */ 591 ds = dmu_objset_ds(os); 592 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 593 error = error ? error : dsl_prop_register(ds, 594 "xattr", xattr_changed_cb, zfsvfs); 595 error = error ? error : dsl_prop_register(ds, 596 "recordsize", blksz_changed_cb, zfsvfs); 597 error = error ? error : dsl_prop_register(ds, 598 "readonly", readonly_changed_cb, zfsvfs); 599 error = error ? error : dsl_prop_register(ds, 600 "devices", devices_changed_cb, zfsvfs); 601 error = error ? error : dsl_prop_register(ds, 602 "setuid", setuid_changed_cb, zfsvfs); 603 error = error ? error : dsl_prop_register(ds, 604 "exec", exec_changed_cb, zfsvfs); 605 error = error ? error : dsl_prop_register(ds, 606 "snapdir", snapdir_changed_cb, zfsvfs); 607 error = error ? error : dsl_prop_register(ds, 608 "aclmode", acl_mode_changed_cb, zfsvfs); 609 error = error ? error : dsl_prop_register(ds, 610 "aclinherit", acl_inherit_changed_cb, zfsvfs); 611 error = error ? error : dsl_prop_register(ds, 612 "vscan", vscan_changed_cb, zfsvfs); 613 if (error) 614 goto unregister; 615 616 /* 617 * Invoke our callbacks to restore temporary mount options. 618 */ 619 if (do_readonly) 620 readonly_changed_cb(zfsvfs, readonly); 621 if (do_setuid) 622 setuid_changed_cb(zfsvfs, setuid); 623 if (do_exec) 624 exec_changed_cb(zfsvfs, exec); 625 if (do_devices) 626 devices_changed_cb(zfsvfs, devices); 627 if (do_xattr) 628 xattr_changed_cb(zfsvfs, xattr); 629 if (do_atime) 630 atime_changed_cb(zfsvfs, atime); 631 632 nbmand_changed_cb(zfsvfs, nbmand); 633 634 return (0); 635 636 unregister: 637 /* 638 * We may attempt to unregister some callbacks that are not 639 * registered, but this is OK; it will simply return ENOMSG, 640 * which we will ignore. 641 */ 642 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 643 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); 644 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 645 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 646 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); 647 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 648 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 649 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 650 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 651 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 652 zfsvfs); 653 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); 654 return (error); 655 656 } 657 658 static void 659 uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, 660 int64_t delta, dmu_tx_t *tx) 661 { 662 uint64_t used = 0; 663 char buf[32]; 664 int err; 665 uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 666 667 if (delta == 0) 668 return; 669 670 (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); 671 err = zap_lookup(os, obj, buf, 8, 1, &used); 672 ASSERT(err == 0 || err == ENOENT); 673 /* no underflow/overflow */ 674 ASSERT(delta > 0 || used >= -delta); 675 ASSERT(delta < 0 || used + delta > used); 676 used += delta; 677 if (used == 0) 678 err = zap_remove(os, obj, buf, tx); 679 else 680 err = zap_update(os, obj, buf, 8, 1, &used, tx); 681 ASSERT(err == 0); 682 } 683 684 static int 685 zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus, 686 uint64_t *userp, uint64_t *groupp) 687 { 688 znode_phys_t *znp = bonus; 689 690 if (bonustype != DMU_OT_ZNODE) 691 return (ENOENT); 692 693 *userp = znp->zp_uid; 694 *groupp = znp->zp_gid; 695 return (0); 696 } 697 698 static void 699 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 700 char *domainbuf, int buflen, uid_t *ridp) 701 { 702 uint64_t fuid; 703 const char *domain; 704 705 fuid = strtonum(fuidstr, NULL); 706 707 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 708 if (domain) 709 (void) strlcpy(domainbuf, domain, buflen); 710 else 711 domainbuf[0] = '\0'; 712 *ridp = FUID_RID(fuid); 713 } 714 715 static uint64_t 716 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 717 { 718 switch (type) { 719 case ZFS_PROP_USERUSED: 720 return (DMU_USERUSED_OBJECT); 721 case ZFS_PROP_GROUPUSED: 722 return (DMU_GROUPUSED_OBJECT); 723 case ZFS_PROP_USERQUOTA: 724 return (zfsvfs->z_userquota_obj); 725 case ZFS_PROP_GROUPQUOTA: 726 return (zfsvfs->z_groupquota_obj); 727 } 728 return (0); 729 } 730 731 int 732 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 733 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 734 { 735 int error; 736 zap_cursor_t zc; 737 zap_attribute_t za; 738 zfs_useracct_t *buf = vbuf; 739 uint64_t obj; 740 741 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 742 return (ENOTSUP); 743 744 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 745 if (obj == 0) { 746 *bufsizep = 0; 747 return (0); 748 } 749 750 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 751 (error = zap_cursor_retrieve(&zc, &za)) == 0; 752 zap_cursor_advance(&zc)) { 753 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 754 *bufsizep) 755 break; 756 757 fuidstr_to_sid(zfsvfs, za.za_name, 758 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 759 760 buf->zu_space = za.za_first_integer; 761 buf++; 762 } 763 if (error == ENOENT) 764 error = 0; 765 766 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 767 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 768 *cookiep = zap_cursor_serialize(&zc); 769 zap_cursor_fini(&zc); 770 return (error); 771 } 772 773 /* 774 * buf must be big enough (eg, 32 bytes) 775 */ 776 static int 777 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 778 char *buf, size_t buflen, boolean_t addok) 779 { 780 uint64_t fuid; 781 int domainid = 0; 782 783 if (domain && domain[0]) { 784 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 785 if (domainid == -1) 786 return (ENOENT); 787 } 788 fuid = FUID_ENCODE(domainid, rid); 789 (void) snprintf(buf, buflen, "%llx", (longlong_t)fuid); 790 return (0); 791 } 792 793 int 794 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 795 const char *domain, uint64_t rid, uint64_t *valp) 796 { 797 char buf[32]; 798 int err; 799 uint64_t obj; 800 801 *valp = 0; 802 803 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 804 return (ENOTSUP); 805 806 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 807 if (obj == 0) 808 return (0); 809 810 err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), FALSE); 811 if (err) 812 return (err); 813 814 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 815 if (err == ENOENT) 816 err = 0; 817 return (err); 818 } 819 820 int 821 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 822 const char *domain, uint64_t rid, uint64_t quota) 823 { 824 char buf[32]; 825 int err; 826 dmu_tx_t *tx; 827 uint64_t *objp; 828 boolean_t fuid_dirtied; 829 830 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 831 return (EINVAL); 832 833 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 834 return (ENOTSUP); 835 836 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 837 &zfsvfs->z_groupquota_obj; 838 839 err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), B_TRUE); 840 if (err) 841 return (err); 842 fuid_dirtied = zfsvfs->z_fuid_dirty; 843 844 tx = dmu_tx_create(zfsvfs->z_os); 845 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 846 if (*objp == 0) { 847 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 848 zfs_userquota_prop_prefixes[type]); 849 } 850 if (fuid_dirtied) 851 zfs_fuid_txhold(zfsvfs, tx); 852 err = dmu_tx_assign(tx, TXG_WAIT); 853 if (err) { 854 dmu_tx_abort(tx); 855 return (err); 856 } 857 858 mutex_enter(&zfsvfs->z_lock); 859 if (*objp == 0) { 860 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 861 DMU_OT_NONE, 0, tx); 862 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 863 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 864 } 865 mutex_exit(&zfsvfs->z_lock); 866 867 if (quota == 0) { 868 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 869 if (err == ENOENT) 870 err = 0; 871 } else { 872 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 873 } 874 ASSERT(err == 0); 875 if (fuid_dirtied) 876 zfs_fuid_sync(zfsvfs, tx); 877 dmu_tx_commit(tx); 878 return (err); 879 } 880 881 boolean_t 882 zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 883 { 884 char buf[32]; 885 uint64_t used, quota, usedobj, quotaobj; 886 int err; 887 888 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 889 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 890 891 if (quotaobj == 0 || zfsvfs->z_replay) 892 return (B_FALSE); 893 894 (void) snprintf(buf, sizeof(buf), "%llx", (longlong_t)fuid); 895 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 896 if (err != 0) 897 return (B_FALSE); 898 899 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 900 if (err != 0) 901 return (B_FALSE); 902 return (used >= quota); 903 } 904 905 int 906 zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 907 { 908 objset_t *os; 909 zfsvfs_t *zfsvfs; 910 uint64_t zval; 911 int i, error; 912 913 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 914 915 /* 916 * We claim to always be readonly so we can open snapshots; 917 * other ZPL code will prevent us from writing to snapshots. 918 */ 919 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 920 if (error) { 921 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 922 return (error); 923 } 924 925 /* 926 * Initialize the zfs-specific filesystem structure. 927 * Should probably make this a kmem cache, shuffle fields, 928 * and just bzero up to z_hold_mtx[]. 929 */ 930 zfsvfs->z_vfs = NULL; 931 zfsvfs->z_parent = zfsvfs; 932 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 933 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 934 zfsvfs->z_os = os; 935 936 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 937 if (error) { 938 goto out; 939 } else if (zfsvfs->z_version > ZPL_VERSION) { 940 (void) printf("Mismatched versions: File system " 941 "is version %llu on-disk format, which is " 942 "incompatible with this software version %lld!", 943 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 944 error = ENOTSUP; 945 goto out; 946 } 947 948 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 949 goto out; 950 zfsvfs->z_norm = (int)zval; 951 952 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 953 goto out; 954 zfsvfs->z_utf8 = (zval != 0); 955 956 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 957 goto out; 958 zfsvfs->z_case = (uint_t)zval; 959 960 /* 961 * Fold case on file systems that are always or sometimes case 962 * insensitive. 963 */ 964 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 965 zfsvfs->z_case == ZFS_CASE_MIXED) 966 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 967 968 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 969 970 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 971 &zfsvfs->z_root); 972 if (error) 973 goto out; 974 ASSERT(zfsvfs->z_root != 0); 975 976 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 977 &zfsvfs->z_unlinkedobj); 978 if (error) 979 goto out; 980 981 error = zap_lookup(os, MASTER_NODE_OBJ, 982 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 983 8, 1, &zfsvfs->z_userquota_obj); 984 if (error && error != ENOENT) 985 goto out; 986 987 error = zap_lookup(os, MASTER_NODE_OBJ, 988 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 989 8, 1, &zfsvfs->z_groupquota_obj); 990 if (error && error != ENOENT) 991 goto out; 992 993 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 994 &zfsvfs->z_fuid_obj); 995 if (error && error != ENOENT) 996 goto out; 997 998 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 999 &zfsvfs->z_shares_dir); 1000 if (error && error != ENOENT) 1001 goto out; 1002 1003 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1004 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1005 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1006 offsetof(znode_t, z_link_node)); 1007 rrw_init(&zfsvfs->z_teardown_lock); 1008 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1009 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1010 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1011 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1012 1013 *zfvp = zfsvfs; 1014 return (0); 1015 1016 out: 1017 dmu_objset_disown(os, zfsvfs); 1018 *zfvp = NULL; 1019 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1020 return (error); 1021 } 1022 1023 static int 1024 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1025 { 1026 int error; 1027 1028 error = zfs_register_callbacks(zfsvfs->z_vfs); 1029 if (error) 1030 return (error); 1031 1032 /* 1033 * Set the objset user_ptr to track its zfsvfs. 1034 */ 1035 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1036 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1037 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1038 1039 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1040 if (zil_disable) { 1041 zil_destroy(zfsvfs->z_log, B_FALSE); 1042 zfsvfs->z_log = NULL; 1043 } 1044 1045 /* 1046 * If we are not mounting (ie: online recv), then we don't 1047 * have to worry about replaying the log as we blocked all 1048 * operations out since we closed the ZIL. 1049 */ 1050 if (mounting) { 1051 boolean_t readonly; 1052 1053 /* 1054 * During replay we remove the read only flag to 1055 * allow replays to succeed. 1056 */ 1057 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1058 if (readonly != 0) 1059 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1060 else 1061 zfs_unlinked_drain(zfsvfs); 1062 1063 if (zfsvfs->z_log) { 1064 /* 1065 * Parse and replay the intent log. 1066 * 1067 * Because of ziltest, this must be done after 1068 * zfs_unlinked_drain(). (Further note: ziltest 1069 * doesn't use readonly mounts, where 1070 * zfs_unlinked_drain() isn't called.) This is because 1071 * ziltest causes spa_sync() to think it's committed, 1072 * but actually it is not, so the intent log contains 1073 * many txg's worth of changes. 1074 * 1075 * In particular, if object N is in the unlinked set in 1076 * the last txg to actually sync, then it could be 1077 * actually freed in a later txg and then reallocated 1078 * in a yet later txg. This would write a "create 1079 * object N" record to the intent log. Normally, this 1080 * would be fine because the spa_sync() would have 1081 * written out the fact that object N is free, before 1082 * we could write the "create object N" intent log 1083 * record. 1084 * 1085 * But when we are in ziltest mode, we advance the "open 1086 * txg" without actually spa_sync()-ing the changes to 1087 * disk. So we would see that object N is still 1088 * allocated and in the unlinked set, and there is an 1089 * intent log record saying to allocate it. 1090 */ 1091 zfsvfs->z_replay = B_TRUE; 1092 zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); 1093 zfsvfs->z_replay = B_FALSE; 1094 } 1095 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1096 } 1097 1098 return (0); 1099 } 1100 1101 void 1102 zfsvfs_free(zfsvfs_t *zfsvfs) 1103 { 1104 int i; 1105 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1106 1107 /* 1108 * This is a barrier to prevent the filesystem from going away in 1109 * zfs_znode_move() until we can safely ensure that the filesystem is 1110 * not unmounted. We consider the filesystem valid before the barrier 1111 * and invalid after the barrier. 1112 */ 1113 rw_enter(&zfsvfs_lock, RW_READER); 1114 rw_exit(&zfsvfs_lock); 1115 1116 zfs_fuid_destroy(zfsvfs); 1117 mutex_destroy(&zfsvfs->z_znodes_lock); 1118 mutex_destroy(&zfsvfs->z_lock); 1119 list_destroy(&zfsvfs->z_all_znodes); 1120 rrw_destroy(&zfsvfs->z_teardown_lock); 1121 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1122 rw_destroy(&zfsvfs->z_fuid_lock); 1123 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1124 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1125 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1126 } 1127 1128 static void 1129 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1130 { 1131 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1132 if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { 1133 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1134 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1135 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1136 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1137 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1138 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1139 } 1140 } 1141 1142 static int 1143 zfs_domount(vfs_t *vfsp, char *osname) 1144 { 1145 dev_t mount_dev; 1146 uint64_t recordsize, fsid_guid; 1147 int error = 0; 1148 zfsvfs_t *zfsvfs; 1149 1150 ASSERT(vfsp); 1151 ASSERT(osname); 1152 1153 error = zfsvfs_create(osname, &zfsvfs); 1154 if (error) 1155 return (error); 1156 zfsvfs->z_vfs = vfsp; 1157 zfsvfs->z_parent = zfsvfs; 1158 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 1159 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1160 1161 /* Initialize the generic filesystem structure. */ 1162 vfsp->vfs_data = NULL; 1163 1164 if (zfs_create_unique_device(&mount_dev) == -1) { 1165 error = ENODEV; 1166 goto out; 1167 } 1168 ASSERT(vfs_devismounted(mount_dev) == 0); 1169 1170 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1171 NULL)) 1172 goto out; 1173 1174 vfsp->vfs_bsize = DEV_BSIZE; 1175 vfsp->vfs_flag |= VFS_NOTRUNC; 1176 vfsp->vfs_data = zfsvfs; 1177 1178 /* 1179 * The fsid is 64 bits, composed of an 8-bit fs type, which 1180 * separates our fsid from any other filesystem types, and a 1181 * 56-bit objset unique ID. The objset unique ID is unique to 1182 * all objsets open on this system, provided by unique_create(). 1183 * The 8-bit fs type must be put in the low bits of fsid[1] 1184 * because that's where other Solaris filesystems put it. 1185 */ 1186 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1187 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1188 vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid; 1189 vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) | 1190 zfsfstype & 0xFF; 1191 1192 dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count); 1193 /* 1194 * Set features for file system. 1195 */ 1196 zfs_set_fuid_feature(zfsvfs); 1197 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1198 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1199 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1200 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1201 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1202 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1203 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1204 } 1205 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1206 1207 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1208 uint64_t pval; 1209 1210 atime_changed_cb(zfsvfs, B_FALSE); 1211 readonly_changed_cb(zfsvfs, B_TRUE); 1212 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1213 goto out; 1214 xattr_changed_cb(zfsvfs, pval); 1215 zfsvfs->z_issnap = B_TRUE; 1216 1217 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1218 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1219 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1220 } else { 1221 error = zfsvfs_setup(zfsvfs, B_TRUE); 1222 } 1223 1224 dprintf("zfs_vfsops.c zfs_domount called\n"); 1225 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 1226 1227 if (!zfsvfs->z_issnap) 1228 zfsctl_create(zfsvfs); 1229 out: 1230 if (error) { 1231 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1232 zfsvfs_free(zfsvfs); 1233 } else { 1234 atomic_add_32(&zfs_active_fs_count, 1); 1235 } 1236 return (error); 1237 } 1238 1239 void 1240 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1241 { 1242 objset_t *os = zfsvfs->z_os; 1243 struct dsl_dataset *ds; 1244 1245 /* 1246 * Unregister properties. 1247 */ 1248 if (!dmu_objset_is_snapshot(os)) { 1249 ds = dmu_objset_ds(os); 1250 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 1251 zfsvfs) == 0); 1252 1253 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 1254 zfsvfs) == 0); 1255 1256 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 1257 zfsvfs) == 0); 1258 1259 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 1260 zfsvfs) == 0); 1261 1262 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, 1263 zfsvfs) == 0); 1264 1265 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 1266 zfsvfs) == 0); 1267 1268 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 1269 zfsvfs) == 0); 1270 1271 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 1272 zfsvfs) == 0); 1273 1274 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 1275 zfsvfs) == 0); 1276 1277 VERIFY(dsl_prop_unregister(ds, "aclinherit", 1278 acl_inherit_changed_cb, zfsvfs) == 0); 1279 1280 VERIFY(dsl_prop_unregister(ds, "vscan", 1281 vscan_changed_cb, zfsvfs) == 0); 1282 } 1283 } 1284 1285 /* 1286 * Convert a decimal digit string to a uint64_t integer. 1287 */ 1288 static int 1289 str_to_uint64(char *str, uint64_t *objnum) 1290 { 1291 uint64_t num = 0; 1292 1293 while (*str) { 1294 if (*str < '0' || *str > '9') 1295 return (EINVAL); 1296 1297 num = num*10 + *str++ - '0'; 1298 } 1299 1300 *objnum = num; 1301 return (0); 1302 } 1303 1304 /* 1305 * The boot path passed from the boot loader is in the form of 1306 * "rootpool-name/root-filesystem-object-number'. Convert this 1307 * string to a dataset name: "rootpool-name/root-filesystem-name". 1308 */ 1309 static int 1310 zfs_parse_bootfs(char *bpath, char *outpath) 1311 { 1312 char *slashp; 1313 uint64_t objnum; 1314 int error; 1315 1316 if (*bpath == 0 || *bpath == '/') 1317 return (EINVAL); 1318 1319 (void) strcpy(outpath, bpath); 1320 1321 slashp = strchr(bpath, '/'); 1322 1323 /* if no '/', just return the pool name */ 1324 if (slashp == NULL) { 1325 return (0); 1326 } 1327 1328 /* if not a number, just return the root dataset name */ 1329 if (str_to_uint64(slashp+1, &objnum)) { 1330 return (0); 1331 } 1332 1333 *slashp = '\0'; 1334 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1335 *slashp = '/'; 1336 1337 return (error); 1338 } 1339 1340 1341 /* 1342 * zfs_check_global_label: 1343 * Check that the hex label string is appropriate for the dataset 1344 * being mounted into the global_zone proper. 1345 * 1346 * Return an error if the hex label string is not default or 1347 * admin_low/admin_high. For admin_low labels, the corresponding 1348 * dataset must be readonly. 1349 */ 1350 int 1351 zfs_check_global_label(const char *dsname, const char *hexsl) 1352 { 1353 #ifdef PORT_SOLARIS 1354 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1355 return (0); 1356 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1357 return (0); 1358 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1359 /* must be readonly */ 1360 uint64_t rdonly; 1361 1362 if (dsl_prop_get_integer(dsname, 1363 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1364 return (EACCES); 1365 return (rdonly ? 0 : EACCES); 1366 } 1367 return (EACCES); 1368 #else 1369 return 0; 1370 #endif 1371 } 1372 1373 /* 1374 * zfs_mount_label_policy: 1375 * Determine whether the mount is allowed according to MAC check. 1376 * by comparing (where appropriate) label of the dataset against 1377 * the label of the zone being mounted into. If the dataset has 1378 * no label, create one. 1379 * 1380 * Returns: 1381 * 0 : access allowed 1382 * >0 : error code, such as EACCES 1383 */ 1384 static int 1385 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1386 { 1387 #ifdef PORT_SOLARIS 1388 int error, retv; 1389 zone_t *mntzone = NULL; 1390 ts_label_t *mnt_tsl; 1391 bslabel_t *mnt_sl; 1392 bslabel_t ds_sl; 1393 char ds_hexsl[MAXNAMELEN]; 1394 1395 retv = EACCES; /* assume the worst */ 1396 1397 /* 1398 * Start by getting the dataset label if it exists. 1399 */ 1400 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1401 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1402 if (error) 1403 return (EACCES); 1404 1405 /* 1406 * If labeling is NOT enabled, then disallow the mount of datasets 1407 * which have a non-default label already. No other label checks 1408 * are needed. 1409 */ 1410 if (!is_system_labeled()) { 1411 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1412 return (0); 1413 return (EACCES); 1414 } 1415 1416 /* 1417 * Get the label of the mountpoint. If mounting into the global 1418 * zone (i.e. mountpoint is not within an active zone and the 1419 * zoned property is off), the label must be default or 1420 * admin_low/admin_high only; no other checks are needed. 1421 */ 1422 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1423 if (mntzone->zone_id == GLOBAL_ZONEID) { 1424 uint64_t zoned; 1425 1426 zone_rele(mntzone); 1427 1428 if (dsl_prop_get_integer(osname, 1429 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1430 return (EACCES); 1431 if (!zoned) 1432 return (zfs_check_global_label(osname, ds_hexsl)); 1433 else 1434 /* 1435 * This is the case of a zone dataset being mounted 1436 * initially, before the zone has been fully created; 1437 * allow this mount into global zone. 1438 */ 1439 return (0); 1440 } 1441 1442 mnt_tsl = mntzone->zone_slabel; 1443 ASSERT(mnt_tsl != NULL); 1444 label_hold(mnt_tsl); 1445 mnt_sl = label2bslabel(mnt_tsl); 1446 1447 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1448 /* 1449 * The dataset doesn't have a real label, so fabricate one. 1450 */ 1451 char *str = NULL; 1452 1453 if (l_to_str_internal(mnt_sl, &str) == 0 && 1454 dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1455 ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) 1456 retv = 0; 1457 if (str != NULL) 1458 kmem_free(str, strlen(str) + 1); 1459 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1460 /* 1461 * Now compare labels to complete the MAC check. If the 1462 * labels are equal then allow access. If the mountpoint 1463 * label dominates the dataset label, allow readonly access. 1464 * Otherwise, access is denied. 1465 */ 1466 if (blequal(mnt_sl, &ds_sl)) 1467 retv = 0; 1468 else if (bldominates(mnt_sl, &ds_sl)) { 1469 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1470 retv = 0; 1471 } 1472 } 1473 1474 label_rele(mnt_tsl); 1475 zone_rele(mntzone); 1476 return (retv); 1477 #else /* PORT_SOLARIS */ 1478 return (0); 1479 #endif 1480 } 1481 1482 #ifndef __NetBSD__ 1483 static int 1484 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1485 { 1486 int error = 0; 1487 static int zfsrootdone = 0; 1488 zfsvfs_t *zfsvfs = NULL; 1489 znode_t *zp = NULL; 1490 vnode_t *vp = NULL; 1491 char *zfs_bootfs; 1492 char *zfs_devid; 1493 1494 ASSERT(vfsp); 1495 1496 /* 1497 * The filesystem that we mount as root is defined in the 1498 * boot property "zfs-bootfs" with a format of 1499 * "poolname/root-dataset-objnum". 1500 */ 1501 if (why == ROOT_INIT) { 1502 if (zfsrootdone++) 1503 return (EBUSY); 1504 /* 1505 * the process of doing a spa_load will require the 1506 * clock to be set before we could (for example) do 1507 * something better by looking at the timestamp on 1508 * an uberblock, so just set it to -1. 1509 */ 1510 clkset(-1); 1511 1512 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1513 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1514 "bootfs name"); 1515 return (EINVAL); 1516 } 1517 zfs_devid = spa_get_bootprop("diskdevid"); 1518 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1519 if (zfs_devid) 1520 spa_free_bootprop(zfs_devid); 1521 if (error) { 1522 spa_free_bootprop(zfs_bootfs); 1523 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1524 error); 1525 return (error); 1526 } 1527 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1528 spa_free_bootprop(zfs_bootfs); 1529 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1530 error); 1531 return (error); 1532 } 1533 1534 spa_free_bootprop(zfs_bootfs); 1535 1536 if (error = vfs_lock(vfsp)) 1537 return (error); 1538 1539 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1540 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1541 goto out; 1542 } 1543 1544 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1545 ASSERT(zfsvfs); 1546 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1547 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1548 goto out; 1549 } 1550 1551 vp = ZTOV(zp); 1552 mutex_enter(&vp->v_lock); 1553 vp->v_flag |= VROOT; 1554 mutex_exit(&vp->v_lock); 1555 rootvp = vp; 1556 1557 /* 1558 * Leave rootvp held. The root file system is never unmounted. 1559 */ 1560 1561 vfs_add((struct vnode *)0, vfsp, 1562 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1563 out: 1564 vfs_unlock(vfsp); 1565 return (error); 1566 } else if (why == ROOT_REMOUNT) { 1567 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1568 vfsp->vfs_flag |= VFS_REMOUNT; 1569 1570 /* refresh mount options */ 1571 zfs_unregister_callbacks(vfsp->vfs_data); 1572 return (zfs_register_callbacks(vfsp)); 1573 1574 } else if (why == ROOT_UNMOUNT) { 1575 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1576 (void) zfs_sync(vfsp, 0, 0); 1577 return (0); 1578 } 1579 1580 /* 1581 * if "why" is equal to anything else other than ROOT_INIT, 1582 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1583 */ 1584 return (ENOTSUP); 1585 } 1586 #endif /*__NetBSD__ */ 1587 1588 /*ARGSUSED*/ 1589 static int 1590 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len) 1591 { 1592 char *osname; 1593 pathname_t spn; 1594 vnode_t *mvp = vfsp->mnt_vnodecovered; 1595 struct mounta *uap = data; 1596 int error = 0; 1597 int canwrite; 1598 cred_t *cr; 1599 1600 crget(cr); 1601 dprintf("zfs_vfsops.c zfs_mount called\n"); 1602 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 1603 if (mvp->v_type != VDIR) 1604 return (ENOTDIR); 1605 1606 if (uap == NULL) 1607 return (EINVAL); 1608 1609 mutex_enter(mvp->v_interlock); 1610 if ((uap->flags & MS_REMOUNT) == 0 && 1611 (uap->flags & MS_OVERLAY) == 0 && 1612 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1613 mutex_exit(mvp->v_interlock); 1614 return (EBUSY); 1615 } 1616 mutex_exit(mvp->v_interlock); 1617 1618 /* 1619 * ZFS does not support passing unparsed data in via MS_DATA. 1620 * Users should use the MS_OPTIONSTR interface; this means 1621 * that all option parsing is already done and the options struct 1622 * can be interrogated. 1623 */ 1624 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1625 return (EINVAL); 1626 1627 osname = PNBUF_GET(); 1628 1629 strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1); 1630 1631 /* 1632 * Check for mount privilege? 1633 * 1634 * If we don't have privilege then see if 1635 * we have local permission to allow it 1636 */ 1637 error = secpolicy_fs_mount(cr, mvp, vfsp); 1638 if (error) { 1639 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); 1640 if (error == 0) { 1641 vattr_t vattr; 1642 1643 /* 1644 * Make sure user is the owner of the mount point 1645 * or has sufficient privileges. 1646 */ 1647 1648 vattr.va_mask = AT_UID; 1649 1650 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1651 goto out; 1652 } 1653 1654 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1655 VOP_ACCESS(mvp, VWRITE, cr) != 0) { 1656 error = EPERM; 1657 goto out; 1658 } 1659 1660 /* XXX NetBSD secpolicy_fs_mount_clearopts(cr, vfsp);*/ 1661 } else { 1662 goto out; 1663 } 1664 } 1665 1666 /* 1667 * Refuse to mount a filesystem if we are in a local zone and the 1668 * dataset is not visible. 1669 */ 1670 if (!INGLOBALZONE(curproc) && 1671 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1672 error = EPERM; 1673 goto out; 1674 } 1675 1676 error = zfs_mount_label_policy(vfsp, osname); 1677 if (error) 1678 goto out; 1679 1680 /* 1681 * When doing a remount, we simply refresh our temporary properties 1682 * according to those options set in the current VFS options. 1683 */ 1684 if (uap->flags & MS_REMOUNT) { 1685 /* refresh mount options */ 1686 zfs_unregister_callbacks(vfsp->vfs_data); 1687 error = zfs_register_callbacks(vfsp); 1688 goto out; 1689 } 1690 1691 /* Mark ZFS as MP SAFE */ 1692 vfsp->mnt_iflag |= IMNT_MPSAFE; 1693 1694 error = zfs_domount(vfsp, osname); 1695 1696 vfs_getnewfsid(vfsp); 1697 1698 /* setup zfs mount info */ 1699 strlcpy(vfsp->mnt_stat.f_mntfromname, osname, 1700 sizeof(vfsp->mnt_stat.f_mntfromname)); 1701 set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname, 1702 UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp); 1703 1704 /* 1705 * Add an extra VFS_HOLD on our parent vfs so that it can't 1706 * disappear due to a forced unmount. 1707 */ 1708 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1709 VFS_HOLD(mvp->v_vfsp); 1710 1711 out: 1712 PNBUF_PUT(osname); 1713 return (error); 1714 } 1715 1716 static int 1717 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp) 1718 { 1719 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1720 dev_t dev; 1721 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1722 1723 ZFS_ENTER(zfsvfs); 1724 1725 dmu_objset_space(zfsvfs->z_os, 1726 &refdbytes, &availbytes, &usedobjs, &availobjs); 1727 1728 /* 1729 * The underlying storage pool actually uses multiple block sizes. 1730 * We report the fragsize as the smallest block size we support, 1731 * and we report our blocksize as the filesystem's maximum blocksize. 1732 */ 1733 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 1734 statp->f_bsize = zfsvfs->z_max_blksz; 1735 1736 /* 1737 * The following report "total" blocks of various kinds in the 1738 * file system, but reported in terms of f_frsize - the 1739 * "fragment" size. 1740 */ 1741 1742 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1743 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 1744 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1745 1746 /* 1747 * statvfs() should really be called statufs(), because it assumes 1748 * static metadata. ZFS doesn't preallocate files, so the best 1749 * we can do is report the max that could possibly fit in f_files, 1750 * and that minus the number actually used in f_ffree. 1751 * For f_ffree, report the smaller of the number of object available 1752 * and the number of blocks (each object will take at least a block). 1753 */ 1754 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1755 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 1756 statp->f_files = statp->f_ffree + usedobjs; 1757 1758 statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0]; 1759 1760 /* 1761 * We're a zfs filesystem. 1762 */ 1763 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); 1764 (void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1765 sizeof(statp->f_mntfromname)); 1766 (void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1767 sizeof(statp->f_mntonname)); 1768 1769 statp->f_namemax = ZFS_MAXNAMELEN; 1770 1771 /* 1772 * We have all of 32 characters to stuff a string here. 1773 * Is there anything useful we could/should provide? 1774 */ 1775 #ifndef __NetBSD__ 1776 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 1777 #endif 1778 ZFS_EXIT(zfsvfs); 1779 return (0); 1780 } 1781 1782 static int 1783 zfs_root(vfs_t *vfsp, vnode_t **vpp) 1784 { 1785 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1786 znode_t *rootzp; 1787 int error; 1788 1789 ZFS_ENTER(zfsvfs); 1790 dprintf("zfs_root called\n"); 1791 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1792 if (error == 0) 1793 *vpp = ZTOV(rootzp); 1794 dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp); 1795 ZFS_EXIT(zfsvfs); 1796 if (error == 0) 1797 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1798 KASSERT((error != 0) || (*vpp != NULL)); 1799 KASSERT((error != 0) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)); 1800 return (error); 1801 } 1802 1803 /* 1804 * Teardown the zfsvfs::z_os. 1805 * 1806 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 1807 * and 'z_teardown_inactive_lock' held. 1808 */ 1809 static int 1810 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1811 { 1812 znode_t *zp; 1813 1814 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1815 1816 if (!unmounting) { 1817 /* 1818 * We purge the parent filesystem's vfsp as the parent 1819 * filesystem and all of its snapshots have their vnode's 1820 * v_vfsp set to the parent's filesystem's vfsp. Note, 1821 * 'z_parent' is self referential for non-snapshots. 1822 */ 1823 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1824 } 1825 1826 /* 1827 * Close the zil. NB: Can't close the zil while zfs_inactive 1828 * threads are blocked as zil_close can call zfs_inactive. 1829 */ 1830 if (zfsvfs->z_log) { 1831 zil_close(zfsvfs->z_log); 1832 zfsvfs->z_log = NULL; 1833 } 1834 1835 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 1836 1837 /* 1838 * If we are not unmounting (ie: online recv) and someone already 1839 * unmounted this file system while we were doing the switcheroo, 1840 * or a reopen of z_os failed then just bail out now. 1841 */ 1842 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1843 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1844 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1845 return (EIO); 1846 } 1847 1848 /* 1849 * At this point there are no vops active, and any new vops will 1850 * fail with EIO since we have z_teardown_lock for writer (only 1851 * relavent for forced unmount). 1852 * 1853 * Release all holds on dbufs. 1854 */ 1855 mutex_enter(&zfsvfs->z_znodes_lock); 1856 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1857 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1858 if (zp->z_dbuf) { 1859 ASSERT(ZTOV(zp)->v_count > 0); 1860 zfs_znode_dmu_fini(zp); 1861 } 1862 mutex_exit(&zfsvfs->z_znodes_lock); 1863 1864 /* 1865 * If we are unmounting, set the unmounted flag and let new vops 1866 * unblock. zfs_inactive will have the unmounted behavior, and all 1867 * other vops will fail with EIO. 1868 */ 1869 if (unmounting) { 1870 zfsvfs->z_unmounted = B_TRUE; 1871 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1872 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1873 } 1874 1875 /* 1876 * z_os will be NULL if there was an error in attempting to reopen 1877 * zfsvfs, so just return as the properties had already been 1878 * unregistered and cached data had been evicted before. 1879 */ 1880 if (zfsvfs->z_os == NULL) 1881 return (0); 1882 1883 /* 1884 * Unregister properties. 1885 */ 1886 zfs_unregister_callbacks(zfsvfs); 1887 1888 /* 1889 * Evict cached data 1890 */ 1891 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { 1892 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1893 (void) dmu_objset_evict_dbufs(zfsvfs->z_os); 1894 } 1895 1896 return (0); 1897 } 1898 1899 /*ARGSUSED*/ 1900 static int 1901 zfs_umount(vfs_t *vfsp, int fflag) 1902 { 1903 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1904 objset_t *os; 1905 int ret, flags = 0; 1906 cred_t *cr; 1907 1908 vnode_t *vpp; 1909 int counter; 1910 1911 counter = 0; 1912 1913 dprintf("ZFS_UMOUNT called\n"); 1914 1915 /*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) { 1916 printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp); 1917 vprint("ZFS vfsp vnode list", vpp); 1918 counter++; 1919 } */ 1920 1921 crget(cr); 1922 #ifdef TODO 1923 ret = secpolicy_fs_unmount(cr, vfsp); 1924 if (ret) { 1925 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1926 ZFS_DELEG_PERM_MOUNT, cr); 1927 if (ret) 1928 return (ret); 1929 } 1930 #endif 1931 /* 1932 * We purge the parent filesystem's vfsp as the parent filesystem 1933 * and all of its snapshots have their vnode's v_vfsp set to the 1934 * parent's filesystem's vfsp. Note, 'z_parent' is self 1935 * referential for non-snapshots. 1936 */ 1937 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1938 1939 /* 1940 * Unmount any snapshots mounted under .zfs before unmounting the 1941 * dataset itself. 1942 */ 1943 if (zfsvfs->z_ctldir != NULL && 1944 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 1945 return (ret); 1946 } 1947 1948 #if 0 1949 if (!(fflag & MS_FORCE)) { 1950 /* 1951 * Check the number of active vnodes in the file system. 1952 * Our count is maintained in the vfs structure, but the 1953 * number is off by 1 to indicate a hold on the vfs 1954 * structure itself. 1955 * 1956 * The '.zfs' directory maintains a reference of its 1957 * own, and any active references underneath are 1958 * reflected in the vnode count. 1959 */ 1960 if (zfsvfs->z_ctldir == NULL) { 1961 if (vfsp->vfs_count > 1){ 1962 return (EBUSY); 1963 } 1964 } else { 1965 if (vfsp->vfs_count > 2 || 1966 zfsvfs->z_ctldir->v_count > 1) { 1967 return (EBUSY); 1968 } 1969 } 1970 } 1971 #endif 1972 ret = vflush(vfsp, NULL, (ISSET(fflag, MS_FORCE)? FORCECLOSE : 0)); 1973 if (ret != 0) 1974 return ret; 1975 vfsp->vfs_flag |= VFS_UNMOUNTED; 1976 1977 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1978 os = zfsvfs->z_os; 1979 1980 /* 1981 * z_os will be NULL if there was an error in 1982 * attempting to reopen zfsvfs. 1983 */ 1984 if (os != NULL) { 1985 /* 1986 * Unset the objset user_ptr. 1987 */ 1988 mutex_enter(&os->os_user_ptr_lock); 1989 dmu_objset_set_user(os, NULL); 1990 mutex_exit(&os->os_user_ptr_lock); 1991 1992 /* 1993 * Finally release the objset 1994 */ 1995 dmu_objset_disown(os, zfsvfs); 1996 } 1997 1998 /* 1999 * We can now safely destroy the '.zfs' directory node. 2000 */ 2001 if (zfsvfs->z_ctldir != NULL) 2002 zfsctl_destroy(zfsvfs); 2003 2004 return (0); 2005 } 2006 2007 static int 2008 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp) 2009 { 2010 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2011 znode_t *zp; 2012 int err; 2013 2014 dprintf("zfs_vget called\n"); 2015 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 2016 2017 ZFS_ENTER(zfsvfs); 2018 err = zfs_zget(zfsvfs, ino, &zp); 2019 if (err == 0 && zp->z_unlinked) { 2020 VN_RELE(ZTOV(zp)); 2021 err = EINVAL; 2022 } 2023 if (err != 0) 2024 *vpp = NULL; 2025 else { 2026 *vpp = ZTOV(zp); 2027 /* XXX NetBSD how to get flags for vn_lock ? */ 2028 vn_lock(*vpp, 0); 2029 } 2030 ZFS_EXIT(zfsvfs); 2031 return (err); 2032 } 2033 2034 static int 2035 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) 2036 { 2037 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2038 znode_t *zp; 2039 uint64_t object = 0; 2040 uint64_t fid_gen = 0; 2041 uint64_t gen_mask; 2042 uint64_t zp_gen; 2043 int i, err; 2044 2045 *vpp = NULL; 2046 2047 dprintf("zfs_fhtovp called\n"); 2048 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); 2049 2050 ZFS_ENTER(zfsvfs); 2051 2052 if (fidp->fid_len == LONG_FID_LEN) { 2053 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2054 uint64_t objsetid = 0; 2055 uint64_t setgen = 0; 2056 2057 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2058 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2059 2060 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2061 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2062 2063 ZFS_EXIT(zfsvfs); 2064 2065 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2066 if (err) 2067 return (EINVAL); 2068 ZFS_ENTER(zfsvfs); 2069 } 2070 2071 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2072 zfid_short_t *zfid = (zfid_short_t *)fidp; 2073 2074 for (i = 0; i < sizeof (zfid->zf_object); i++) 2075 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2076 2077 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2078 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2079 } else { 2080 ZFS_EXIT(zfsvfs); 2081 return (EINVAL); 2082 } 2083 2084 /* A zero fid_gen means we are in the .zfs control directories */ 2085 if (fid_gen == 0 && 2086 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2087 *vpp = zfsvfs->z_ctldir; 2088 ASSERT(*vpp != NULL); 2089 if (object == ZFSCTL_INO_SNAPDIR) { 2090 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2091 0, NULL, NULL, NULL, NULL, NULL) == 0); 2092 } else { 2093 VN_HOLD(*vpp); 2094 } 2095 ZFS_EXIT(zfsvfs); 2096 /* XXX: LK_RETRY? */ 2097 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 2098 return (0); 2099 } 2100 2101 gen_mask = -1ULL >> (64 - 8 * i); 2102 2103 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2104 if (err = zfs_zget(zfsvfs, object, &zp)) { 2105 ZFS_EXIT(zfsvfs); 2106 return (err); 2107 } 2108 zp_gen = zp->z_phys->zp_gen & gen_mask; 2109 if (zp_gen == 0) 2110 zp_gen = 1; 2111 if (zp->z_unlinked || zp_gen != fid_gen) { 2112 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2113 VN_RELE(ZTOV(zp)); 2114 ZFS_EXIT(zfsvfs); 2115 return (EINVAL); 2116 } 2117 2118 *vpp = ZTOV(zp); 2119 /* XXX: LK_RETRY? */ 2120 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 2121 ZFS_EXIT(zfsvfs); 2122 return (0); 2123 } 2124 2125 /* 2126 * Block out VOPs and close zfsvfs_t::z_os 2127 * 2128 * Note, if successful, then we return with the 'z_teardown_lock' and 2129 * 'z_teardown_inactive_lock' write held. 2130 */ 2131 int 2132 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2133 { 2134 int error; 2135 2136 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2137 return (error); 2138 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 2139 2140 return (0); 2141 } 2142 2143 /* 2144 * Reopen zfsvfs_t::z_os and release VOPs. 2145 */ 2146 int 2147 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) 2148 { 2149 int err; 2150 2151 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2152 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2153 2154 err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, 2155 &zfsvfs->z_os); 2156 if (err) { 2157 zfsvfs->z_os = NULL; 2158 } else { 2159 znode_t *zp; 2160 2161 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2162 2163 /* 2164 * Attempt to re-establish all the active znodes with 2165 * their dbufs. If a zfs_rezget() fails, then we'll let 2166 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2167 * when they try to use their znode. 2168 */ 2169 mutex_enter(&zfsvfs->z_znodes_lock); 2170 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2171 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2172 (void) zfs_rezget(zp); 2173 } 2174 mutex_exit(&zfsvfs->z_znodes_lock); 2175 2176 } 2177 2178 /* release the VOPs */ 2179 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2180 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 2181 2182 if (err) { 2183 /* 2184 * Since we couldn't reopen zfsvfs::z_os, force 2185 * unmount this file system. 2186 */ 2187 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2188 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp); 2189 } 2190 return (err); 2191 } 2192 2193 static void 2194 zfs_freevfs(vfs_t *vfsp) 2195 { 2196 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2197 2198 /* 2199 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2200 * from zfs_mount(). Release it here. 2201 */ 2202 if (zfsvfs->z_issnap) 2203 VFS_RELE(zfsvfs->z_parent->z_vfs); 2204 2205 zfsvfs_free(zfsvfs); 2206 2207 atomic_add_32(&zfs_active_fs_count, -1); 2208 } 2209 2210 /* 2211 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2212 * so we can't safely do any non-idempotent initialization here. 2213 * Leave that to zfs_init() and zfs_fini(), which are called 2214 * from the module's _init() and _fini() entry points. 2215 */ 2216 /*ARGSUSED*/ 2217 int 2218 zfs_vfsinit(int fstype, char *name) 2219 { 2220 int error; 2221 2222 zfsfstype = fstype; 2223 2224 /* 2225 * Setup vfsops and vnodeops tables. 2226 */ 2227 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2228 2229 error = zfs_create_op_tables(); 2230 if (error) { 2231 zfs_remove_op_tables(); 2232 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2233 vfs_freevfsops_by_type(zfsfstype); 2234 return (error); 2235 } 2236 2237 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2238 mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL); 2239 2240 /* 2241 * Unique major number for all zfs mounts. 2242 * If we run out of 32-bit minors, we'll getudev() another major. 2243 */ 2244 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2245 zfs_minor = ZFS_MIN_MINOR; 2246 2247 return (0); 2248 } 2249 2250 int 2251 zfs_vfsfini(void) 2252 { 2253 int err; 2254 2255 err = vfs_detach(&zfs_vfsops_template); 2256 if (err != 0) 2257 return err; 2258 2259 mutex_destroy(&zfs_debug_mtx); 2260 mutex_destroy(&zfs_dev_mtx); 2261 2262 return 0; 2263 } 2264 2265 void 2266 zfs_init(void) 2267 { 2268 /* 2269 * Initialize .zfs directory structures 2270 */ 2271 zfsctl_init(); 2272 2273 /* 2274 * Initialize znode cache, vnode ops, etc... 2275 */ 2276 zfs_znode_init(); 2277 2278 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2279 } 2280 2281 void 2282 zfs_fini(void) 2283 { 2284 zfsctl_fini(); 2285 zfs_znode_fini(); 2286 } 2287 2288 int 2289 zfs_busy(void) 2290 { 2291 return (zfs_active_fs_count != 0); 2292 } 2293 2294 int 2295 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2296 { 2297 int error; 2298 objset_t *os = zfsvfs->z_os; 2299 dmu_tx_t *tx; 2300 2301 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2302 return (EINVAL); 2303 2304 if (newvers < zfsvfs->z_version) 2305 return (EINVAL); 2306 2307 tx = dmu_tx_create(os); 2308 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2309 error = dmu_tx_assign(tx, TXG_WAIT); 2310 if (error) { 2311 dmu_tx_abort(tx); 2312 return (error); 2313 } 2314 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2315 8, 1, &newvers, tx); 2316 2317 if (error) { 2318 dmu_tx_commit(tx); 2319 return (error); 2320 } 2321 2322 spa_history_internal_log(LOG_DS_UPGRADE, 2323 dmu_objset_spa(os), tx, CRED(), 2324 "oldver=%llu newver=%llu dataset = %llu", 2325 zfsvfs->z_version, newvers, dmu_objset_id(os)); 2326 2327 dmu_tx_commit(tx); 2328 2329 zfsvfs->z_version = newvers; 2330 2331 if (zfsvfs->z_version >= ZPL_VERSION_FUID) 2332 zfs_set_fuid_feature(zfsvfs); 2333 2334 return (0); 2335 } 2336 2337 /* 2338 * Read a property stored within the master node. 2339 */ 2340 int 2341 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2342 { 2343 const char *pname; 2344 int error = ENOENT; 2345 2346 /* 2347 * Look up the file system's value for the property. For the 2348 * version property, we look up a slightly different string. 2349 */ 2350 if (prop == ZFS_PROP_VERSION) 2351 pname = ZPL_VERSION_STR; 2352 else 2353 pname = zfs_prop_to_name(prop); 2354 2355 if (os != NULL) 2356 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2357 2358 if (error == ENOENT) { 2359 /* No value set, use the default value */ 2360 switch (prop) { 2361 case ZFS_PROP_VERSION: 2362 *value = ZPL_VERSION; 2363 break; 2364 case ZFS_PROP_NORMALIZE: 2365 case ZFS_PROP_UTF8ONLY: 2366 *value = 0; 2367 break; 2368 case ZFS_PROP_CASE: 2369 *value = ZFS_CASE_SENSITIVE; 2370 break; 2371 default: 2372 return (error); 2373 } 2374 error = 0; 2375 } 2376 return (error); 2377 } 2378 2379 static int 2380 zfs_start(vfs_t *vfsp, int flags) 2381 { 2382 2383 return (0); 2384 } 2385 2386 2387 #ifdef TODO 2388 static vfsdef_t vfw = { 2389 VFSDEF_VERSION, 2390 MNTTYPE_ZFS, 2391 zfs_vfsinit, 2392 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2393 VSW_XID, 2394 &zfs_mntopts 2395 }; 2396 2397 struct modlfs zfs_modlfs = { 2398 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2399 }; 2400 #endif 2401