1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/sysmacros.h> 37 #include <sys/kmem.h> 38 #include <sys/acl.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/mntent.h> 42 #include <sys/mount.h> 43 #include <sys/cmn_err.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zil.h> 47 #include <sys/fs/zfs.h> 48 #include <sys/dmu.h> 49 #include <sys/dsl_prop.h> 50 #include <sys/dsl_dataset.h> 51 #include <sys/dsl_deleg.h> 52 #include <sys/spa.h> 53 #include <sys/zap.h> 54 #include <sys/sa.h> 55 #include <sys/sa_impl.h> 56 #include <sys/policy.h> 57 #include <sys/atomic.h> 58 #include <sys/zfs_ioctl.h> 59 #include <sys/zfs_ctldir.h> 60 #include <sys/zfs_fuid.h> 61 #include <sys/sunddi.h> 62 #include <sys/dmu_objset.h> 63 #include <sys/dsl_dir.h> 64 #include <sys/spa_boot.h> 65 #include <sys/jail.h> 66 #include <ufs/ufs/quota.h> 67 #include <sys/zfs_quota.h> 68 69 #include "zfs_comutil.h" 70 71 #ifndef MNTK_VMSETSIZE_BUG 72 #define MNTK_VMSETSIZE_BUG 0 73 #endif 74 #ifndef MNTK_NOMSYNC 75 #define MNTK_NOMSYNC 8 76 #endif 77 78 /* BEGIN CSTYLED */ 79 struct mtx zfs_debug_mtx; 80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 81 82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 83 84 int zfs_super_owner; 85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 86 "File system owner can perform privileged operation on his file systems"); 87 88 int zfs_debug_level; 89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 90 "Debug level"); 91 92 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 93 static int zfs_version_acl = ZFS_ACL_VERSION; 94 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 95 "ZFS_ACL_VERSION"); 96 static int zfs_version_spa = SPA_VERSION; 97 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 98 "SPA_VERSION"); 99 static int zfs_version_zpl = ZPL_VERSION; 100 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 101 "ZPL_VERSION"); 102 /* END CSTYLED */ 103 104 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 105 static int zfs_mount(vfs_t *vfsp); 106 static int zfs_umount(vfs_t *vfsp, int fflag); 107 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 108 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 109 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 110 static int zfs_sync(vfs_t *vfsp, int waitfor); 111 #if __FreeBSD_version >= 1300098 112 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 113 struct ucred **credanonp, int *numsecflavors, int *secflavors); 114 #else 115 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 116 struct ucred **credanonp, int *numsecflavors, int **secflavors); 117 #endif 118 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 119 static void zfs_freevfs(vfs_t *vfsp); 120 121 struct vfsops zfs_vfsops = { 122 .vfs_mount = zfs_mount, 123 .vfs_unmount = zfs_umount, 124 #if __FreeBSD_version >= 1300049 125 .vfs_root = vfs_cache_root, 126 .vfs_cachedroot = zfs_root, 127 #else 128 .vfs_root = zfs_root, 129 #endif 130 .vfs_statfs = zfs_statfs, 131 .vfs_vget = zfs_vget, 132 .vfs_sync = zfs_sync, 133 .vfs_checkexp = zfs_checkexp, 134 .vfs_fhtovp = zfs_fhtovp, 135 .vfs_quotactl = zfs_quotactl, 136 }; 137 138 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); 139 140 /* 141 * We need to keep a count of active fs's. 142 * This is necessary to prevent our module 143 * from being unloaded after a umount -f 144 */ 145 static uint32_t zfs_active_fs_count = 0; 146 147 int 148 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 149 char *setpoint) 150 { 151 int error; 152 zfsvfs_t *zfvp; 153 vfs_t *vfsp; 154 objset_t *os; 155 uint64_t tmp = *val; 156 157 error = dmu_objset_from_ds(ds, &os); 158 if (error != 0) 159 return (error); 160 161 error = getzfsvfs_impl(os, &zfvp); 162 if (error != 0) 163 return (error); 164 if (zfvp == NULL) 165 return (ENOENT); 166 vfsp = zfvp->z_vfs; 167 switch (zfs_prop) { 168 case ZFS_PROP_ATIME: 169 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 170 tmp = 0; 171 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 172 tmp = 1; 173 break; 174 case ZFS_PROP_DEVICES: 175 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 176 tmp = 0; 177 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 178 tmp = 1; 179 break; 180 case ZFS_PROP_EXEC: 181 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 182 tmp = 0; 183 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 184 tmp = 1; 185 break; 186 case ZFS_PROP_SETUID: 187 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 188 tmp = 0; 189 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 190 tmp = 1; 191 break; 192 case ZFS_PROP_READONLY: 193 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 194 tmp = 0; 195 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 196 tmp = 1; 197 break; 198 case ZFS_PROP_XATTR: 199 if (zfvp->z_flags & ZSB_XATTR) 200 tmp = zfvp->z_xattr; 201 break; 202 case ZFS_PROP_NBMAND: 203 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 204 tmp = 0; 205 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 206 tmp = 1; 207 break; 208 default: 209 vfs_unbusy(vfsp); 210 return (ENOENT); 211 } 212 213 vfs_unbusy(vfsp); 214 if (tmp != *val) { 215 (void) strcpy(setpoint, "temporary"); 216 *val = tmp; 217 } 218 return (0); 219 } 220 221 static int 222 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 223 { 224 int error = 0; 225 char buf[32]; 226 uint64_t usedobj, quotaobj; 227 uint64_t quota, used = 0; 228 timespec_t now; 229 230 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 231 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 232 233 if (quotaobj == 0 || zfsvfs->z_replay) { 234 error = ENOENT; 235 goto done; 236 } 237 (void) sprintf(buf, "%llx", (longlong_t)id); 238 if ((error = zap_lookup(zfsvfs->z_os, quotaobj, 239 buf, sizeof (quota), 1, "a)) != 0) { 240 dprintf("%s(%d): quotaobj lookup failed\n", 241 __FUNCTION__, __LINE__); 242 goto done; 243 } 244 /* 245 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 246 * So we set them to be the same. 247 */ 248 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 249 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 250 if (error && error != ENOENT) { 251 dprintf("%s(%d): usedobj failed; %d\n", 252 __FUNCTION__, __LINE__, error); 253 goto done; 254 } 255 dqp->dqb_curblocks = btodb(used); 256 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 257 vfs_timestamp(&now); 258 /* 259 * Setting this to 0 causes FreeBSD quota(8) to print 260 * the number of days since the epoch, which isn't 261 * particularly useful. 262 */ 263 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 264 done: 265 return (error); 266 } 267 268 static int 269 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 270 { 271 zfsvfs_t *zfsvfs = vfsp->vfs_data; 272 struct thread *td; 273 int cmd, type, error = 0; 274 int bitsize; 275 zfs_userquota_prop_t quota_type; 276 struct dqblk64 dqblk = { 0 }; 277 278 td = curthread; 279 cmd = cmds >> SUBCMDSHIFT; 280 type = cmds & SUBCMDMASK; 281 282 ZFS_ENTER(zfsvfs); 283 if (id == -1) { 284 switch (type) { 285 case USRQUOTA: 286 id = td->td_ucred->cr_ruid; 287 break; 288 case GRPQUOTA: 289 id = td->td_ucred->cr_rgid; 290 break; 291 default: 292 error = EINVAL; 293 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 294 vfs_unbusy(vfsp); 295 goto done; 296 } 297 } 298 /* 299 * Map BSD type to: 300 * ZFS_PROP_USERUSED, 301 * ZFS_PROP_USERQUOTA, 302 * ZFS_PROP_GROUPUSED, 303 * ZFS_PROP_GROUPQUOTA 304 */ 305 switch (cmd) { 306 case Q_SETQUOTA: 307 case Q_SETQUOTA32: 308 if (type == USRQUOTA) 309 quota_type = ZFS_PROP_USERQUOTA; 310 else if (type == GRPQUOTA) 311 quota_type = ZFS_PROP_GROUPQUOTA; 312 else 313 error = EINVAL; 314 break; 315 case Q_GETQUOTA: 316 case Q_GETQUOTA32: 317 if (type == USRQUOTA) 318 quota_type = ZFS_PROP_USERUSED; 319 else if (type == GRPQUOTA) 320 quota_type = ZFS_PROP_GROUPUSED; 321 else 322 error = EINVAL; 323 break; 324 } 325 326 /* 327 * Depending on the cmd, we may need to get 328 * the ruid and domain (see fuidstr_to_sid?), 329 * the fuid (how?), or other information. 330 * Create fuid using zfs_fuid_create(zfsvfs, id, 331 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 332 * I think I can use just the id? 333 * 334 * Look at zfs_id_overquota() to look up a quota. 335 * zap_lookup(something, quotaobj, fuidstring, 336 * sizeof (long long), 1, "a) 337 * 338 * See zfs_set_userquota() to set a quota. 339 */ 340 if ((uint32_t)type >= MAXQUOTAS) { 341 error = EINVAL; 342 goto done; 343 } 344 345 switch (cmd) { 346 case Q_GETQUOTASIZE: 347 bitsize = 64; 348 error = copyout(&bitsize, arg, sizeof (int)); 349 break; 350 case Q_QUOTAON: 351 // As far as I can tell, you can't turn quotas on or off on zfs 352 error = 0; 353 vfs_unbusy(vfsp); 354 break; 355 case Q_QUOTAOFF: 356 error = ENOTSUP; 357 vfs_unbusy(vfsp); 358 break; 359 case Q_SETQUOTA: 360 error = copyin(arg, &dqblk, sizeof (dqblk)); 361 if (error == 0) 362 error = zfs_set_userquota(zfsvfs, quota_type, 363 "", id, dbtob(dqblk.dqb_bhardlimit)); 364 break; 365 case Q_GETQUOTA: 366 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 367 if (error == 0) 368 error = copyout(&dqblk, arg, sizeof (dqblk)); 369 break; 370 default: 371 error = EINVAL; 372 break; 373 } 374 done: 375 ZFS_EXIT(zfsvfs); 376 return (error); 377 } 378 379 380 boolean_t 381 zfs_is_readonly(zfsvfs_t *zfsvfs) 382 { 383 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 384 } 385 386 /*ARGSUSED*/ 387 static int 388 zfs_sync(vfs_t *vfsp, int waitfor) 389 { 390 391 /* 392 * Data integrity is job one. We don't want a compromised kernel 393 * writing to the storage pool, so we never sync during panic. 394 */ 395 if (panicstr) 396 return (0); 397 398 /* 399 * Ignore the system syncher. ZFS already commits async data 400 * at zfs_txg_timeout intervals. 401 */ 402 if (waitfor == MNT_LAZY) 403 return (0); 404 405 if (vfsp != NULL) { 406 /* 407 * Sync a specific filesystem. 408 */ 409 zfsvfs_t *zfsvfs = vfsp->vfs_data; 410 dsl_pool_t *dp; 411 int error; 412 413 error = vfs_stdsync(vfsp, waitfor); 414 if (error != 0) 415 return (error); 416 417 ZFS_ENTER(zfsvfs); 418 dp = dmu_objset_pool(zfsvfs->z_os); 419 420 /* 421 * If the system is shutting down, then skip any 422 * filesystems which may exist on a suspended pool. 423 */ 424 if (rebooting && spa_suspended(dp->dp_spa)) { 425 ZFS_EXIT(zfsvfs); 426 return (0); 427 } 428 429 if (zfsvfs->z_log != NULL) 430 zil_commit(zfsvfs->z_log, 0); 431 432 ZFS_EXIT(zfsvfs); 433 } else { 434 /* 435 * Sync all ZFS filesystems. This is what happens when you 436 * run sync(1M). Unlike other filesystems, ZFS honors the 437 * request by waiting for all pools to commit all dirty data. 438 */ 439 spa_sync_allpools(); 440 } 441 442 return (0); 443 } 444 445 static void 446 atime_changed_cb(void *arg, uint64_t newval) 447 { 448 zfsvfs_t *zfsvfs = arg; 449 450 if (newval == TRUE) { 451 zfsvfs->z_atime = TRUE; 452 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 453 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 454 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 455 } else { 456 zfsvfs->z_atime = FALSE; 457 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 458 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 459 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 460 } 461 } 462 463 static void 464 xattr_changed_cb(void *arg, uint64_t newval) 465 { 466 zfsvfs_t *zfsvfs = arg; 467 468 if (newval == ZFS_XATTR_OFF) { 469 zfsvfs->z_flags &= ~ZSB_XATTR; 470 } else { 471 zfsvfs->z_flags |= ZSB_XATTR; 472 473 if (newval == ZFS_XATTR_SA) 474 zfsvfs->z_xattr_sa = B_TRUE; 475 else 476 zfsvfs->z_xattr_sa = B_FALSE; 477 } 478 } 479 480 static void 481 blksz_changed_cb(void *arg, uint64_t newval) 482 { 483 zfsvfs_t *zfsvfs = arg; 484 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 485 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 486 ASSERT(ISP2(newval)); 487 488 zfsvfs->z_max_blksz = newval; 489 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 490 } 491 492 static void 493 readonly_changed_cb(void *arg, uint64_t newval) 494 { 495 zfsvfs_t *zfsvfs = arg; 496 497 if (newval) { 498 /* XXX locking on vfs_flag? */ 499 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 500 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 501 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 502 } else { 503 /* XXX locking on vfs_flag? */ 504 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 505 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 506 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 507 } 508 } 509 510 static void 511 setuid_changed_cb(void *arg, uint64_t newval) 512 { 513 zfsvfs_t *zfsvfs = arg; 514 515 if (newval == FALSE) { 516 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 517 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 518 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 519 } else { 520 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 521 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 522 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 523 } 524 } 525 526 static void 527 exec_changed_cb(void *arg, uint64_t newval) 528 { 529 zfsvfs_t *zfsvfs = arg; 530 531 if (newval == FALSE) { 532 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 533 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 534 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 535 } else { 536 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 537 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 538 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 539 } 540 } 541 542 /* 543 * The nbmand mount option can be changed at mount time. 544 * We can't allow it to be toggled on live file systems or incorrect 545 * behavior may be seen from cifs clients 546 * 547 * This property isn't registered via dsl_prop_register(), but this callback 548 * will be called when a file system is first mounted 549 */ 550 static void 551 nbmand_changed_cb(void *arg, uint64_t newval) 552 { 553 zfsvfs_t *zfsvfs = arg; 554 if (newval == FALSE) { 555 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 556 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 557 } else { 558 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 559 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 560 } 561 } 562 563 static void 564 snapdir_changed_cb(void *arg, uint64_t newval) 565 { 566 zfsvfs_t *zfsvfs = arg; 567 568 zfsvfs->z_show_ctldir = newval; 569 } 570 571 static void 572 vscan_changed_cb(void *arg, uint64_t newval) 573 { 574 zfsvfs_t *zfsvfs = arg; 575 576 zfsvfs->z_vscan = newval; 577 } 578 579 static void 580 acl_mode_changed_cb(void *arg, uint64_t newval) 581 { 582 zfsvfs_t *zfsvfs = arg; 583 584 zfsvfs->z_acl_mode = newval; 585 } 586 587 static void 588 acl_inherit_changed_cb(void *arg, uint64_t newval) 589 { 590 zfsvfs_t *zfsvfs = arg; 591 592 zfsvfs->z_acl_inherit = newval; 593 } 594 595 static void 596 acl_type_changed_cb(void *arg, uint64_t newval) 597 { 598 zfsvfs_t *zfsvfs = arg; 599 600 zfsvfs->z_acl_type = newval; 601 } 602 603 static int 604 zfs_register_callbacks(vfs_t *vfsp) 605 { 606 struct dsl_dataset *ds = NULL; 607 objset_t *os = NULL; 608 zfsvfs_t *zfsvfs = NULL; 609 uint64_t nbmand; 610 boolean_t readonly = B_FALSE; 611 boolean_t do_readonly = B_FALSE; 612 boolean_t setuid = B_FALSE; 613 boolean_t do_setuid = B_FALSE; 614 boolean_t exec = B_FALSE; 615 boolean_t do_exec = B_FALSE; 616 boolean_t xattr = B_FALSE; 617 boolean_t atime = B_FALSE; 618 boolean_t do_atime = B_FALSE; 619 boolean_t do_xattr = B_FALSE; 620 int error = 0; 621 622 ASSERT(vfsp); 623 zfsvfs = vfsp->vfs_data; 624 ASSERT(zfsvfs); 625 os = zfsvfs->z_os; 626 627 /* 628 * This function can be called for a snapshot when we update snapshot's 629 * mount point, which isn't really supported. 630 */ 631 if (dmu_objset_is_snapshot(os)) 632 return (EOPNOTSUPP); 633 634 /* 635 * The act of registering our callbacks will destroy any mount 636 * options we may have. In order to enable temporary overrides 637 * of mount options, we stash away the current values and 638 * restore them after we register the callbacks. 639 */ 640 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 641 !spa_writeable(dmu_objset_spa(os))) { 642 readonly = B_TRUE; 643 do_readonly = B_TRUE; 644 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 645 readonly = B_FALSE; 646 do_readonly = B_TRUE; 647 } 648 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 649 setuid = B_FALSE; 650 do_setuid = B_TRUE; 651 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 652 setuid = B_TRUE; 653 do_setuid = B_TRUE; 654 } 655 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 656 exec = B_FALSE; 657 do_exec = B_TRUE; 658 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 659 exec = B_TRUE; 660 do_exec = B_TRUE; 661 } 662 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 663 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 664 do_xattr = B_TRUE; 665 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 666 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 667 do_xattr = B_TRUE; 668 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 669 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 670 do_xattr = B_TRUE; 671 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 672 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 673 do_xattr = B_TRUE; 674 } 675 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 676 atime = B_FALSE; 677 do_atime = B_TRUE; 678 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 679 atime = B_TRUE; 680 do_atime = B_TRUE; 681 } 682 683 /* 684 * We need to enter pool configuration here, so that we can use 685 * dsl_prop_get_int_ds() to handle the special nbmand property below. 686 * dsl_prop_get_integer() can not be used, because it has to acquire 687 * spa_namespace_lock and we can not do that because we already hold 688 * z_teardown_lock. The problem is that spa_write_cachefile() is called 689 * with spa_namespace_lock held and the function calls ZFS vnode 690 * operations to write the cache file and thus z_teardown_lock is 691 * acquired after spa_namespace_lock. 692 */ 693 ds = dmu_objset_ds(os); 694 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 695 696 /* 697 * nbmand is a special property. It can only be changed at 698 * mount time. 699 * 700 * This is weird, but it is documented to only be changeable 701 * at mount time. 702 */ 703 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 704 nbmand = B_FALSE; 705 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 706 nbmand = B_TRUE; 707 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { 708 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 709 return (error); 710 } 711 712 /* 713 * Register property callbacks. 714 * 715 * It would probably be fine to just check for i/o error from 716 * the first prop_register(), but I guess I like to go 717 * overboard... 718 */ 719 error = dsl_prop_register(ds, 720 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 721 error = error ? error : dsl_prop_register(ds, 722 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 723 error = error ? error : dsl_prop_register(ds, 724 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 725 error = error ? error : dsl_prop_register(ds, 726 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 727 error = error ? error : dsl_prop_register(ds, 728 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 729 error = error ? error : dsl_prop_register(ds, 730 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 731 error = error ? error : dsl_prop_register(ds, 732 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 733 error = error ? error : dsl_prop_register(ds, 734 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 735 error = error ? error : dsl_prop_register(ds, 736 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 737 error = error ? error : dsl_prop_register(ds, 738 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 739 zfsvfs); 740 error = error ? error : dsl_prop_register(ds, 741 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 742 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 743 if (error) 744 goto unregister; 745 746 /* 747 * Invoke our callbacks to restore temporary mount options. 748 */ 749 if (do_readonly) 750 readonly_changed_cb(zfsvfs, readonly); 751 if (do_setuid) 752 setuid_changed_cb(zfsvfs, setuid); 753 if (do_exec) 754 exec_changed_cb(zfsvfs, exec); 755 if (do_xattr) 756 xattr_changed_cb(zfsvfs, xattr); 757 if (do_atime) 758 atime_changed_cb(zfsvfs, atime); 759 760 nbmand_changed_cb(zfsvfs, nbmand); 761 762 return (0); 763 764 unregister: 765 dsl_prop_unregister_all(ds, zfsvfs); 766 return (error); 767 } 768 769 /* 770 * Associate this zfsvfs with the given objset, which must be owned. 771 * This will cache a bunch of on-disk state from the objset in the 772 * zfsvfs. 773 */ 774 static int 775 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 776 { 777 int error; 778 uint64_t val; 779 780 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 781 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 782 zfsvfs->z_os = os; 783 784 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 785 if (error != 0) 786 return (error); 787 if (zfsvfs->z_version > 788 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 789 (void) printf("Can't mount a version %lld file system " 790 "on a version %lld pool\n. Pool must be upgraded to mount " 791 "this file system.", (u_longlong_t)zfsvfs->z_version, 792 (u_longlong_t)spa_version(dmu_objset_spa(os))); 793 return (SET_ERROR(ENOTSUP)); 794 } 795 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 796 if (error != 0) 797 return (error); 798 zfsvfs->z_norm = (int)val; 799 800 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 801 if (error != 0) 802 return (error); 803 zfsvfs->z_utf8 = (val != 0); 804 805 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 806 if (error != 0) 807 return (error); 808 zfsvfs->z_case = (uint_t)val; 809 810 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 811 if (error != 0) 812 return (error); 813 zfsvfs->z_acl_type = (uint_t)val; 814 815 /* 816 * Fold case on file systems that are always or sometimes case 817 * insensitive. 818 */ 819 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 820 zfsvfs->z_case == ZFS_CASE_MIXED) 821 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 822 823 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 824 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 825 826 uint64_t sa_obj = 0; 827 if (zfsvfs->z_use_sa) { 828 /* should either have both of these objects or none */ 829 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 830 &sa_obj); 831 if (error != 0) 832 return (error); 833 } 834 835 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 836 &zfsvfs->z_attr_table); 837 if (error != 0) 838 return (error); 839 840 if (zfsvfs->z_version >= ZPL_VERSION_SA) 841 sa_register_update_callback(os, zfs_sa_upgrade); 842 843 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 844 &zfsvfs->z_root); 845 if (error != 0) 846 return (error); 847 ASSERT(zfsvfs->z_root != 0); 848 849 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 850 &zfsvfs->z_unlinkedobj); 851 if (error != 0) 852 return (error); 853 854 error = zap_lookup(os, MASTER_NODE_OBJ, 855 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 856 8, 1, &zfsvfs->z_userquota_obj); 857 if (error == ENOENT) 858 zfsvfs->z_userquota_obj = 0; 859 else if (error != 0) 860 return (error); 861 862 error = zap_lookup(os, MASTER_NODE_OBJ, 863 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 864 8, 1, &zfsvfs->z_groupquota_obj); 865 if (error == ENOENT) 866 zfsvfs->z_groupquota_obj = 0; 867 else if (error != 0) 868 return (error); 869 870 error = zap_lookup(os, MASTER_NODE_OBJ, 871 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 872 8, 1, &zfsvfs->z_projectquota_obj); 873 if (error == ENOENT) 874 zfsvfs->z_projectquota_obj = 0; 875 else if (error != 0) 876 return (error); 877 878 error = zap_lookup(os, MASTER_NODE_OBJ, 879 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 880 8, 1, &zfsvfs->z_userobjquota_obj); 881 if (error == ENOENT) 882 zfsvfs->z_userobjquota_obj = 0; 883 else if (error != 0) 884 return (error); 885 886 error = zap_lookup(os, MASTER_NODE_OBJ, 887 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 888 8, 1, &zfsvfs->z_groupobjquota_obj); 889 if (error == ENOENT) 890 zfsvfs->z_groupobjquota_obj = 0; 891 else if (error != 0) 892 return (error); 893 894 error = zap_lookup(os, MASTER_NODE_OBJ, 895 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 896 8, 1, &zfsvfs->z_projectobjquota_obj); 897 if (error == ENOENT) 898 zfsvfs->z_projectobjquota_obj = 0; 899 else if (error != 0) 900 return (error); 901 902 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 903 &zfsvfs->z_fuid_obj); 904 if (error == ENOENT) 905 zfsvfs->z_fuid_obj = 0; 906 else if (error != 0) 907 return (error); 908 909 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 910 &zfsvfs->z_shares_dir); 911 if (error == ENOENT) 912 zfsvfs->z_shares_dir = 0; 913 else if (error != 0) 914 return (error); 915 916 /* 917 * Only use the name cache if we are looking for a 918 * name on a file system that does not require normalization 919 * or case folding. We can also look there if we happen to be 920 * on a non-normalizing, mixed sensitivity file system IF we 921 * are looking for the exact name (which is always the case on 922 * FreeBSD). 923 */ 924 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 925 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 926 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 927 928 return (0); 929 } 930 931 taskq_t *zfsvfs_taskq; 932 933 static void 934 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 935 { 936 937 zfs_unlinked_drain((zfsvfs_t *)context); 938 } 939 940 int 941 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 942 { 943 objset_t *os; 944 zfsvfs_t *zfsvfs; 945 int error; 946 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 947 948 /* 949 * XXX: Fix struct statfs so this isn't necessary! 950 * 951 * The 'osname' is used as the filesystem's special node, which means 952 * it must fit in statfs.f_mntfromname, or else it can't be 953 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 954 * 'zfs unmount' to think it's not mounted when it is. 955 */ 956 if (strlen(osname) >= MNAMELEN) 957 return (SET_ERROR(ENAMETOOLONG)); 958 959 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 960 961 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 962 &os); 963 if (error != 0) { 964 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 965 return (error); 966 } 967 968 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 969 970 return (error); 971 } 972 973 974 int 975 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 976 { 977 int error; 978 979 zfsvfs->z_vfs = NULL; 980 zfsvfs->z_parent = zfsvfs; 981 982 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 983 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 984 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 985 offsetof(znode_t, z_link_node)); 986 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 987 zfsvfs_task_unlinked_drain, zfsvfs); 988 ZFS_TEARDOWN_INIT(zfsvfs); 989 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 990 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 991 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 992 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 993 994 error = zfsvfs_init(zfsvfs, os); 995 if (error != 0) { 996 dmu_objset_disown(os, B_TRUE, zfsvfs); 997 *zfvp = NULL; 998 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 999 return (error); 1000 } 1001 1002 *zfvp = zfsvfs; 1003 return (0); 1004 } 1005 1006 static int 1007 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1008 { 1009 int error; 1010 1011 /* 1012 * Check for a bad on-disk format version now since we 1013 * lied about owning the dataset readonly before. 1014 */ 1015 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1016 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1017 return (SET_ERROR(EROFS)); 1018 1019 error = zfs_register_callbacks(zfsvfs->z_vfs); 1020 if (error) 1021 return (error); 1022 1023 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1024 1025 /* 1026 * If we are not mounting (ie: online recv), then we don't 1027 * have to worry about replaying the log as we blocked all 1028 * operations out since we closed the ZIL. 1029 */ 1030 if (mounting) { 1031 boolean_t readonly; 1032 1033 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); 1034 dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1035 1036 /* 1037 * During replay we remove the read only flag to 1038 * allow replays to succeed. 1039 */ 1040 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1041 if (readonly != 0) { 1042 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1043 } else { 1044 dsl_dir_t *dd; 1045 zap_stats_t zs; 1046 1047 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1048 &zs) == 0) { 1049 dataset_kstats_update_nunlinks_kstat( 1050 &zfsvfs->z_kstat, zs.zs_num_entries); 1051 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1052 "num_entries in unlinked set: %llu", 1053 zs.zs_num_entries); 1054 } 1055 1056 zfs_unlinked_drain(zfsvfs); 1057 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1058 dd->dd_activity_cancelled = B_FALSE; 1059 } 1060 1061 /* 1062 * Parse and replay the intent log. 1063 * 1064 * Because of ziltest, this must be done after 1065 * zfs_unlinked_drain(). (Further note: ziltest 1066 * doesn't use readonly mounts, where 1067 * zfs_unlinked_drain() isn't called.) This is because 1068 * ziltest causes spa_sync() to think it's committed, 1069 * but actually it is not, so the intent log contains 1070 * many txg's worth of changes. 1071 * 1072 * In particular, if object N is in the unlinked set in 1073 * the last txg to actually sync, then it could be 1074 * actually freed in a later txg and then reallocated 1075 * in a yet later txg. This would write a "create 1076 * object N" record to the intent log. Normally, this 1077 * would be fine because the spa_sync() would have 1078 * written out the fact that object N is free, before 1079 * we could write the "create object N" intent log 1080 * record. 1081 * 1082 * But when we are in ziltest mode, we advance the "open 1083 * txg" without actually spa_sync()-ing the changes to 1084 * disk. So we would see that object N is still 1085 * allocated and in the unlinked set, and there is an 1086 * intent log record saying to allocate it. 1087 */ 1088 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1089 if (zil_replay_disable) { 1090 zil_destroy(zfsvfs->z_log, B_FALSE); 1091 } else { 1092 boolean_t use_nc = zfsvfs->z_use_namecache; 1093 zfsvfs->z_use_namecache = B_FALSE; 1094 zfsvfs->z_replay = B_TRUE; 1095 zil_replay(zfsvfs->z_os, zfsvfs, 1096 zfs_replay_vector); 1097 zfsvfs->z_replay = B_FALSE; 1098 zfsvfs->z_use_namecache = use_nc; 1099 } 1100 } 1101 1102 /* restore readonly bit */ 1103 if (readonly != 0) 1104 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1105 } 1106 1107 /* 1108 * Set the objset user_ptr to track its zfsvfs. 1109 */ 1110 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1111 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1112 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1113 1114 return (0); 1115 } 1116 1117 void 1118 zfsvfs_free(zfsvfs_t *zfsvfs) 1119 { 1120 int i; 1121 1122 zfs_fuid_destroy(zfsvfs); 1123 1124 mutex_destroy(&zfsvfs->z_znodes_lock); 1125 mutex_destroy(&zfsvfs->z_lock); 1126 ASSERT(zfsvfs->z_nr_znodes == 0); 1127 list_destroy(&zfsvfs->z_all_znodes); 1128 ZFS_TEARDOWN_DESTROY(zfsvfs); 1129 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1130 rw_destroy(&zfsvfs->z_fuid_lock); 1131 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1132 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1133 dataset_kstats_destroy(&zfsvfs->z_kstat); 1134 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1135 } 1136 1137 static void 1138 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1139 { 1140 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1141 if (zfsvfs->z_vfs) { 1142 if (zfsvfs->z_use_fuids) { 1143 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1144 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1145 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1146 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1147 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1148 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1149 } else { 1150 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1151 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1152 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1153 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1154 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1155 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1156 } 1157 } 1158 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1159 } 1160 1161 static int 1162 zfs_domount(vfs_t *vfsp, char *osname) 1163 { 1164 uint64_t recordsize, fsid_guid; 1165 int error = 0; 1166 zfsvfs_t *zfsvfs; 1167 1168 ASSERT(vfsp); 1169 ASSERT(osname); 1170 1171 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1172 if (error) 1173 return (error); 1174 zfsvfs->z_vfs = vfsp; 1175 1176 if ((error = dsl_prop_get_integer(osname, 1177 "recordsize", &recordsize, NULL))) 1178 goto out; 1179 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1180 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1181 1182 vfsp->vfs_data = zfsvfs; 1183 vfsp->mnt_flag |= MNT_LOCAL; 1184 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1185 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1186 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1187 /* 1188 * This can cause a loss of coherence between ARC and page cache 1189 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1190 */ 1191 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1192 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1193 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1194 1195 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1196 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1197 #endif 1198 /* 1199 * The fsid is 64 bits, composed of an 8-bit fs type, which 1200 * separates our fsid from any other filesystem types, and a 1201 * 56-bit objset unique ID. The objset unique ID is unique to 1202 * all objsets open on this system, provided by unique_create(). 1203 * The 8-bit fs type must be put in the low bits of fsid[1] 1204 * because that's where other Solaris filesystems put it. 1205 */ 1206 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1207 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1208 vfsp->vfs_fsid.val[0] = fsid_guid; 1209 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1210 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1211 1212 /* 1213 * Set features for file system. 1214 */ 1215 zfs_set_fuid_feature(zfsvfs); 1216 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1217 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1218 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1219 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1220 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1221 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1222 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1223 } 1224 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1225 1226 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1227 uint64_t pval; 1228 1229 atime_changed_cb(zfsvfs, B_FALSE); 1230 readonly_changed_cb(zfsvfs, B_TRUE); 1231 if ((error = dsl_prop_get_integer(osname, 1232 "xattr", &pval, NULL))) 1233 goto out; 1234 xattr_changed_cb(zfsvfs, pval); 1235 if ((error = dsl_prop_get_integer(osname, 1236 "acltype", &pval, NULL))) 1237 goto out; 1238 acl_type_changed_cb(zfsvfs, pval); 1239 zfsvfs->z_issnap = B_TRUE; 1240 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1241 1242 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1243 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1244 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1245 } else { 1246 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1247 goto out; 1248 } 1249 1250 vfs_mountedfrom(vfsp, osname); 1251 1252 if (!zfsvfs->z_issnap) 1253 zfsctl_create(zfsvfs); 1254 out: 1255 if (error) { 1256 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1257 zfsvfs_free(zfsvfs); 1258 } else { 1259 atomic_inc_32(&zfs_active_fs_count); 1260 } 1261 1262 return (error); 1263 } 1264 1265 static void 1266 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1267 { 1268 objset_t *os = zfsvfs->z_os; 1269 1270 if (!dmu_objset_is_snapshot(os)) 1271 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1272 } 1273 1274 static int 1275 getpoolname(const char *osname, char *poolname) 1276 { 1277 char *p; 1278 1279 p = strchr(osname, '/'); 1280 if (p == NULL) { 1281 if (strlen(osname) >= MAXNAMELEN) 1282 return (ENAMETOOLONG); 1283 (void) strcpy(poolname, osname); 1284 } else { 1285 if (p - osname >= MAXNAMELEN) 1286 return (ENAMETOOLONG); 1287 (void) strncpy(poolname, osname, p - osname); 1288 poolname[p - osname] = '\0'; 1289 } 1290 return (0); 1291 } 1292 1293 /*ARGSUSED*/ 1294 static int 1295 zfs_mount(vfs_t *vfsp) 1296 { 1297 kthread_t *td = curthread; 1298 vnode_t *mvp = vfsp->mnt_vnodecovered; 1299 cred_t *cr = td->td_ucred; 1300 char *osname; 1301 int error = 0; 1302 int canwrite; 1303 1304 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1305 return (SET_ERROR(EINVAL)); 1306 1307 /* 1308 * If full-owner-access is enabled and delegated administration is 1309 * turned on, we must set nosuid. 1310 */ 1311 if (zfs_super_owner && 1312 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1313 secpolicy_fs_mount_clearopts(cr, vfsp); 1314 } 1315 1316 /* 1317 * Check for mount privilege? 1318 * 1319 * If we don't have privilege then see if 1320 * we have local permission to allow it 1321 */ 1322 error = secpolicy_fs_mount(cr, mvp, vfsp); 1323 if (error) { 1324 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1325 goto out; 1326 1327 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1328 vattr_t vattr; 1329 1330 /* 1331 * Make sure user is the owner of the mount point 1332 * or has sufficient privileges. 1333 */ 1334 1335 vattr.va_mask = AT_UID; 1336 1337 vn_lock(mvp, LK_SHARED | LK_RETRY); 1338 if (VOP_GETATTR(mvp, &vattr, cr)) { 1339 VOP_UNLOCK1(mvp); 1340 goto out; 1341 } 1342 1343 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1344 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1345 VOP_UNLOCK1(mvp); 1346 goto out; 1347 } 1348 VOP_UNLOCK1(mvp); 1349 } 1350 1351 secpolicy_fs_mount_clearopts(cr, vfsp); 1352 } 1353 1354 /* 1355 * Refuse to mount a filesystem if we are in a local zone and the 1356 * dataset is not visible. 1357 */ 1358 if (!INGLOBALZONE(curproc) && 1359 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1360 error = SET_ERROR(EPERM); 1361 goto out; 1362 } 1363 1364 vfsp->vfs_flag |= MNT_NFS4ACLS; 1365 1366 /* 1367 * When doing a remount, we simply refresh our temporary properties 1368 * according to those options set in the current VFS options. 1369 */ 1370 if (vfsp->vfs_flag & MS_REMOUNT) { 1371 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1372 1373 /* 1374 * Refresh mount options with z_teardown_lock blocking I/O while 1375 * the filesystem is in an inconsistent state. 1376 * The lock also serializes this code with filesystem 1377 * manipulations between entry to zfs_suspend_fs() and return 1378 * from zfs_resume_fs(). 1379 */ 1380 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1381 zfs_unregister_callbacks(zfsvfs); 1382 error = zfs_register_callbacks(vfsp); 1383 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1384 goto out; 1385 } 1386 1387 /* Initial root mount: try hard to import the requested root pool. */ 1388 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1389 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1390 char pname[MAXNAMELEN]; 1391 1392 error = getpoolname(osname, pname); 1393 if (error == 0) 1394 error = spa_import_rootpool(pname, false); 1395 if (error) 1396 goto out; 1397 } 1398 DROP_GIANT(); 1399 error = zfs_domount(vfsp, osname); 1400 PICKUP_GIANT(); 1401 1402 out: 1403 return (error); 1404 } 1405 1406 static int 1407 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1408 { 1409 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1410 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1411 1412 statp->f_version = STATFS_VERSION; 1413 1414 ZFS_ENTER(zfsvfs); 1415 1416 dmu_objset_space(zfsvfs->z_os, 1417 &refdbytes, &availbytes, &usedobjs, &availobjs); 1418 1419 /* 1420 * The underlying storage pool actually uses multiple block sizes. 1421 * We report the fragsize as the smallest block size we support, 1422 * and we report our blocksize as the filesystem's maximum blocksize. 1423 */ 1424 statp->f_bsize = SPA_MINBLOCKSIZE; 1425 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1426 1427 /* 1428 * The following report "total" blocks of various kinds in the 1429 * file system, but reported in terms of f_frsize - the 1430 * "fragment" size. 1431 */ 1432 1433 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1434 statp->f_bfree = availbytes / statp->f_bsize; 1435 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1436 1437 /* 1438 * statvfs() should really be called statufs(), because it assumes 1439 * static metadata. ZFS doesn't preallocate files, so the best 1440 * we can do is report the max that could possibly fit in f_files, 1441 * and that minus the number actually used in f_ffree. 1442 * For f_ffree, report the smaller of the number of object available 1443 * and the number of blocks (each object will take at least a block). 1444 */ 1445 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1446 statp->f_files = statp->f_ffree + usedobjs; 1447 1448 /* 1449 * We're a zfs filesystem. 1450 */ 1451 strlcpy(statp->f_fstypename, "zfs", 1452 sizeof (statp->f_fstypename)); 1453 1454 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1455 sizeof (statp->f_mntfromname)); 1456 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1457 sizeof (statp->f_mntonname)); 1458 1459 statp->f_namemax = MAXNAMELEN - 1; 1460 1461 ZFS_EXIT(zfsvfs); 1462 return (0); 1463 } 1464 1465 static int 1466 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1467 { 1468 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1469 znode_t *rootzp; 1470 int error; 1471 1472 ZFS_ENTER(zfsvfs); 1473 1474 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1475 if (error == 0) 1476 *vpp = ZTOV(rootzp); 1477 1478 ZFS_EXIT(zfsvfs); 1479 1480 if (error == 0) { 1481 error = vn_lock(*vpp, flags); 1482 if (error != 0) { 1483 VN_RELE(*vpp); 1484 *vpp = NULL; 1485 } 1486 } 1487 return (error); 1488 } 1489 1490 /* 1491 * Teardown the zfsvfs::z_os. 1492 * 1493 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1494 * and 'z_teardown_inactive_lock' held. 1495 */ 1496 static int 1497 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1498 { 1499 znode_t *zp; 1500 dsl_dir_t *dd; 1501 1502 /* 1503 * If someone has not already unmounted this file system, 1504 * drain the zrele_taskq to ensure all active references to the 1505 * zfsvfs_t have been handled only then can it be safely destroyed. 1506 */ 1507 if (zfsvfs->z_os) { 1508 /* 1509 * If we're unmounting we have to wait for the list to 1510 * drain completely. 1511 * 1512 * If we're not unmounting there's no guarantee the list 1513 * will drain completely, but zreles run from the taskq 1514 * may add the parents of dir-based xattrs to the taskq 1515 * so we want to wait for these. 1516 * 1517 * We can safely read z_nr_znodes without locking because the 1518 * VFS has already blocked operations which add to the 1519 * z_all_znodes list and thus increment z_nr_znodes. 1520 */ 1521 int round = 0; 1522 while (zfsvfs->z_nr_znodes > 0) { 1523 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1524 dmu_objset_pool(zfsvfs->z_os)), 0); 1525 if (++round > 1 && !unmounting) 1526 break; 1527 } 1528 } 1529 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1530 1531 if (!unmounting) { 1532 /* 1533 * We purge the parent filesystem's vfsp as the parent 1534 * filesystem and all of its snapshots have their vnode's 1535 * v_vfsp set to the parent's filesystem's vfsp. Note, 1536 * 'z_parent' is self referential for non-snapshots. 1537 */ 1538 #ifdef FREEBSD_NAMECACHE 1539 #if __FreeBSD_version >= 1300117 1540 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1541 #else 1542 cache_purgevfs(zfsvfs->z_parent->z_vfs, true); 1543 #endif 1544 #endif 1545 } 1546 1547 /* 1548 * Close the zil. NB: Can't close the zil while zfs_inactive 1549 * threads are blocked as zil_close can call zfs_inactive. 1550 */ 1551 if (zfsvfs->z_log) { 1552 zil_close(zfsvfs->z_log); 1553 zfsvfs->z_log = NULL; 1554 } 1555 1556 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1557 1558 /* 1559 * If we are not unmounting (ie: online recv) and someone already 1560 * unmounted this file system while we were doing the switcheroo, 1561 * or a reopen of z_os failed then just bail out now. 1562 */ 1563 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1564 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1565 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1566 return (SET_ERROR(EIO)); 1567 } 1568 1569 /* 1570 * At this point there are no vops active, and any new vops will 1571 * fail with EIO since we have z_teardown_lock for writer (only 1572 * relevant for forced unmount). 1573 * 1574 * Release all holds on dbufs. 1575 */ 1576 mutex_enter(&zfsvfs->z_znodes_lock); 1577 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1578 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1579 if (zp->z_sa_hdl) { 1580 ASSERT(ZTOV(zp)->v_count >= 0); 1581 zfs_znode_dmu_fini(zp); 1582 } 1583 mutex_exit(&zfsvfs->z_znodes_lock); 1584 1585 /* 1586 * If we are unmounting, set the unmounted flag and let new vops 1587 * unblock. zfs_inactive will have the unmounted behavior, and all 1588 * other vops will fail with EIO. 1589 */ 1590 if (unmounting) { 1591 zfsvfs->z_unmounted = B_TRUE; 1592 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1593 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1594 } 1595 1596 /* 1597 * z_os will be NULL if there was an error in attempting to reopen 1598 * zfsvfs, so just return as the properties had already been 1599 * unregistered and cached data had been evicted before. 1600 */ 1601 if (zfsvfs->z_os == NULL) 1602 return (0); 1603 1604 /* 1605 * Unregister properties. 1606 */ 1607 zfs_unregister_callbacks(zfsvfs); 1608 1609 /* 1610 * Evict cached data 1611 */ 1612 if (!zfs_is_readonly(zfsvfs)) 1613 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1614 dmu_objset_evict_dbufs(zfsvfs->z_os); 1615 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1616 dsl_dir_cancel_waiters(dd); 1617 1618 return (0); 1619 } 1620 1621 /*ARGSUSED*/ 1622 static int 1623 zfs_umount(vfs_t *vfsp, int fflag) 1624 { 1625 kthread_t *td = curthread; 1626 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1627 objset_t *os; 1628 cred_t *cr = td->td_ucred; 1629 int ret; 1630 1631 ret = secpolicy_fs_unmount(cr, vfsp); 1632 if (ret) { 1633 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1634 ZFS_DELEG_PERM_MOUNT, cr)) 1635 return (ret); 1636 } 1637 1638 /* 1639 * Unmount any snapshots mounted under .zfs before unmounting the 1640 * dataset itself. 1641 */ 1642 if (zfsvfs->z_ctldir != NULL) { 1643 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1644 return (ret); 1645 } 1646 1647 if (fflag & MS_FORCE) { 1648 /* 1649 * Mark file system as unmounted before calling 1650 * vflush(FORCECLOSE). This way we ensure no future vnops 1651 * will be called and risk operating on DOOMED vnodes. 1652 */ 1653 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1654 zfsvfs->z_unmounted = B_TRUE; 1655 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1656 } 1657 1658 /* 1659 * Flush all the files. 1660 */ 1661 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1662 if (ret != 0) 1663 return (ret); 1664 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1665 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1666 taskqueue_drain(zfsvfs_taskq->tq_queue, 1667 &zfsvfs->z_unlinked_drain_task); 1668 1669 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1670 os = zfsvfs->z_os; 1671 1672 /* 1673 * z_os will be NULL if there was an error in 1674 * attempting to reopen zfsvfs. 1675 */ 1676 if (os != NULL) { 1677 /* 1678 * Unset the objset user_ptr. 1679 */ 1680 mutex_enter(&os->os_user_ptr_lock); 1681 dmu_objset_set_user(os, NULL); 1682 mutex_exit(&os->os_user_ptr_lock); 1683 1684 /* 1685 * Finally release the objset 1686 */ 1687 dmu_objset_disown(os, B_TRUE, zfsvfs); 1688 } 1689 1690 /* 1691 * We can now safely destroy the '.zfs' directory node. 1692 */ 1693 if (zfsvfs->z_ctldir != NULL) 1694 zfsctl_destroy(zfsvfs); 1695 zfs_freevfs(vfsp); 1696 1697 return (0); 1698 } 1699 1700 static int 1701 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1702 { 1703 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1704 znode_t *zp; 1705 int err; 1706 1707 /* 1708 * zfs_zget() can't operate on virtual entries like .zfs/ or 1709 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1710 * This will make NFS to switch to LOOKUP instead of using VGET. 1711 */ 1712 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1713 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1714 return (EOPNOTSUPP); 1715 1716 ZFS_ENTER(zfsvfs); 1717 err = zfs_zget(zfsvfs, ino, &zp); 1718 if (err == 0 && zp->z_unlinked) { 1719 vrele(ZTOV(zp)); 1720 err = EINVAL; 1721 } 1722 if (err == 0) 1723 *vpp = ZTOV(zp); 1724 ZFS_EXIT(zfsvfs); 1725 if (err == 0) { 1726 err = vn_lock(*vpp, flags); 1727 if (err != 0) 1728 vrele(*vpp); 1729 } 1730 if (err != 0) 1731 *vpp = NULL; 1732 return (err); 1733 } 1734 1735 static int 1736 #if __FreeBSD_version >= 1300098 1737 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1738 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1739 #else 1740 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 1741 struct ucred **credanonp, int *numsecflavors, int **secflavors) 1742 #endif 1743 { 1744 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1745 1746 /* 1747 * If this is regular file system vfsp is the same as 1748 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1749 * zfsvfs->z_parent->z_vfs represents parent file system 1750 * which we have to use here, because only this file system 1751 * has mnt_export configured. 1752 */ 1753 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1754 credanonp, numsecflavors, secflavors)); 1755 } 1756 1757 CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); 1758 CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); 1759 1760 static int 1761 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1762 { 1763 struct componentname cn; 1764 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1765 znode_t *zp; 1766 vnode_t *dvp; 1767 uint64_t object = 0; 1768 uint64_t fid_gen = 0; 1769 uint64_t gen_mask; 1770 uint64_t zp_gen; 1771 int i, err; 1772 1773 *vpp = NULL; 1774 1775 ZFS_ENTER(zfsvfs); 1776 1777 /* 1778 * On FreeBSD we can get snapshot's mount point or its parent file 1779 * system mount point depending if snapshot is already mounted or not. 1780 */ 1781 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1782 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1783 uint64_t objsetid = 0; 1784 uint64_t setgen = 0; 1785 1786 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1787 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1788 1789 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1790 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1791 1792 ZFS_EXIT(zfsvfs); 1793 1794 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1795 if (err) 1796 return (SET_ERROR(EINVAL)); 1797 ZFS_ENTER(zfsvfs); 1798 } 1799 1800 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1801 zfid_short_t *zfid = (zfid_short_t *)fidp; 1802 1803 for (i = 0; i < sizeof (zfid->zf_object); i++) 1804 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1805 1806 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1807 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1808 } else { 1809 ZFS_EXIT(zfsvfs); 1810 return (SET_ERROR(EINVAL)); 1811 } 1812 1813 /* 1814 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1815 * directory tree. If the object == zfsvfs->z_shares_dir, then 1816 * we are in the .zfs/shares directory tree. 1817 */ 1818 if ((fid_gen == 0 && 1819 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1820 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1821 ZFS_EXIT(zfsvfs); 1822 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1823 if (object == ZFSCTL_INO_SNAPDIR) { 1824 cn.cn_nameptr = "snapshot"; 1825 cn.cn_namelen = strlen(cn.cn_nameptr); 1826 cn.cn_nameiop = LOOKUP; 1827 cn.cn_flags = ISLASTCN | LOCKLEAF; 1828 cn.cn_lkflags = flags; 1829 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1830 vput(dvp); 1831 } else if (object == zfsvfs->z_shares_dir) { 1832 /* 1833 * XXX This branch must not be taken, 1834 * if it is, then the lookup below will 1835 * explode. 1836 */ 1837 cn.cn_nameptr = "shares"; 1838 cn.cn_namelen = strlen(cn.cn_nameptr); 1839 cn.cn_nameiop = LOOKUP; 1840 cn.cn_flags = ISLASTCN; 1841 cn.cn_lkflags = flags; 1842 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1843 vput(dvp); 1844 } else { 1845 *vpp = dvp; 1846 } 1847 return (err); 1848 } 1849 1850 gen_mask = -1ULL >> (64 - 8 * i); 1851 1852 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 1853 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1854 ZFS_EXIT(zfsvfs); 1855 return (err); 1856 } 1857 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1858 sizeof (uint64_t)); 1859 zp_gen = zp_gen & gen_mask; 1860 if (zp_gen == 0) 1861 zp_gen = 1; 1862 if (zp->z_unlinked || zp_gen != fid_gen) { 1863 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 1864 vrele(ZTOV(zp)); 1865 ZFS_EXIT(zfsvfs); 1866 return (SET_ERROR(EINVAL)); 1867 } 1868 1869 *vpp = ZTOV(zp); 1870 ZFS_EXIT(zfsvfs); 1871 err = vn_lock(*vpp, flags); 1872 if (err == 0) 1873 vnode_create_vobject(*vpp, zp->z_size, curthread); 1874 else 1875 *vpp = NULL; 1876 return (err); 1877 } 1878 1879 /* 1880 * Block out VOPs and close zfsvfs_t::z_os 1881 * 1882 * Note, if successful, then we return with the 'z_teardown_lock' and 1883 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 1884 * dataset and objset intact so that they can be atomically handed off during 1885 * a subsequent rollback or recv operation and the resume thereafter. 1886 */ 1887 int 1888 zfs_suspend_fs(zfsvfs_t *zfsvfs) 1889 { 1890 int error; 1891 1892 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1893 return (error); 1894 1895 return (0); 1896 } 1897 1898 /* 1899 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 1900 * is an invariant across any of the operations that can be performed while the 1901 * filesystem was suspended. Whether it succeeded or failed, the preconditions 1902 * are the same: the relevant objset and associated dataset are owned by 1903 * zfsvfs, held, and long held on entry. 1904 */ 1905 int 1906 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 1907 { 1908 int err; 1909 znode_t *zp; 1910 1911 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 1912 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 1913 1914 /* 1915 * We already own this, so just update the objset_t, as the one we 1916 * had before may have been evicted. 1917 */ 1918 objset_t *os; 1919 VERIFY3P(ds->ds_owner, ==, zfsvfs); 1920 VERIFY(dsl_dataset_long_held(ds)); 1921 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 1922 dsl_pool_config_enter(dp, FTAG); 1923 VERIFY0(dmu_objset_from_ds(ds, &os)); 1924 dsl_pool_config_exit(dp, FTAG); 1925 1926 err = zfsvfs_init(zfsvfs, os); 1927 if (err != 0) 1928 goto bail; 1929 1930 ds->ds_dir->dd_activity_cancelled = B_FALSE; 1931 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 1932 1933 zfs_set_fuid_feature(zfsvfs); 1934 1935 /* 1936 * Attempt to re-establish all the active znodes with 1937 * their dbufs. If a zfs_rezget() fails, then we'll let 1938 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 1939 * when they try to use their znode. 1940 */ 1941 mutex_enter(&zfsvfs->z_znodes_lock); 1942 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 1943 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1944 (void) zfs_rezget(zp); 1945 } 1946 mutex_exit(&zfsvfs->z_znodes_lock); 1947 1948 bail: 1949 /* release the VOPs */ 1950 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1951 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1952 1953 if (err) { 1954 /* 1955 * Since we couldn't setup the sa framework, try to force 1956 * unmount this file system. 1957 */ 1958 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 1959 vfs_ref(zfsvfs->z_vfs); 1960 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 1961 } 1962 } 1963 return (err); 1964 } 1965 1966 static void 1967 zfs_freevfs(vfs_t *vfsp) 1968 { 1969 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1970 1971 zfsvfs_free(zfsvfs); 1972 1973 atomic_dec_32(&zfs_active_fs_count); 1974 } 1975 1976 #ifdef __i386__ 1977 static int desiredvnodes_backup; 1978 #include <sys/vmmeter.h> 1979 1980 1981 #include <vm/vm_page.h> 1982 #include <vm/vm_object.h> 1983 #include <vm/vm_kern.h> 1984 #include <vm/vm_map.h> 1985 #endif 1986 1987 static void 1988 zfs_vnodes_adjust(void) 1989 { 1990 #ifdef __i386__ 1991 int newdesiredvnodes; 1992 1993 desiredvnodes_backup = desiredvnodes; 1994 1995 /* 1996 * We calculate newdesiredvnodes the same way it is done in 1997 * vntblinit(). If it is equal to desiredvnodes, it means that 1998 * it wasn't tuned by the administrator and we can tune it down. 1999 */ 2000 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2001 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2002 sizeof (struct vnode)))); 2003 if (newdesiredvnodes == desiredvnodes) 2004 desiredvnodes = (3 * newdesiredvnodes) / 4; 2005 #endif 2006 } 2007 2008 static void 2009 zfs_vnodes_adjust_back(void) 2010 { 2011 2012 #ifdef __i386__ 2013 desiredvnodes = desiredvnodes_backup; 2014 #endif 2015 } 2016 2017 void 2018 zfs_init(void) 2019 { 2020 2021 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2022 2023 /* 2024 * Initialize .zfs directory structures 2025 */ 2026 zfsctl_init(); 2027 2028 /* 2029 * Initialize znode cache, vnode ops, etc... 2030 */ 2031 zfs_znode_init(); 2032 2033 /* 2034 * Reduce number of vnodes. Originally number of vnodes is calculated 2035 * with UFS inode in mind. We reduce it here, because it's too big for 2036 * ZFS/i386. 2037 */ 2038 zfs_vnodes_adjust(); 2039 2040 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2041 2042 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2043 } 2044 2045 void 2046 zfs_fini(void) 2047 { 2048 taskq_destroy(zfsvfs_taskq); 2049 zfsctl_fini(); 2050 zfs_znode_fini(); 2051 zfs_vnodes_adjust_back(); 2052 } 2053 2054 int 2055 zfs_busy(void) 2056 { 2057 return (zfs_active_fs_count != 0); 2058 } 2059 2060 /* 2061 * Release VOPs and unmount a suspended filesystem. 2062 */ 2063 int 2064 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2065 { 2066 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2067 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2068 2069 /* 2070 * We already own this, so just hold and rele it to update the 2071 * objset_t, as the one we had before may have been evicted. 2072 */ 2073 objset_t *os; 2074 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2075 VERIFY(dsl_dataset_long_held(ds)); 2076 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2077 dsl_pool_config_enter(dp, FTAG); 2078 VERIFY0(dmu_objset_from_ds(ds, &os)); 2079 dsl_pool_config_exit(dp, FTAG); 2080 zfsvfs->z_os = os; 2081 2082 /* release the VOPs */ 2083 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2084 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2085 2086 /* 2087 * Try to force unmount this file system. 2088 */ 2089 (void) zfs_umount(zfsvfs->z_vfs, 0); 2090 zfsvfs->z_unmounted = B_TRUE; 2091 return (0); 2092 } 2093 2094 int 2095 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2096 { 2097 int error; 2098 objset_t *os = zfsvfs->z_os; 2099 dmu_tx_t *tx; 2100 2101 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2102 return (SET_ERROR(EINVAL)); 2103 2104 if (newvers < zfsvfs->z_version) 2105 return (SET_ERROR(EINVAL)); 2106 2107 if (zfs_spa_version_map(newvers) > 2108 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2109 return (SET_ERROR(ENOTSUP)); 2110 2111 tx = dmu_tx_create(os); 2112 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2113 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2114 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2115 ZFS_SA_ATTRS); 2116 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2117 } 2118 error = dmu_tx_assign(tx, TXG_WAIT); 2119 if (error) { 2120 dmu_tx_abort(tx); 2121 return (error); 2122 } 2123 2124 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2125 8, 1, &newvers, tx); 2126 2127 if (error) { 2128 dmu_tx_commit(tx); 2129 return (error); 2130 } 2131 2132 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2133 uint64_t sa_obj; 2134 2135 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2136 SPA_VERSION_SA); 2137 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2138 DMU_OT_NONE, 0, tx); 2139 2140 error = zap_add(os, MASTER_NODE_OBJ, 2141 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2142 ASSERT0(error); 2143 2144 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2145 sa_register_update_callback(os, zfs_sa_upgrade); 2146 } 2147 2148 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2149 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2150 (uintmax_t)newvers); 2151 dmu_tx_commit(tx); 2152 2153 zfsvfs->z_version = newvers; 2154 os->os_version = newvers; 2155 2156 zfs_set_fuid_feature(zfsvfs); 2157 2158 return (0); 2159 } 2160 2161 /* 2162 * Read a property stored within the master node. 2163 */ 2164 int 2165 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2166 { 2167 uint64_t *cached_copy = NULL; 2168 2169 /* 2170 * Figure out where in the objset_t the cached copy would live, if it 2171 * is available for the requested property. 2172 */ 2173 if (os != NULL) { 2174 switch (prop) { 2175 case ZFS_PROP_VERSION: 2176 cached_copy = &os->os_version; 2177 break; 2178 case ZFS_PROP_NORMALIZE: 2179 cached_copy = &os->os_normalization; 2180 break; 2181 case ZFS_PROP_UTF8ONLY: 2182 cached_copy = &os->os_utf8only; 2183 break; 2184 case ZFS_PROP_CASE: 2185 cached_copy = &os->os_casesensitivity; 2186 break; 2187 default: 2188 break; 2189 } 2190 } 2191 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2192 *value = *cached_copy; 2193 return (0); 2194 } 2195 2196 /* 2197 * If the property wasn't cached, look up the file system's value for 2198 * the property. For the version property, we look up a slightly 2199 * different string. 2200 */ 2201 const char *pname; 2202 int error = ENOENT; 2203 if (prop == ZFS_PROP_VERSION) { 2204 pname = ZPL_VERSION_STR; 2205 } else { 2206 pname = zfs_prop_to_name(prop); 2207 } 2208 2209 if (os != NULL) { 2210 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2211 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2212 } 2213 2214 if (error == ENOENT) { 2215 /* No value set, use the default value */ 2216 switch (prop) { 2217 case ZFS_PROP_VERSION: 2218 *value = ZPL_VERSION; 2219 break; 2220 case ZFS_PROP_NORMALIZE: 2221 case ZFS_PROP_UTF8ONLY: 2222 *value = 0; 2223 break; 2224 case ZFS_PROP_CASE: 2225 *value = ZFS_CASE_SENSITIVE; 2226 break; 2227 case ZFS_PROP_ACLTYPE: 2228 *value = ZFS_ACLTYPE_NFSV4; 2229 break; 2230 default: 2231 return (error); 2232 } 2233 error = 0; 2234 } 2235 2236 /* 2237 * If one of the methods for getting the property value above worked, 2238 * copy it into the objset_t's cache. 2239 */ 2240 if (error == 0 && cached_copy != NULL) { 2241 *cached_copy = *value; 2242 } 2243 2244 return (error); 2245 } 2246 2247 /* 2248 * Return true if the corresponding vfs's unmounted flag is set. 2249 * Otherwise return false. 2250 * If this function returns true we know VFS unmount has been initiated. 2251 */ 2252 boolean_t 2253 zfs_get_vfs_flag_unmounted(objset_t *os) 2254 { 2255 zfsvfs_t *zfvp; 2256 boolean_t unmounted = B_FALSE; 2257 2258 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); 2259 2260 mutex_enter(&os->os_user_ptr_lock); 2261 zfvp = dmu_objset_get_user(os); 2262 if (zfvp != NULL && zfvp->z_vfs != NULL && 2263 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2264 unmounted = B_TRUE; 2265 mutex_exit(&os->os_user_ptr_lock); 2266 2267 return (unmounted); 2268 } 2269 2270 #ifdef _KERNEL 2271 void 2272 zfsvfs_update_fromname(const char *oldname, const char *newname) 2273 { 2274 char tmpbuf[MAXPATHLEN]; 2275 struct mount *mp; 2276 char *fromname; 2277 size_t oldlen; 2278 2279 oldlen = strlen(oldname); 2280 2281 mtx_lock(&mountlist_mtx); 2282 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2283 fromname = mp->mnt_stat.f_mntfromname; 2284 if (strcmp(fromname, oldname) == 0) { 2285 (void) strlcpy(fromname, newname, 2286 sizeof (mp->mnt_stat.f_mntfromname)); 2287 continue; 2288 } 2289 if (strncmp(fromname, oldname, oldlen) == 0 && 2290 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2291 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2292 newname, fromname + oldlen); 2293 (void) strlcpy(fromname, tmpbuf, 2294 sizeof (mp->mnt_stat.f_mntfromname)); 2295 continue; 2296 } 2297 } 2298 mtx_unlock(&mountlist_mtx); 2299 } 2300 #endif 2301