1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/sysmacros.h> 37 #include <sys/kmem.h> 38 #include <sys/acl.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/mntent.h> 42 #include <sys/mount.h> 43 #include <sys/cmn_err.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_vnops.h> 46 #include <sys/zfs_dir.h> 47 #include <sys/zil.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dsl_prop.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/dsl_deleg.h> 53 #include <sys/spa.h> 54 #include <sys/zap.h> 55 #include <sys/sa.h> 56 #include <sys/sa_impl.h> 57 #include <sys/policy.h> 58 #include <sys/atomic.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/sunddi.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/dsl_dir.h> 65 #include <sys/spa_boot.h> 66 #include <sys/jail.h> 67 #include <ufs/ufs/quota.h> 68 #include <sys/zfs_quota.h> 69 70 #include "zfs_comutil.h" 71 72 #ifndef MNTK_VMSETSIZE_BUG 73 #define MNTK_VMSETSIZE_BUG 0 74 #endif 75 #ifndef MNTK_NOMSYNC 76 #define MNTK_NOMSYNC 8 77 #endif 78 79 /* BEGIN CSTYLED */ 80 struct mtx zfs_debug_mtx; 81 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 82 83 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 84 85 int zfs_super_owner; 86 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 87 "File system owner can perform privileged operation on his file systems"); 88 89 int zfs_debug_level; 90 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 91 "Debug level"); 92 93 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 94 static int zfs_version_acl = ZFS_ACL_VERSION; 95 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 96 "ZFS_ACL_VERSION"); 97 static int zfs_version_spa = SPA_VERSION; 98 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 99 "SPA_VERSION"); 100 static int zfs_version_zpl = ZPL_VERSION; 101 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 102 "ZPL_VERSION"); 103 /* END CSTYLED */ 104 105 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 106 static int zfs_mount(vfs_t *vfsp); 107 static int zfs_umount(vfs_t *vfsp, int fflag); 108 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 109 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 110 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 111 static int zfs_sync(vfs_t *vfsp, int waitfor); 112 #if __FreeBSD_version >= 1300098 113 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 114 struct ucred **credanonp, int *numsecflavors, int *secflavors); 115 #else 116 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 117 struct ucred **credanonp, int *numsecflavors, int **secflavors); 118 #endif 119 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 120 static void zfs_freevfs(vfs_t *vfsp); 121 122 struct vfsops zfs_vfsops = { 123 .vfs_mount = zfs_mount, 124 .vfs_unmount = zfs_umount, 125 #if __FreeBSD_version >= 1300049 126 .vfs_root = vfs_cache_root, 127 .vfs_cachedroot = zfs_root, 128 #else 129 .vfs_root = zfs_root, 130 #endif 131 .vfs_statfs = zfs_statfs, 132 .vfs_vget = zfs_vget, 133 .vfs_sync = zfs_sync, 134 .vfs_checkexp = zfs_checkexp, 135 .vfs_fhtovp = zfs_fhtovp, 136 .vfs_quotactl = zfs_quotactl, 137 }; 138 139 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); 140 141 /* 142 * We need to keep a count of active fs's. 143 * This is necessary to prevent our module 144 * from being unloaded after a umount -f 145 */ 146 static uint32_t zfs_active_fs_count = 0; 147 148 int 149 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 150 char *setpoint) 151 { 152 int error; 153 zfsvfs_t *zfvp; 154 vfs_t *vfsp; 155 objset_t *os; 156 uint64_t tmp = *val; 157 158 error = dmu_objset_from_ds(ds, &os); 159 if (error != 0) 160 return (error); 161 162 error = getzfsvfs_impl(os, &zfvp); 163 if (error != 0) 164 return (error); 165 if (zfvp == NULL) 166 return (ENOENT); 167 vfsp = zfvp->z_vfs; 168 switch (zfs_prop) { 169 case ZFS_PROP_ATIME: 170 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 171 tmp = 0; 172 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 173 tmp = 1; 174 break; 175 case ZFS_PROP_DEVICES: 176 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 177 tmp = 0; 178 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 179 tmp = 1; 180 break; 181 case ZFS_PROP_EXEC: 182 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 183 tmp = 0; 184 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 185 tmp = 1; 186 break; 187 case ZFS_PROP_SETUID: 188 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 189 tmp = 0; 190 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 191 tmp = 1; 192 break; 193 case ZFS_PROP_READONLY: 194 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 195 tmp = 0; 196 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 197 tmp = 1; 198 break; 199 case ZFS_PROP_XATTR: 200 if (zfvp->z_flags & ZSB_XATTR) 201 tmp = zfvp->z_xattr; 202 break; 203 case ZFS_PROP_NBMAND: 204 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 205 tmp = 0; 206 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 207 tmp = 1; 208 break; 209 default: 210 vfs_unbusy(vfsp); 211 return (ENOENT); 212 } 213 214 vfs_unbusy(vfsp); 215 if (tmp != *val) { 216 (void) strcpy(setpoint, "temporary"); 217 *val = tmp; 218 } 219 return (0); 220 } 221 222 static int 223 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 224 { 225 int error = 0; 226 char buf[32]; 227 uint64_t usedobj, quotaobj; 228 uint64_t quota, used = 0; 229 timespec_t now; 230 231 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 232 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 233 234 if (quotaobj == 0 || zfsvfs->z_replay) { 235 error = ENOENT; 236 goto done; 237 } 238 (void) sprintf(buf, "%llx", (longlong_t)id); 239 if ((error = zap_lookup(zfsvfs->z_os, quotaobj, 240 buf, sizeof (quota), 1, "a)) != 0) { 241 dprintf("%s(%d): quotaobj lookup failed\n", 242 __FUNCTION__, __LINE__); 243 goto done; 244 } 245 /* 246 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 247 * So we set them to be the same. 248 */ 249 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 250 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 251 if (error && error != ENOENT) { 252 dprintf("%s(%d): usedobj failed; %d\n", 253 __FUNCTION__, __LINE__, error); 254 goto done; 255 } 256 dqp->dqb_curblocks = btodb(used); 257 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 258 vfs_timestamp(&now); 259 /* 260 * Setting this to 0 causes FreeBSD quota(8) to print 261 * the number of days since the epoch, which isn't 262 * particularly useful. 263 */ 264 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 265 done: 266 return (error); 267 } 268 269 static int 270 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 271 { 272 zfsvfs_t *zfsvfs = vfsp->vfs_data; 273 struct thread *td; 274 int cmd, type, error = 0; 275 int bitsize; 276 zfs_userquota_prop_t quota_type; 277 struct dqblk64 dqblk = { 0 }; 278 279 td = curthread; 280 cmd = cmds >> SUBCMDSHIFT; 281 type = cmds & SUBCMDMASK; 282 283 ZFS_ENTER(zfsvfs); 284 if (id == -1) { 285 switch (type) { 286 case USRQUOTA: 287 id = td->td_ucred->cr_ruid; 288 break; 289 case GRPQUOTA: 290 id = td->td_ucred->cr_rgid; 291 break; 292 default: 293 error = EINVAL; 294 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 295 vfs_unbusy(vfsp); 296 goto done; 297 } 298 } 299 /* 300 * Map BSD type to: 301 * ZFS_PROP_USERUSED, 302 * ZFS_PROP_USERQUOTA, 303 * ZFS_PROP_GROUPUSED, 304 * ZFS_PROP_GROUPQUOTA 305 */ 306 switch (cmd) { 307 case Q_SETQUOTA: 308 case Q_SETQUOTA32: 309 if (type == USRQUOTA) 310 quota_type = ZFS_PROP_USERQUOTA; 311 else if (type == GRPQUOTA) 312 quota_type = ZFS_PROP_GROUPQUOTA; 313 else 314 error = EINVAL; 315 break; 316 case Q_GETQUOTA: 317 case Q_GETQUOTA32: 318 if (type == USRQUOTA) 319 quota_type = ZFS_PROP_USERUSED; 320 else if (type == GRPQUOTA) 321 quota_type = ZFS_PROP_GROUPUSED; 322 else 323 error = EINVAL; 324 break; 325 } 326 327 /* 328 * Depending on the cmd, we may need to get 329 * the ruid and domain (see fuidstr_to_sid?), 330 * the fuid (how?), or other information. 331 * Create fuid using zfs_fuid_create(zfsvfs, id, 332 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 333 * I think I can use just the id? 334 * 335 * Look at zfs_id_overquota() to look up a quota. 336 * zap_lookup(something, quotaobj, fuidstring, 337 * sizeof (long long), 1, "a) 338 * 339 * See zfs_set_userquota() to set a quota. 340 */ 341 if ((uint32_t)type >= MAXQUOTAS) { 342 error = EINVAL; 343 goto done; 344 } 345 346 switch (cmd) { 347 case Q_GETQUOTASIZE: 348 bitsize = 64; 349 error = copyout(&bitsize, arg, sizeof (int)); 350 break; 351 case Q_QUOTAON: 352 // As far as I can tell, you can't turn quotas on or off on zfs 353 error = 0; 354 vfs_unbusy(vfsp); 355 break; 356 case Q_QUOTAOFF: 357 error = ENOTSUP; 358 vfs_unbusy(vfsp); 359 break; 360 case Q_SETQUOTA: 361 error = copyin(arg, &dqblk, sizeof (dqblk)); 362 if (error == 0) 363 error = zfs_set_userquota(zfsvfs, quota_type, 364 "", id, dbtob(dqblk.dqb_bhardlimit)); 365 break; 366 case Q_GETQUOTA: 367 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 368 if (error == 0) 369 error = copyout(&dqblk, arg, sizeof (dqblk)); 370 break; 371 default: 372 error = EINVAL; 373 break; 374 } 375 done: 376 ZFS_EXIT(zfsvfs); 377 return (error); 378 } 379 380 381 boolean_t 382 zfs_is_readonly(zfsvfs_t *zfsvfs) 383 { 384 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 385 } 386 387 /*ARGSUSED*/ 388 static int 389 zfs_sync(vfs_t *vfsp, int waitfor) 390 { 391 392 /* 393 * Data integrity is job one. We don't want a compromised kernel 394 * writing to the storage pool, so we never sync during panic. 395 */ 396 if (panicstr) 397 return (0); 398 399 /* 400 * Ignore the system syncher. ZFS already commits async data 401 * at zfs_txg_timeout intervals. 402 */ 403 if (waitfor == MNT_LAZY) 404 return (0); 405 406 if (vfsp != NULL) { 407 /* 408 * Sync a specific filesystem. 409 */ 410 zfsvfs_t *zfsvfs = vfsp->vfs_data; 411 dsl_pool_t *dp; 412 int error; 413 414 error = vfs_stdsync(vfsp, waitfor); 415 if (error != 0) 416 return (error); 417 418 ZFS_ENTER(zfsvfs); 419 dp = dmu_objset_pool(zfsvfs->z_os); 420 421 /* 422 * If the system is shutting down, then skip any 423 * filesystems which may exist on a suspended pool. 424 */ 425 if (rebooting && spa_suspended(dp->dp_spa)) { 426 ZFS_EXIT(zfsvfs); 427 return (0); 428 } 429 430 if (zfsvfs->z_log != NULL) 431 zil_commit(zfsvfs->z_log, 0); 432 433 ZFS_EXIT(zfsvfs); 434 } else { 435 /* 436 * Sync all ZFS filesystems. This is what happens when you 437 * run sync(8). Unlike other filesystems, ZFS honors the 438 * request by waiting for all pools to commit all dirty data. 439 */ 440 spa_sync_allpools(); 441 } 442 443 return (0); 444 } 445 446 static void 447 atime_changed_cb(void *arg, uint64_t newval) 448 { 449 zfsvfs_t *zfsvfs = arg; 450 451 if (newval == TRUE) { 452 zfsvfs->z_atime = TRUE; 453 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 454 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 455 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 456 } else { 457 zfsvfs->z_atime = FALSE; 458 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 459 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 460 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 461 } 462 } 463 464 static void 465 xattr_changed_cb(void *arg, uint64_t newval) 466 { 467 zfsvfs_t *zfsvfs = arg; 468 469 if (newval == ZFS_XATTR_OFF) { 470 zfsvfs->z_flags &= ~ZSB_XATTR; 471 } else { 472 zfsvfs->z_flags |= ZSB_XATTR; 473 474 if (newval == ZFS_XATTR_SA) 475 zfsvfs->z_xattr_sa = B_TRUE; 476 else 477 zfsvfs->z_xattr_sa = B_FALSE; 478 } 479 } 480 481 static void 482 blksz_changed_cb(void *arg, uint64_t newval) 483 { 484 zfsvfs_t *zfsvfs = arg; 485 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 486 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 487 ASSERT(ISP2(newval)); 488 489 zfsvfs->z_max_blksz = newval; 490 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 491 } 492 493 static void 494 readonly_changed_cb(void *arg, uint64_t newval) 495 { 496 zfsvfs_t *zfsvfs = arg; 497 498 if (newval) { 499 /* XXX locking on vfs_flag? */ 500 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 501 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 502 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 503 } else { 504 /* XXX locking on vfs_flag? */ 505 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 506 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 507 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 508 } 509 } 510 511 static void 512 setuid_changed_cb(void *arg, uint64_t newval) 513 { 514 zfsvfs_t *zfsvfs = arg; 515 516 if (newval == FALSE) { 517 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 518 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 519 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 520 } else { 521 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 522 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 523 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 524 } 525 } 526 527 static void 528 exec_changed_cb(void *arg, uint64_t newval) 529 { 530 zfsvfs_t *zfsvfs = arg; 531 532 if (newval == FALSE) { 533 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 534 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 535 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 536 } else { 537 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 538 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 539 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 540 } 541 } 542 543 /* 544 * The nbmand mount option can be changed at mount time. 545 * We can't allow it to be toggled on live file systems or incorrect 546 * behavior may be seen from cifs clients 547 * 548 * This property isn't registered via dsl_prop_register(), but this callback 549 * will be called when a file system is first mounted 550 */ 551 static void 552 nbmand_changed_cb(void *arg, uint64_t newval) 553 { 554 zfsvfs_t *zfsvfs = arg; 555 if (newval == FALSE) { 556 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 557 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 558 } else { 559 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 560 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 561 } 562 } 563 564 static void 565 snapdir_changed_cb(void *arg, uint64_t newval) 566 { 567 zfsvfs_t *zfsvfs = arg; 568 569 zfsvfs->z_show_ctldir = newval; 570 } 571 572 static void 573 vscan_changed_cb(void *arg, uint64_t newval) 574 { 575 zfsvfs_t *zfsvfs = arg; 576 577 zfsvfs->z_vscan = newval; 578 } 579 580 static void 581 acl_mode_changed_cb(void *arg, uint64_t newval) 582 { 583 zfsvfs_t *zfsvfs = arg; 584 585 zfsvfs->z_acl_mode = newval; 586 } 587 588 static void 589 acl_inherit_changed_cb(void *arg, uint64_t newval) 590 { 591 zfsvfs_t *zfsvfs = arg; 592 593 zfsvfs->z_acl_inherit = newval; 594 } 595 596 static void 597 acl_type_changed_cb(void *arg, uint64_t newval) 598 { 599 zfsvfs_t *zfsvfs = arg; 600 601 zfsvfs->z_acl_type = newval; 602 } 603 604 static int 605 zfs_register_callbacks(vfs_t *vfsp) 606 { 607 struct dsl_dataset *ds = NULL; 608 objset_t *os = NULL; 609 zfsvfs_t *zfsvfs = NULL; 610 uint64_t nbmand; 611 boolean_t readonly = B_FALSE; 612 boolean_t do_readonly = B_FALSE; 613 boolean_t setuid = B_FALSE; 614 boolean_t do_setuid = B_FALSE; 615 boolean_t exec = B_FALSE; 616 boolean_t do_exec = B_FALSE; 617 boolean_t xattr = B_FALSE; 618 boolean_t atime = B_FALSE; 619 boolean_t do_atime = B_FALSE; 620 boolean_t do_xattr = B_FALSE; 621 int error = 0; 622 623 ASSERT(vfsp); 624 zfsvfs = vfsp->vfs_data; 625 ASSERT(zfsvfs); 626 os = zfsvfs->z_os; 627 628 /* 629 * This function can be called for a snapshot when we update snapshot's 630 * mount point, which isn't really supported. 631 */ 632 if (dmu_objset_is_snapshot(os)) 633 return (EOPNOTSUPP); 634 635 /* 636 * The act of registering our callbacks will destroy any mount 637 * options we may have. In order to enable temporary overrides 638 * of mount options, we stash away the current values and 639 * restore them after we register the callbacks. 640 */ 641 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 642 !spa_writeable(dmu_objset_spa(os))) { 643 readonly = B_TRUE; 644 do_readonly = B_TRUE; 645 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 646 readonly = B_FALSE; 647 do_readonly = B_TRUE; 648 } 649 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 650 setuid = B_FALSE; 651 do_setuid = B_TRUE; 652 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 653 setuid = B_TRUE; 654 do_setuid = B_TRUE; 655 } 656 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 657 exec = B_FALSE; 658 do_exec = B_TRUE; 659 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 660 exec = B_TRUE; 661 do_exec = B_TRUE; 662 } 663 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 664 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 665 do_xattr = B_TRUE; 666 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 667 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 668 do_xattr = B_TRUE; 669 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 670 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 671 do_xattr = B_TRUE; 672 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 673 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 674 do_xattr = B_TRUE; 675 } 676 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 677 atime = B_FALSE; 678 do_atime = B_TRUE; 679 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 680 atime = B_TRUE; 681 do_atime = B_TRUE; 682 } 683 684 /* 685 * We need to enter pool configuration here, so that we can use 686 * dsl_prop_get_int_ds() to handle the special nbmand property below. 687 * dsl_prop_get_integer() can not be used, because it has to acquire 688 * spa_namespace_lock and we can not do that because we already hold 689 * z_teardown_lock. The problem is that spa_write_cachefile() is called 690 * with spa_namespace_lock held and the function calls ZFS vnode 691 * operations to write the cache file and thus z_teardown_lock is 692 * acquired after spa_namespace_lock. 693 */ 694 ds = dmu_objset_ds(os); 695 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 696 697 /* 698 * nbmand is a special property. It can only be changed at 699 * mount time. 700 * 701 * This is weird, but it is documented to only be changeable 702 * at mount time. 703 */ 704 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 705 nbmand = B_FALSE; 706 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 707 nbmand = B_TRUE; 708 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { 709 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 710 return (error); 711 } 712 713 /* 714 * Register property callbacks. 715 * 716 * It would probably be fine to just check for i/o error from 717 * the first prop_register(), but I guess I like to go 718 * overboard... 719 */ 720 error = dsl_prop_register(ds, 721 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 722 error = error ? error : dsl_prop_register(ds, 723 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 724 error = error ? error : dsl_prop_register(ds, 725 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 726 error = error ? error : dsl_prop_register(ds, 727 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 728 error = error ? error : dsl_prop_register(ds, 729 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 730 error = error ? error : dsl_prop_register(ds, 731 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 732 error = error ? error : dsl_prop_register(ds, 733 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 734 error = error ? error : dsl_prop_register(ds, 735 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 736 error = error ? error : dsl_prop_register(ds, 737 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 738 error = error ? error : dsl_prop_register(ds, 739 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 740 zfsvfs); 741 error = error ? error : dsl_prop_register(ds, 742 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 743 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 744 if (error) 745 goto unregister; 746 747 /* 748 * Invoke our callbacks to restore temporary mount options. 749 */ 750 if (do_readonly) 751 readonly_changed_cb(zfsvfs, readonly); 752 if (do_setuid) 753 setuid_changed_cb(zfsvfs, setuid); 754 if (do_exec) 755 exec_changed_cb(zfsvfs, exec); 756 if (do_xattr) 757 xattr_changed_cb(zfsvfs, xattr); 758 if (do_atime) 759 atime_changed_cb(zfsvfs, atime); 760 761 nbmand_changed_cb(zfsvfs, nbmand); 762 763 return (0); 764 765 unregister: 766 dsl_prop_unregister_all(ds, zfsvfs); 767 return (error); 768 } 769 770 /* 771 * Associate this zfsvfs with the given objset, which must be owned. 772 * This will cache a bunch of on-disk state from the objset in the 773 * zfsvfs. 774 */ 775 static int 776 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 777 { 778 int error; 779 uint64_t val; 780 781 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 782 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 783 zfsvfs->z_os = os; 784 785 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 786 if (error != 0) 787 return (error); 788 if (zfsvfs->z_version > 789 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 790 (void) printf("Can't mount a version %lld file system " 791 "on a version %lld pool\n. Pool must be upgraded to mount " 792 "this file system.", (u_longlong_t)zfsvfs->z_version, 793 (u_longlong_t)spa_version(dmu_objset_spa(os))); 794 return (SET_ERROR(ENOTSUP)); 795 } 796 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 797 if (error != 0) 798 return (error); 799 zfsvfs->z_norm = (int)val; 800 801 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 802 if (error != 0) 803 return (error); 804 zfsvfs->z_utf8 = (val != 0); 805 806 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 807 if (error != 0) 808 return (error); 809 zfsvfs->z_case = (uint_t)val; 810 811 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 812 if (error != 0) 813 return (error); 814 zfsvfs->z_acl_type = (uint_t)val; 815 816 /* 817 * Fold case on file systems that are always or sometimes case 818 * insensitive. 819 */ 820 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 821 zfsvfs->z_case == ZFS_CASE_MIXED) 822 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 823 824 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 825 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 826 827 uint64_t sa_obj = 0; 828 if (zfsvfs->z_use_sa) { 829 /* should either have both of these objects or none */ 830 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 831 &sa_obj); 832 if (error != 0) 833 return (error); 834 } 835 836 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 837 &zfsvfs->z_attr_table); 838 if (error != 0) 839 return (error); 840 841 if (zfsvfs->z_version >= ZPL_VERSION_SA) 842 sa_register_update_callback(os, zfs_sa_upgrade); 843 844 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 845 &zfsvfs->z_root); 846 if (error != 0) 847 return (error); 848 ASSERT(zfsvfs->z_root != 0); 849 850 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 851 &zfsvfs->z_unlinkedobj); 852 if (error != 0) 853 return (error); 854 855 error = zap_lookup(os, MASTER_NODE_OBJ, 856 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 857 8, 1, &zfsvfs->z_userquota_obj); 858 if (error == ENOENT) 859 zfsvfs->z_userquota_obj = 0; 860 else if (error != 0) 861 return (error); 862 863 error = zap_lookup(os, MASTER_NODE_OBJ, 864 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 865 8, 1, &zfsvfs->z_groupquota_obj); 866 if (error == ENOENT) 867 zfsvfs->z_groupquota_obj = 0; 868 else if (error != 0) 869 return (error); 870 871 error = zap_lookup(os, MASTER_NODE_OBJ, 872 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 873 8, 1, &zfsvfs->z_projectquota_obj); 874 if (error == ENOENT) 875 zfsvfs->z_projectquota_obj = 0; 876 else if (error != 0) 877 return (error); 878 879 error = zap_lookup(os, MASTER_NODE_OBJ, 880 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 881 8, 1, &zfsvfs->z_userobjquota_obj); 882 if (error == ENOENT) 883 zfsvfs->z_userobjquota_obj = 0; 884 else if (error != 0) 885 return (error); 886 887 error = zap_lookup(os, MASTER_NODE_OBJ, 888 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 889 8, 1, &zfsvfs->z_groupobjquota_obj); 890 if (error == ENOENT) 891 zfsvfs->z_groupobjquota_obj = 0; 892 else if (error != 0) 893 return (error); 894 895 error = zap_lookup(os, MASTER_NODE_OBJ, 896 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 897 8, 1, &zfsvfs->z_projectobjquota_obj); 898 if (error == ENOENT) 899 zfsvfs->z_projectobjquota_obj = 0; 900 else if (error != 0) 901 return (error); 902 903 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 904 &zfsvfs->z_fuid_obj); 905 if (error == ENOENT) 906 zfsvfs->z_fuid_obj = 0; 907 else if (error != 0) 908 return (error); 909 910 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 911 &zfsvfs->z_shares_dir); 912 if (error == ENOENT) 913 zfsvfs->z_shares_dir = 0; 914 else if (error != 0) 915 return (error); 916 917 /* 918 * Only use the name cache if we are looking for a 919 * name on a file system that does not require normalization 920 * or case folding. We can also look there if we happen to be 921 * on a non-normalizing, mixed sensitivity file system IF we 922 * are looking for the exact name (which is always the case on 923 * FreeBSD). 924 */ 925 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 926 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 927 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 928 929 return (0); 930 } 931 932 taskq_t *zfsvfs_taskq; 933 934 static void 935 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 936 { 937 938 zfs_unlinked_drain((zfsvfs_t *)context); 939 } 940 941 int 942 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 943 { 944 objset_t *os; 945 zfsvfs_t *zfsvfs; 946 int error; 947 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 948 949 /* 950 * XXX: Fix struct statfs so this isn't necessary! 951 * 952 * The 'osname' is used as the filesystem's special node, which means 953 * it must fit in statfs.f_mntfromname, or else it can't be 954 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 955 * 'zfs unmount' to think it's not mounted when it is. 956 */ 957 if (strlen(osname) >= MNAMELEN) 958 return (SET_ERROR(ENAMETOOLONG)); 959 960 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 961 962 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 963 &os); 964 if (error != 0) { 965 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 966 return (error); 967 } 968 969 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 970 971 return (error); 972 } 973 974 975 int 976 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 977 { 978 int error; 979 980 zfsvfs->z_vfs = NULL; 981 zfsvfs->z_parent = zfsvfs; 982 983 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 984 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 985 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 986 offsetof(znode_t, z_link_node)); 987 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 988 zfsvfs_task_unlinked_drain, zfsvfs); 989 ZFS_TEARDOWN_INIT(zfsvfs); 990 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 991 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 992 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 993 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 994 995 error = zfsvfs_init(zfsvfs, os); 996 if (error != 0) { 997 dmu_objset_disown(os, B_TRUE, zfsvfs); 998 *zfvp = NULL; 999 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1000 return (error); 1001 } 1002 1003 *zfvp = zfsvfs; 1004 return (0); 1005 } 1006 1007 static int 1008 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1009 { 1010 int error; 1011 1012 /* 1013 * Check for a bad on-disk format version now since we 1014 * lied about owning the dataset readonly before. 1015 */ 1016 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1017 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1018 return (SET_ERROR(EROFS)); 1019 1020 error = zfs_register_callbacks(zfsvfs->z_vfs); 1021 if (error) 1022 return (error); 1023 1024 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1025 1026 /* 1027 * If we are not mounting (ie: online recv), then we don't 1028 * have to worry about replaying the log as we blocked all 1029 * operations out since we closed the ZIL. 1030 */ 1031 if (mounting) { 1032 boolean_t readonly; 1033 1034 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); 1035 dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1036 1037 /* 1038 * During replay we remove the read only flag to 1039 * allow replays to succeed. 1040 */ 1041 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1042 if (readonly != 0) { 1043 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1044 } else { 1045 dsl_dir_t *dd; 1046 zap_stats_t zs; 1047 1048 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1049 &zs) == 0) { 1050 dataset_kstats_update_nunlinks_kstat( 1051 &zfsvfs->z_kstat, zs.zs_num_entries); 1052 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1053 "num_entries in unlinked set: %llu", 1054 zs.zs_num_entries); 1055 } 1056 1057 zfs_unlinked_drain(zfsvfs); 1058 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1059 dd->dd_activity_cancelled = B_FALSE; 1060 } 1061 1062 /* 1063 * Parse and replay the intent log. 1064 * 1065 * Because of ziltest, this must be done after 1066 * zfs_unlinked_drain(). (Further note: ziltest 1067 * doesn't use readonly mounts, where 1068 * zfs_unlinked_drain() isn't called.) This is because 1069 * ziltest causes spa_sync() to think it's committed, 1070 * but actually it is not, so the intent log contains 1071 * many txg's worth of changes. 1072 * 1073 * In particular, if object N is in the unlinked set in 1074 * the last txg to actually sync, then it could be 1075 * actually freed in a later txg and then reallocated 1076 * in a yet later txg. This would write a "create 1077 * object N" record to the intent log. Normally, this 1078 * would be fine because the spa_sync() would have 1079 * written out the fact that object N is free, before 1080 * we could write the "create object N" intent log 1081 * record. 1082 * 1083 * But when we are in ziltest mode, we advance the "open 1084 * txg" without actually spa_sync()-ing the changes to 1085 * disk. So we would see that object N is still 1086 * allocated and in the unlinked set, and there is an 1087 * intent log record saying to allocate it. 1088 */ 1089 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1090 if (zil_replay_disable) { 1091 zil_destroy(zfsvfs->z_log, B_FALSE); 1092 } else { 1093 boolean_t use_nc = zfsvfs->z_use_namecache; 1094 zfsvfs->z_use_namecache = B_FALSE; 1095 zfsvfs->z_replay = B_TRUE; 1096 zil_replay(zfsvfs->z_os, zfsvfs, 1097 zfs_replay_vector); 1098 zfsvfs->z_replay = B_FALSE; 1099 zfsvfs->z_use_namecache = use_nc; 1100 } 1101 } 1102 1103 /* restore readonly bit */ 1104 if (readonly != 0) 1105 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1106 } 1107 1108 /* 1109 * Set the objset user_ptr to track its zfsvfs. 1110 */ 1111 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1112 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1113 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1114 1115 return (0); 1116 } 1117 1118 void 1119 zfsvfs_free(zfsvfs_t *zfsvfs) 1120 { 1121 int i; 1122 1123 zfs_fuid_destroy(zfsvfs); 1124 1125 mutex_destroy(&zfsvfs->z_znodes_lock); 1126 mutex_destroy(&zfsvfs->z_lock); 1127 ASSERT(zfsvfs->z_nr_znodes == 0); 1128 list_destroy(&zfsvfs->z_all_znodes); 1129 ZFS_TEARDOWN_DESTROY(zfsvfs); 1130 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1131 rw_destroy(&zfsvfs->z_fuid_lock); 1132 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1133 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1134 dataset_kstats_destroy(&zfsvfs->z_kstat); 1135 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1136 } 1137 1138 static void 1139 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1140 { 1141 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1142 if (zfsvfs->z_vfs) { 1143 if (zfsvfs->z_use_fuids) { 1144 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1145 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1146 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1147 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1148 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1149 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1150 } else { 1151 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1152 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1153 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1154 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1155 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1156 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1157 } 1158 } 1159 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1160 } 1161 1162 static int 1163 zfs_domount(vfs_t *vfsp, char *osname) 1164 { 1165 uint64_t recordsize, fsid_guid; 1166 int error = 0; 1167 zfsvfs_t *zfsvfs; 1168 1169 ASSERT(vfsp); 1170 ASSERT(osname); 1171 1172 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1173 if (error) 1174 return (error); 1175 zfsvfs->z_vfs = vfsp; 1176 1177 if ((error = dsl_prop_get_integer(osname, 1178 "recordsize", &recordsize, NULL))) 1179 goto out; 1180 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1181 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1182 1183 vfsp->vfs_data = zfsvfs; 1184 vfsp->mnt_flag |= MNT_LOCAL; 1185 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1186 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1187 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1188 /* 1189 * This can cause a loss of coherence between ARC and page cache 1190 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1191 */ 1192 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1193 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1194 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1195 1196 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1197 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1198 #endif 1199 /* 1200 * The fsid is 64 bits, composed of an 8-bit fs type, which 1201 * separates our fsid from any other filesystem types, and a 1202 * 56-bit objset unique ID. The objset unique ID is unique to 1203 * all objsets open on this system, provided by unique_create(). 1204 * The 8-bit fs type must be put in the low bits of fsid[1] 1205 * because that's where other Solaris filesystems put it. 1206 */ 1207 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1208 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1209 vfsp->vfs_fsid.val[0] = fsid_guid; 1210 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1211 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1212 1213 /* 1214 * Set features for file system. 1215 */ 1216 zfs_set_fuid_feature(zfsvfs); 1217 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1218 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1219 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1220 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1221 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1222 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1223 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1224 } 1225 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1226 1227 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1228 uint64_t pval; 1229 1230 atime_changed_cb(zfsvfs, B_FALSE); 1231 readonly_changed_cb(zfsvfs, B_TRUE); 1232 if ((error = dsl_prop_get_integer(osname, 1233 "xattr", &pval, NULL))) 1234 goto out; 1235 xattr_changed_cb(zfsvfs, pval); 1236 if ((error = dsl_prop_get_integer(osname, 1237 "acltype", &pval, NULL))) 1238 goto out; 1239 acl_type_changed_cb(zfsvfs, pval); 1240 zfsvfs->z_issnap = B_TRUE; 1241 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1242 1243 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1244 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1245 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1246 } else { 1247 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1248 goto out; 1249 } 1250 1251 vfs_mountedfrom(vfsp, osname); 1252 1253 if (!zfsvfs->z_issnap) 1254 zfsctl_create(zfsvfs); 1255 out: 1256 if (error) { 1257 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1258 zfsvfs_free(zfsvfs); 1259 } else { 1260 atomic_inc_32(&zfs_active_fs_count); 1261 } 1262 1263 return (error); 1264 } 1265 1266 static void 1267 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1268 { 1269 objset_t *os = zfsvfs->z_os; 1270 1271 if (!dmu_objset_is_snapshot(os)) 1272 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1273 } 1274 1275 static int 1276 getpoolname(const char *osname, char *poolname) 1277 { 1278 char *p; 1279 1280 p = strchr(osname, '/'); 1281 if (p == NULL) { 1282 if (strlen(osname) >= MAXNAMELEN) 1283 return (ENAMETOOLONG); 1284 (void) strcpy(poolname, osname); 1285 } else { 1286 if (p - osname >= MAXNAMELEN) 1287 return (ENAMETOOLONG); 1288 (void) strncpy(poolname, osname, p - osname); 1289 poolname[p - osname] = '\0'; 1290 } 1291 return (0); 1292 } 1293 1294 /*ARGSUSED*/ 1295 static int 1296 zfs_mount(vfs_t *vfsp) 1297 { 1298 kthread_t *td = curthread; 1299 vnode_t *mvp = vfsp->mnt_vnodecovered; 1300 cred_t *cr = td->td_ucred; 1301 char *osname; 1302 int error = 0; 1303 int canwrite; 1304 1305 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1306 return (SET_ERROR(EINVAL)); 1307 1308 /* 1309 * If full-owner-access is enabled and delegated administration is 1310 * turned on, we must set nosuid. 1311 */ 1312 if (zfs_super_owner && 1313 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1314 secpolicy_fs_mount_clearopts(cr, vfsp); 1315 } 1316 1317 /* 1318 * Check for mount privilege? 1319 * 1320 * If we don't have privilege then see if 1321 * we have local permission to allow it 1322 */ 1323 error = secpolicy_fs_mount(cr, mvp, vfsp); 1324 if (error) { 1325 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1326 goto out; 1327 1328 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1329 vattr_t vattr; 1330 1331 /* 1332 * Make sure user is the owner of the mount point 1333 * or has sufficient privileges. 1334 */ 1335 1336 vattr.va_mask = AT_UID; 1337 1338 vn_lock(mvp, LK_SHARED | LK_RETRY); 1339 if (VOP_GETATTR(mvp, &vattr, cr)) { 1340 VOP_UNLOCK1(mvp); 1341 goto out; 1342 } 1343 1344 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1345 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1346 VOP_UNLOCK1(mvp); 1347 goto out; 1348 } 1349 VOP_UNLOCK1(mvp); 1350 } 1351 1352 secpolicy_fs_mount_clearopts(cr, vfsp); 1353 } 1354 1355 /* 1356 * Refuse to mount a filesystem if we are in a local zone and the 1357 * dataset is not visible. 1358 */ 1359 if (!INGLOBALZONE(curproc) && 1360 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1361 error = SET_ERROR(EPERM); 1362 goto out; 1363 } 1364 1365 vfsp->vfs_flag |= MNT_NFS4ACLS; 1366 1367 /* 1368 * When doing a remount, we simply refresh our temporary properties 1369 * according to those options set in the current VFS options. 1370 */ 1371 if (vfsp->vfs_flag & MS_REMOUNT) { 1372 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1373 1374 /* 1375 * Refresh mount options with z_teardown_lock blocking I/O while 1376 * the filesystem is in an inconsistent state. 1377 * The lock also serializes this code with filesystem 1378 * manipulations between entry to zfs_suspend_fs() and return 1379 * from zfs_resume_fs(). 1380 */ 1381 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1382 zfs_unregister_callbacks(zfsvfs); 1383 error = zfs_register_callbacks(vfsp); 1384 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1385 goto out; 1386 } 1387 1388 /* Initial root mount: try hard to import the requested root pool. */ 1389 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1390 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1391 char pname[MAXNAMELEN]; 1392 1393 error = getpoolname(osname, pname); 1394 if (error == 0) 1395 error = spa_import_rootpool(pname, false); 1396 if (error) 1397 goto out; 1398 } 1399 DROP_GIANT(); 1400 error = zfs_domount(vfsp, osname); 1401 PICKUP_GIANT(); 1402 1403 out: 1404 return (error); 1405 } 1406 1407 static int 1408 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1409 { 1410 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1411 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1412 1413 statp->f_version = STATFS_VERSION; 1414 1415 ZFS_ENTER(zfsvfs); 1416 1417 dmu_objset_space(zfsvfs->z_os, 1418 &refdbytes, &availbytes, &usedobjs, &availobjs); 1419 1420 /* 1421 * The underlying storage pool actually uses multiple block sizes. 1422 * We report the fragsize as the smallest block size we support, 1423 * and we report our blocksize as the filesystem's maximum blocksize. 1424 */ 1425 statp->f_bsize = SPA_MINBLOCKSIZE; 1426 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1427 1428 /* 1429 * The following report "total" blocks of various kinds in the 1430 * file system, but reported in terms of f_frsize - the 1431 * "fragment" size. 1432 */ 1433 1434 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1435 statp->f_bfree = availbytes / statp->f_bsize; 1436 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1437 1438 /* 1439 * statvfs() should really be called statufs(), because it assumes 1440 * static metadata. ZFS doesn't preallocate files, so the best 1441 * we can do is report the max that could possibly fit in f_files, 1442 * and that minus the number actually used in f_ffree. 1443 * For f_ffree, report the smaller of the number of object available 1444 * and the number of blocks (each object will take at least a block). 1445 */ 1446 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1447 statp->f_files = statp->f_ffree + usedobjs; 1448 1449 /* 1450 * We're a zfs filesystem. 1451 */ 1452 strlcpy(statp->f_fstypename, "zfs", 1453 sizeof (statp->f_fstypename)); 1454 1455 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1456 sizeof (statp->f_mntfromname)); 1457 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1458 sizeof (statp->f_mntonname)); 1459 1460 statp->f_namemax = MAXNAMELEN - 1; 1461 1462 ZFS_EXIT(zfsvfs); 1463 return (0); 1464 } 1465 1466 static int 1467 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1468 { 1469 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1470 znode_t *rootzp; 1471 int error; 1472 1473 ZFS_ENTER(zfsvfs); 1474 1475 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1476 if (error == 0) 1477 *vpp = ZTOV(rootzp); 1478 1479 ZFS_EXIT(zfsvfs); 1480 1481 if (error == 0) { 1482 error = vn_lock(*vpp, flags); 1483 if (error != 0) { 1484 VN_RELE(*vpp); 1485 *vpp = NULL; 1486 } 1487 } 1488 return (error); 1489 } 1490 1491 /* 1492 * Teardown the zfsvfs::z_os. 1493 * 1494 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1495 * and 'z_teardown_inactive_lock' held. 1496 */ 1497 static int 1498 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1499 { 1500 znode_t *zp; 1501 dsl_dir_t *dd; 1502 1503 /* 1504 * If someone has not already unmounted this file system, 1505 * drain the zrele_taskq to ensure all active references to the 1506 * zfsvfs_t have been handled only then can it be safely destroyed. 1507 */ 1508 if (zfsvfs->z_os) { 1509 /* 1510 * If we're unmounting we have to wait for the list to 1511 * drain completely. 1512 * 1513 * If we're not unmounting there's no guarantee the list 1514 * will drain completely, but zreles run from the taskq 1515 * may add the parents of dir-based xattrs to the taskq 1516 * so we want to wait for these. 1517 * 1518 * We can safely read z_nr_znodes without locking because the 1519 * VFS has already blocked operations which add to the 1520 * z_all_znodes list and thus increment z_nr_znodes. 1521 */ 1522 int round = 0; 1523 while (zfsvfs->z_nr_znodes > 0) { 1524 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1525 dmu_objset_pool(zfsvfs->z_os)), 0); 1526 if (++round > 1 && !unmounting) 1527 break; 1528 } 1529 } 1530 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1531 1532 if (!unmounting) { 1533 /* 1534 * We purge the parent filesystem's vfsp as the parent 1535 * filesystem and all of its snapshots have their vnode's 1536 * v_vfsp set to the parent's filesystem's vfsp. Note, 1537 * 'z_parent' is self referential for non-snapshots. 1538 */ 1539 #ifdef FREEBSD_NAMECACHE 1540 #if __FreeBSD_version >= 1300117 1541 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1542 #else 1543 cache_purgevfs(zfsvfs->z_parent->z_vfs, true); 1544 #endif 1545 #endif 1546 } 1547 1548 /* 1549 * Close the zil. NB: Can't close the zil while zfs_inactive 1550 * threads are blocked as zil_close can call zfs_inactive. 1551 */ 1552 if (zfsvfs->z_log) { 1553 zil_close(zfsvfs->z_log); 1554 zfsvfs->z_log = NULL; 1555 } 1556 1557 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1558 1559 /* 1560 * If we are not unmounting (ie: online recv) and someone already 1561 * unmounted this file system while we were doing the switcheroo, 1562 * or a reopen of z_os failed then just bail out now. 1563 */ 1564 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1565 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1566 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1567 return (SET_ERROR(EIO)); 1568 } 1569 1570 /* 1571 * At this point there are no vops active, and any new vops will 1572 * fail with EIO since we have z_teardown_lock for writer (only 1573 * relevant for forced unmount). 1574 * 1575 * Release all holds on dbufs. 1576 */ 1577 mutex_enter(&zfsvfs->z_znodes_lock); 1578 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1579 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1580 if (zp->z_sa_hdl) { 1581 ASSERT(ZTOV(zp)->v_count >= 0); 1582 zfs_znode_dmu_fini(zp); 1583 } 1584 mutex_exit(&zfsvfs->z_znodes_lock); 1585 1586 /* 1587 * If we are unmounting, set the unmounted flag and let new vops 1588 * unblock. zfs_inactive will have the unmounted behavior, and all 1589 * other vops will fail with EIO. 1590 */ 1591 if (unmounting) { 1592 zfsvfs->z_unmounted = B_TRUE; 1593 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1594 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1595 } 1596 1597 /* 1598 * z_os will be NULL if there was an error in attempting to reopen 1599 * zfsvfs, so just return as the properties had already been 1600 * unregistered and cached data had been evicted before. 1601 */ 1602 if (zfsvfs->z_os == NULL) 1603 return (0); 1604 1605 /* 1606 * Unregister properties. 1607 */ 1608 zfs_unregister_callbacks(zfsvfs); 1609 1610 /* 1611 * Evict cached data 1612 */ 1613 if (!zfs_is_readonly(zfsvfs)) 1614 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1615 dmu_objset_evict_dbufs(zfsvfs->z_os); 1616 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1617 dsl_dir_cancel_waiters(dd); 1618 1619 return (0); 1620 } 1621 1622 /*ARGSUSED*/ 1623 static int 1624 zfs_umount(vfs_t *vfsp, int fflag) 1625 { 1626 kthread_t *td = curthread; 1627 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1628 objset_t *os; 1629 cred_t *cr = td->td_ucred; 1630 int ret; 1631 1632 ret = secpolicy_fs_unmount(cr, vfsp); 1633 if (ret) { 1634 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1635 ZFS_DELEG_PERM_MOUNT, cr)) 1636 return (ret); 1637 } 1638 1639 /* 1640 * Unmount any snapshots mounted under .zfs before unmounting the 1641 * dataset itself. 1642 */ 1643 if (zfsvfs->z_ctldir != NULL) { 1644 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1645 return (ret); 1646 } 1647 1648 if (fflag & MS_FORCE) { 1649 /* 1650 * Mark file system as unmounted before calling 1651 * vflush(FORCECLOSE). This way we ensure no future vnops 1652 * will be called and risk operating on DOOMED vnodes. 1653 */ 1654 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1655 zfsvfs->z_unmounted = B_TRUE; 1656 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1657 } 1658 1659 /* 1660 * Flush all the files. 1661 */ 1662 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1663 if (ret != 0) 1664 return (ret); 1665 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1666 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1667 taskqueue_drain(zfsvfs_taskq->tq_queue, 1668 &zfsvfs->z_unlinked_drain_task); 1669 1670 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1671 os = zfsvfs->z_os; 1672 1673 /* 1674 * z_os will be NULL if there was an error in 1675 * attempting to reopen zfsvfs. 1676 */ 1677 if (os != NULL) { 1678 /* 1679 * Unset the objset user_ptr. 1680 */ 1681 mutex_enter(&os->os_user_ptr_lock); 1682 dmu_objset_set_user(os, NULL); 1683 mutex_exit(&os->os_user_ptr_lock); 1684 1685 /* 1686 * Finally release the objset 1687 */ 1688 dmu_objset_disown(os, B_TRUE, zfsvfs); 1689 } 1690 1691 /* 1692 * We can now safely destroy the '.zfs' directory node. 1693 */ 1694 if (zfsvfs->z_ctldir != NULL) 1695 zfsctl_destroy(zfsvfs); 1696 zfs_freevfs(vfsp); 1697 1698 return (0); 1699 } 1700 1701 static int 1702 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1703 { 1704 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1705 znode_t *zp; 1706 int err; 1707 1708 /* 1709 * zfs_zget() can't operate on virtual entries like .zfs/ or 1710 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1711 * This will make NFS to switch to LOOKUP instead of using VGET. 1712 */ 1713 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1714 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1715 return (EOPNOTSUPP); 1716 1717 ZFS_ENTER(zfsvfs); 1718 err = zfs_zget(zfsvfs, ino, &zp); 1719 if (err == 0 && zp->z_unlinked) { 1720 vrele(ZTOV(zp)); 1721 err = EINVAL; 1722 } 1723 if (err == 0) 1724 *vpp = ZTOV(zp); 1725 ZFS_EXIT(zfsvfs); 1726 if (err == 0) { 1727 err = vn_lock(*vpp, flags); 1728 if (err != 0) 1729 vrele(*vpp); 1730 } 1731 if (err != 0) 1732 *vpp = NULL; 1733 return (err); 1734 } 1735 1736 static int 1737 #if __FreeBSD_version >= 1300098 1738 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1739 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1740 #else 1741 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 1742 struct ucred **credanonp, int *numsecflavors, int **secflavors) 1743 #endif 1744 { 1745 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1746 1747 /* 1748 * If this is regular file system vfsp is the same as 1749 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1750 * zfsvfs->z_parent->z_vfs represents parent file system 1751 * which we have to use here, because only this file system 1752 * has mnt_export configured. 1753 */ 1754 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1755 credanonp, numsecflavors, secflavors)); 1756 } 1757 1758 CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); 1759 CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); 1760 1761 static int 1762 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1763 { 1764 struct componentname cn; 1765 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1766 znode_t *zp; 1767 vnode_t *dvp; 1768 uint64_t object = 0; 1769 uint64_t fid_gen = 0; 1770 uint64_t gen_mask; 1771 uint64_t zp_gen; 1772 int i, err; 1773 1774 *vpp = NULL; 1775 1776 ZFS_ENTER(zfsvfs); 1777 1778 /* 1779 * On FreeBSD we can get snapshot's mount point or its parent file 1780 * system mount point depending if snapshot is already mounted or not. 1781 */ 1782 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1783 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1784 uint64_t objsetid = 0; 1785 uint64_t setgen = 0; 1786 1787 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1788 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1789 1790 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1791 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1792 1793 ZFS_EXIT(zfsvfs); 1794 1795 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1796 if (err) 1797 return (SET_ERROR(EINVAL)); 1798 ZFS_ENTER(zfsvfs); 1799 } 1800 1801 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1802 zfid_short_t *zfid = (zfid_short_t *)fidp; 1803 1804 for (i = 0; i < sizeof (zfid->zf_object); i++) 1805 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1806 1807 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1808 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1809 } else { 1810 ZFS_EXIT(zfsvfs); 1811 return (SET_ERROR(EINVAL)); 1812 } 1813 1814 /* 1815 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1816 * directory tree. If the object == zfsvfs->z_shares_dir, then 1817 * we are in the .zfs/shares directory tree. 1818 */ 1819 if ((fid_gen == 0 && 1820 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1821 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1822 ZFS_EXIT(zfsvfs); 1823 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1824 if (object == ZFSCTL_INO_SNAPDIR) { 1825 cn.cn_nameptr = "snapshot"; 1826 cn.cn_namelen = strlen(cn.cn_nameptr); 1827 cn.cn_nameiop = LOOKUP; 1828 cn.cn_flags = ISLASTCN | LOCKLEAF; 1829 cn.cn_lkflags = flags; 1830 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1831 vput(dvp); 1832 } else if (object == zfsvfs->z_shares_dir) { 1833 /* 1834 * XXX This branch must not be taken, 1835 * if it is, then the lookup below will 1836 * explode. 1837 */ 1838 cn.cn_nameptr = "shares"; 1839 cn.cn_namelen = strlen(cn.cn_nameptr); 1840 cn.cn_nameiop = LOOKUP; 1841 cn.cn_flags = ISLASTCN; 1842 cn.cn_lkflags = flags; 1843 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1844 vput(dvp); 1845 } else { 1846 *vpp = dvp; 1847 } 1848 return (err); 1849 } 1850 1851 gen_mask = -1ULL >> (64 - 8 * i); 1852 1853 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 1854 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1855 ZFS_EXIT(zfsvfs); 1856 return (err); 1857 } 1858 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1859 sizeof (uint64_t)); 1860 zp_gen = zp_gen & gen_mask; 1861 if (zp_gen == 0) 1862 zp_gen = 1; 1863 if (zp->z_unlinked || zp_gen != fid_gen) { 1864 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 1865 vrele(ZTOV(zp)); 1866 ZFS_EXIT(zfsvfs); 1867 return (SET_ERROR(EINVAL)); 1868 } 1869 1870 *vpp = ZTOV(zp); 1871 ZFS_EXIT(zfsvfs); 1872 err = vn_lock(*vpp, flags); 1873 if (err == 0) 1874 vnode_create_vobject(*vpp, zp->z_size, curthread); 1875 else 1876 *vpp = NULL; 1877 return (err); 1878 } 1879 1880 /* 1881 * Block out VOPs and close zfsvfs_t::z_os 1882 * 1883 * Note, if successful, then we return with the 'z_teardown_lock' and 1884 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 1885 * dataset and objset intact so that they can be atomically handed off during 1886 * a subsequent rollback or recv operation and the resume thereafter. 1887 */ 1888 int 1889 zfs_suspend_fs(zfsvfs_t *zfsvfs) 1890 { 1891 int error; 1892 1893 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1894 return (error); 1895 1896 return (0); 1897 } 1898 1899 /* 1900 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 1901 * is an invariant across any of the operations that can be performed while the 1902 * filesystem was suspended. Whether it succeeded or failed, the preconditions 1903 * are the same: the relevant objset and associated dataset are owned by 1904 * zfsvfs, held, and long held on entry. 1905 */ 1906 int 1907 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 1908 { 1909 int err; 1910 znode_t *zp; 1911 1912 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 1913 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 1914 1915 /* 1916 * We already own this, so just update the objset_t, as the one we 1917 * had before may have been evicted. 1918 */ 1919 objset_t *os; 1920 VERIFY3P(ds->ds_owner, ==, zfsvfs); 1921 VERIFY(dsl_dataset_long_held(ds)); 1922 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 1923 dsl_pool_config_enter(dp, FTAG); 1924 VERIFY0(dmu_objset_from_ds(ds, &os)); 1925 dsl_pool_config_exit(dp, FTAG); 1926 1927 err = zfsvfs_init(zfsvfs, os); 1928 if (err != 0) 1929 goto bail; 1930 1931 ds->ds_dir->dd_activity_cancelled = B_FALSE; 1932 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 1933 1934 zfs_set_fuid_feature(zfsvfs); 1935 1936 /* 1937 * Attempt to re-establish all the active znodes with 1938 * their dbufs. If a zfs_rezget() fails, then we'll let 1939 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 1940 * when they try to use their znode. 1941 */ 1942 mutex_enter(&zfsvfs->z_znodes_lock); 1943 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 1944 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1945 (void) zfs_rezget(zp); 1946 } 1947 mutex_exit(&zfsvfs->z_znodes_lock); 1948 1949 bail: 1950 /* release the VOPs */ 1951 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1952 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1953 1954 if (err) { 1955 /* 1956 * Since we couldn't setup the sa framework, try to force 1957 * unmount this file system. 1958 */ 1959 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 1960 vfs_ref(zfsvfs->z_vfs); 1961 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 1962 } 1963 } 1964 return (err); 1965 } 1966 1967 static void 1968 zfs_freevfs(vfs_t *vfsp) 1969 { 1970 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1971 1972 zfsvfs_free(zfsvfs); 1973 1974 atomic_dec_32(&zfs_active_fs_count); 1975 } 1976 1977 #ifdef __i386__ 1978 static int desiredvnodes_backup; 1979 #include <sys/vmmeter.h> 1980 1981 1982 #include <vm/vm_page.h> 1983 #include <vm/vm_object.h> 1984 #include <vm/vm_kern.h> 1985 #include <vm/vm_map.h> 1986 #endif 1987 1988 static void 1989 zfs_vnodes_adjust(void) 1990 { 1991 #ifdef __i386__ 1992 int newdesiredvnodes; 1993 1994 desiredvnodes_backup = desiredvnodes; 1995 1996 /* 1997 * We calculate newdesiredvnodes the same way it is done in 1998 * vntblinit(). If it is equal to desiredvnodes, it means that 1999 * it wasn't tuned by the administrator and we can tune it down. 2000 */ 2001 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2002 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2003 sizeof (struct vnode)))); 2004 if (newdesiredvnodes == desiredvnodes) 2005 desiredvnodes = (3 * newdesiredvnodes) / 4; 2006 #endif 2007 } 2008 2009 static void 2010 zfs_vnodes_adjust_back(void) 2011 { 2012 2013 #ifdef __i386__ 2014 desiredvnodes = desiredvnodes_backup; 2015 #endif 2016 } 2017 2018 void 2019 zfs_init(void) 2020 { 2021 2022 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2023 2024 /* 2025 * Initialize .zfs directory structures 2026 */ 2027 zfsctl_init(); 2028 2029 /* 2030 * Initialize znode cache, vnode ops, etc... 2031 */ 2032 zfs_znode_init(); 2033 2034 /* 2035 * Reduce number of vnodes. Originally number of vnodes is calculated 2036 * with UFS inode in mind. We reduce it here, because it's too big for 2037 * ZFS/i386. 2038 */ 2039 zfs_vnodes_adjust(); 2040 2041 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2042 2043 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2044 } 2045 2046 void 2047 zfs_fini(void) 2048 { 2049 taskq_destroy(zfsvfs_taskq); 2050 zfsctl_fini(); 2051 zfs_znode_fini(); 2052 zfs_vnodes_adjust_back(); 2053 } 2054 2055 int 2056 zfs_busy(void) 2057 { 2058 return (zfs_active_fs_count != 0); 2059 } 2060 2061 /* 2062 * Release VOPs and unmount a suspended filesystem. 2063 */ 2064 int 2065 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2066 { 2067 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2068 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2069 2070 /* 2071 * We already own this, so just hold and rele it to update the 2072 * objset_t, as the one we had before may have been evicted. 2073 */ 2074 objset_t *os; 2075 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2076 VERIFY(dsl_dataset_long_held(ds)); 2077 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2078 dsl_pool_config_enter(dp, FTAG); 2079 VERIFY0(dmu_objset_from_ds(ds, &os)); 2080 dsl_pool_config_exit(dp, FTAG); 2081 zfsvfs->z_os = os; 2082 2083 /* release the VOPs */ 2084 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2085 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2086 2087 /* 2088 * Try to force unmount this file system. 2089 */ 2090 (void) zfs_umount(zfsvfs->z_vfs, 0); 2091 zfsvfs->z_unmounted = B_TRUE; 2092 return (0); 2093 } 2094 2095 int 2096 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2097 { 2098 int error; 2099 objset_t *os = zfsvfs->z_os; 2100 dmu_tx_t *tx; 2101 2102 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2103 return (SET_ERROR(EINVAL)); 2104 2105 if (newvers < zfsvfs->z_version) 2106 return (SET_ERROR(EINVAL)); 2107 2108 if (zfs_spa_version_map(newvers) > 2109 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2110 return (SET_ERROR(ENOTSUP)); 2111 2112 tx = dmu_tx_create(os); 2113 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2114 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2115 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2116 ZFS_SA_ATTRS); 2117 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2118 } 2119 error = dmu_tx_assign(tx, TXG_WAIT); 2120 if (error) { 2121 dmu_tx_abort(tx); 2122 return (error); 2123 } 2124 2125 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2126 8, 1, &newvers, tx); 2127 2128 if (error) { 2129 dmu_tx_commit(tx); 2130 return (error); 2131 } 2132 2133 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2134 uint64_t sa_obj; 2135 2136 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2137 SPA_VERSION_SA); 2138 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2139 DMU_OT_NONE, 0, tx); 2140 2141 error = zap_add(os, MASTER_NODE_OBJ, 2142 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2143 ASSERT0(error); 2144 2145 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2146 sa_register_update_callback(os, zfs_sa_upgrade); 2147 } 2148 2149 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2150 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2151 (uintmax_t)newvers); 2152 dmu_tx_commit(tx); 2153 2154 zfsvfs->z_version = newvers; 2155 os->os_version = newvers; 2156 2157 zfs_set_fuid_feature(zfsvfs); 2158 2159 return (0); 2160 } 2161 2162 /* 2163 * Read a property stored within the master node. 2164 */ 2165 int 2166 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2167 { 2168 uint64_t *cached_copy = NULL; 2169 2170 /* 2171 * Figure out where in the objset_t the cached copy would live, if it 2172 * is available for the requested property. 2173 */ 2174 if (os != NULL) { 2175 switch (prop) { 2176 case ZFS_PROP_VERSION: 2177 cached_copy = &os->os_version; 2178 break; 2179 case ZFS_PROP_NORMALIZE: 2180 cached_copy = &os->os_normalization; 2181 break; 2182 case ZFS_PROP_UTF8ONLY: 2183 cached_copy = &os->os_utf8only; 2184 break; 2185 case ZFS_PROP_CASE: 2186 cached_copy = &os->os_casesensitivity; 2187 break; 2188 default: 2189 break; 2190 } 2191 } 2192 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2193 *value = *cached_copy; 2194 return (0); 2195 } 2196 2197 /* 2198 * If the property wasn't cached, look up the file system's value for 2199 * the property. For the version property, we look up a slightly 2200 * different string. 2201 */ 2202 const char *pname; 2203 int error = ENOENT; 2204 if (prop == ZFS_PROP_VERSION) { 2205 pname = ZPL_VERSION_STR; 2206 } else { 2207 pname = zfs_prop_to_name(prop); 2208 } 2209 2210 if (os != NULL) { 2211 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2212 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2213 } 2214 2215 if (error == ENOENT) { 2216 /* No value set, use the default value */ 2217 switch (prop) { 2218 case ZFS_PROP_VERSION: 2219 *value = ZPL_VERSION; 2220 break; 2221 case ZFS_PROP_NORMALIZE: 2222 case ZFS_PROP_UTF8ONLY: 2223 *value = 0; 2224 break; 2225 case ZFS_PROP_CASE: 2226 *value = ZFS_CASE_SENSITIVE; 2227 break; 2228 case ZFS_PROP_ACLTYPE: 2229 *value = ZFS_ACLTYPE_NFSV4; 2230 break; 2231 default: 2232 return (error); 2233 } 2234 error = 0; 2235 } 2236 2237 /* 2238 * If one of the methods for getting the property value above worked, 2239 * copy it into the objset_t's cache. 2240 */ 2241 if (error == 0 && cached_copy != NULL) { 2242 *cached_copy = *value; 2243 } 2244 2245 return (error); 2246 } 2247 2248 /* 2249 * Return true if the corresponding vfs's unmounted flag is set. 2250 * Otherwise return false. 2251 * If this function returns true we know VFS unmount has been initiated. 2252 */ 2253 boolean_t 2254 zfs_get_vfs_flag_unmounted(objset_t *os) 2255 { 2256 zfsvfs_t *zfvp; 2257 boolean_t unmounted = B_FALSE; 2258 2259 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); 2260 2261 mutex_enter(&os->os_user_ptr_lock); 2262 zfvp = dmu_objset_get_user(os); 2263 if (zfvp != NULL && zfvp->z_vfs != NULL && 2264 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2265 unmounted = B_TRUE; 2266 mutex_exit(&os->os_user_ptr_lock); 2267 2268 return (unmounted); 2269 } 2270 2271 #ifdef _KERNEL 2272 void 2273 zfsvfs_update_fromname(const char *oldname, const char *newname) 2274 { 2275 char tmpbuf[MAXPATHLEN]; 2276 struct mount *mp; 2277 char *fromname; 2278 size_t oldlen; 2279 2280 oldlen = strlen(oldname); 2281 2282 mtx_lock(&mountlist_mtx); 2283 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2284 fromname = mp->mnt_stat.f_mntfromname; 2285 if (strcmp(fromname, oldname) == 0) { 2286 (void) strlcpy(fromname, newname, 2287 sizeof (mp->mnt_stat.f_mntfromname)); 2288 continue; 2289 } 2290 if (strncmp(fromname, oldname, oldlen) == 0 && 2291 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2292 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2293 newname, fromname + oldlen); 2294 (void) strlcpy(fromname, tmpbuf, 2295 sizeof (mp->mnt_stat.f_mntfromname)); 2296 continue; 2297 } 2298 } 2299 mtx_unlock(&mountlist_mtx); 2300 } 2301 #endif 2302