1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/sysmacros.h> 37 #include <sys/kmem.h> 38 #include <sys/acl.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/mntent.h> 42 #include <sys/mount.h> 43 #include <sys/cmn_err.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_vnops.h> 46 #include <sys/zfs_dir.h> 47 #include <sys/zil.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dsl_prop.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/dsl_deleg.h> 53 #include <sys/spa.h> 54 #include <sys/zap.h> 55 #include <sys/sa.h> 56 #include <sys/sa_impl.h> 57 #include <sys/policy.h> 58 #include <sys/atomic.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/sunddi.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/dsl_dir.h> 65 #include <sys/jail.h> 66 #include <sys/osd.h> 67 #include <ufs/ufs/quota.h> 68 #include <sys/zfs_quota.h> 69 70 #include "zfs_comutil.h" 71 72 #ifndef MNTK_VMSETSIZE_BUG 73 #define MNTK_VMSETSIZE_BUG 0 74 #endif 75 #ifndef MNTK_NOMSYNC 76 #define MNTK_NOMSYNC 8 77 #endif 78 79 struct mtx zfs_debug_mtx; 80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 81 82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 83 84 int zfs_super_owner; 85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 86 "File system owners can perform privileged operation on file systems"); 87 88 int zfs_debug_level; 89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 90 "Debug level"); 91 92 struct zfs_jailparam { 93 int mount_snapshot; 94 }; 95 96 static struct zfs_jailparam zfs_jailparam0 = { 97 .mount_snapshot = 0, 98 }; 99 100 static int zfs_jailparam_slot; 101 102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); 103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", 104 "Allow mounting snapshots in the .zfs directory for unjailed datasets"); 105 106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 107 static int zfs_version_acl = ZFS_ACL_VERSION; 108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 109 "ZFS_ACL_VERSION"); 110 static int zfs_version_spa = SPA_VERSION; 111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 112 "SPA_VERSION"); 113 static int zfs_version_zpl = ZPL_VERSION; 114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 115 "ZPL_VERSION"); 116 117 #if __FreeBSD_version >= 1400018 118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, 119 bool *mp_busy); 120 #else 121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 122 #endif 123 static int zfs_mount(vfs_t *vfsp); 124 static int zfs_umount(vfs_t *vfsp, int fflag); 125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 128 static int zfs_sync(vfs_t *vfsp, int waitfor); 129 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 130 struct ucred **credanonp, int *numsecflavors, int *secflavors); 131 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 132 static void zfs_freevfs(vfs_t *vfsp); 133 134 struct vfsops zfs_vfsops = { 135 .vfs_mount = zfs_mount, 136 .vfs_unmount = zfs_umount, 137 .vfs_root = vfs_cache_root, 138 .vfs_cachedroot = zfs_root, 139 .vfs_statfs = zfs_statfs, 140 .vfs_vget = zfs_vget, 141 .vfs_sync = zfs_sync, 142 .vfs_checkexp = zfs_checkexp, 143 .vfs_fhtovp = zfs_fhtovp, 144 .vfs_quotactl = zfs_quotactl, 145 }; 146 147 #ifdef VFCF_CROSS_COPY_FILE_RANGE 148 VFS_SET(zfs_vfsops, zfs, 149 VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); 150 #else 151 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); 152 #endif 153 154 /* 155 * We need to keep a count of active fs's. 156 * This is necessary to prevent our module 157 * from being unloaded after a umount -f 158 */ 159 static uint32_t zfs_active_fs_count = 0; 160 161 int 162 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 163 char *setpoint) 164 { 165 int error; 166 zfsvfs_t *zfvp; 167 vfs_t *vfsp; 168 objset_t *os; 169 uint64_t tmp = *val; 170 171 error = dmu_objset_from_ds(ds, &os); 172 if (error != 0) 173 return (error); 174 175 error = getzfsvfs_impl(os, &zfvp); 176 if (error != 0) 177 return (error); 178 if (zfvp == NULL) 179 return (ENOENT); 180 vfsp = zfvp->z_vfs; 181 switch (zfs_prop) { 182 case ZFS_PROP_ATIME: 183 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 184 tmp = 0; 185 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 186 tmp = 1; 187 break; 188 case ZFS_PROP_DEVICES: 189 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 190 tmp = 0; 191 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 192 tmp = 1; 193 break; 194 case ZFS_PROP_EXEC: 195 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 196 tmp = 0; 197 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 198 tmp = 1; 199 break; 200 case ZFS_PROP_SETUID: 201 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 202 tmp = 0; 203 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 204 tmp = 1; 205 break; 206 case ZFS_PROP_READONLY: 207 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 208 tmp = 0; 209 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 210 tmp = 1; 211 break; 212 case ZFS_PROP_XATTR: 213 if (zfvp->z_flags & ZSB_XATTR) 214 tmp = zfvp->z_xattr; 215 break; 216 case ZFS_PROP_NBMAND: 217 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 218 tmp = 0; 219 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 220 tmp = 1; 221 break; 222 default: 223 vfs_unbusy(vfsp); 224 return (ENOENT); 225 } 226 227 vfs_unbusy(vfsp); 228 if (tmp != *val) { 229 if (setpoint) 230 (void) strcpy(setpoint, "temporary"); 231 *val = tmp; 232 } 233 return (0); 234 } 235 236 static int 237 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 238 { 239 int error = 0; 240 char buf[32]; 241 uint64_t usedobj, quotaobj; 242 uint64_t quota, used = 0; 243 timespec_t now; 244 245 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 246 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 247 248 if (quotaobj == 0 || zfsvfs->z_replay) { 249 error = ENOENT; 250 goto done; 251 } 252 (void) sprintf(buf, "%llx", (longlong_t)id); 253 if ((error = zap_lookup(zfsvfs->z_os, quotaobj, 254 buf, sizeof (quota), 1, "a)) != 0) { 255 dprintf("%s(%d): quotaobj lookup failed\n", 256 __FUNCTION__, __LINE__); 257 goto done; 258 } 259 /* 260 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 261 * So we set them to be the same. 262 */ 263 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 264 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 265 if (error && error != ENOENT) { 266 dprintf("%s(%d): usedobj failed; %d\n", 267 __FUNCTION__, __LINE__, error); 268 goto done; 269 } 270 dqp->dqb_curblocks = btodb(used); 271 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 272 vfs_timestamp(&now); 273 /* 274 * Setting this to 0 causes FreeBSD quota(8) to print 275 * the number of days since the epoch, which isn't 276 * particularly useful. 277 */ 278 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 279 done: 280 return (error); 281 } 282 283 static int 284 #if __FreeBSD_version >= 1400018 285 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) 286 #else 287 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 288 #endif 289 { 290 zfsvfs_t *zfsvfs = vfsp->vfs_data; 291 struct thread *td; 292 int cmd, type, error = 0; 293 int bitsize; 294 zfs_userquota_prop_t quota_type; 295 struct dqblk64 dqblk = { 0 }; 296 297 td = curthread; 298 cmd = cmds >> SUBCMDSHIFT; 299 type = cmds & SUBCMDMASK; 300 301 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 302 return (error); 303 if (id == -1) { 304 switch (type) { 305 case USRQUOTA: 306 id = td->td_ucred->cr_ruid; 307 break; 308 case GRPQUOTA: 309 id = td->td_ucred->cr_rgid; 310 break; 311 default: 312 error = EINVAL; 313 #if __FreeBSD_version < 1400018 314 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 315 vfs_unbusy(vfsp); 316 #endif 317 goto done; 318 } 319 } 320 /* 321 * Map BSD type to: 322 * ZFS_PROP_USERUSED, 323 * ZFS_PROP_USERQUOTA, 324 * ZFS_PROP_GROUPUSED, 325 * ZFS_PROP_GROUPQUOTA 326 */ 327 switch (cmd) { 328 case Q_SETQUOTA: 329 case Q_SETQUOTA32: 330 if (type == USRQUOTA) 331 quota_type = ZFS_PROP_USERQUOTA; 332 else if (type == GRPQUOTA) 333 quota_type = ZFS_PROP_GROUPQUOTA; 334 else 335 error = EINVAL; 336 break; 337 case Q_GETQUOTA: 338 case Q_GETQUOTA32: 339 if (type == USRQUOTA) 340 quota_type = ZFS_PROP_USERUSED; 341 else if (type == GRPQUOTA) 342 quota_type = ZFS_PROP_GROUPUSED; 343 else 344 error = EINVAL; 345 break; 346 } 347 348 /* 349 * Depending on the cmd, we may need to get 350 * the ruid and domain (see fuidstr_to_sid?), 351 * the fuid (how?), or other information. 352 * Create fuid using zfs_fuid_create(zfsvfs, id, 353 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 354 * I think I can use just the id? 355 * 356 * Look at zfs_id_overquota() to look up a quota. 357 * zap_lookup(something, quotaobj, fuidstring, 358 * sizeof (long long), 1, "a) 359 * 360 * See zfs_set_userquota() to set a quota. 361 */ 362 if ((uint32_t)type >= MAXQUOTAS) { 363 error = EINVAL; 364 goto done; 365 } 366 367 switch (cmd) { 368 case Q_GETQUOTASIZE: 369 bitsize = 64; 370 error = copyout(&bitsize, arg, sizeof (int)); 371 break; 372 case Q_QUOTAON: 373 // As far as I can tell, you can't turn quotas on or off on zfs 374 error = 0; 375 #if __FreeBSD_version < 1400018 376 vfs_unbusy(vfsp); 377 #endif 378 break; 379 case Q_QUOTAOFF: 380 error = ENOTSUP; 381 #if __FreeBSD_version < 1400018 382 vfs_unbusy(vfsp); 383 #endif 384 break; 385 case Q_SETQUOTA: 386 error = copyin(arg, &dqblk, sizeof (dqblk)); 387 if (error == 0) 388 error = zfs_set_userquota(zfsvfs, quota_type, 389 "", id, dbtob(dqblk.dqb_bhardlimit)); 390 break; 391 case Q_GETQUOTA: 392 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 393 if (error == 0) 394 error = copyout(&dqblk, arg, sizeof (dqblk)); 395 break; 396 default: 397 error = EINVAL; 398 break; 399 } 400 done: 401 zfs_exit(zfsvfs, FTAG); 402 return (error); 403 } 404 405 406 boolean_t 407 zfs_is_readonly(zfsvfs_t *zfsvfs) 408 { 409 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 410 } 411 412 static int 413 zfs_sync(vfs_t *vfsp, int waitfor) 414 { 415 416 /* 417 * Data integrity is job one. We don't want a compromised kernel 418 * writing to the storage pool, so we never sync during panic. 419 */ 420 if (panicstr) 421 return (0); 422 423 /* 424 * Ignore the system syncher. ZFS already commits async data 425 * at zfs_txg_timeout intervals. 426 */ 427 if (waitfor == MNT_LAZY) 428 return (0); 429 430 if (vfsp != NULL) { 431 /* 432 * Sync a specific filesystem. 433 */ 434 zfsvfs_t *zfsvfs = vfsp->vfs_data; 435 dsl_pool_t *dp; 436 int error; 437 438 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 439 return (error); 440 dp = dmu_objset_pool(zfsvfs->z_os); 441 442 /* 443 * If the system is shutting down, then skip any 444 * filesystems which may exist on a suspended pool. 445 */ 446 if (rebooting && spa_suspended(dp->dp_spa)) { 447 zfs_exit(zfsvfs, FTAG); 448 return (0); 449 } 450 451 if (zfsvfs->z_log != NULL) 452 zil_commit(zfsvfs->z_log, 0); 453 454 zfs_exit(zfsvfs, FTAG); 455 } else { 456 /* 457 * Sync all ZFS filesystems. This is what happens when you 458 * run sync(8). Unlike other filesystems, ZFS honors the 459 * request by waiting for all pools to commit all dirty data. 460 */ 461 spa_sync_allpools(); 462 } 463 464 return (0); 465 } 466 467 static void 468 atime_changed_cb(void *arg, uint64_t newval) 469 { 470 zfsvfs_t *zfsvfs = arg; 471 472 if (newval == TRUE) { 473 zfsvfs->z_atime = TRUE; 474 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 475 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 476 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 477 } else { 478 zfsvfs->z_atime = FALSE; 479 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 480 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 481 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 482 } 483 } 484 485 static void 486 xattr_changed_cb(void *arg, uint64_t newval) 487 { 488 zfsvfs_t *zfsvfs = arg; 489 490 if (newval == ZFS_XATTR_OFF) { 491 zfsvfs->z_flags &= ~ZSB_XATTR; 492 } else { 493 zfsvfs->z_flags |= ZSB_XATTR; 494 495 if (newval == ZFS_XATTR_SA) 496 zfsvfs->z_xattr_sa = B_TRUE; 497 else 498 zfsvfs->z_xattr_sa = B_FALSE; 499 } 500 } 501 502 static void 503 blksz_changed_cb(void *arg, uint64_t newval) 504 { 505 zfsvfs_t *zfsvfs = arg; 506 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 507 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 508 ASSERT(ISP2(newval)); 509 510 zfsvfs->z_max_blksz = newval; 511 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 512 } 513 514 static void 515 readonly_changed_cb(void *arg, uint64_t newval) 516 { 517 zfsvfs_t *zfsvfs = arg; 518 519 if (newval) { 520 /* XXX locking on vfs_flag? */ 521 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 522 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 523 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 524 } else { 525 /* XXX locking on vfs_flag? */ 526 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 527 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 528 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 529 } 530 } 531 532 static void 533 setuid_changed_cb(void *arg, uint64_t newval) 534 { 535 zfsvfs_t *zfsvfs = arg; 536 537 if (newval == FALSE) { 538 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 539 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 540 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 541 } else { 542 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 543 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 544 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 545 } 546 } 547 548 static void 549 exec_changed_cb(void *arg, uint64_t newval) 550 { 551 zfsvfs_t *zfsvfs = arg; 552 553 if (newval == FALSE) { 554 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 555 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 556 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 557 } else { 558 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 559 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 560 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 561 } 562 } 563 564 /* 565 * The nbmand mount option can be changed at mount time. 566 * We can't allow it to be toggled on live file systems or incorrect 567 * behavior may be seen from cifs clients 568 * 569 * This property isn't registered via dsl_prop_register(), but this callback 570 * will be called when a file system is first mounted 571 */ 572 static void 573 nbmand_changed_cb(void *arg, uint64_t newval) 574 { 575 zfsvfs_t *zfsvfs = arg; 576 if (newval == FALSE) { 577 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 578 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 579 } else { 580 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 581 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 582 } 583 } 584 585 static void 586 snapdir_changed_cb(void *arg, uint64_t newval) 587 { 588 zfsvfs_t *zfsvfs = arg; 589 590 zfsvfs->z_show_ctldir = newval; 591 } 592 593 static void 594 acl_mode_changed_cb(void *arg, uint64_t newval) 595 { 596 zfsvfs_t *zfsvfs = arg; 597 598 zfsvfs->z_acl_mode = newval; 599 } 600 601 static void 602 acl_inherit_changed_cb(void *arg, uint64_t newval) 603 { 604 zfsvfs_t *zfsvfs = arg; 605 606 zfsvfs->z_acl_inherit = newval; 607 } 608 609 static void 610 acl_type_changed_cb(void *arg, uint64_t newval) 611 { 612 zfsvfs_t *zfsvfs = arg; 613 614 zfsvfs->z_acl_type = newval; 615 } 616 617 static void 618 longname_changed_cb(void *arg, uint64_t newval) 619 { 620 zfsvfs_t *zfsvfs = arg; 621 622 zfsvfs->z_longname = newval; 623 } 624 625 static int 626 zfs_register_callbacks(vfs_t *vfsp) 627 { 628 struct dsl_dataset *ds = NULL; 629 objset_t *os = NULL; 630 zfsvfs_t *zfsvfs = NULL; 631 uint64_t nbmand; 632 boolean_t readonly = B_FALSE; 633 boolean_t do_readonly = B_FALSE; 634 boolean_t setuid = B_FALSE; 635 boolean_t do_setuid = B_FALSE; 636 boolean_t exec = B_FALSE; 637 boolean_t do_exec = B_FALSE; 638 boolean_t xattr = B_FALSE; 639 boolean_t atime = B_FALSE; 640 boolean_t do_atime = B_FALSE; 641 boolean_t do_xattr = B_FALSE; 642 int error = 0; 643 644 ASSERT3P(vfsp, !=, NULL); 645 zfsvfs = vfsp->vfs_data; 646 ASSERT3P(zfsvfs, !=, NULL); 647 os = zfsvfs->z_os; 648 649 /* 650 * This function can be called for a snapshot when we update snapshot's 651 * mount point, which isn't really supported. 652 */ 653 if (dmu_objset_is_snapshot(os)) 654 return (EOPNOTSUPP); 655 656 /* 657 * The act of registering our callbacks will destroy any mount 658 * options we may have. In order to enable temporary overrides 659 * of mount options, we stash away the current values and 660 * restore them after we register the callbacks. 661 */ 662 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 663 !spa_writeable(dmu_objset_spa(os))) { 664 readonly = B_TRUE; 665 do_readonly = B_TRUE; 666 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 667 readonly = B_FALSE; 668 do_readonly = B_TRUE; 669 } 670 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 671 setuid = B_FALSE; 672 do_setuid = B_TRUE; 673 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 674 setuid = B_TRUE; 675 do_setuid = B_TRUE; 676 } 677 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 678 exec = B_FALSE; 679 do_exec = B_TRUE; 680 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 681 exec = B_TRUE; 682 do_exec = B_TRUE; 683 } 684 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 685 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 686 do_xattr = B_TRUE; 687 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 688 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 689 do_xattr = B_TRUE; 690 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 691 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 692 do_xattr = B_TRUE; 693 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 694 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 695 do_xattr = B_TRUE; 696 } 697 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 698 atime = B_FALSE; 699 do_atime = B_TRUE; 700 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 701 atime = B_TRUE; 702 do_atime = B_TRUE; 703 } 704 705 /* 706 * We need to enter pool configuration here, so that we can use 707 * dsl_prop_get_int_ds() to handle the special nbmand property below. 708 * dsl_prop_get_integer() can not be used, because it has to acquire 709 * spa_namespace_lock and we can not do that because we already hold 710 * z_teardown_lock. The problem is that spa_write_cachefile() is called 711 * with spa_namespace_lock held and the function calls ZFS vnode 712 * operations to write the cache file and thus z_teardown_lock is 713 * acquired after spa_namespace_lock. 714 */ 715 ds = dmu_objset_ds(os); 716 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 717 718 /* 719 * nbmand is a special property. It can only be changed at 720 * mount time. 721 * 722 * This is weird, but it is documented to only be changeable 723 * at mount time. 724 */ 725 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 726 nbmand = B_FALSE; 727 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 728 nbmand = B_TRUE; 729 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { 730 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 731 return (error); 732 } 733 734 /* 735 * Register property callbacks. 736 * 737 * It would probably be fine to just check for i/o error from 738 * the first prop_register(), but I guess I like to go 739 * overboard... 740 */ 741 error = dsl_prop_register(ds, 742 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 743 error = error ? error : dsl_prop_register(ds, 744 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 745 error = error ? error : dsl_prop_register(ds, 746 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 747 error = error ? error : dsl_prop_register(ds, 748 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 749 error = error ? error : dsl_prop_register(ds, 750 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 751 error = error ? error : dsl_prop_register(ds, 752 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 753 error = error ? error : dsl_prop_register(ds, 754 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 755 error = error ? error : dsl_prop_register(ds, 756 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 757 error = error ? error : dsl_prop_register(ds, 758 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 759 error = error ? error : dsl_prop_register(ds, 760 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 761 zfsvfs); 762 error = error ? error : dsl_prop_register(ds, 763 zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs); 764 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 765 if (error) 766 goto unregister; 767 768 /* 769 * Invoke our callbacks to restore temporary mount options. 770 */ 771 if (do_readonly) 772 readonly_changed_cb(zfsvfs, readonly); 773 if (do_setuid) 774 setuid_changed_cb(zfsvfs, setuid); 775 if (do_exec) 776 exec_changed_cb(zfsvfs, exec); 777 if (do_xattr) 778 xattr_changed_cb(zfsvfs, xattr); 779 if (do_atime) 780 atime_changed_cb(zfsvfs, atime); 781 782 nbmand_changed_cb(zfsvfs, nbmand); 783 784 return (0); 785 786 unregister: 787 dsl_prop_unregister_all(ds, zfsvfs); 788 return (error); 789 } 790 791 /* 792 * Associate this zfsvfs with the given objset, which must be owned. 793 * This will cache a bunch of on-disk state from the objset in the 794 * zfsvfs. 795 */ 796 static int 797 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 798 { 799 int error; 800 uint64_t val; 801 802 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 803 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 804 zfsvfs->z_os = os; 805 806 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 807 if (error != 0) 808 return (error); 809 if (zfsvfs->z_version > 810 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 811 (void) printf("Can't mount a version %lld file system " 812 "on a version %lld pool\n. Pool must be upgraded to mount " 813 "this file system.", (u_longlong_t)zfsvfs->z_version, 814 (u_longlong_t)spa_version(dmu_objset_spa(os))); 815 return (SET_ERROR(ENOTSUP)); 816 } 817 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 818 if (error != 0) 819 return (error); 820 zfsvfs->z_norm = (int)val; 821 822 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 823 if (error != 0) 824 return (error); 825 zfsvfs->z_utf8 = (val != 0); 826 827 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 828 if (error != 0) 829 return (error); 830 zfsvfs->z_case = (uint_t)val; 831 832 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 833 if (error != 0) 834 return (error); 835 zfsvfs->z_acl_type = (uint_t)val; 836 837 /* 838 * Fold case on file systems that are always or sometimes case 839 * insensitive. 840 */ 841 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 842 zfsvfs->z_case == ZFS_CASE_MIXED) 843 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 844 845 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 846 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 847 848 uint64_t sa_obj = 0; 849 if (zfsvfs->z_use_sa) { 850 /* should either have both of these objects or none */ 851 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 852 &sa_obj); 853 if (error != 0) 854 return (error); 855 856 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); 857 if (error == 0 && val == ZFS_XATTR_SA) 858 zfsvfs->z_xattr_sa = B_TRUE; 859 } 860 861 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 862 &zfsvfs->z_attr_table); 863 if (error != 0) 864 return (error); 865 866 if (zfsvfs->z_version >= ZPL_VERSION_SA) 867 sa_register_update_callback(os, zfs_sa_upgrade); 868 869 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 870 &zfsvfs->z_root); 871 if (error != 0) 872 return (error); 873 ASSERT3U(zfsvfs->z_root, !=, 0); 874 875 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 876 &zfsvfs->z_unlinkedobj); 877 if (error != 0) 878 return (error); 879 880 error = zap_lookup(os, MASTER_NODE_OBJ, 881 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 882 8, 1, &zfsvfs->z_userquota_obj); 883 if (error == ENOENT) 884 zfsvfs->z_userquota_obj = 0; 885 else if (error != 0) 886 return (error); 887 888 error = zap_lookup(os, MASTER_NODE_OBJ, 889 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 890 8, 1, &zfsvfs->z_groupquota_obj); 891 if (error == ENOENT) 892 zfsvfs->z_groupquota_obj = 0; 893 else if (error != 0) 894 return (error); 895 896 error = zap_lookup(os, MASTER_NODE_OBJ, 897 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 898 8, 1, &zfsvfs->z_projectquota_obj); 899 if (error == ENOENT) 900 zfsvfs->z_projectquota_obj = 0; 901 else if (error != 0) 902 return (error); 903 904 error = zap_lookup(os, MASTER_NODE_OBJ, 905 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 906 8, 1, &zfsvfs->z_userobjquota_obj); 907 if (error == ENOENT) 908 zfsvfs->z_userobjquota_obj = 0; 909 else if (error != 0) 910 return (error); 911 912 error = zap_lookup(os, MASTER_NODE_OBJ, 913 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 914 8, 1, &zfsvfs->z_groupobjquota_obj); 915 if (error == ENOENT) 916 zfsvfs->z_groupobjquota_obj = 0; 917 else if (error != 0) 918 return (error); 919 920 error = zap_lookup(os, MASTER_NODE_OBJ, 921 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 922 8, 1, &zfsvfs->z_projectobjquota_obj); 923 if (error == ENOENT) 924 zfsvfs->z_projectobjquota_obj = 0; 925 else if (error != 0) 926 return (error); 927 928 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 929 &zfsvfs->z_fuid_obj); 930 if (error == ENOENT) 931 zfsvfs->z_fuid_obj = 0; 932 else if (error != 0) 933 return (error); 934 935 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 936 &zfsvfs->z_shares_dir); 937 if (error == ENOENT) 938 zfsvfs->z_shares_dir = 0; 939 else if (error != 0) 940 return (error); 941 942 /* 943 * Only use the name cache if we are looking for a 944 * name on a file system that does not require normalization 945 * or case folding. We can also look there if we happen to be 946 * on a non-normalizing, mixed sensitivity file system IF we 947 * are looking for the exact name (which is always the case on 948 * FreeBSD). 949 */ 950 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 951 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 952 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 953 954 return (0); 955 } 956 957 taskq_t *zfsvfs_taskq; 958 959 static void 960 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 961 { 962 963 zfs_unlinked_drain((zfsvfs_t *)context); 964 } 965 966 int 967 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 968 { 969 objset_t *os; 970 zfsvfs_t *zfsvfs; 971 int error; 972 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 973 974 /* 975 * XXX: Fix struct statfs so this isn't necessary! 976 * 977 * The 'osname' is used as the filesystem's special node, which means 978 * it must fit in statfs.f_mntfromname, or else it can't be 979 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 980 * 'zfs unmount' to think it's not mounted when it is. 981 */ 982 if (strlen(osname) >= MNAMELEN) 983 return (SET_ERROR(ENAMETOOLONG)); 984 985 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 986 987 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 988 &os); 989 if (error != 0) { 990 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 991 return (error); 992 } 993 994 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 995 996 return (error); 997 } 998 999 1000 int 1001 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1002 { 1003 int error; 1004 1005 zfsvfs->z_vfs = NULL; 1006 zfsvfs->z_parent = zfsvfs; 1007 1008 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1009 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1010 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1011 offsetof(znode_t, z_link_node)); 1012 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 1013 zfsvfs_task_unlinked_drain, zfsvfs); 1014 ZFS_TEARDOWN_INIT(zfsvfs); 1015 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 1016 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1017 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1018 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1019 1020 error = zfsvfs_init(zfsvfs, os); 1021 if (error != 0) { 1022 dmu_objset_disown(os, B_TRUE, zfsvfs); 1023 *zfvp = NULL; 1024 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1025 return (error); 1026 } 1027 1028 *zfvp = zfsvfs; 1029 return (0); 1030 } 1031 1032 static int 1033 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1034 { 1035 int error; 1036 1037 /* 1038 * Check for a bad on-disk format version now since we 1039 * lied about owning the dataset readonly before. 1040 */ 1041 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1042 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1043 return (SET_ERROR(EROFS)); 1044 1045 error = zfs_register_callbacks(zfsvfs->z_vfs); 1046 if (error) 1047 return (error); 1048 1049 /* 1050 * If we are not mounting (ie: online recv), then we don't 1051 * have to worry about replaying the log as we blocked all 1052 * operations out since we closed the ZIL. 1053 */ 1054 if (mounting) { 1055 boolean_t readonly; 1056 1057 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); 1058 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1059 if (error) 1060 return (error); 1061 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1062 &zfsvfs->z_kstat.dk_zil_sums); 1063 1064 /* 1065 * During replay we remove the read only flag to 1066 * allow replays to succeed. 1067 */ 1068 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1069 if (readonly != 0) { 1070 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1071 } else { 1072 dsl_dir_t *dd; 1073 zap_stats_t zs; 1074 1075 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1076 &zs) == 0) { 1077 dataset_kstats_update_nunlinks_kstat( 1078 &zfsvfs->z_kstat, zs.zs_num_entries); 1079 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1080 "num_entries in unlinked set: %llu", 1081 (u_longlong_t)zs.zs_num_entries); 1082 } 1083 1084 zfs_unlinked_drain(zfsvfs); 1085 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1086 dd->dd_activity_cancelled = B_FALSE; 1087 } 1088 1089 /* 1090 * Parse and replay the intent log. 1091 * 1092 * Because of ziltest, this must be done after 1093 * zfs_unlinked_drain(). (Further note: ziltest 1094 * doesn't use readonly mounts, where 1095 * zfs_unlinked_drain() isn't called.) This is because 1096 * ziltest causes spa_sync() to think it's committed, 1097 * but actually it is not, so the intent log contains 1098 * many txg's worth of changes. 1099 * 1100 * In particular, if object N is in the unlinked set in 1101 * the last txg to actually sync, then it could be 1102 * actually freed in a later txg and then reallocated 1103 * in a yet later txg. This would write a "create 1104 * object N" record to the intent log. Normally, this 1105 * would be fine because the spa_sync() would have 1106 * written out the fact that object N is free, before 1107 * we could write the "create object N" intent log 1108 * record. 1109 * 1110 * But when we are in ziltest mode, we advance the "open 1111 * txg" without actually spa_sync()-ing the changes to 1112 * disk. So we would see that object N is still 1113 * allocated and in the unlinked set, and there is an 1114 * intent log record saying to allocate it. 1115 */ 1116 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1117 if (zil_replay_disable) { 1118 zil_destroy(zfsvfs->z_log, B_FALSE); 1119 } else { 1120 boolean_t use_nc = zfsvfs->z_use_namecache; 1121 zfsvfs->z_use_namecache = B_FALSE; 1122 zfsvfs->z_replay = B_TRUE; 1123 zil_replay(zfsvfs->z_os, zfsvfs, 1124 zfs_replay_vector); 1125 zfsvfs->z_replay = B_FALSE; 1126 zfsvfs->z_use_namecache = use_nc; 1127 } 1128 } 1129 1130 /* restore readonly bit */ 1131 if (readonly != 0) 1132 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1133 } else { 1134 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); 1135 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1136 &zfsvfs->z_kstat.dk_zil_sums); 1137 } 1138 1139 /* 1140 * Set the objset user_ptr to track its zfsvfs. 1141 */ 1142 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1143 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1144 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1145 1146 return (0); 1147 } 1148 1149 void 1150 zfsvfs_free(zfsvfs_t *zfsvfs) 1151 { 1152 int i; 1153 1154 zfs_fuid_destroy(zfsvfs); 1155 1156 mutex_destroy(&zfsvfs->z_znodes_lock); 1157 mutex_destroy(&zfsvfs->z_lock); 1158 list_destroy(&zfsvfs->z_all_znodes); 1159 ZFS_TEARDOWN_DESTROY(zfsvfs); 1160 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1161 rw_destroy(&zfsvfs->z_fuid_lock); 1162 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1163 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1164 dataset_kstats_destroy(&zfsvfs->z_kstat); 1165 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1166 } 1167 1168 static void 1169 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1170 { 1171 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1172 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1173 } 1174 1175 static int 1176 zfs_domount(vfs_t *vfsp, char *osname) 1177 { 1178 uint64_t recordsize, fsid_guid; 1179 int error = 0; 1180 zfsvfs_t *zfsvfs; 1181 1182 ASSERT3P(vfsp, !=, NULL); 1183 ASSERT3P(osname, !=, NULL); 1184 1185 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1186 if (error) 1187 return (error); 1188 zfsvfs->z_vfs = vfsp; 1189 1190 if ((error = dsl_prop_get_integer(osname, 1191 "recordsize", &recordsize, NULL))) 1192 goto out; 1193 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1194 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1195 1196 vfsp->vfs_data = zfsvfs; 1197 vfsp->mnt_flag |= MNT_LOCAL; 1198 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1199 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1200 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1201 /* 1202 * This can cause a loss of coherence between ARC and page cache 1203 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1204 */ 1205 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1206 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1207 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1208 1209 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1210 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1211 #endif 1212 /* 1213 * The fsid is 64 bits, composed of an 8-bit fs type, which 1214 * separates our fsid from any other filesystem types, and a 1215 * 56-bit objset unique ID. The objset unique ID is unique to 1216 * all objsets open on this system, provided by unique_create(). 1217 * The 8-bit fs type must be put in the low bits of fsid[1] 1218 * because that's where other Solaris filesystems put it. 1219 */ 1220 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1221 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); 1222 vfsp->vfs_fsid.val[0] = fsid_guid; 1223 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | 1224 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1225 1226 /* 1227 * Set features for file system. 1228 */ 1229 zfs_set_fuid_feature(zfsvfs); 1230 1231 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1232 uint64_t pval; 1233 1234 atime_changed_cb(zfsvfs, B_FALSE); 1235 readonly_changed_cb(zfsvfs, B_TRUE); 1236 if ((error = dsl_prop_get_integer(osname, 1237 "xattr", &pval, NULL))) 1238 goto out; 1239 xattr_changed_cb(zfsvfs, pval); 1240 if ((error = dsl_prop_get_integer(osname, 1241 "acltype", &pval, NULL))) 1242 goto out; 1243 acl_type_changed_cb(zfsvfs, pval); 1244 zfsvfs->z_issnap = B_TRUE; 1245 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1246 1247 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1248 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1249 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1250 } else { 1251 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1252 goto out; 1253 } 1254 1255 vfs_mountedfrom(vfsp, osname); 1256 1257 if (!zfsvfs->z_issnap) 1258 zfsctl_create(zfsvfs); 1259 out: 1260 if (error) { 1261 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1262 zfsvfs_free(zfsvfs); 1263 } else { 1264 atomic_inc_32(&zfs_active_fs_count); 1265 } 1266 1267 return (error); 1268 } 1269 1270 static void 1271 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1272 { 1273 objset_t *os = zfsvfs->z_os; 1274 1275 if (!dmu_objset_is_snapshot(os)) 1276 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1277 } 1278 1279 static int 1280 getpoolname(const char *osname, char *poolname) 1281 { 1282 char *p; 1283 1284 p = strchr(osname, '/'); 1285 if (p == NULL) { 1286 if (strlen(osname) >= MAXNAMELEN) 1287 return (ENAMETOOLONG); 1288 (void) strcpy(poolname, osname); 1289 } else { 1290 if (p - osname >= MAXNAMELEN) 1291 return (ENAMETOOLONG); 1292 (void) strlcpy(poolname, osname, p - osname + 1); 1293 } 1294 return (0); 1295 } 1296 1297 static void 1298 fetch_osname_options(char *name, bool *checkpointrewind) 1299 { 1300 1301 if (name[0] == '!') { 1302 *checkpointrewind = true; 1303 memmove(name, name + 1, strlen(name)); 1304 } else { 1305 *checkpointrewind = false; 1306 } 1307 } 1308 1309 static int 1310 zfs_mount(vfs_t *vfsp) 1311 { 1312 kthread_t *td = curthread; 1313 vnode_t *mvp = vfsp->mnt_vnodecovered; 1314 cred_t *cr = td->td_ucred; 1315 char *osname; 1316 int error = 0; 1317 int canwrite; 1318 bool checkpointrewind, isctlsnap = false; 1319 1320 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1321 return (SET_ERROR(EINVAL)); 1322 1323 /* 1324 * If full-owner-access is enabled and delegated administration is 1325 * turned on, we must set nosuid. 1326 */ 1327 if (zfs_super_owner && 1328 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1329 secpolicy_fs_mount_clearopts(cr, vfsp); 1330 } 1331 1332 fetch_osname_options(osname, &checkpointrewind); 1333 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && 1334 strchr(osname, '@') != NULL); 1335 1336 /* 1337 * Check for mount privilege? 1338 * 1339 * If we don't have privilege then see if 1340 * we have local permission to allow it 1341 */ 1342 error = secpolicy_fs_mount(cr, mvp, vfsp); 1343 if (error && isctlsnap) { 1344 secpolicy_fs_mount_clearopts(cr, vfsp); 1345 } else if (error) { 1346 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1347 goto out; 1348 1349 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1350 vattr_t vattr; 1351 1352 /* 1353 * Make sure user is the owner of the mount point 1354 * or has sufficient privileges. 1355 */ 1356 1357 vattr.va_mask = AT_UID; 1358 1359 vn_lock(mvp, LK_SHARED | LK_RETRY); 1360 if (VOP_GETATTR(mvp, &vattr, cr)) { 1361 VOP_UNLOCK(mvp); 1362 goto out; 1363 } 1364 1365 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1366 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1367 VOP_UNLOCK(mvp); 1368 goto out; 1369 } 1370 VOP_UNLOCK(mvp); 1371 } 1372 1373 secpolicy_fs_mount_clearopts(cr, vfsp); 1374 } 1375 1376 /* 1377 * Refuse to mount a filesystem if we are in a local zone and the 1378 * dataset is not visible. 1379 */ 1380 if (!INGLOBALZONE(curproc) && 1381 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1382 boolean_t mount_snapshot = B_FALSE; 1383 1384 /* 1385 * Snapshots may be mounted in .zfs for unjailed datasets 1386 * if allowed by the jail param zfs.mount_snapshot. 1387 */ 1388 if (isctlsnap) { 1389 struct prison *pr; 1390 struct zfs_jailparam *zjp; 1391 1392 pr = curthread->td_ucred->cr_prison; 1393 mtx_lock(&pr->pr_mtx); 1394 zjp = osd_jail_get(pr, zfs_jailparam_slot); 1395 mtx_unlock(&pr->pr_mtx); 1396 if (zjp && zjp->mount_snapshot) 1397 mount_snapshot = B_TRUE; 1398 } 1399 if (!mount_snapshot) { 1400 error = SET_ERROR(EPERM); 1401 goto out; 1402 } 1403 } 1404 1405 vfsp->vfs_flag |= MNT_NFS4ACLS; 1406 1407 /* 1408 * When doing a remount, we simply refresh our temporary properties 1409 * according to those options set in the current VFS options. 1410 */ 1411 if (vfsp->vfs_flag & MS_REMOUNT) { 1412 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1413 1414 /* 1415 * Refresh mount options with z_teardown_lock blocking I/O while 1416 * the filesystem is in an inconsistent state. 1417 * The lock also serializes this code with filesystem 1418 * manipulations between entry to zfs_suspend_fs() and return 1419 * from zfs_resume_fs(). 1420 */ 1421 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1422 zfs_unregister_callbacks(zfsvfs); 1423 error = zfs_register_callbacks(vfsp); 1424 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1425 goto out; 1426 } 1427 1428 /* Initial root mount: try hard to import the requested root pool. */ 1429 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1430 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1431 char pname[MAXNAMELEN]; 1432 1433 error = getpoolname(osname, pname); 1434 if (error == 0) 1435 error = spa_import_rootpool(pname, checkpointrewind); 1436 if (error) 1437 goto out; 1438 } 1439 DROP_GIANT(); 1440 error = zfs_domount(vfsp, osname); 1441 PICKUP_GIANT(); 1442 1443 out: 1444 return (error); 1445 } 1446 1447 static int 1448 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1449 { 1450 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1451 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1452 int error; 1453 1454 statp->f_version = STATFS_VERSION; 1455 1456 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1457 return (error); 1458 1459 dmu_objset_space(zfsvfs->z_os, 1460 &refdbytes, &availbytes, &usedobjs, &availobjs); 1461 1462 /* 1463 * The underlying storage pool actually uses multiple block sizes. 1464 * We report the fragsize as the smallest block size we support, 1465 * and we report our blocksize as the filesystem's maximum blocksize. 1466 */ 1467 statp->f_bsize = SPA_MINBLOCKSIZE; 1468 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1469 1470 /* 1471 * The following report "total" blocks of various kinds in the 1472 * file system, but reported in terms of f_frsize - the 1473 * "fragment" size. 1474 */ 1475 1476 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1477 statp->f_bfree = availbytes / statp->f_bsize; 1478 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1479 1480 /* 1481 * statvfs() should really be called statufs(), because it assumes 1482 * static metadata. ZFS doesn't preallocate files, so the best 1483 * we can do is report the max that could possibly fit in f_files, 1484 * and that minus the number actually used in f_ffree. 1485 * For f_ffree, report the smaller of the number of object available 1486 * and the number of blocks (each object will take at least a block). 1487 */ 1488 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1489 statp->f_files = statp->f_ffree + usedobjs; 1490 1491 /* 1492 * We're a zfs filesystem. 1493 */ 1494 strlcpy(statp->f_fstypename, "zfs", 1495 sizeof (statp->f_fstypename)); 1496 1497 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1498 sizeof (statp->f_mntfromname)); 1499 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1500 sizeof (statp->f_mntonname)); 1501 1502 statp->f_namemax = 1503 zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1); 1504 1505 zfs_exit(zfsvfs, FTAG); 1506 return (0); 1507 } 1508 1509 static int 1510 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1511 { 1512 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1513 znode_t *rootzp; 1514 int error; 1515 1516 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1517 return (error); 1518 1519 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1520 if (error == 0) 1521 *vpp = ZTOV(rootzp); 1522 1523 zfs_exit(zfsvfs, FTAG); 1524 1525 if (error == 0) { 1526 error = vn_lock(*vpp, flags); 1527 if (error != 0) { 1528 VN_RELE(*vpp); 1529 *vpp = NULL; 1530 } 1531 } 1532 return (error); 1533 } 1534 1535 /* 1536 * Teardown the zfsvfs::z_os. 1537 * 1538 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1539 * and 'z_teardown_inactive_lock' held. 1540 */ 1541 static int 1542 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1543 { 1544 znode_t *zp; 1545 dsl_dir_t *dd; 1546 1547 /* 1548 * If someone has not already unmounted this file system, 1549 * drain the zrele_taskq to ensure all active references to the 1550 * zfsvfs_t have been handled only then can it be safely destroyed. 1551 */ 1552 if (zfsvfs->z_os) { 1553 /* 1554 * If we're unmounting we have to wait for the list to 1555 * drain completely. 1556 * 1557 * If we're not unmounting there's no guarantee the list 1558 * will drain completely, but zreles run from the taskq 1559 * may add the parents of dir-based xattrs to the taskq 1560 * so we want to wait for these. 1561 * 1562 * We can safely check z_all_znodes for being empty because the 1563 * VFS has already blocked operations which add to it. 1564 */ 1565 int round = 0; 1566 while (!list_is_empty(&zfsvfs->z_all_znodes)) { 1567 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1568 dmu_objset_pool(zfsvfs->z_os)), 0); 1569 if (++round > 1 && !unmounting) 1570 break; 1571 } 1572 } 1573 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1574 1575 if (!unmounting) { 1576 /* 1577 * We purge the parent filesystem's vfsp as the parent 1578 * filesystem and all of its snapshots have their vnode's 1579 * v_vfsp set to the parent's filesystem's vfsp. Note, 1580 * 'z_parent' is self referential for non-snapshots. 1581 */ 1582 #ifdef FREEBSD_NAMECACHE 1583 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1584 #endif 1585 } 1586 1587 /* 1588 * Close the zil. NB: Can't close the zil while zfs_inactive 1589 * threads are blocked as zil_close can call zfs_inactive. 1590 */ 1591 if (zfsvfs->z_log) { 1592 zil_close(zfsvfs->z_log); 1593 zfsvfs->z_log = NULL; 1594 } 1595 1596 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1597 1598 /* 1599 * If we are not unmounting (ie: online recv) and someone already 1600 * unmounted this file system while we were doing the switcheroo, 1601 * or a reopen of z_os failed then just bail out now. 1602 */ 1603 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1604 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1605 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1606 return (SET_ERROR(EIO)); 1607 } 1608 1609 /* 1610 * At this point there are no vops active, and any new vops will 1611 * fail with EIO since we have z_teardown_lock for writer (only 1612 * relevant for forced unmount). 1613 * 1614 * Release all holds on dbufs. 1615 */ 1616 mutex_enter(&zfsvfs->z_znodes_lock); 1617 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1618 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1619 if (zp->z_sa_hdl != NULL) { 1620 zfs_znode_dmu_fini(zp); 1621 } 1622 } 1623 mutex_exit(&zfsvfs->z_znodes_lock); 1624 1625 /* 1626 * If we are unmounting, set the unmounted flag and let new vops 1627 * unblock. zfs_inactive will have the unmounted behavior, and all 1628 * other vops will fail with EIO. 1629 */ 1630 if (unmounting) { 1631 zfsvfs->z_unmounted = B_TRUE; 1632 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1633 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1634 } 1635 1636 /* 1637 * z_os will be NULL if there was an error in attempting to reopen 1638 * zfsvfs, so just return as the properties had already been 1639 * unregistered and cached data had been evicted before. 1640 */ 1641 if (zfsvfs->z_os == NULL) 1642 return (0); 1643 1644 /* 1645 * Unregister properties. 1646 */ 1647 zfs_unregister_callbacks(zfsvfs); 1648 1649 /* 1650 * Evict cached data. We must write out any dirty data before 1651 * disowning the dataset. 1652 */ 1653 objset_t *os = zfsvfs->z_os; 1654 boolean_t os_dirty = B_FALSE; 1655 for (int t = 0; t < TXG_SIZE; t++) { 1656 if (dmu_objset_is_dirty(os, t)) { 1657 os_dirty = B_TRUE; 1658 break; 1659 } 1660 } 1661 if (!zfs_is_readonly(zfsvfs) && os_dirty) 1662 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1663 dmu_objset_evict_dbufs(zfsvfs->z_os); 1664 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1665 dsl_dir_cancel_waiters(dd); 1666 1667 return (0); 1668 } 1669 1670 static int 1671 zfs_umount(vfs_t *vfsp, int fflag) 1672 { 1673 kthread_t *td = curthread; 1674 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1675 objset_t *os; 1676 cred_t *cr = td->td_ucred; 1677 int ret; 1678 1679 ret = secpolicy_fs_unmount(cr, vfsp); 1680 if (ret) { 1681 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1682 ZFS_DELEG_PERM_MOUNT, cr)) 1683 return (ret); 1684 } 1685 1686 /* 1687 * Unmount any snapshots mounted under .zfs before unmounting the 1688 * dataset itself. 1689 */ 1690 if (zfsvfs->z_ctldir != NULL) { 1691 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1692 return (ret); 1693 } 1694 1695 if (fflag & MS_FORCE) { 1696 /* 1697 * Mark file system as unmounted before calling 1698 * vflush(FORCECLOSE). This way we ensure no future vnops 1699 * will be called and risk operating on DOOMED vnodes. 1700 */ 1701 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1702 zfsvfs->z_unmounted = B_TRUE; 1703 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1704 } 1705 1706 /* 1707 * Flush all the files. 1708 */ 1709 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1710 if (ret != 0) 1711 return (ret); 1712 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1713 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1714 taskqueue_drain(zfsvfs_taskq->tq_queue, 1715 &zfsvfs->z_unlinked_drain_task); 1716 1717 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); 1718 os = zfsvfs->z_os; 1719 1720 /* 1721 * z_os will be NULL if there was an error in 1722 * attempting to reopen zfsvfs. 1723 */ 1724 if (os != NULL) { 1725 /* 1726 * Unset the objset user_ptr. 1727 */ 1728 mutex_enter(&os->os_user_ptr_lock); 1729 dmu_objset_set_user(os, NULL); 1730 mutex_exit(&os->os_user_ptr_lock); 1731 1732 /* 1733 * Finally release the objset 1734 */ 1735 dmu_objset_disown(os, B_TRUE, zfsvfs); 1736 } 1737 1738 /* 1739 * We can now safely destroy the '.zfs' directory node. 1740 */ 1741 if (zfsvfs->z_ctldir != NULL) 1742 zfsctl_destroy(zfsvfs); 1743 zfs_freevfs(vfsp); 1744 1745 return (0); 1746 } 1747 1748 static int 1749 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1750 { 1751 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1752 znode_t *zp; 1753 int err; 1754 1755 /* 1756 * zfs_zget() can't operate on virtual entries like .zfs/ or 1757 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1758 * This will make NFS to switch to LOOKUP instead of using VGET. 1759 */ 1760 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1761 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1762 return (EOPNOTSUPP); 1763 1764 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1765 return (err); 1766 err = zfs_zget(zfsvfs, ino, &zp); 1767 if (err == 0 && zp->z_unlinked) { 1768 vrele(ZTOV(zp)); 1769 err = EINVAL; 1770 } 1771 if (err == 0) 1772 *vpp = ZTOV(zp); 1773 zfs_exit(zfsvfs, FTAG); 1774 if (err == 0) { 1775 err = vn_lock(*vpp, flags); 1776 if (err != 0) 1777 vrele(*vpp); 1778 } 1779 if (err != 0) 1780 *vpp = NULL; 1781 return (err); 1782 } 1783 1784 static int 1785 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1786 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1787 { 1788 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1789 1790 /* 1791 * If this is regular file system vfsp is the same as 1792 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1793 * zfsvfs->z_parent->z_vfs represents parent file system 1794 * which we have to use here, because only this file system 1795 * has mnt_export configured. 1796 */ 1797 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1798 credanonp, numsecflavors, secflavors)); 1799 } 1800 1801 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, 1802 "struct fid bigger than SHORT_FID_LEN"); 1803 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, 1804 "struct fid bigger than LONG_FID_LEN"); 1805 1806 static int 1807 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1808 { 1809 struct componentname cn; 1810 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1811 znode_t *zp; 1812 vnode_t *dvp; 1813 uint64_t object = 0; 1814 uint64_t fid_gen = 0; 1815 uint64_t setgen = 0; 1816 uint64_t gen_mask; 1817 uint64_t zp_gen; 1818 int i, err; 1819 1820 *vpp = NULL; 1821 1822 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1823 return (err); 1824 1825 /* 1826 * On FreeBSD we can get snapshot's mount point or its parent file 1827 * system mount point depending if snapshot is already mounted or not. 1828 */ 1829 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1830 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1831 uint64_t objsetid = 0; 1832 1833 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1834 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1835 1836 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1837 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1838 1839 zfs_exit(zfsvfs, FTAG); 1840 1841 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1842 if (err) 1843 return (SET_ERROR(EINVAL)); 1844 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1845 return (err); 1846 } 1847 1848 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1849 zfid_short_t *zfid = (zfid_short_t *)fidp; 1850 1851 for (i = 0; i < sizeof (zfid->zf_object); i++) 1852 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1853 1854 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1855 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1856 } else { 1857 zfs_exit(zfsvfs, FTAG); 1858 return (SET_ERROR(EINVAL)); 1859 } 1860 1861 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { 1862 zfs_exit(zfsvfs, FTAG); 1863 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", 1864 (u_longlong_t)fid_gen, (u_longlong_t)setgen); 1865 return (SET_ERROR(EINVAL)); 1866 } 1867 1868 /* 1869 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1870 * directory tree. If the object == zfsvfs->z_shares_dir, then 1871 * we are in the .zfs/shares directory tree. 1872 */ 1873 if ((fid_gen == 0 && 1874 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1875 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1876 zfs_exit(zfsvfs, FTAG); 1877 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1878 if (object == ZFSCTL_INO_SNAPDIR) { 1879 cn.cn_nameptr = "snapshot"; 1880 cn.cn_namelen = strlen(cn.cn_nameptr); 1881 cn.cn_nameiop = LOOKUP; 1882 cn.cn_flags = ISLASTCN | LOCKLEAF; 1883 cn.cn_lkflags = flags; 1884 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1885 vput(dvp); 1886 } else if (object == zfsvfs->z_shares_dir) { 1887 /* 1888 * XXX This branch must not be taken, 1889 * if it is, then the lookup below will 1890 * explode. 1891 */ 1892 cn.cn_nameptr = "shares"; 1893 cn.cn_namelen = strlen(cn.cn_nameptr); 1894 cn.cn_nameiop = LOOKUP; 1895 cn.cn_flags = ISLASTCN; 1896 cn.cn_lkflags = flags; 1897 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1898 vput(dvp); 1899 } else { 1900 *vpp = dvp; 1901 } 1902 return (err); 1903 } 1904 1905 gen_mask = -1ULL >> (64 - 8 * i); 1906 1907 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, 1908 (u_longlong_t)fid_gen, 1909 (u_longlong_t)gen_mask); 1910 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1911 zfs_exit(zfsvfs, FTAG); 1912 return (err); 1913 } 1914 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1915 sizeof (uint64_t)); 1916 zp_gen = zp_gen & gen_mask; 1917 if (zp_gen == 0) 1918 zp_gen = 1; 1919 if (zp->z_unlinked || zp_gen != fid_gen) { 1920 dprintf("znode gen (%llu) != fid gen (%llu)\n", 1921 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); 1922 vrele(ZTOV(zp)); 1923 zfs_exit(zfsvfs, FTAG); 1924 return (SET_ERROR(EINVAL)); 1925 } 1926 1927 *vpp = ZTOV(zp); 1928 zfs_exit(zfsvfs, FTAG); 1929 err = vn_lock(*vpp, flags); 1930 if (err == 0) 1931 vnode_create_vobject(*vpp, zp->z_size, curthread); 1932 else 1933 *vpp = NULL; 1934 return (err); 1935 } 1936 1937 /* 1938 * Block out VOPs and close zfsvfs_t::z_os 1939 * 1940 * Note, if successful, then we return with the 'z_teardown_lock' and 1941 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 1942 * dataset and objset intact so that they can be atomically handed off during 1943 * a subsequent rollback or recv operation and the resume thereafter. 1944 */ 1945 int 1946 zfs_suspend_fs(zfsvfs_t *zfsvfs) 1947 { 1948 int error; 1949 1950 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1951 return (error); 1952 1953 return (0); 1954 } 1955 1956 /* 1957 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 1958 * is an invariant across any of the operations that can be performed while the 1959 * filesystem was suspended. Whether it succeeded or failed, the preconditions 1960 * are the same: the relevant objset and associated dataset are owned by 1961 * zfsvfs, held, and long held on entry. 1962 */ 1963 int 1964 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 1965 { 1966 int err; 1967 znode_t *zp; 1968 1969 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 1970 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 1971 1972 /* 1973 * We already own this, so just update the objset_t, as the one we 1974 * had before may have been evicted. 1975 */ 1976 objset_t *os; 1977 VERIFY3P(ds->ds_owner, ==, zfsvfs); 1978 VERIFY(dsl_dataset_long_held(ds)); 1979 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 1980 dsl_pool_config_enter(dp, FTAG); 1981 VERIFY0(dmu_objset_from_ds(ds, &os)); 1982 dsl_pool_config_exit(dp, FTAG); 1983 1984 err = zfsvfs_init(zfsvfs, os); 1985 if (err != 0) 1986 goto bail; 1987 1988 ds->ds_dir->dd_activity_cancelled = B_FALSE; 1989 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); 1990 1991 zfs_set_fuid_feature(zfsvfs); 1992 1993 /* 1994 * Attempt to re-establish all the active znodes with 1995 * their dbufs. If a zfs_rezget() fails, then we'll let 1996 * any potential callers discover that via zfs_enter_verify_zp 1997 * when they try to use their znode. 1998 */ 1999 mutex_enter(&zfsvfs->z_znodes_lock); 2000 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2001 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2002 (void) zfs_rezget(zp); 2003 } 2004 mutex_exit(&zfsvfs->z_znodes_lock); 2005 2006 bail: 2007 /* release the VOPs */ 2008 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2009 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2010 2011 if (err) { 2012 /* 2013 * Since we couldn't setup the sa framework, try to force 2014 * unmount this file system. 2015 */ 2016 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 2017 vfs_ref(zfsvfs->z_vfs); 2018 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2019 } 2020 } 2021 return (err); 2022 } 2023 2024 static void 2025 zfs_freevfs(vfs_t *vfsp) 2026 { 2027 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2028 2029 zfsvfs_free(zfsvfs); 2030 2031 atomic_dec_32(&zfs_active_fs_count); 2032 } 2033 2034 #ifdef __i386__ 2035 static int desiredvnodes_backup; 2036 #include <sys/vmmeter.h> 2037 2038 2039 #include <vm/vm_page.h> 2040 #include <vm/vm_object.h> 2041 #include <vm/vm_kern.h> 2042 #include <vm/vm_map.h> 2043 #endif 2044 2045 static void 2046 zfs_vnodes_adjust(void) 2047 { 2048 #ifdef __i386__ 2049 int newdesiredvnodes; 2050 2051 desiredvnodes_backup = desiredvnodes; 2052 2053 /* 2054 * We calculate newdesiredvnodes the same way it is done in 2055 * vntblinit(). If it is equal to desiredvnodes, it means that 2056 * it wasn't tuned by the administrator and we can tune it down. 2057 */ 2058 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2059 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2060 sizeof (struct vnode)))); 2061 if (newdesiredvnodes == desiredvnodes) 2062 desiredvnodes = (3 * newdesiredvnodes) / 4; 2063 #endif 2064 } 2065 2066 static void 2067 zfs_vnodes_adjust_back(void) 2068 { 2069 2070 #ifdef __i386__ 2071 desiredvnodes = desiredvnodes_backup; 2072 #endif 2073 } 2074 2075 static struct sx zfs_vnlru_lock; 2076 static struct vnode *zfs_vnlru_marker; 2077 static arc_prune_t *zfs_prune; 2078 2079 static void 2080 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) 2081 { 2082 if (nr_to_scan > INT_MAX) 2083 nr_to_scan = INT_MAX; 2084 sx_xlock(&zfs_vnlru_lock); 2085 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); 2086 sx_xunlock(&zfs_vnlru_lock); 2087 } 2088 2089 void 2090 zfs_init(void) 2091 { 2092 2093 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2094 2095 /* 2096 * Initialize .zfs directory structures 2097 */ 2098 zfsctl_init(); 2099 2100 /* 2101 * Initialize znode cache, vnode ops, etc... 2102 */ 2103 zfs_znode_init(); 2104 2105 /* 2106 * Reduce number of vnodes. Originally number of vnodes is calculated 2107 * with UFS inode in mind. We reduce it here, because it's too big for 2108 * ZFS/i386. 2109 */ 2110 zfs_vnodes_adjust(); 2111 2112 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2113 2114 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2115 2116 zfs_vnlru_marker = vnlru_alloc_marker(); 2117 sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); 2118 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); 2119 } 2120 2121 void 2122 zfs_fini(void) 2123 { 2124 arc_remove_prune_callback(zfs_prune); 2125 vnlru_free_marker(zfs_vnlru_marker); 2126 sx_destroy(&zfs_vnlru_lock); 2127 2128 taskq_destroy(zfsvfs_taskq); 2129 zfsctl_fini(); 2130 zfs_znode_fini(); 2131 zfs_vnodes_adjust_back(); 2132 } 2133 2134 int 2135 zfs_busy(void) 2136 { 2137 return (zfs_active_fs_count != 0); 2138 } 2139 2140 /* 2141 * Release VOPs and unmount a suspended filesystem. 2142 */ 2143 int 2144 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2145 { 2146 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2147 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2148 2149 /* 2150 * We already own this, so just hold and rele it to update the 2151 * objset_t, as the one we had before may have been evicted. 2152 */ 2153 objset_t *os; 2154 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2155 VERIFY(dsl_dataset_long_held(ds)); 2156 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2157 dsl_pool_config_enter(dp, FTAG); 2158 VERIFY0(dmu_objset_from_ds(ds, &os)); 2159 dsl_pool_config_exit(dp, FTAG); 2160 zfsvfs->z_os = os; 2161 2162 /* release the VOPs */ 2163 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2164 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2165 2166 /* 2167 * Try to force unmount this file system. 2168 */ 2169 (void) zfs_umount(zfsvfs->z_vfs, 0); 2170 zfsvfs->z_unmounted = B_TRUE; 2171 return (0); 2172 } 2173 2174 int 2175 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2176 { 2177 int error; 2178 objset_t *os = zfsvfs->z_os; 2179 dmu_tx_t *tx; 2180 2181 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2182 return (SET_ERROR(EINVAL)); 2183 2184 if (newvers < zfsvfs->z_version) 2185 return (SET_ERROR(EINVAL)); 2186 2187 if (zfs_spa_version_map(newvers) > 2188 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2189 return (SET_ERROR(ENOTSUP)); 2190 2191 tx = dmu_tx_create(os); 2192 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2193 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2194 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2195 ZFS_SA_ATTRS); 2196 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2197 } 2198 error = dmu_tx_assign(tx, TXG_WAIT); 2199 if (error) { 2200 dmu_tx_abort(tx); 2201 return (error); 2202 } 2203 2204 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2205 8, 1, &newvers, tx); 2206 2207 if (error) { 2208 dmu_tx_commit(tx); 2209 return (error); 2210 } 2211 2212 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2213 uint64_t sa_obj; 2214 2215 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2216 SPA_VERSION_SA); 2217 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2218 DMU_OT_NONE, 0, tx); 2219 2220 error = zap_add(os, MASTER_NODE_OBJ, 2221 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2222 ASSERT0(error); 2223 2224 VERIFY0(sa_set_sa_object(os, sa_obj)); 2225 sa_register_update_callback(os, zfs_sa_upgrade); 2226 } 2227 2228 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2229 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2230 (uintmax_t)newvers); 2231 dmu_tx_commit(tx); 2232 2233 zfsvfs->z_version = newvers; 2234 os->os_version = newvers; 2235 2236 zfs_set_fuid_feature(zfsvfs); 2237 2238 return (0); 2239 } 2240 2241 /* 2242 * Return true if the corresponding vfs's unmounted flag is set. 2243 * Otherwise return false. 2244 * If this function returns true we know VFS unmount has been initiated. 2245 */ 2246 boolean_t 2247 zfs_get_vfs_flag_unmounted(objset_t *os) 2248 { 2249 zfsvfs_t *zfvp; 2250 boolean_t unmounted = B_FALSE; 2251 2252 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); 2253 2254 mutex_enter(&os->os_user_ptr_lock); 2255 zfvp = dmu_objset_get_user(os); 2256 if (zfvp != NULL && zfvp->z_vfs != NULL && 2257 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2258 unmounted = B_TRUE; 2259 mutex_exit(&os->os_user_ptr_lock); 2260 2261 return (unmounted); 2262 } 2263 2264 #ifdef _KERNEL 2265 void 2266 zfsvfs_update_fromname(const char *oldname, const char *newname) 2267 { 2268 char tmpbuf[MAXPATHLEN]; 2269 struct mount *mp; 2270 char *fromname; 2271 size_t oldlen; 2272 2273 oldlen = strlen(oldname); 2274 2275 mtx_lock(&mountlist_mtx); 2276 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2277 fromname = mp->mnt_stat.f_mntfromname; 2278 if (strcmp(fromname, oldname) == 0) { 2279 (void) strlcpy(fromname, newname, 2280 sizeof (mp->mnt_stat.f_mntfromname)); 2281 continue; 2282 } 2283 if (strncmp(fromname, oldname, oldlen) == 0 && 2284 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2285 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2286 newname, fromname + oldlen); 2287 (void) strlcpy(fromname, tmpbuf, 2288 sizeof (mp->mnt_stat.f_mntfromname)); 2289 continue; 2290 } 2291 } 2292 mtx_unlock(&mountlist_mtx); 2293 } 2294 #endif 2295 2296 /* 2297 * Find a prison with ZFS info. 2298 * Return the ZFS info and the (locked) prison. 2299 */ 2300 static struct zfs_jailparam * 2301 zfs_jailparam_find(struct prison *spr, struct prison **prp) 2302 { 2303 struct prison *pr; 2304 struct zfs_jailparam *zjp; 2305 2306 for (pr = spr; ; pr = pr->pr_parent) { 2307 mtx_lock(&pr->pr_mtx); 2308 if (pr == &prison0) { 2309 zjp = &zfs_jailparam0; 2310 break; 2311 } 2312 zjp = osd_jail_get(pr, zfs_jailparam_slot); 2313 if (zjp != NULL) 2314 break; 2315 mtx_unlock(&pr->pr_mtx); 2316 } 2317 *prp = pr; 2318 2319 return (zjp); 2320 } 2321 2322 /* 2323 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the 2324 * ZFS info and lock the prison. 2325 */ 2326 static void 2327 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) 2328 { 2329 struct prison *ppr; 2330 struct zfs_jailparam *zjp, *nzjp; 2331 void **rsv; 2332 2333 /* If this prison already has ZFS info, return that. */ 2334 zjp = zfs_jailparam_find(pr, &ppr); 2335 if (ppr == pr) 2336 goto done; 2337 2338 /* 2339 * Allocate a new info record. Then check again, in case something 2340 * changed during the allocation. 2341 */ 2342 mtx_unlock(&ppr->pr_mtx); 2343 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); 2344 rsv = osd_reserve(zfs_jailparam_slot); 2345 zjp = zfs_jailparam_find(pr, &ppr); 2346 if (ppr == pr) { 2347 free(nzjp, M_PRISON); 2348 osd_free_reserved(rsv); 2349 goto done; 2350 } 2351 /* Inherit the initial values from the ancestor. */ 2352 mtx_lock(&pr->pr_mtx); 2353 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); 2354 (void) memcpy(nzjp, zjp, sizeof (*zjp)); 2355 zjp = nzjp; 2356 mtx_unlock(&ppr->pr_mtx); 2357 done: 2358 if (zjpp != NULL) 2359 *zjpp = zjp; 2360 else 2361 mtx_unlock(&pr->pr_mtx); 2362 } 2363 2364 /* 2365 * Jail OSD methods for ZFS VFS info. 2366 */ 2367 static int 2368 zfs_jailparam_create(void *obj, void *data) 2369 { 2370 struct prison *pr = obj; 2371 struct vfsoptlist *opts = data; 2372 int jsys; 2373 2374 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && 2375 jsys == JAIL_SYS_INHERIT) 2376 return (0); 2377 /* 2378 * Inherit a prison's initial values from its parent 2379 * (different from JAIL_SYS_INHERIT which also inherits changes). 2380 */ 2381 zfs_jailparam_alloc(pr, NULL); 2382 return (0); 2383 } 2384 2385 static int 2386 zfs_jailparam_get(void *obj, void *data) 2387 { 2388 struct prison *ppr, *pr = obj; 2389 struct vfsoptlist *opts = data; 2390 struct zfs_jailparam *zjp; 2391 int jsys, error; 2392 2393 zjp = zfs_jailparam_find(pr, &ppr); 2394 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; 2395 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); 2396 if (error != 0 && error != ENOENT) 2397 goto done; 2398 if (jsys == JAIL_SYS_NEW) { 2399 error = vfs_setopt(opts, "zfs.mount_snapshot", 2400 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); 2401 if (error != 0 && error != ENOENT) 2402 goto done; 2403 } else { 2404 /* 2405 * If this prison is inheriting its ZFS info, report 2406 * empty/zero parameters. 2407 */ 2408 static int mount_snapshot = 0; 2409 2410 error = vfs_setopt(opts, "zfs.mount_snapshot", 2411 &mount_snapshot, sizeof (mount_snapshot)); 2412 if (error != 0 && error != ENOENT) 2413 goto done; 2414 } 2415 error = 0; 2416 done: 2417 mtx_unlock(&ppr->pr_mtx); 2418 return (error); 2419 } 2420 2421 static int 2422 zfs_jailparam_set(void *obj, void *data) 2423 { 2424 struct prison *pr = obj; 2425 struct prison *ppr; 2426 struct vfsoptlist *opts = data; 2427 int error, jsys, mount_snapshot; 2428 2429 /* Set the parameters, which should be correct. */ 2430 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2431 if (error == ENOENT) 2432 jsys = -1; 2433 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2434 sizeof (mount_snapshot)); 2435 if (error == ENOENT) 2436 mount_snapshot = -1; 2437 else 2438 jsys = JAIL_SYS_NEW; 2439 switch (jsys) { 2440 case JAIL_SYS_NEW: 2441 { 2442 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ 2443 struct zfs_jailparam *zjp; 2444 2445 /* 2446 * A child jail cannot have more permissions than its parent 2447 */ 2448 if (pr->pr_parent != &prison0) { 2449 zjp = zfs_jailparam_find(pr->pr_parent, &ppr); 2450 mtx_unlock(&ppr->pr_mtx); 2451 if (zjp->mount_snapshot < mount_snapshot) { 2452 return (EPERM); 2453 } 2454 } 2455 zfs_jailparam_alloc(pr, &zjp); 2456 if (mount_snapshot != -1) 2457 zjp->mount_snapshot = mount_snapshot; 2458 mtx_unlock(&pr->pr_mtx); 2459 break; 2460 } 2461 case JAIL_SYS_INHERIT: 2462 /* "zfs=inherit": inherit the parent's ZFS info. */ 2463 mtx_lock(&pr->pr_mtx); 2464 osd_jail_del(pr, zfs_jailparam_slot); 2465 mtx_unlock(&pr->pr_mtx); 2466 break; 2467 case -1: 2468 /* 2469 * If the setting being changed is not ZFS related 2470 * then do nothing. 2471 */ 2472 break; 2473 } 2474 2475 return (0); 2476 } 2477 2478 static int 2479 zfs_jailparam_check(void *obj __unused, void *data) 2480 { 2481 struct vfsoptlist *opts = data; 2482 int error, jsys, mount_snapshot; 2483 2484 /* Check that the parameters are correct. */ 2485 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2486 if (error != ENOENT) { 2487 if (error != 0) 2488 return (error); 2489 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) 2490 return (EINVAL); 2491 } 2492 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2493 sizeof (mount_snapshot)); 2494 if (error != ENOENT) { 2495 if (error != 0) 2496 return (error); 2497 if (mount_snapshot != 0 && mount_snapshot != 1) 2498 return (EINVAL); 2499 } 2500 return (0); 2501 } 2502 2503 static void 2504 zfs_jailparam_destroy(void *data) 2505 { 2506 2507 free(data, M_PRISON); 2508 } 2509 2510 static void 2511 zfs_jailparam_sysinit(void *arg __unused) 2512 { 2513 struct prison *pr; 2514 osd_method_t methods[PR_MAXMETHOD] = { 2515 [PR_METHOD_CREATE] = zfs_jailparam_create, 2516 [PR_METHOD_GET] = zfs_jailparam_get, 2517 [PR_METHOD_SET] = zfs_jailparam_set, 2518 [PR_METHOD_CHECK] = zfs_jailparam_check, 2519 }; 2520 2521 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); 2522 /* Copy the defaults to any existing prisons. */ 2523 sx_slock(&allprison_lock); 2524 TAILQ_FOREACH(pr, &allprison, pr_list) 2525 zfs_jailparam_alloc(pr, NULL); 2526 sx_sunlock(&allprison_lock); 2527 } 2528 2529 static void 2530 zfs_jailparam_sysuninit(void *arg __unused) 2531 { 2532 2533 osd_jail_deregister(zfs_jailparam_slot); 2534 } 2535 2536 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2537 zfs_jailparam_sysinit, NULL); 2538 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2539 zfs_jailparam_sysuninit, NULL); 2540