1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/sysmacros.h> 37 #include <sys/kmem.h> 38 #include <sys/acl.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/mntent.h> 42 #include <sys/mount.h> 43 #include <sys/cmn_err.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_vnops.h> 46 #include <sys/zfs_dir.h> 47 #include <sys/zil.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dsl_prop.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/dsl_deleg.h> 53 #include <sys/spa.h> 54 #include <sys/zap.h> 55 #include <sys/sa.h> 56 #include <sys/sa_impl.h> 57 #include <sys/policy.h> 58 #include <sys/atomic.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/sunddi.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/dsl_dir.h> 65 #include <sys/jail.h> 66 #include <sys/osd.h> 67 #include <ufs/ufs/quota.h> 68 #include <sys/zfs_quota.h> 69 70 #include "zfs_comutil.h" 71 72 #ifndef MNTK_VMSETSIZE_BUG 73 #define MNTK_VMSETSIZE_BUG 0 74 #endif 75 #ifndef MNTK_NOMSYNC 76 #define MNTK_NOMSYNC 8 77 #endif 78 79 struct mtx zfs_debug_mtx; 80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 81 82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 83 84 int zfs_super_owner; 85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 86 "File system owners can perform privileged operation on file systems"); 87 88 int zfs_debug_level; 89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 90 "Debug level"); 91 92 struct zfs_jailparam { 93 int mount_snapshot; 94 }; 95 96 static struct zfs_jailparam zfs_jailparam0 = { 97 .mount_snapshot = 0, 98 }; 99 100 static int zfs_jailparam_slot; 101 102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); 103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", 104 "Allow mounting snapshots in the .zfs directory for unjailed datasets"); 105 106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 107 static int zfs_version_acl = ZFS_ACL_VERSION; 108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 109 "ZFS_ACL_VERSION"); 110 static int zfs_version_spa = SPA_VERSION; 111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 112 "SPA_VERSION"); 113 static int zfs_version_zpl = ZPL_VERSION; 114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 115 "ZPL_VERSION"); 116 117 #if __FreeBSD_version >= 1400018 118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, 119 bool *mp_busy); 120 #else 121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 122 #endif 123 static int zfs_mount(vfs_t *vfsp); 124 static int zfs_umount(vfs_t *vfsp, int fflag); 125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 128 static int zfs_sync(vfs_t *vfsp, int waitfor); 129 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 130 struct ucred **credanonp, int *numsecflavors, int *secflavors); 131 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 132 static void zfs_freevfs(vfs_t *vfsp); 133 134 struct vfsops zfs_vfsops = { 135 .vfs_mount = zfs_mount, 136 .vfs_unmount = zfs_umount, 137 .vfs_root = vfs_cache_root, 138 .vfs_cachedroot = zfs_root, 139 .vfs_statfs = zfs_statfs, 140 .vfs_vget = zfs_vget, 141 .vfs_sync = zfs_sync, 142 .vfs_checkexp = zfs_checkexp, 143 .vfs_fhtovp = zfs_fhtovp, 144 .vfs_quotactl = zfs_quotactl, 145 }; 146 147 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL 148 #ifdef VFCF_CROSS_COPY_FILE_RANGE 149 | VFCF_CROSS_COPY_FILE_RANGE 150 #endif 151 #ifdef VFCF_FILEREVINC 152 | VFCF_FILEREVINC 153 #endif 154 ); 155 156 /* 157 * We need to keep a count of active fs's. 158 * This is necessary to prevent our module 159 * from being unloaded after a umount -f 160 */ 161 static uint32_t zfs_active_fs_count = 0; 162 163 int 164 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 165 char *setpoint) 166 { 167 int error; 168 zfsvfs_t *zfvp; 169 vfs_t *vfsp; 170 objset_t *os; 171 uint64_t tmp = *val; 172 173 error = dmu_objset_from_ds(ds, &os); 174 if (error != 0) 175 return (error); 176 177 error = getzfsvfs_impl(os, &zfvp); 178 if (error != 0) 179 return (error); 180 if (zfvp == NULL) 181 return (ENOENT); 182 vfsp = zfvp->z_vfs; 183 switch (zfs_prop) { 184 case ZFS_PROP_ATIME: 185 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 186 tmp = 0; 187 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 188 tmp = 1; 189 break; 190 case ZFS_PROP_DEVICES: 191 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 192 tmp = 0; 193 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 194 tmp = 1; 195 break; 196 case ZFS_PROP_EXEC: 197 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 198 tmp = 0; 199 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 200 tmp = 1; 201 break; 202 case ZFS_PROP_SETUID: 203 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 204 tmp = 0; 205 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 206 tmp = 1; 207 break; 208 case ZFS_PROP_READONLY: 209 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 210 tmp = 0; 211 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 212 tmp = 1; 213 break; 214 case ZFS_PROP_XATTR: 215 if (zfvp->z_flags & ZSB_XATTR) 216 tmp = zfvp->z_xattr; 217 break; 218 case ZFS_PROP_NBMAND: 219 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 220 tmp = 0; 221 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 222 tmp = 1; 223 break; 224 default: 225 vfs_unbusy(vfsp); 226 return (ENOENT); 227 } 228 229 vfs_unbusy(vfsp); 230 if (tmp != *val) { 231 if (setpoint) 232 (void) strcpy(setpoint, "temporary"); 233 *val = tmp; 234 } 235 return (0); 236 } 237 238 static int 239 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 240 { 241 int error = 0; 242 char buf[32]; 243 uint64_t usedobj, quotaobj; 244 uint64_t quota, used = 0; 245 timespec_t now; 246 247 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 248 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 249 250 if (quotaobj == 0 || zfsvfs->z_replay) { 251 error = ENOENT; 252 goto done; 253 } 254 (void) sprintf(buf, "%llx", (longlong_t)id); 255 if ((error = zap_lookup(zfsvfs->z_os, quotaobj, 256 buf, sizeof (quota), 1, "a)) != 0) { 257 dprintf("%s(%d): quotaobj lookup failed\n", 258 __FUNCTION__, __LINE__); 259 goto done; 260 } 261 /* 262 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 263 * So we set them to be the same. 264 */ 265 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 266 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 267 if (error && error != ENOENT) { 268 dprintf("%s(%d): usedobj failed; %d\n", 269 __FUNCTION__, __LINE__, error); 270 goto done; 271 } 272 dqp->dqb_curblocks = btodb(used); 273 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 274 vfs_timestamp(&now); 275 /* 276 * Setting this to 0 causes FreeBSD quota(8) to print 277 * the number of days since the epoch, which isn't 278 * particularly useful. 279 */ 280 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 281 done: 282 return (error); 283 } 284 285 static int 286 #if __FreeBSD_version >= 1400018 287 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) 288 #else 289 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 290 #endif 291 { 292 zfsvfs_t *zfsvfs = vfsp->vfs_data; 293 struct thread *td; 294 int cmd, type, error = 0; 295 int bitsize; 296 zfs_userquota_prop_t quota_type; 297 struct dqblk64 dqblk = { 0 }; 298 299 td = curthread; 300 cmd = cmds >> SUBCMDSHIFT; 301 type = cmds & SUBCMDMASK; 302 303 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 304 return (error); 305 if (id == -1) { 306 switch (type) { 307 case USRQUOTA: 308 id = td->td_ucred->cr_ruid; 309 break; 310 case GRPQUOTA: 311 id = td->td_ucred->cr_rgid; 312 break; 313 default: 314 error = EINVAL; 315 #if __FreeBSD_version < 1400018 316 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 317 vfs_unbusy(vfsp); 318 #endif 319 goto done; 320 } 321 } 322 /* 323 * Map BSD type to: 324 * ZFS_PROP_USERUSED, 325 * ZFS_PROP_USERQUOTA, 326 * ZFS_PROP_GROUPUSED, 327 * ZFS_PROP_GROUPQUOTA 328 */ 329 switch (cmd) { 330 case Q_SETQUOTA: 331 case Q_SETQUOTA32: 332 if (type == USRQUOTA) 333 quota_type = ZFS_PROP_USERQUOTA; 334 else if (type == GRPQUOTA) 335 quota_type = ZFS_PROP_GROUPQUOTA; 336 else 337 error = EINVAL; 338 break; 339 case Q_GETQUOTA: 340 case Q_GETQUOTA32: 341 if (type == USRQUOTA) 342 quota_type = ZFS_PROP_USERUSED; 343 else if (type == GRPQUOTA) 344 quota_type = ZFS_PROP_GROUPUSED; 345 else 346 error = EINVAL; 347 break; 348 } 349 350 /* 351 * Depending on the cmd, we may need to get 352 * the ruid and domain (see fuidstr_to_sid?), 353 * the fuid (how?), or other information. 354 * Create fuid using zfs_fuid_create(zfsvfs, id, 355 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 356 * I think I can use just the id? 357 * 358 * Look at zfs_id_overquota() to look up a quota. 359 * zap_lookup(something, quotaobj, fuidstring, 360 * sizeof (long long), 1, "a) 361 * 362 * See zfs_set_userquota() to set a quota. 363 */ 364 if ((uint32_t)type >= MAXQUOTAS) { 365 error = EINVAL; 366 goto done; 367 } 368 369 switch (cmd) { 370 case Q_GETQUOTASIZE: 371 bitsize = 64; 372 error = copyout(&bitsize, arg, sizeof (int)); 373 break; 374 case Q_QUOTAON: 375 // As far as I can tell, you can't turn quotas on or off on zfs 376 error = 0; 377 #if __FreeBSD_version < 1400018 378 vfs_unbusy(vfsp); 379 #endif 380 break; 381 case Q_QUOTAOFF: 382 error = ENOTSUP; 383 #if __FreeBSD_version < 1400018 384 vfs_unbusy(vfsp); 385 #endif 386 break; 387 case Q_SETQUOTA: 388 error = copyin(arg, &dqblk, sizeof (dqblk)); 389 if (error == 0) 390 error = zfs_set_userquota(zfsvfs, quota_type, 391 "", id, dbtob(dqblk.dqb_bhardlimit)); 392 break; 393 case Q_GETQUOTA: 394 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 395 if (error == 0) 396 error = copyout(&dqblk, arg, sizeof (dqblk)); 397 break; 398 default: 399 error = EINVAL; 400 break; 401 } 402 done: 403 zfs_exit(zfsvfs, FTAG); 404 return (error); 405 } 406 407 408 boolean_t 409 zfs_is_readonly(zfsvfs_t *zfsvfs) 410 { 411 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 412 } 413 414 static int 415 zfs_sync(vfs_t *vfsp, int waitfor) 416 { 417 418 /* 419 * Data integrity is job one. We don't want a compromised kernel 420 * writing to the storage pool, so we never sync during panic. 421 */ 422 if (panicstr) 423 return (0); 424 425 /* 426 * Ignore the system syncher. ZFS already commits async data 427 * at zfs_txg_timeout intervals. 428 */ 429 if (waitfor == MNT_LAZY) 430 return (0); 431 432 if (vfsp != NULL) { 433 /* 434 * Sync a specific filesystem. 435 */ 436 zfsvfs_t *zfsvfs = vfsp->vfs_data; 437 dsl_pool_t *dp; 438 int error; 439 440 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 441 return (error); 442 dp = dmu_objset_pool(zfsvfs->z_os); 443 444 /* 445 * If the system is shutting down, then skip any 446 * filesystems which may exist on a suspended pool. 447 */ 448 if (rebooting && spa_suspended(dp->dp_spa)) { 449 zfs_exit(zfsvfs, FTAG); 450 return (0); 451 } 452 453 if (zfsvfs->z_log != NULL) 454 zil_commit(zfsvfs->z_log, 0); 455 456 zfs_exit(zfsvfs, FTAG); 457 } else { 458 /* 459 * Sync all ZFS filesystems. This is what happens when you 460 * run sync(8). Unlike other filesystems, ZFS honors the 461 * request by waiting for all pools to commit all dirty data. 462 */ 463 spa_sync_allpools(); 464 } 465 466 return (0); 467 } 468 469 static void 470 atime_changed_cb(void *arg, uint64_t newval) 471 { 472 zfsvfs_t *zfsvfs = arg; 473 474 if (newval == TRUE) { 475 zfsvfs->z_atime = TRUE; 476 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 477 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 478 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 479 } else { 480 zfsvfs->z_atime = FALSE; 481 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 482 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 483 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 484 } 485 } 486 487 static void 488 xattr_changed_cb(void *arg, uint64_t newval) 489 { 490 zfsvfs_t *zfsvfs = arg; 491 492 if (newval == ZFS_XATTR_OFF) { 493 zfsvfs->z_flags &= ~ZSB_XATTR; 494 } else { 495 zfsvfs->z_flags |= ZSB_XATTR; 496 497 if (newval == ZFS_XATTR_SA) 498 zfsvfs->z_xattr_sa = B_TRUE; 499 else 500 zfsvfs->z_xattr_sa = B_FALSE; 501 } 502 } 503 504 static void 505 blksz_changed_cb(void *arg, uint64_t newval) 506 { 507 zfsvfs_t *zfsvfs = arg; 508 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 509 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 510 ASSERT(ISP2(newval)); 511 512 zfsvfs->z_max_blksz = newval; 513 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 514 } 515 516 static void 517 readonly_changed_cb(void *arg, uint64_t newval) 518 { 519 zfsvfs_t *zfsvfs = arg; 520 521 if (newval) { 522 /* XXX locking on vfs_flag? */ 523 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 524 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 525 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 526 } else { 527 /* XXX locking on vfs_flag? */ 528 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 529 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 530 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 531 } 532 } 533 534 static void 535 setuid_changed_cb(void *arg, uint64_t newval) 536 { 537 zfsvfs_t *zfsvfs = arg; 538 539 if (newval == FALSE) { 540 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 541 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 542 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 543 } else { 544 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 545 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 546 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 547 } 548 } 549 550 static void 551 exec_changed_cb(void *arg, uint64_t newval) 552 { 553 zfsvfs_t *zfsvfs = arg; 554 555 if (newval == FALSE) { 556 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 557 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 558 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 559 } else { 560 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 561 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 562 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 563 } 564 } 565 566 /* 567 * The nbmand mount option can be changed at mount time. 568 * We can't allow it to be toggled on live file systems or incorrect 569 * behavior may be seen from cifs clients 570 * 571 * This property isn't registered via dsl_prop_register(), but this callback 572 * will be called when a file system is first mounted 573 */ 574 static void 575 nbmand_changed_cb(void *arg, uint64_t newval) 576 { 577 zfsvfs_t *zfsvfs = arg; 578 if (newval == FALSE) { 579 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 580 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 581 } else { 582 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 583 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 584 } 585 } 586 587 static void 588 snapdir_changed_cb(void *arg, uint64_t newval) 589 { 590 zfsvfs_t *zfsvfs = arg; 591 592 zfsvfs->z_show_ctldir = newval; 593 } 594 595 static void 596 acl_mode_changed_cb(void *arg, uint64_t newval) 597 { 598 zfsvfs_t *zfsvfs = arg; 599 600 zfsvfs->z_acl_mode = newval; 601 } 602 603 static void 604 acl_inherit_changed_cb(void *arg, uint64_t newval) 605 { 606 zfsvfs_t *zfsvfs = arg; 607 608 zfsvfs->z_acl_inherit = newval; 609 } 610 611 static void 612 acl_type_changed_cb(void *arg, uint64_t newval) 613 { 614 zfsvfs_t *zfsvfs = arg; 615 616 zfsvfs->z_acl_type = newval; 617 } 618 619 static void 620 longname_changed_cb(void *arg, uint64_t newval) 621 { 622 zfsvfs_t *zfsvfs = arg; 623 624 zfsvfs->z_longname = newval; 625 } 626 627 static int 628 zfs_register_callbacks(vfs_t *vfsp) 629 { 630 struct dsl_dataset *ds = NULL; 631 objset_t *os = NULL; 632 zfsvfs_t *zfsvfs = NULL; 633 uint64_t nbmand; 634 boolean_t readonly = B_FALSE; 635 boolean_t do_readonly = B_FALSE; 636 boolean_t setuid = B_FALSE; 637 boolean_t do_setuid = B_FALSE; 638 boolean_t exec = B_FALSE; 639 boolean_t do_exec = B_FALSE; 640 boolean_t xattr = B_FALSE; 641 boolean_t atime = B_FALSE; 642 boolean_t do_atime = B_FALSE; 643 boolean_t do_xattr = B_FALSE; 644 int error = 0; 645 646 ASSERT3P(vfsp, !=, NULL); 647 zfsvfs = vfsp->vfs_data; 648 ASSERT3P(zfsvfs, !=, NULL); 649 os = zfsvfs->z_os; 650 651 /* 652 * This function can be called for a snapshot when we update snapshot's 653 * mount point, which isn't really supported. 654 */ 655 if (dmu_objset_is_snapshot(os)) 656 return (EOPNOTSUPP); 657 658 /* 659 * The act of registering our callbacks will destroy any mount 660 * options we may have. In order to enable temporary overrides 661 * of mount options, we stash away the current values and 662 * restore them after we register the callbacks. 663 */ 664 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 665 !spa_writeable(dmu_objset_spa(os))) { 666 readonly = B_TRUE; 667 do_readonly = B_TRUE; 668 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 669 readonly = B_FALSE; 670 do_readonly = B_TRUE; 671 } 672 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 673 setuid = B_FALSE; 674 do_setuid = B_TRUE; 675 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 676 setuid = B_TRUE; 677 do_setuid = B_TRUE; 678 } 679 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 680 exec = B_FALSE; 681 do_exec = B_TRUE; 682 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 683 exec = B_TRUE; 684 do_exec = B_TRUE; 685 } 686 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 687 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 688 do_xattr = B_TRUE; 689 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 690 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 691 do_xattr = B_TRUE; 692 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 693 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 694 do_xattr = B_TRUE; 695 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 696 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 697 do_xattr = B_TRUE; 698 } 699 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 700 atime = B_FALSE; 701 do_atime = B_TRUE; 702 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 703 atime = B_TRUE; 704 do_atime = B_TRUE; 705 } 706 707 /* 708 * We need to enter pool configuration here, so that we can use 709 * dsl_prop_get_int_ds() to handle the special nbmand property below. 710 * dsl_prop_get_integer() can not be used, because it has to acquire 711 * spa_namespace_lock and we can not do that because we already hold 712 * z_teardown_lock. The problem is that spa_write_cachefile() is called 713 * with spa_namespace_lock held and the function calls ZFS vnode 714 * operations to write the cache file and thus z_teardown_lock is 715 * acquired after spa_namespace_lock. 716 */ 717 ds = dmu_objset_ds(os); 718 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 719 720 /* 721 * nbmand is a special property. It can only be changed at 722 * mount time. 723 * 724 * This is weird, but it is documented to only be changeable 725 * at mount time. 726 */ 727 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 728 nbmand = B_FALSE; 729 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 730 nbmand = B_TRUE; 731 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { 732 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 733 return (error); 734 } 735 736 /* 737 * Register property callbacks. 738 * 739 * It would probably be fine to just check for i/o error from 740 * the first prop_register(), but I guess I like to go 741 * overboard... 742 */ 743 error = dsl_prop_register(ds, 744 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 745 error = error ? error : dsl_prop_register(ds, 746 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 747 error = error ? error : dsl_prop_register(ds, 748 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 749 error = error ? error : dsl_prop_register(ds, 750 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 751 error = error ? error : dsl_prop_register(ds, 752 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 753 error = error ? error : dsl_prop_register(ds, 754 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 755 error = error ? error : dsl_prop_register(ds, 756 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 757 error = error ? error : dsl_prop_register(ds, 758 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 759 error = error ? error : dsl_prop_register(ds, 760 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 761 error = error ? error : dsl_prop_register(ds, 762 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 763 zfsvfs); 764 error = error ? error : dsl_prop_register(ds, 765 zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs); 766 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 767 if (error) 768 goto unregister; 769 770 /* 771 * Invoke our callbacks to restore temporary mount options. 772 */ 773 if (do_readonly) 774 readonly_changed_cb(zfsvfs, readonly); 775 if (do_setuid) 776 setuid_changed_cb(zfsvfs, setuid); 777 if (do_exec) 778 exec_changed_cb(zfsvfs, exec); 779 if (do_xattr) 780 xattr_changed_cb(zfsvfs, xattr); 781 if (do_atime) 782 atime_changed_cb(zfsvfs, atime); 783 784 nbmand_changed_cb(zfsvfs, nbmand); 785 786 return (0); 787 788 unregister: 789 dsl_prop_unregister_all(ds, zfsvfs); 790 return (error); 791 } 792 793 /* 794 * Associate this zfsvfs with the given objset, which must be owned. 795 * This will cache a bunch of on-disk state from the objset in the 796 * zfsvfs. 797 */ 798 static int 799 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 800 { 801 int error; 802 uint64_t val; 803 804 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 805 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 806 zfsvfs->z_os = os; 807 808 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 809 if (error != 0) 810 return (error); 811 if (zfsvfs->z_version > 812 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 813 (void) printf("Can't mount a version %lld file system " 814 "on a version %lld pool\n. Pool must be upgraded to mount " 815 "this file system.", (u_longlong_t)zfsvfs->z_version, 816 (u_longlong_t)spa_version(dmu_objset_spa(os))); 817 return (SET_ERROR(ENOTSUP)); 818 } 819 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 820 if (error != 0) 821 return (error); 822 zfsvfs->z_norm = (int)val; 823 824 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 825 if (error != 0) 826 return (error); 827 zfsvfs->z_utf8 = (val != 0); 828 829 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 830 if (error != 0) 831 return (error); 832 zfsvfs->z_case = (uint_t)val; 833 834 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 835 if (error != 0) 836 return (error); 837 zfsvfs->z_acl_type = (uint_t)val; 838 839 /* 840 * Fold case on file systems that are always or sometimes case 841 * insensitive. 842 */ 843 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 844 zfsvfs->z_case == ZFS_CASE_MIXED) 845 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 846 847 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 848 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 849 850 uint64_t sa_obj = 0; 851 if (zfsvfs->z_use_sa) { 852 /* should either have both of these objects or none */ 853 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 854 &sa_obj); 855 if (error != 0) 856 return (error); 857 858 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); 859 if (error == 0 && val == ZFS_XATTR_SA) 860 zfsvfs->z_xattr_sa = B_TRUE; 861 } 862 863 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 864 &zfsvfs->z_attr_table); 865 if (error != 0) 866 return (error); 867 868 if (zfsvfs->z_version >= ZPL_VERSION_SA) 869 sa_register_update_callback(os, zfs_sa_upgrade); 870 871 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 872 &zfsvfs->z_root); 873 if (error != 0) 874 return (error); 875 ASSERT3U(zfsvfs->z_root, !=, 0); 876 877 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 878 &zfsvfs->z_unlinkedobj); 879 if (error != 0) 880 return (error); 881 882 error = zap_lookup(os, MASTER_NODE_OBJ, 883 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 884 8, 1, &zfsvfs->z_userquota_obj); 885 if (error == ENOENT) 886 zfsvfs->z_userquota_obj = 0; 887 else if (error != 0) 888 return (error); 889 890 error = zap_lookup(os, MASTER_NODE_OBJ, 891 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 892 8, 1, &zfsvfs->z_groupquota_obj); 893 if (error == ENOENT) 894 zfsvfs->z_groupquota_obj = 0; 895 else if (error != 0) 896 return (error); 897 898 error = zap_lookup(os, MASTER_NODE_OBJ, 899 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 900 8, 1, &zfsvfs->z_projectquota_obj); 901 if (error == ENOENT) 902 zfsvfs->z_projectquota_obj = 0; 903 else if (error != 0) 904 return (error); 905 906 error = zap_lookup(os, MASTER_NODE_OBJ, 907 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 908 8, 1, &zfsvfs->z_userobjquota_obj); 909 if (error == ENOENT) 910 zfsvfs->z_userobjquota_obj = 0; 911 else if (error != 0) 912 return (error); 913 914 error = zap_lookup(os, MASTER_NODE_OBJ, 915 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 916 8, 1, &zfsvfs->z_groupobjquota_obj); 917 if (error == ENOENT) 918 zfsvfs->z_groupobjquota_obj = 0; 919 else if (error != 0) 920 return (error); 921 922 error = zap_lookup(os, MASTER_NODE_OBJ, 923 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 924 8, 1, &zfsvfs->z_projectobjquota_obj); 925 if (error == ENOENT) 926 zfsvfs->z_projectobjquota_obj = 0; 927 else if (error != 0) 928 return (error); 929 930 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 931 &zfsvfs->z_fuid_obj); 932 if (error == ENOENT) 933 zfsvfs->z_fuid_obj = 0; 934 else if (error != 0) 935 return (error); 936 937 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 938 &zfsvfs->z_shares_dir); 939 if (error == ENOENT) 940 zfsvfs->z_shares_dir = 0; 941 else if (error != 0) 942 return (error); 943 944 /* 945 * Only use the name cache if we are looking for a 946 * name on a file system that does not require normalization 947 * or case folding. We can also look there if we happen to be 948 * on a non-normalizing, mixed sensitivity file system IF we 949 * are looking for the exact name (which is always the case on 950 * FreeBSD). 951 */ 952 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 953 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 954 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 955 956 return (0); 957 } 958 959 taskq_t *zfsvfs_taskq; 960 961 static void 962 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 963 { 964 965 zfs_unlinked_drain((zfsvfs_t *)context); 966 } 967 968 int 969 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 970 { 971 objset_t *os; 972 zfsvfs_t *zfsvfs; 973 int error; 974 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 975 976 /* 977 * XXX: Fix struct statfs so this isn't necessary! 978 * 979 * The 'osname' is used as the filesystem's special node, which means 980 * it must fit in statfs.f_mntfromname, or else it can't be 981 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 982 * 'zfs unmount' to think it's not mounted when it is. 983 */ 984 if (strlen(osname) >= MNAMELEN) 985 return (SET_ERROR(ENAMETOOLONG)); 986 987 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 988 989 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 990 &os); 991 if (error != 0) { 992 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 993 return (error); 994 } 995 996 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 997 998 return (error); 999 } 1000 1001 1002 int 1003 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1004 { 1005 int error; 1006 1007 zfsvfs->z_vfs = NULL; 1008 zfsvfs->z_parent = zfsvfs; 1009 1010 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1011 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1012 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1013 offsetof(znode_t, z_link_node)); 1014 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 1015 zfsvfs_task_unlinked_drain, zfsvfs); 1016 ZFS_TEARDOWN_INIT(zfsvfs); 1017 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 1018 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1019 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1020 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1021 1022 error = zfsvfs_init(zfsvfs, os); 1023 if (error != 0) { 1024 dmu_objset_disown(os, B_TRUE, zfsvfs); 1025 *zfvp = NULL; 1026 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1027 return (error); 1028 } 1029 1030 *zfvp = zfsvfs; 1031 return (0); 1032 } 1033 1034 static int 1035 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1036 { 1037 int error; 1038 1039 /* 1040 * Check for a bad on-disk format version now since we 1041 * lied about owning the dataset readonly before. 1042 */ 1043 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1044 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1045 return (SET_ERROR(EROFS)); 1046 1047 error = zfs_register_callbacks(zfsvfs->z_vfs); 1048 if (error) 1049 return (error); 1050 1051 /* 1052 * If we are not mounting (ie: online recv), then we don't 1053 * have to worry about replaying the log as we blocked all 1054 * operations out since we closed the ZIL. 1055 */ 1056 if (mounting) { 1057 boolean_t readonly; 1058 1059 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); 1060 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1061 if (error) 1062 return (error); 1063 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1064 &zfsvfs->z_kstat.dk_zil_sums); 1065 1066 /* 1067 * During replay we remove the read only flag to 1068 * allow replays to succeed. 1069 */ 1070 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1071 if (readonly != 0) { 1072 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1073 } else { 1074 dsl_dir_t *dd; 1075 zap_stats_t zs; 1076 1077 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1078 &zs) == 0) { 1079 dataset_kstats_update_nunlinks_kstat( 1080 &zfsvfs->z_kstat, zs.zs_num_entries); 1081 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1082 "num_entries in unlinked set: %llu", 1083 (u_longlong_t)zs.zs_num_entries); 1084 } 1085 1086 zfs_unlinked_drain(zfsvfs); 1087 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1088 dd->dd_activity_cancelled = B_FALSE; 1089 } 1090 1091 /* 1092 * Parse and replay the intent log. 1093 * 1094 * Because of ziltest, this must be done after 1095 * zfs_unlinked_drain(). (Further note: ziltest 1096 * doesn't use readonly mounts, where 1097 * zfs_unlinked_drain() isn't called.) This is because 1098 * ziltest causes spa_sync() to think it's committed, 1099 * but actually it is not, so the intent log contains 1100 * many txg's worth of changes. 1101 * 1102 * In particular, if object N is in the unlinked set in 1103 * the last txg to actually sync, then it could be 1104 * actually freed in a later txg and then reallocated 1105 * in a yet later txg. This would write a "create 1106 * object N" record to the intent log. Normally, this 1107 * would be fine because the spa_sync() would have 1108 * written out the fact that object N is free, before 1109 * we could write the "create object N" intent log 1110 * record. 1111 * 1112 * But when we are in ziltest mode, we advance the "open 1113 * txg" without actually spa_sync()-ing the changes to 1114 * disk. So we would see that object N is still 1115 * allocated and in the unlinked set, and there is an 1116 * intent log record saying to allocate it. 1117 */ 1118 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1119 if (zil_replay_disable) { 1120 zil_destroy(zfsvfs->z_log, B_FALSE); 1121 } else { 1122 boolean_t use_nc = zfsvfs->z_use_namecache; 1123 zfsvfs->z_use_namecache = B_FALSE; 1124 zfsvfs->z_replay = B_TRUE; 1125 zil_replay(zfsvfs->z_os, zfsvfs, 1126 zfs_replay_vector); 1127 zfsvfs->z_replay = B_FALSE; 1128 zfsvfs->z_use_namecache = use_nc; 1129 } 1130 } 1131 1132 /* restore readonly bit */ 1133 if (readonly != 0) 1134 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1135 } else { 1136 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); 1137 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1138 &zfsvfs->z_kstat.dk_zil_sums); 1139 } 1140 1141 /* 1142 * Set the objset user_ptr to track its zfsvfs. 1143 */ 1144 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1145 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1146 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1147 1148 return (0); 1149 } 1150 1151 void 1152 zfsvfs_free(zfsvfs_t *zfsvfs) 1153 { 1154 int i; 1155 1156 zfs_fuid_destroy(zfsvfs); 1157 1158 mutex_destroy(&zfsvfs->z_znodes_lock); 1159 mutex_destroy(&zfsvfs->z_lock); 1160 list_destroy(&zfsvfs->z_all_znodes); 1161 ZFS_TEARDOWN_DESTROY(zfsvfs); 1162 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1163 rw_destroy(&zfsvfs->z_fuid_lock); 1164 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1165 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1166 dataset_kstats_destroy(&zfsvfs->z_kstat); 1167 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1168 } 1169 1170 static void 1171 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1172 { 1173 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1174 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1175 } 1176 1177 static int 1178 zfs_domount(vfs_t *vfsp, char *osname) 1179 { 1180 uint64_t recordsize, fsid_guid; 1181 int error = 0; 1182 zfsvfs_t *zfsvfs; 1183 1184 ASSERT3P(vfsp, !=, NULL); 1185 ASSERT3P(osname, !=, NULL); 1186 1187 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1188 if (error) 1189 return (error); 1190 zfsvfs->z_vfs = vfsp; 1191 1192 if ((error = dsl_prop_get_integer(osname, 1193 "recordsize", &recordsize, NULL))) 1194 goto out; 1195 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1196 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1197 1198 vfsp->vfs_data = zfsvfs; 1199 vfsp->mnt_flag |= MNT_LOCAL; 1200 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1201 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1202 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1203 /* 1204 * This can cause a loss of coherence between ARC and page cache 1205 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1206 */ 1207 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1208 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1209 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1210 1211 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1212 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1213 #endif 1214 /* 1215 * The fsid is 64 bits, composed of an 8-bit fs type, which 1216 * separates our fsid from any other filesystem types, and a 1217 * 56-bit objset unique ID. The objset unique ID is unique to 1218 * all objsets open on this system, provided by unique_create(). 1219 * The 8-bit fs type must be put in the low bits of fsid[1] 1220 * because that's where other Solaris filesystems put it. 1221 */ 1222 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1223 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); 1224 vfsp->vfs_fsid.val[0] = fsid_guid; 1225 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | 1226 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1227 1228 /* 1229 * Set features for file system. 1230 */ 1231 zfs_set_fuid_feature(zfsvfs); 1232 1233 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1234 uint64_t pval; 1235 1236 atime_changed_cb(zfsvfs, B_FALSE); 1237 readonly_changed_cb(zfsvfs, B_TRUE); 1238 if ((error = dsl_prop_get_integer(osname, 1239 "xattr", &pval, NULL))) 1240 goto out; 1241 xattr_changed_cb(zfsvfs, pval); 1242 if ((error = dsl_prop_get_integer(osname, 1243 "acltype", &pval, NULL))) 1244 goto out; 1245 acl_type_changed_cb(zfsvfs, pval); 1246 zfsvfs->z_issnap = B_TRUE; 1247 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1248 1249 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1250 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1251 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1252 } else { 1253 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1254 goto out; 1255 } 1256 1257 vfs_mountedfrom(vfsp, osname); 1258 1259 if (!zfsvfs->z_issnap) 1260 zfsctl_create(zfsvfs); 1261 out: 1262 if (error) { 1263 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1264 zfsvfs_free(zfsvfs); 1265 } else { 1266 atomic_inc_32(&zfs_active_fs_count); 1267 } 1268 1269 return (error); 1270 } 1271 1272 static void 1273 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1274 { 1275 objset_t *os = zfsvfs->z_os; 1276 1277 if (!dmu_objset_is_snapshot(os)) 1278 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1279 } 1280 1281 static int 1282 getpoolname(const char *osname, char *poolname) 1283 { 1284 char *p; 1285 1286 p = strchr(osname, '/'); 1287 if (p == NULL) { 1288 if (strlen(osname) >= MAXNAMELEN) 1289 return (ENAMETOOLONG); 1290 (void) strcpy(poolname, osname); 1291 } else { 1292 if (p - osname >= MAXNAMELEN) 1293 return (ENAMETOOLONG); 1294 (void) strlcpy(poolname, osname, p - osname + 1); 1295 } 1296 return (0); 1297 } 1298 1299 static void 1300 fetch_osname_options(char *name, bool *checkpointrewind) 1301 { 1302 1303 if (name[0] == '!') { 1304 *checkpointrewind = true; 1305 memmove(name, name + 1, strlen(name)); 1306 } else { 1307 *checkpointrewind = false; 1308 } 1309 } 1310 1311 static int 1312 zfs_mount(vfs_t *vfsp) 1313 { 1314 kthread_t *td = curthread; 1315 vnode_t *mvp = vfsp->mnt_vnodecovered; 1316 cred_t *cr = td->td_ucred; 1317 char *osname; 1318 int error = 0; 1319 int canwrite; 1320 bool checkpointrewind, isctlsnap = false; 1321 1322 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1323 return (SET_ERROR(EINVAL)); 1324 1325 /* 1326 * If full-owner-access is enabled and delegated administration is 1327 * turned on, we must set nosuid. 1328 */ 1329 if (zfs_super_owner && 1330 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1331 secpolicy_fs_mount_clearopts(cr, vfsp); 1332 } 1333 1334 fetch_osname_options(osname, &checkpointrewind); 1335 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && 1336 strchr(osname, '@') != NULL); 1337 1338 /* 1339 * Check for mount privilege? 1340 * 1341 * If we don't have privilege then see if 1342 * we have local permission to allow it 1343 */ 1344 error = secpolicy_fs_mount(cr, mvp, vfsp); 1345 if (error && isctlsnap) { 1346 secpolicy_fs_mount_clearopts(cr, vfsp); 1347 } else if (error) { 1348 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1349 goto out; 1350 1351 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1352 vattr_t vattr; 1353 1354 /* 1355 * Make sure user is the owner of the mount point 1356 * or has sufficient privileges. 1357 */ 1358 1359 vattr.va_mask = AT_UID; 1360 1361 vn_lock(mvp, LK_SHARED | LK_RETRY); 1362 if (VOP_GETATTR(mvp, &vattr, cr)) { 1363 VOP_UNLOCK(mvp); 1364 goto out; 1365 } 1366 1367 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1368 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1369 VOP_UNLOCK(mvp); 1370 goto out; 1371 } 1372 VOP_UNLOCK(mvp); 1373 } 1374 1375 secpolicy_fs_mount_clearopts(cr, vfsp); 1376 } 1377 1378 /* 1379 * Refuse to mount a filesystem if we are in a local zone and the 1380 * dataset is not visible. 1381 */ 1382 if (!INGLOBALZONE(curproc) && 1383 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1384 boolean_t mount_snapshot = B_FALSE; 1385 1386 /* 1387 * Snapshots may be mounted in .zfs for unjailed datasets 1388 * if allowed by the jail param zfs.mount_snapshot. 1389 */ 1390 if (isctlsnap) { 1391 struct prison *pr; 1392 struct zfs_jailparam *zjp; 1393 1394 pr = curthread->td_ucred->cr_prison; 1395 mtx_lock(&pr->pr_mtx); 1396 zjp = osd_jail_get(pr, zfs_jailparam_slot); 1397 mtx_unlock(&pr->pr_mtx); 1398 if (zjp && zjp->mount_snapshot) 1399 mount_snapshot = B_TRUE; 1400 } 1401 if (!mount_snapshot) { 1402 error = SET_ERROR(EPERM); 1403 goto out; 1404 } 1405 } 1406 1407 vfsp->vfs_flag |= MNT_NFS4ACLS; 1408 1409 /* 1410 * When doing a remount, we simply refresh our temporary properties 1411 * according to those options set in the current VFS options. 1412 */ 1413 if (vfsp->vfs_flag & MS_REMOUNT) { 1414 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1415 1416 /* 1417 * Refresh mount options with z_teardown_lock blocking I/O while 1418 * the filesystem is in an inconsistent state. 1419 * The lock also serializes this code with filesystem 1420 * manipulations between entry to zfs_suspend_fs() and return 1421 * from zfs_resume_fs(). 1422 */ 1423 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1424 zfs_unregister_callbacks(zfsvfs); 1425 error = zfs_register_callbacks(vfsp); 1426 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1427 goto out; 1428 } 1429 1430 /* Initial root mount: try hard to import the requested root pool. */ 1431 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1432 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1433 char pname[MAXNAMELEN]; 1434 1435 error = getpoolname(osname, pname); 1436 if (error == 0) 1437 error = spa_import_rootpool(pname, checkpointrewind); 1438 if (error) 1439 goto out; 1440 } 1441 DROP_GIANT(); 1442 error = zfs_domount(vfsp, osname); 1443 PICKUP_GIANT(); 1444 1445 out: 1446 return (error); 1447 } 1448 1449 static int 1450 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1451 { 1452 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1453 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1454 int error; 1455 1456 statp->f_version = STATFS_VERSION; 1457 1458 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1459 return (error); 1460 1461 dmu_objset_space(zfsvfs->z_os, 1462 &refdbytes, &availbytes, &usedobjs, &availobjs); 1463 1464 /* 1465 * The underlying storage pool actually uses multiple block sizes. 1466 * We report the fragsize as the smallest block size we support, 1467 * and we report our blocksize as the filesystem's maximum blocksize. 1468 */ 1469 statp->f_bsize = SPA_MINBLOCKSIZE; 1470 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1471 1472 /* 1473 * The following report "total" blocks of various kinds in the 1474 * file system, but reported in terms of f_frsize - the 1475 * "fragment" size. 1476 */ 1477 1478 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1479 statp->f_bfree = availbytes / statp->f_bsize; 1480 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1481 1482 /* 1483 * statvfs() should really be called statufs(), because it assumes 1484 * static metadata. ZFS doesn't preallocate files, so the best 1485 * we can do is report the max that could possibly fit in f_files, 1486 * and that minus the number actually used in f_ffree. 1487 * For f_ffree, report the smaller of the number of object available 1488 * and the number of blocks (each object will take at least a block). 1489 */ 1490 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1491 statp->f_files = statp->f_ffree + usedobjs; 1492 1493 /* 1494 * We're a zfs filesystem. 1495 */ 1496 strlcpy(statp->f_fstypename, "zfs", 1497 sizeof (statp->f_fstypename)); 1498 1499 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1500 sizeof (statp->f_mntfromname)); 1501 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1502 sizeof (statp->f_mntonname)); 1503 1504 statp->f_namemax = 1505 zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1); 1506 1507 zfs_exit(zfsvfs, FTAG); 1508 return (0); 1509 } 1510 1511 static int 1512 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1513 { 1514 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1515 znode_t *rootzp; 1516 int error; 1517 1518 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1519 return (error); 1520 1521 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1522 if (error == 0) 1523 *vpp = ZTOV(rootzp); 1524 1525 zfs_exit(zfsvfs, FTAG); 1526 1527 if (error == 0) { 1528 error = vn_lock(*vpp, flags); 1529 if (error != 0) { 1530 VN_RELE(*vpp); 1531 *vpp = NULL; 1532 } 1533 } 1534 return (error); 1535 } 1536 1537 /* 1538 * Teardown the zfsvfs::z_os. 1539 * 1540 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1541 * and 'z_teardown_inactive_lock' held. 1542 */ 1543 static int 1544 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1545 { 1546 znode_t *zp; 1547 dsl_dir_t *dd; 1548 1549 /* 1550 * If someone has not already unmounted this file system, 1551 * drain the zrele_taskq to ensure all active references to the 1552 * zfsvfs_t have been handled only then can it be safely destroyed. 1553 */ 1554 if (zfsvfs->z_os) { 1555 /* 1556 * If we're unmounting we have to wait for the list to 1557 * drain completely. 1558 * 1559 * If we're not unmounting there's no guarantee the list 1560 * will drain completely, but zreles run from the taskq 1561 * may add the parents of dir-based xattrs to the taskq 1562 * so we want to wait for these. 1563 * 1564 * We can safely check z_all_znodes for being empty because the 1565 * VFS has already blocked operations which add to it. 1566 */ 1567 int round = 0; 1568 while (!list_is_empty(&zfsvfs->z_all_znodes)) { 1569 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1570 dmu_objset_pool(zfsvfs->z_os)), 0); 1571 if (++round > 1 && !unmounting) 1572 break; 1573 } 1574 } 1575 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1576 1577 if (!unmounting) { 1578 /* 1579 * We purge the parent filesystem's vfsp as the parent 1580 * filesystem and all of its snapshots have their vnode's 1581 * v_vfsp set to the parent's filesystem's vfsp. Note, 1582 * 'z_parent' is self referential for non-snapshots. 1583 */ 1584 #ifdef FREEBSD_NAMECACHE 1585 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1586 #endif 1587 } 1588 1589 /* 1590 * Close the zil. NB: Can't close the zil while zfs_inactive 1591 * threads are blocked as zil_close can call zfs_inactive. 1592 */ 1593 if (zfsvfs->z_log) { 1594 zil_close(zfsvfs->z_log); 1595 zfsvfs->z_log = NULL; 1596 } 1597 1598 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1599 1600 /* 1601 * If we are not unmounting (ie: online recv) and someone already 1602 * unmounted this file system while we were doing the switcheroo, 1603 * or a reopen of z_os failed then just bail out now. 1604 */ 1605 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1606 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1607 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1608 return (SET_ERROR(EIO)); 1609 } 1610 1611 /* 1612 * At this point there are no vops active, and any new vops will 1613 * fail with EIO since we have z_teardown_lock for writer (only 1614 * relevant for forced unmount). 1615 * 1616 * Release all holds on dbufs. 1617 */ 1618 mutex_enter(&zfsvfs->z_znodes_lock); 1619 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1620 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1621 if (zp->z_sa_hdl != NULL) { 1622 zfs_znode_dmu_fini(zp); 1623 } 1624 } 1625 mutex_exit(&zfsvfs->z_znodes_lock); 1626 1627 /* 1628 * If we are unmounting, set the unmounted flag and let new vops 1629 * unblock. zfs_inactive will have the unmounted behavior, and all 1630 * other vops will fail with EIO. 1631 */ 1632 if (unmounting) { 1633 zfsvfs->z_unmounted = B_TRUE; 1634 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1635 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1636 } 1637 1638 /* 1639 * z_os will be NULL if there was an error in attempting to reopen 1640 * zfsvfs, so just return as the properties had already been 1641 * unregistered and cached data had been evicted before. 1642 */ 1643 if (zfsvfs->z_os == NULL) 1644 return (0); 1645 1646 /* 1647 * Unregister properties. 1648 */ 1649 zfs_unregister_callbacks(zfsvfs); 1650 1651 /* 1652 * Evict cached data. We must write out any dirty data before 1653 * disowning the dataset. 1654 */ 1655 objset_t *os = zfsvfs->z_os; 1656 boolean_t os_dirty = B_FALSE; 1657 for (int t = 0; t < TXG_SIZE; t++) { 1658 if (dmu_objset_is_dirty(os, t)) { 1659 os_dirty = B_TRUE; 1660 break; 1661 } 1662 } 1663 if (!zfs_is_readonly(zfsvfs) && os_dirty) 1664 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1665 dmu_objset_evict_dbufs(zfsvfs->z_os); 1666 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1667 dsl_dir_cancel_waiters(dd); 1668 1669 return (0); 1670 } 1671 1672 static int 1673 zfs_umount(vfs_t *vfsp, int fflag) 1674 { 1675 kthread_t *td = curthread; 1676 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1677 objset_t *os; 1678 cred_t *cr = td->td_ucred; 1679 int ret; 1680 1681 ret = secpolicy_fs_unmount(cr, vfsp); 1682 if (ret) { 1683 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1684 ZFS_DELEG_PERM_MOUNT, cr)) 1685 return (ret); 1686 } 1687 1688 /* 1689 * Unmount any snapshots mounted under .zfs before unmounting the 1690 * dataset itself. 1691 */ 1692 if (zfsvfs->z_ctldir != NULL) { 1693 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1694 return (ret); 1695 } 1696 1697 if (fflag & MS_FORCE) { 1698 /* 1699 * Mark file system as unmounted before calling 1700 * vflush(FORCECLOSE). This way we ensure no future vnops 1701 * will be called and risk operating on DOOMED vnodes. 1702 */ 1703 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1704 zfsvfs->z_unmounted = B_TRUE; 1705 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1706 } 1707 1708 /* 1709 * Flush all the files. 1710 */ 1711 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1712 if (ret != 0) 1713 return (ret); 1714 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1715 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1716 taskqueue_drain(zfsvfs_taskq->tq_queue, 1717 &zfsvfs->z_unlinked_drain_task); 1718 1719 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); 1720 os = zfsvfs->z_os; 1721 1722 /* 1723 * z_os will be NULL if there was an error in 1724 * attempting to reopen zfsvfs. 1725 */ 1726 if (os != NULL) { 1727 /* 1728 * Unset the objset user_ptr. 1729 */ 1730 mutex_enter(&os->os_user_ptr_lock); 1731 dmu_objset_set_user(os, NULL); 1732 mutex_exit(&os->os_user_ptr_lock); 1733 1734 /* 1735 * Finally release the objset 1736 */ 1737 dmu_objset_disown(os, B_TRUE, zfsvfs); 1738 } 1739 1740 /* 1741 * We can now safely destroy the '.zfs' directory node. 1742 */ 1743 if (zfsvfs->z_ctldir != NULL) 1744 zfsctl_destroy(zfsvfs); 1745 zfs_freevfs(vfsp); 1746 1747 return (0); 1748 } 1749 1750 static int 1751 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1752 { 1753 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1754 znode_t *zp; 1755 int err; 1756 1757 /* 1758 * zfs_zget() can't operate on virtual entries like .zfs/ or 1759 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1760 * This will make NFS to switch to LOOKUP instead of using VGET. 1761 */ 1762 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1763 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1764 return (EOPNOTSUPP); 1765 1766 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1767 return (err); 1768 err = zfs_zget(zfsvfs, ino, &zp); 1769 if (err == 0 && zp->z_unlinked) { 1770 vrele(ZTOV(zp)); 1771 err = EINVAL; 1772 } 1773 if (err == 0) 1774 *vpp = ZTOV(zp); 1775 zfs_exit(zfsvfs, FTAG); 1776 if (err == 0) { 1777 err = vn_lock(*vpp, flags); 1778 if (err != 0) 1779 vrele(*vpp); 1780 } 1781 if (err != 0) 1782 *vpp = NULL; 1783 return (err); 1784 } 1785 1786 static int 1787 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1788 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1789 { 1790 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1791 1792 /* 1793 * If this is regular file system vfsp is the same as 1794 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1795 * zfsvfs->z_parent->z_vfs represents parent file system 1796 * which we have to use here, because only this file system 1797 * has mnt_export configured. 1798 */ 1799 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1800 credanonp, numsecflavors, secflavors)); 1801 } 1802 1803 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, 1804 "struct fid bigger than SHORT_FID_LEN"); 1805 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, 1806 "struct fid bigger than LONG_FID_LEN"); 1807 1808 static int 1809 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1810 { 1811 struct componentname cn; 1812 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1813 znode_t *zp; 1814 vnode_t *dvp; 1815 uint64_t object = 0; 1816 uint64_t fid_gen = 0; 1817 uint64_t setgen = 0; 1818 uint64_t gen_mask; 1819 uint64_t zp_gen; 1820 int i, err; 1821 1822 *vpp = NULL; 1823 1824 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1825 return (err); 1826 1827 /* 1828 * On FreeBSD we can get snapshot's mount point or its parent file 1829 * system mount point depending if snapshot is already mounted or not. 1830 */ 1831 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1832 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1833 uint64_t objsetid = 0; 1834 1835 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1836 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1837 1838 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1839 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1840 1841 zfs_exit(zfsvfs, FTAG); 1842 1843 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1844 if (err) 1845 return (SET_ERROR(EINVAL)); 1846 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1847 return (err); 1848 } 1849 1850 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1851 zfid_short_t *zfid = (zfid_short_t *)fidp; 1852 1853 for (i = 0; i < sizeof (zfid->zf_object); i++) 1854 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1855 1856 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1857 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1858 } else { 1859 zfs_exit(zfsvfs, FTAG); 1860 return (SET_ERROR(EINVAL)); 1861 } 1862 1863 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { 1864 zfs_exit(zfsvfs, FTAG); 1865 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", 1866 (u_longlong_t)fid_gen, (u_longlong_t)setgen); 1867 return (SET_ERROR(EINVAL)); 1868 } 1869 1870 /* 1871 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1872 * directory tree. If the object == zfsvfs->z_shares_dir, then 1873 * we are in the .zfs/shares directory tree. 1874 */ 1875 if ((fid_gen == 0 && 1876 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1877 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1878 zfs_exit(zfsvfs, FTAG); 1879 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1880 if (object == ZFSCTL_INO_SNAPDIR) { 1881 cn.cn_nameptr = "snapshot"; 1882 cn.cn_namelen = strlen(cn.cn_nameptr); 1883 cn.cn_nameiop = LOOKUP; 1884 cn.cn_flags = ISLASTCN | LOCKLEAF; 1885 cn.cn_lkflags = flags; 1886 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1887 vput(dvp); 1888 } else if (object == zfsvfs->z_shares_dir) { 1889 /* 1890 * XXX This branch must not be taken, 1891 * if it is, then the lookup below will 1892 * explode. 1893 */ 1894 cn.cn_nameptr = "shares"; 1895 cn.cn_namelen = strlen(cn.cn_nameptr); 1896 cn.cn_nameiop = LOOKUP; 1897 cn.cn_flags = ISLASTCN; 1898 cn.cn_lkflags = flags; 1899 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1900 vput(dvp); 1901 } else { 1902 *vpp = dvp; 1903 } 1904 return (err); 1905 } 1906 1907 gen_mask = -1ULL >> (64 - 8 * i); 1908 1909 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, 1910 (u_longlong_t)fid_gen, 1911 (u_longlong_t)gen_mask); 1912 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1913 zfs_exit(zfsvfs, FTAG); 1914 return (err); 1915 } 1916 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1917 sizeof (uint64_t)); 1918 zp_gen = zp_gen & gen_mask; 1919 if (zp_gen == 0) 1920 zp_gen = 1; 1921 if (zp->z_unlinked || zp_gen != fid_gen) { 1922 dprintf("znode gen (%llu) != fid gen (%llu)\n", 1923 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); 1924 vrele(ZTOV(zp)); 1925 zfs_exit(zfsvfs, FTAG); 1926 return (SET_ERROR(EINVAL)); 1927 } 1928 1929 *vpp = ZTOV(zp); 1930 zfs_exit(zfsvfs, FTAG); 1931 err = vn_lock(*vpp, flags); 1932 if (err == 0) 1933 vnode_create_vobject(*vpp, zp->z_size, curthread); 1934 else 1935 *vpp = NULL; 1936 return (err); 1937 } 1938 1939 /* 1940 * Block out VOPs and close zfsvfs_t::z_os 1941 * 1942 * Note, if successful, then we return with the 'z_teardown_lock' and 1943 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 1944 * dataset and objset intact so that they can be atomically handed off during 1945 * a subsequent rollback or recv operation and the resume thereafter. 1946 */ 1947 int 1948 zfs_suspend_fs(zfsvfs_t *zfsvfs) 1949 { 1950 int error; 1951 1952 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1953 return (error); 1954 1955 return (0); 1956 } 1957 1958 /* 1959 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 1960 * is an invariant across any of the operations that can be performed while the 1961 * filesystem was suspended. Whether it succeeded or failed, the preconditions 1962 * are the same: the relevant objset and associated dataset are owned by 1963 * zfsvfs, held, and long held on entry. 1964 */ 1965 int 1966 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 1967 { 1968 int err; 1969 znode_t *zp; 1970 1971 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 1972 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 1973 1974 /* 1975 * We already own this, so just update the objset_t, as the one we 1976 * had before may have been evicted. 1977 */ 1978 objset_t *os; 1979 VERIFY3P(ds->ds_owner, ==, zfsvfs); 1980 VERIFY(dsl_dataset_long_held(ds)); 1981 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 1982 dsl_pool_config_enter(dp, FTAG); 1983 VERIFY0(dmu_objset_from_ds(ds, &os)); 1984 dsl_pool_config_exit(dp, FTAG); 1985 1986 err = zfsvfs_init(zfsvfs, os); 1987 if (err != 0) 1988 goto bail; 1989 1990 ds->ds_dir->dd_activity_cancelled = B_FALSE; 1991 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); 1992 1993 zfs_set_fuid_feature(zfsvfs); 1994 1995 /* 1996 * Attempt to re-establish all the active znodes with 1997 * their dbufs. If a zfs_rezget() fails, then we'll let 1998 * any potential callers discover that via zfs_enter_verify_zp 1999 * when they try to use their znode. 2000 */ 2001 mutex_enter(&zfsvfs->z_znodes_lock); 2002 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2003 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2004 (void) zfs_rezget(zp); 2005 } 2006 mutex_exit(&zfsvfs->z_znodes_lock); 2007 2008 bail: 2009 /* release the VOPs */ 2010 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2011 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2012 2013 if (err) { 2014 /* 2015 * Since we couldn't setup the sa framework, try to force 2016 * unmount this file system. 2017 */ 2018 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 2019 vfs_ref(zfsvfs->z_vfs); 2020 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2021 } 2022 } 2023 return (err); 2024 } 2025 2026 static void 2027 zfs_freevfs(vfs_t *vfsp) 2028 { 2029 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2030 2031 zfsvfs_free(zfsvfs); 2032 2033 atomic_dec_32(&zfs_active_fs_count); 2034 } 2035 2036 #ifdef __i386__ 2037 static int desiredvnodes_backup; 2038 #include <sys/vmmeter.h> 2039 2040 2041 #include <vm/vm_page.h> 2042 #include <vm/vm_object.h> 2043 #include <vm/vm_kern.h> 2044 #include <vm/vm_map.h> 2045 #endif 2046 2047 static void 2048 zfs_vnodes_adjust(void) 2049 { 2050 #ifdef __i386__ 2051 int newdesiredvnodes; 2052 2053 desiredvnodes_backup = desiredvnodes; 2054 2055 /* 2056 * We calculate newdesiredvnodes the same way it is done in 2057 * vntblinit(). If it is equal to desiredvnodes, it means that 2058 * it wasn't tuned by the administrator and we can tune it down. 2059 */ 2060 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2061 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2062 sizeof (struct vnode)))); 2063 if (newdesiredvnodes == desiredvnodes) 2064 desiredvnodes = (3 * newdesiredvnodes) / 4; 2065 #endif 2066 } 2067 2068 static void 2069 zfs_vnodes_adjust_back(void) 2070 { 2071 2072 #ifdef __i386__ 2073 desiredvnodes = desiredvnodes_backup; 2074 #endif 2075 } 2076 2077 static struct sx zfs_vnlru_lock; 2078 static struct vnode *zfs_vnlru_marker; 2079 static arc_prune_t *zfs_prune; 2080 2081 static void 2082 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) 2083 { 2084 if (nr_to_scan > INT_MAX) 2085 nr_to_scan = INT_MAX; 2086 sx_xlock(&zfs_vnlru_lock); 2087 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); 2088 sx_xunlock(&zfs_vnlru_lock); 2089 } 2090 2091 void 2092 zfs_init(void) 2093 { 2094 2095 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2096 2097 /* 2098 * Initialize .zfs directory structures 2099 */ 2100 zfsctl_init(); 2101 2102 /* 2103 * Initialize znode cache, vnode ops, etc... 2104 */ 2105 zfs_znode_init(); 2106 2107 /* 2108 * Reduce number of vnodes. Originally number of vnodes is calculated 2109 * with UFS inode in mind. We reduce it here, because it's too big for 2110 * ZFS/i386. 2111 */ 2112 zfs_vnodes_adjust(); 2113 2114 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2115 2116 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2117 2118 zfs_vnlru_marker = vnlru_alloc_marker(); 2119 sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); 2120 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); 2121 } 2122 2123 void 2124 zfs_fini(void) 2125 { 2126 arc_remove_prune_callback(zfs_prune); 2127 vnlru_free_marker(zfs_vnlru_marker); 2128 sx_destroy(&zfs_vnlru_lock); 2129 2130 taskq_destroy(zfsvfs_taskq); 2131 zfsctl_fini(); 2132 zfs_znode_fini(); 2133 zfs_vnodes_adjust_back(); 2134 } 2135 2136 int 2137 zfs_busy(void) 2138 { 2139 return (zfs_active_fs_count != 0); 2140 } 2141 2142 /* 2143 * Release VOPs and unmount a suspended filesystem. 2144 */ 2145 int 2146 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2147 { 2148 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2149 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2150 2151 /* 2152 * We already own this, so just hold and rele it to update the 2153 * objset_t, as the one we had before may have been evicted. 2154 */ 2155 objset_t *os; 2156 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2157 VERIFY(dsl_dataset_long_held(ds)); 2158 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2159 dsl_pool_config_enter(dp, FTAG); 2160 VERIFY0(dmu_objset_from_ds(ds, &os)); 2161 dsl_pool_config_exit(dp, FTAG); 2162 zfsvfs->z_os = os; 2163 2164 /* release the VOPs */ 2165 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2166 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2167 2168 /* 2169 * Try to force unmount this file system. 2170 */ 2171 (void) zfs_umount(zfsvfs->z_vfs, 0); 2172 zfsvfs->z_unmounted = B_TRUE; 2173 return (0); 2174 } 2175 2176 int 2177 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2178 { 2179 int error; 2180 objset_t *os = zfsvfs->z_os; 2181 dmu_tx_t *tx; 2182 2183 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2184 return (SET_ERROR(EINVAL)); 2185 2186 if (newvers < zfsvfs->z_version) 2187 return (SET_ERROR(EINVAL)); 2188 2189 if (zfs_spa_version_map(newvers) > 2190 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2191 return (SET_ERROR(ENOTSUP)); 2192 2193 tx = dmu_tx_create(os); 2194 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2195 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2196 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2197 ZFS_SA_ATTRS); 2198 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2199 } 2200 error = dmu_tx_assign(tx, TXG_WAIT); 2201 if (error) { 2202 dmu_tx_abort(tx); 2203 return (error); 2204 } 2205 2206 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2207 8, 1, &newvers, tx); 2208 2209 if (error) { 2210 dmu_tx_commit(tx); 2211 return (error); 2212 } 2213 2214 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2215 uint64_t sa_obj; 2216 2217 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2218 SPA_VERSION_SA); 2219 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2220 DMU_OT_NONE, 0, tx); 2221 2222 error = zap_add(os, MASTER_NODE_OBJ, 2223 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2224 ASSERT0(error); 2225 2226 VERIFY0(sa_set_sa_object(os, sa_obj)); 2227 sa_register_update_callback(os, zfs_sa_upgrade); 2228 } 2229 2230 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2231 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2232 (uintmax_t)newvers); 2233 dmu_tx_commit(tx); 2234 2235 zfsvfs->z_version = newvers; 2236 os->os_version = newvers; 2237 2238 zfs_set_fuid_feature(zfsvfs); 2239 2240 return (0); 2241 } 2242 2243 /* 2244 * Return true if the corresponding vfs's unmounted flag is set. 2245 * Otherwise return false. 2246 * If this function returns true we know VFS unmount has been initiated. 2247 */ 2248 boolean_t 2249 zfs_get_vfs_flag_unmounted(objset_t *os) 2250 { 2251 zfsvfs_t *zfvp; 2252 boolean_t unmounted = B_FALSE; 2253 2254 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); 2255 2256 mutex_enter(&os->os_user_ptr_lock); 2257 zfvp = dmu_objset_get_user(os); 2258 if (zfvp != NULL && zfvp->z_vfs != NULL && 2259 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2260 unmounted = B_TRUE; 2261 mutex_exit(&os->os_user_ptr_lock); 2262 2263 return (unmounted); 2264 } 2265 2266 #ifdef _KERNEL 2267 void 2268 zfsvfs_update_fromname(const char *oldname, const char *newname) 2269 { 2270 char tmpbuf[MAXPATHLEN]; 2271 struct mount *mp; 2272 char *fromname; 2273 size_t oldlen; 2274 2275 oldlen = strlen(oldname); 2276 2277 mtx_lock(&mountlist_mtx); 2278 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2279 fromname = mp->mnt_stat.f_mntfromname; 2280 if (strcmp(fromname, oldname) == 0) { 2281 (void) strlcpy(fromname, newname, 2282 sizeof (mp->mnt_stat.f_mntfromname)); 2283 continue; 2284 } 2285 if (strncmp(fromname, oldname, oldlen) == 0 && 2286 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2287 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2288 newname, fromname + oldlen); 2289 (void) strlcpy(fromname, tmpbuf, 2290 sizeof (mp->mnt_stat.f_mntfromname)); 2291 continue; 2292 } 2293 } 2294 mtx_unlock(&mountlist_mtx); 2295 } 2296 #endif 2297 2298 /* 2299 * Find a prison with ZFS info. 2300 * Return the ZFS info and the (locked) prison. 2301 */ 2302 static struct zfs_jailparam * 2303 zfs_jailparam_find(struct prison *spr, struct prison **prp) 2304 { 2305 struct prison *pr; 2306 struct zfs_jailparam *zjp; 2307 2308 for (pr = spr; ; pr = pr->pr_parent) { 2309 mtx_lock(&pr->pr_mtx); 2310 if (pr == &prison0) { 2311 zjp = &zfs_jailparam0; 2312 break; 2313 } 2314 zjp = osd_jail_get(pr, zfs_jailparam_slot); 2315 if (zjp != NULL) 2316 break; 2317 mtx_unlock(&pr->pr_mtx); 2318 } 2319 *prp = pr; 2320 2321 return (zjp); 2322 } 2323 2324 /* 2325 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the 2326 * ZFS info and lock the prison. 2327 */ 2328 static void 2329 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) 2330 { 2331 struct prison *ppr; 2332 struct zfs_jailparam *zjp, *nzjp; 2333 void **rsv; 2334 2335 /* If this prison already has ZFS info, return that. */ 2336 zjp = zfs_jailparam_find(pr, &ppr); 2337 if (ppr == pr) 2338 goto done; 2339 2340 /* 2341 * Allocate a new info record. Then check again, in case something 2342 * changed during the allocation. 2343 */ 2344 mtx_unlock(&ppr->pr_mtx); 2345 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); 2346 rsv = osd_reserve(zfs_jailparam_slot); 2347 zjp = zfs_jailparam_find(pr, &ppr); 2348 if (ppr == pr) { 2349 free(nzjp, M_PRISON); 2350 osd_free_reserved(rsv); 2351 goto done; 2352 } 2353 /* Inherit the initial values from the ancestor. */ 2354 mtx_lock(&pr->pr_mtx); 2355 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); 2356 (void) memcpy(nzjp, zjp, sizeof (*zjp)); 2357 zjp = nzjp; 2358 mtx_unlock(&ppr->pr_mtx); 2359 done: 2360 if (zjpp != NULL) 2361 *zjpp = zjp; 2362 else 2363 mtx_unlock(&pr->pr_mtx); 2364 } 2365 2366 /* 2367 * Jail OSD methods for ZFS VFS info. 2368 */ 2369 static int 2370 zfs_jailparam_create(void *obj, void *data) 2371 { 2372 struct prison *pr = obj; 2373 struct vfsoptlist *opts = data; 2374 int jsys; 2375 2376 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && 2377 jsys == JAIL_SYS_INHERIT) 2378 return (0); 2379 /* 2380 * Inherit a prison's initial values from its parent 2381 * (different from JAIL_SYS_INHERIT which also inherits changes). 2382 */ 2383 zfs_jailparam_alloc(pr, NULL); 2384 return (0); 2385 } 2386 2387 static int 2388 zfs_jailparam_get(void *obj, void *data) 2389 { 2390 struct prison *ppr, *pr = obj; 2391 struct vfsoptlist *opts = data; 2392 struct zfs_jailparam *zjp; 2393 int jsys, error; 2394 2395 zjp = zfs_jailparam_find(pr, &ppr); 2396 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; 2397 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); 2398 if (error != 0 && error != ENOENT) 2399 goto done; 2400 if (jsys == JAIL_SYS_NEW) { 2401 error = vfs_setopt(opts, "zfs.mount_snapshot", 2402 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); 2403 if (error != 0 && error != ENOENT) 2404 goto done; 2405 } else { 2406 /* 2407 * If this prison is inheriting its ZFS info, report 2408 * empty/zero parameters. 2409 */ 2410 static int mount_snapshot = 0; 2411 2412 error = vfs_setopt(opts, "zfs.mount_snapshot", 2413 &mount_snapshot, sizeof (mount_snapshot)); 2414 if (error != 0 && error != ENOENT) 2415 goto done; 2416 } 2417 error = 0; 2418 done: 2419 mtx_unlock(&ppr->pr_mtx); 2420 return (error); 2421 } 2422 2423 static int 2424 zfs_jailparam_set(void *obj, void *data) 2425 { 2426 struct prison *pr = obj; 2427 struct prison *ppr; 2428 struct vfsoptlist *opts = data; 2429 int error, jsys, mount_snapshot; 2430 2431 /* Set the parameters, which should be correct. */ 2432 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2433 if (error == ENOENT) 2434 jsys = -1; 2435 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2436 sizeof (mount_snapshot)); 2437 if (error == ENOENT) 2438 mount_snapshot = -1; 2439 else 2440 jsys = JAIL_SYS_NEW; 2441 switch (jsys) { 2442 case JAIL_SYS_NEW: 2443 { 2444 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ 2445 struct zfs_jailparam *zjp; 2446 2447 /* 2448 * A child jail cannot have more permissions than its parent 2449 */ 2450 if (pr->pr_parent != &prison0) { 2451 zjp = zfs_jailparam_find(pr->pr_parent, &ppr); 2452 mtx_unlock(&ppr->pr_mtx); 2453 if (zjp->mount_snapshot < mount_snapshot) { 2454 return (EPERM); 2455 } 2456 } 2457 zfs_jailparam_alloc(pr, &zjp); 2458 if (mount_snapshot != -1) 2459 zjp->mount_snapshot = mount_snapshot; 2460 mtx_unlock(&pr->pr_mtx); 2461 break; 2462 } 2463 case JAIL_SYS_INHERIT: 2464 /* "zfs=inherit": inherit the parent's ZFS info. */ 2465 mtx_lock(&pr->pr_mtx); 2466 osd_jail_del(pr, zfs_jailparam_slot); 2467 mtx_unlock(&pr->pr_mtx); 2468 break; 2469 case -1: 2470 /* 2471 * If the setting being changed is not ZFS related 2472 * then do nothing. 2473 */ 2474 break; 2475 } 2476 2477 return (0); 2478 } 2479 2480 static int 2481 zfs_jailparam_check(void *obj __unused, void *data) 2482 { 2483 struct vfsoptlist *opts = data; 2484 int error, jsys, mount_snapshot; 2485 2486 /* Check that the parameters are correct. */ 2487 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2488 if (error != ENOENT) { 2489 if (error != 0) 2490 return (error); 2491 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) 2492 return (EINVAL); 2493 } 2494 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2495 sizeof (mount_snapshot)); 2496 if (error != ENOENT) { 2497 if (error != 0) 2498 return (error); 2499 if (mount_snapshot != 0 && mount_snapshot != 1) 2500 return (EINVAL); 2501 } 2502 return (0); 2503 } 2504 2505 static void 2506 zfs_jailparam_destroy(void *data) 2507 { 2508 2509 free(data, M_PRISON); 2510 } 2511 2512 static void 2513 zfs_jailparam_sysinit(void *arg __unused) 2514 { 2515 struct prison *pr; 2516 osd_method_t methods[PR_MAXMETHOD] = { 2517 [PR_METHOD_CREATE] = zfs_jailparam_create, 2518 [PR_METHOD_GET] = zfs_jailparam_get, 2519 [PR_METHOD_SET] = zfs_jailparam_set, 2520 [PR_METHOD_CHECK] = zfs_jailparam_check, 2521 }; 2522 2523 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); 2524 /* Copy the defaults to any existing prisons. */ 2525 sx_slock(&allprison_lock); 2526 TAILQ_FOREACH(pr, &allprison, pr_list) 2527 zfs_jailparam_alloc(pr, NULL); 2528 sx_sunlock(&allprison_lock); 2529 } 2530 2531 static void 2532 zfs_jailparam_sysuninit(void *arg __unused) 2533 { 2534 2535 osd_jail_deregister(zfs_jailparam_slot); 2536 } 2537 2538 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2539 zfs_jailparam_sysinit, NULL); 2540 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2541 zfs_jailparam_sysuninit, NULL); 2542