xref: /freebsd-src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c (revision 46e6e290975f19ea62d03f90ac3e523af4dae557)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 /* Portions Copyright 2010 Robert Milkowski */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <sys/osd.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
69 
70 #include "zfs_comutil.h"
71 
72 #ifndef	MNTK_VMSETSIZE_BUG
73 #define	MNTK_VMSETSIZE_BUG	0
74 #endif
75 #ifndef	MNTK_NOMSYNC
76 #define	MNTK_NOMSYNC	8
77 #endif
78 
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86 	"File system owners can perform privileged operation on file systems");
87 
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 	"Debug level");
91 
92 struct zfs_jailparam {
93 	int mount_snapshot;
94 };
95 
96 static struct zfs_jailparam zfs_jailparam0 = {
97 	.mount_snapshot = 0,
98 };
99 
100 static int zfs_jailparam_slot;
101 
102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 	"Allow mounting snapshots in the .zfs directory for unjailed datasets");
105 
106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107 static int zfs_version_acl = ZFS_ACL_VERSION;
108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
109 	"ZFS_ACL_VERSION");
110 static int zfs_version_spa = SPA_VERSION;
111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
112 	"SPA_VERSION");
113 static int zfs_version_zpl = ZPL_VERSION;
114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
115 	"ZPL_VERSION");
116 
117 #if __FreeBSD_version >= 1400018
118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119     bool *mp_busy);
120 #else
121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
122 #endif
123 static int zfs_mount(vfs_t *vfsp);
124 static int zfs_umount(vfs_t *vfsp, int fflag);
125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128 static int zfs_sync(vfs_t *vfsp, int waitfor);
129 #if __FreeBSD_version >= 1300098
130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131     struct ucred **credanonp, int *numsecflavors, int *secflavors);
132 #else
133 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
134     struct ucred **credanonp, int *numsecflavors, int **secflavors);
135 #endif
136 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
137 static void zfs_freevfs(vfs_t *vfsp);
138 
139 struct vfsops zfs_vfsops = {
140 	.vfs_mount =		zfs_mount,
141 	.vfs_unmount =		zfs_umount,
142 #if __FreeBSD_version >= 1300049
143 	.vfs_root =		vfs_cache_root,
144 	.vfs_cachedroot = zfs_root,
145 #else
146 	.vfs_root =		zfs_root,
147 #endif
148 	.vfs_statfs =		zfs_statfs,
149 	.vfs_vget =		zfs_vget,
150 	.vfs_sync =		zfs_sync,
151 	.vfs_checkexp =		zfs_checkexp,
152 	.vfs_fhtovp =		zfs_fhtovp,
153 	.vfs_quotactl =		zfs_quotactl,
154 };
155 
156 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
157 
158 /*
159  * We need to keep a count of active fs's.
160  * This is necessary to prevent our module
161  * from being unloaded after a umount -f
162  */
163 static uint32_t	zfs_active_fs_count = 0;
164 
165 int
166 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
167     char *setpoint)
168 {
169 	int error;
170 	zfsvfs_t *zfvp;
171 	vfs_t *vfsp;
172 	objset_t *os;
173 	uint64_t tmp = *val;
174 
175 	error = dmu_objset_from_ds(ds, &os);
176 	if (error != 0)
177 		return (error);
178 
179 	error = getzfsvfs_impl(os, &zfvp);
180 	if (error != 0)
181 		return (error);
182 	if (zfvp == NULL)
183 		return (ENOENT);
184 	vfsp = zfvp->z_vfs;
185 	switch (zfs_prop) {
186 	case ZFS_PROP_ATIME:
187 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
188 			tmp = 0;
189 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
190 			tmp = 1;
191 		break;
192 	case ZFS_PROP_DEVICES:
193 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
194 			tmp = 0;
195 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
196 			tmp = 1;
197 		break;
198 	case ZFS_PROP_EXEC:
199 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
200 			tmp = 0;
201 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
202 			tmp = 1;
203 		break;
204 	case ZFS_PROP_SETUID:
205 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
206 			tmp = 0;
207 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
208 			tmp = 1;
209 		break;
210 	case ZFS_PROP_READONLY:
211 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
212 			tmp = 0;
213 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
214 			tmp = 1;
215 		break;
216 	case ZFS_PROP_XATTR:
217 		if (zfvp->z_flags & ZSB_XATTR)
218 			tmp = zfvp->z_xattr;
219 		break;
220 	case ZFS_PROP_NBMAND:
221 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
222 			tmp = 0;
223 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
224 			tmp = 1;
225 		break;
226 	default:
227 		vfs_unbusy(vfsp);
228 		return (ENOENT);
229 	}
230 
231 	vfs_unbusy(vfsp);
232 	if (tmp != *val) {
233 		if (setpoint)
234 			(void) strcpy(setpoint, "temporary");
235 		*val = tmp;
236 	}
237 	return (0);
238 }
239 
240 static int
241 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
242 {
243 	int error = 0;
244 	char buf[32];
245 	uint64_t usedobj, quotaobj;
246 	uint64_t quota, used = 0;
247 	timespec_t now;
248 
249 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
250 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
251 
252 	if (quotaobj == 0 || zfsvfs->z_replay) {
253 		error = ENOENT;
254 		goto done;
255 	}
256 	(void) sprintf(buf, "%llx", (longlong_t)id);
257 	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
258 	    buf, sizeof (quota), 1, &quota)) != 0) {
259 		dprintf("%s(%d): quotaobj lookup failed\n",
260 		    __FUNCTION__, __LINE__);
261 		goto done;
262 	}
263 	/*
264 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
265 	 * So we set them to be the same.
266 	 */
267 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
268 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
269 	if (error && error != ENOENT) {
270 		dprintf("%s(%d):  usedobj failed; %d\n",
271 		    __FUNCTION__, __LINE__, error);
272 		goto done;
273 	}
274 	dqp->dqb_curblocks = btodb(used);
275 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
276 	vfs_timestamp(&now);
277 	/*
278 	 * Setting this to 0 causes FreeBSD quota(8) to print
279 	 * the number of days since the epoch, which isn't
280 	 * particularly useful.
281 	 */
282 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
283 done:
284 	return (error);
285 }
286 
287 static int
288 #if __FreeBSD_version >= 1400018
289 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
290 #else
291 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
292 #endif
293 {
294 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
295 	struct thread *td;
296 	int cmd, type, error = 0;
297 	int bitsize;
298 	zfs_userquota_prop_t quota_type;
299 	struct dqblk64 dqblk = { 0 };
300 
301 	td = curthread;
302 	cmd = cmds >> SUBCMDSHIFT;
303 	type = cmds & SUBCMDMASK;
304 
305 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
306 		return (error);
307 	if (id == -1) {
308 		switch (type) {
309 		case USRQUOTA:
310 			id = td->td_ucred->cr_ruid;
311 			break;
312 		case GRPQUOTA:
313 			id = td->td_ucred->cr_rgid;
314 			break;
315 		default:
316 			error = EINVAL;
317 #if __FreeBSD_version < 1400018
318 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
319 				vfs_unbusy(vfsp);
320 #endif
321 			goto done;
322 		}
323 	}
324 	/*
325 	 * Map BSD type to:
326 	 * ZFS_PROP_USERUSED,
327 	 * ZFS_PROP_USERQUOTA,
328 	 * ZFS_PROP_GROUPUSED,
329 	 * ZFS_PROP_GROUPQUOTA
330 	 */
331 	switch (cmd) {
332 	case Q_SETQUOTA:
333 	case Q_SETQUOTA32:
334 		if (type == USRQUOTA)
335 			quota_type = ZFS_PROP_USERQUOTA;
336 		else if (type == GRPQUOTA)
337 			quota_type = ZFS_PROP_GROUPQUOTA;
338 		else
339 			error = EINVAL;
340 		break;
341 	case Q_GETQUOTA:
342 	case Q_GETQUOTA32:
343 		if (type == USRQUOTA)
344 			quota_type = ZFS_PROP_USERUSED;
345 		else if (type == GRPQUOTA)
346 			quota_type = ZFS_PROP_GROUPUSED;
347 		else
348 			error = EINVAL;
349 		break;
350 	}
351 
352 	/*
353 	 * Depending on the cmd, we may need to get
354 	 * the ruid and domain (see fuidstr_to_sid?),
355 	 * the fuid (how?), or other information.
356 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
357 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
358 	 * I think I can use just the id?
359 	 *
360 	 * Look at zfs_id_overquota() to look up a quota.
361 	 * zap_lookup(something, quotaobj, fuidstring,
362 	 *     sizeof (long long), 1, &quota)
363 	 *
364 	 * See zfs_set_userquota() to set a quota.
365 	 */
366 	if ((uint32_t)type >= MAXQUOTAS) {
367 		error = EINVAL;
368 		goto done;
369 	}
370 
371 	switch (cmd) {
372 	case Q_GETQUOTASIZE:
373 		bitsize = 64;
374 		error = copyout(&bitsize, arg, sizeof (int));
375 		break;
376 	case Q_QUOTAON:
377 		// As far as I can tell, you can't turn quotas on or off on zfs
378 		error = 0;
379 #if __FreeBSD_version < 1400018
380 		vfs_unbusy(vfsp);
381 #endif
382 		break;
383 	case Q_QUOTAOFF:
384 		error = ENOTSUP;
385 #if __FreeBSD_version < 1400018
386 		vfs_unbusy(vfsp);
387 #endif
388 		break;
389 	case Q_SETQUOTA:
390 		error = copyin(arg, &dqblk, sizeof (dqblk));
391 		if (error == 0)
392 			error = zfs_set_userquota(zfsvfs, quota_type,
393 			    "", id, dbtob(dqblk.dqb_bhardlimit));
394 		break;
395 	case Q_GETQUOTA:
396 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
397 		if (error == 0)
398 			error = copyout(&dqblk, arg, sizeof (dqblk));
399 		break;
400 	default:
401 		error = EINVAL;
402 		break;
403 	}
404 done:
405 	zfs_exit(zfsvfs, FTAG);
406 	return (error);
407 }
408 
409 
410 boolean_t
411 zfs_is_readonly(zfsvfs_t *zfsvfs)
412 {
413 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
414 }
415 
416 static int
417 zfs_sync(vfs_t *vfsp, int waitfor)
418 {
419 
420 	/*
421 	 * Data integrity is job one.  We don't want a compromised kernel
422 	 * writing to the storage pool, so we never sync during panic.
423 	 */
424 	if (panicstr)
425 		return (0);
426 
427 	/*
428 	 * Ignore the system syncher.  ZFS already commits async data
429 	 * at zfs_txg_timeout intervals.
430 	 */
431 	if (waitfor == MNT_LAZY)
432 		return (0);
433 
434 	if (vfsp != NULL) {
435 		/*
436 		 * Sync a specific filesystem.
437 		 */
438 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
439 		dsl_pool_t *dp;
440 		int error;
441 
442 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
443 			return (error);
444 		dp = dmu_objset_pool(zfsvfs->z_os);
445 
446 		/*
447 		 * If the system is shutting down, then skip any
448 		 * filesystems which may exist on a suspended pool.
449 		 */
450 		if (rebooting && spa_suspended(dp->dp_spa)) {
451 			zfs_exit(zfsvfs, FTAG);
452 			return (0);
453 		}
454 
455 		if (zfsvfs->z_log != NULL)
456 			zil_commit(zfsvfs->z_log, 0);
457 
458 		zfs_exit(zfsvfs, FTAG);
459 	} else {
460 		/*
461 		 * Sync all ZFS filesystems.  This is what happens when you
462 		 * run sync(8).  Unlike other filesystems, ZFS honors the
463 		 * request by waiting for all pools to commit all dirty data.
464 		 */
465 		spa_sync_allpools();
466 	}
467 
468 	return (0);
469 }
470 
471 static void
472 atime_changed_cb(void *arg, uint64_t newval)
473 {
474 	zfsvfs_t *zfsvfs = arg;
475 
476 	if (newval == TRUE) {
477 		zfsvfs->z_atime = TRUE;
478 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
479 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
480 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
481 	} else {
482 		zfsvfs->z_atime = FALSE;
483 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
484 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
485 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
486 	}
487 }
488 
489 static void
490 xattr_changed_cb(void *arg, uint64_t newval)
491 {
492 	zfsvfs_t *zfsvfs = arg;
493 
494 	if (newval == ZFS_XATTR_OFF) {
495 		zfsvfs->z_flags &= ~ZSB_XATTR;
496 	} else {
497 		zfsvfs->z_flags |= ZSB_XATTR;
498 
499 		if (newval == ZFS_XATTR_SA)
500 			zfsvfs->z_xattr_sa = B_TRUE;
501 		else
502 			zfsvfs->z_xattr_sa = B_FALSE;
503 	}
504 }
505 
506 static void
507 blksz_changed_cb(void *arg, uint64_t newval)
508 {
509 	zfsvfs_t *zfsvfs = arg;
510 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
511 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
512 	ASSERT(ISP2(newval));
513 
514 	zfsvfs->z_max_blksz = newval;
515 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
516 }
517 
518 static void
519 readonly_changed_cb(void *arg, uint64_t newval)
520 {
521 	zfsvfs_t *zfsvfs = arg;
522 
523 	if (newval) {
524 		/* XXX locking on vfs_flag? */
525 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
526 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
527 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
528 	} else {
529 		/* XXX locking on vfs_flag? */
530 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
531 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
532 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
533 	}
534 }
535 
536 static void
537 setuid_changed_cb(void *arg, uint64_t newval)
538 {
539 	zfsvfs_t *zfsvfs = arg;
540 
541 	if (newval == FALSE) {
542 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
543 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
544 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
545 	} else {
546 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
547 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
548 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
549 	}
550 }
551 
552 static void
553 exec_changed_cb(void *arg, uint64_t newval)
554 {
555 	zfsvfs_t *zfsvfs = arg;
556 
557 	if (newval == FALSE) {
558 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
559 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
560 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
561 	} else {
562 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
563 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
564 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
565 	}
566 }
567 
568 /*
569  * The nbmand mount option can be changed at mount time.
570  * We can't allow it to be toggled on live file systems or incorrect
571  * behavior may be seen from cifs clients
572  *
573  * This property isn't registered via dsl_prop_register(), but this callback
574  * will be called when a file system is first mounted
575  */
576 static void
577 nbmand_changed_cb(void *arg, uint64_t newval)
578 {
579 	zfsvfs_t *zfsvfs = arg;
580 	if (newval == FALSE) {
581 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
582 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
583 	} else {
584 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
585 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
586 	}
587 }
588 
589 static void
590 snapdir_changed_cb(void *arg, uint64_t newval)
591 {
592 	zfsvfs_t *zfsvfs = arg;
593 
594 	zfsvfs->z_show_ctldir = newval;
595 }
596 
597 static void
598 acl_mode_changed_cb(void *arg, uint64_t newval)
599 {
600 	zfsvfs_t *zfsvfs = arg;
601 
602 	zfsvfs->z_acl_mode = newval;
603 }
604 
605 static void
606 acl_inherit_changed_cb(void *arg, uint64_t newval)
607 {
608 	zfsvfs_t *zfsvfs = arg;
609 
610 	zfsvfs->z_acl_inherit = newval;
611 }
612 
613 static void
614 acl_type_changed_cb(void *arg, uint64_t newval)
615 {
616 	zfsvfs_t *zfsvfs = arg;
617 
618 	zfsvfs->z_acl_type = newval;
619 }
620 
621 static int
622 zfs_register_callbacks(vfs_t *vfsp)
623 {
624 	struct dsl_dataset *ds = NULL;
625 	objset_t *os = NULL;
626 	zfsvfs_t *zfsvfs = NULL;
627 	uint64_t nbmand;
628 	boolean_t readonly = B_FALSE;
629 	boolean_t do_readonly = B_FALSE;
630 	boolean_t setuid = B_FALSE;
631 	boolean_t do_setuid = B_FALSE;
632 	boolean_t exec = B_FALSE;
633 	boolean_t do_exec = B_FALSE;
634 	boolean_t xattr = B_FALSE;
635 	boolean_t atime = B_FALSE;
636 	boolean_t do_atime = B_FALSE;
637 	boolean_t do_xattr = B_FALSE;
638 	int error = 0;
639 
640 	ASSERT3P(vfsp, !=, NULL);
641 	zfsvfs = vfsp->vfs_data;
642 	ASSERT3P(zfsvfs, !=, NULL);
643 	os = zfsvfs->z_os;
644 
645 	/*
646 	 * This function can be called for a snapshot when we update snapshot's
647 	 * mount point, which isn't really supported.
648 	 */
649 	if (dmu_objset_is_snapshot(os))
650 		return (EOPNOTSUPP);
651 
652 	/*
653 	 * The act of registering our callbacks will destroy any mount
654 	 * options we may have.  In order to enable temporary overrides
655 	 * of mount options, we stash away the current values and
656 	 * restore them after we register the callbacks.
657 	 */
658 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
659 	    !spa_writeable(dmu_objset_spa(os))) {
660 		readonly = B_TRUE;
661 		do_readonly = B_TRUE;
662 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
663 		readonly = B_FALSE;
664 		do_readonly = B_TRUE;
665 	}
666 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
667 		setuid = B_FALSE;
668 		do_setuid = B_TRUE;
669 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
670 		setuid = B_TRUE;
671 		do_setuid = B_TRUE;
672 	}
673 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
674 		exec = B_FALSE;
675 		do_exec = B_TRUE;
676 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
677 		exec = B_TRUE;
678 		do_exec = B_TRUE;
679 	}
680 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
681 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
682 		do_xattr = B_TRUE;
683 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
684 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
685 		do_xattr = B_TRUE;
686 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
687 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
688 		do_xattr = B_TRUE;
689 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
690 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
691 		do_xattr = B_TRUE;
692 	}
693 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
694 		atime = B_FALSE;
695 		do_atime = B_TRUE;
696 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
697 		atime = B_TRUE;
698 		do_atime = B_TRUE;
699 	}
700 
701 	/*
702 	 * We need to enter pool configuration here, so that we can use
703 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
704 	 * dsl_prop_get_integer() can not be used, because it has to acquire
705 	 * spa_namespace_lock and we can not do that because we already hold
706 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
707 	 * with spa_namespace_lock held and the function calls ZFS vnode
708 	 * operations to write the cache file and thus z_teardown_lock is
709 	 * acquired after spa_namespace_lock.
710 	 */
711 	ds = dmu_objset_ds(os);
712 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
713 
714 	/*
715 	 * nbmand is a special property.  It can only be changed at
716 	 * mount time.
717 	 *
718 	 * This is weird, but it is documented to only be changeable
719 	 * at mount time.
720 	 */
721 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
722 		nbmand = B_FALSE;
723 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
724 		nbmand = B_TRUE;
725 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
726 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
727 		return (error);
728 	}
729 
730 	/*
731 	 * Register property callbacks.
732 	 *
733 	 * It would probably be fine to just check for i/o error from
734 	 * the first prop_register(), but I guess I like to go
735 	 * overboard...
736 	 */
737 	error = dsl_prop_register(ds,
738 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
739 	error = error ? error : dsl_prop_register(ds,
740 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
741 	error = error ? error : dsl_prop_register(ds,
742 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
743 	error = error ? error : dsl_prop_register(ds,
744 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
745 	error = error ? error : dsl_prop_register(ds,
746 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
747 	error = error ? error : dsl_prop_register(ds,
748 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
749 	error = error ? error : dsl_prop_register(ds,
750 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
751 	error = error ? error : dsl_prop_register(ds,
752 	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
753 	error = error ? error : dsl_prop_register(ds,
754 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
755 	error = error ? error : dsl_prop_register(ds,
756 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
757 	    zfsvfs);
758 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
759 	if (error)
760 		goto unregister;
761 
762 	/*
763 	 * Invoke our callbacks to restore temporary mount options.
764 	 */
765 	if (do_readonly)
766 		readonly_changed_cb(zfsvfs, readonly);
767 	if (do_setuid)
768 		setuid_changed_cb(zfsvfs, setuid);
769 	if (do_exec)
770 		exec_changed_cb(zfsvfs, exec);
771 	if (do_xattr)
772 		xattr_changed_cb(zfsvfs, xattr);
773 	if (do_atime)
774 		atime_changed_cb(zfsvfs, atime);
775 
776 	nbmand_changed_cb(zfsvfs, nbmand);
777 
778 	return (0);
779 
780 unregister:
781 	dsl_prop_unregister_all(ds, zfsvfs);
782 	return (error);
783 }
784 
785 /*
786  * Associate this zfsvfs with the given objset, which must be owned.
787  * This will cache a bunch of on-disk state from the objset in the
788  * zfsvfs.
789  */
790 static int
791 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
792 {
793 	int error;
794 	uint64_t val;
795 
796 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
797 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
798 	zfsvfs->z_os = os;
799 
800 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
801 	if (error != 0)
802 		return (error);
803 	if (zfsvfs->z_version >
804 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
805 		(void) printf("Can't mount a version %lld file system "
806 		    "on a version %lld pool\n. Pool must be upgraded to mount "
807 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
808 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
809 		return (SET_ERROR(ENOTSUP));
810 	}
811 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
812 	if (error != 0)
813 		return (error);
814 	zfsvfs->z_norm = (int)val;
815 
816 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
817 	if (error != 0)
818 		return (error);
819 	zfsvfs->z_utf8 = (val != 0);
820 
821 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
822 	if (error != 0)
823 		return (error);
824 	zfsvfs->z_case = (uint_t)val;
825 
826 	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
827 	if (error != 0)
828 		return (error);
829 	zfsvfs->z_acl_type = (uint_t)val;
830 
831 	/*
832 	 * Fold case on file systems that are always or sometimes case
833 	 * insensitive.
834 	 */
835 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
836 	    zfsvfs->z_case == ZFS_CASE_MIXED)
837 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
838 
839 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
840 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
841 
842 	uint64_t sa_obj = 0;
843 	if (zfsvfs->z_use_sa) {
844 		/* should either have both of these objects or none */
845 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
846 		    &sa_obj);
847 		if (error != 0)
848 			return (error);
849 
850 		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
851 		if (error == 0 && val == ZFS_XATTR_SA)
852 			zfsvfs->z_xattr_sa = B_TRUE;
853 	}
854 
855 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
856 	    &zfsvfs->z_attr_table);
857 	if (error != 0)
858 		return (error);
859 
860 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
861 		sa_register_update_callback(os, zfs_sa_upgrade);
862 
863 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
864 	    &zfsvfs->z_root);
865 	if (error != 0)
866 		return (error);
867 	ASSERT3U(zfsvfs->z_root, !=, 0);
868 
869 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
870 	    &zfsvfs->z_unlinkedobj);
871 	if (error != 0)
872 		return (error);
873 
874 	error = zap_lookup(os, MASTER_NODE_OBJ,
875 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
876 	    8, 1, &zfsvfs->z_userquota_obj);
877 	if (error == ENOENT)
878 		zfsvfs->z_userquota_obj = 0;
879 	else if (error != 0)
880 		return (error);
881 
882 	error = zap_lookup(os, MASTER_NODE_OBJ,
883 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
884 	    8, 1, &zfsvfs->z_groupquota_obj);
885 	if (error == ENOENT)
886 		zfsvfs->z_groupquota_obj = 0;
887 	else if (error != 0)
888 		return (error);
889 
890 	error = zap_lookup(os, MASTER_NODE_OBJ,
891 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
892 	    8, 1, &zfsvfs->z_projectquota_obj);
893 	if (error == ENOENT)
894 		zfsvfs->z_projectquota_obj = 0;
895 	else if (error != 0)
896 		return (error);
897 
898 	error = zap_lookup(os, MASTER_NODE_OBJ,
899 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
900 	    8, 1, &zfsvfs->z_userobjquota_obj);
901 	if (error == ENOENT)
902 		zfsvfs->z_userobjquota_obj = 0;
903 	else if (error != 0)
904 		return (error);
905 
906 	error = zap_lookup(os, MASTER_NODE_OBJ,
907 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
908 	    8, 1, &zfsvfs->z_groupobjquota_obj);
909 	if (error == ENOENT)
910 		zfsvfs->z_groupobjquota_obj = 0;
911 	else if (error != 0)
912 		return (error);
913 
914 	error = zap_lookup(os, MASTER_NODE_OBJ,
915 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
916 	    8, 1, &zfsvfs->z_projectobjquota_obj);
917 	if (error == ENOENT)
918 		zfsvfs->z_projectobjquota_obj = 0;
919 	else if (error != 0)
920 		return (error);
921 
922 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
923 	    &zfsvfs->z_fuid_obj);
924 	if (error == ENOENT)
925 		zfsvfs->z_fuid_obj = 0;
926 	else if (error != 0)
927 		return (error);
928 
929 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
930 	    &zfsvfs->z_shares_dir);
931 	if (error == ENOENT)
932 		zfsvfs->z_shares_dir = 0;
933 	else if (error != 0)
934 		return (error);
935 
936 	/*
937 	 * Only use the name cache if we are looking for a
938 	 * name on a file system that does not require normalization
939 	 * or case folding.  We can also look there if we happen to be
940 	 * on a non-normalizing, mixed sensitivity file system IF we
941 	 * are looking for the exact name (which is always the case on
942 	 * FreeBSD).
943 	 */
944 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
945 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
946 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
947 
948 	return (0);
949 }
950 
951 taskq_t *zfsvfs_taskq;
952 
953 static void
954 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
955 {
956 
957 	zfs_unlinked_drain((zfsvfs_t *)context);
958 }
959 
960 int
961 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
962 {
963 	objset_t *os;
964 	zfsvfs_t *zfsvfs;
965 	int error;
966 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
967 
968 	/*
969 	 * XXX: Fix struct statfs so this isn't necessary!
970 	 *
971 	 * The 'osname' is used as the filesystem's special node, which means
972 	 * it must fit in statfs.f_mntfromname, or else it can't be
973 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
974 	 * 'zfs unmount' to think it's not mounted when it is.
975 	 */
976 	if (strlen(osname) >= MNAMELEN)
977 		return (SET_ERROR(ENAMETOOLONG));
978 
979 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
980 
981 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
982 	    &os);
983 	if (error != 0) {
984 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
985 		return (error);
986 	}
987 
988 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
989 
990 	return (error);
991 }
992 
993 
994 int
995 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
996 {
997 	int error;
998 
999 	zfsvfs->z_vfs = NULL;
1000 	zfsvfs->z_parent = zfsvfs;
1001 
1002 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1003 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1004 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1005 	    offsetof(znode_t, z_link_node));
1006 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1007 	    zfsvfs_task_unlinked_drain, zfsvfs);
1008 	ZFS_TEARDOWN_INIT(zfsvfs);
1009 	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1010 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1011 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1012 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1013 
1014 	error = zfsvfs_init(zfsvfs, os);
1015 	if (error != 0) {
1016 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1017 		*zfvp = NULL;
1018 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1019 		return (error);
1020 	}
1021 
1022 	*zfvp = zfsvfs;
1023 	return (0);
1024 }
1025 
1026 static int
1027 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1028 {
1029 	int error;
1030 
1031 	/*
1032 	 * Check for a bad on-disk format version now since we
1033 	 * lied about owning the dataset readonly before.
1034 	 */
1035 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1036 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1037 		return (SET_ERROR(EROFS));
1038 
1039 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1040 	if (error)
1041 		return (error);
1042 
1043 	/*
1044 	 * If we are not mounting (ie: online recv), then we don't
1045 	 * have to worry about replaying the log as we blocked all
1046 	 * operations out since we closed the ZIL.
1047 	 */
1048 	if (mounting) {
1049 		boolean_t readonly;
1050 
1051 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1052 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1053 		if (error)
1054 			return (error);
1055 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1056 		    &zfsvfs->z_kstat.dk_zil_sums);
1057 
1058 		/*
1059 		 * During replay we remove the read only flag to
1060 		 * allow replays to succeed.
1061 		 */
1062 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1063 		if (readonly != 0) {
1064 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1065 		} else {
1066 			dsl_dir_t *dd;
1067 			zap_stats_t zs;
1068 
1069 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1070 			    &zs) == 0) {
1071 				dataset_kstats_update_nunlinks_kstat(
1072 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1073 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1074 				    "num_entries in unlinked set: %llu",
1075 				    (u_longlong_t)zs.zs_num_entries);
1076 			}
1077 
1078 			zfs_unlinked_drain(zfsvfs);
1079 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1080 			dd->dd_activity_cancelled = B_FALSE;
1081 		}
1082 
1083 		/*
1084 		 * Parse and replay the intent log.
1085 		 *
1086 		 * Because of ziltest, this must be done after
1087 		 * zfs_unlinked_drain().  (Further note: ziltest
1088 		 * doesn't use readonly mounts, where
1089 		 * zfs_unlinked_drain() isn't called.)  This is because
1090 		 * ziltest causes spa_sync() to think it's committed,
1091 		 * but actually it is not, so the intent log contains
1092 		 * many txg's worth of changes.
1093 		 *
1094 		 * In particular, if object N is in the unlinked set in
1095 		 * the last txg to actually sync, then it could be
1096 		 * actually freed in a later txg and then reallocated
1097 		 * in a yet later txg.  This would write a "create
1098 		 * object N" record to the intent log.  Normally, this
1099 		 * would be fine because the spa_sync() would have
1100 		 * written out the fact that object N is free, before
1101 		 * we could write the "create object N" intent log
1102 		 * record.
1103 		 *
1104 		 * But when we are in ziltest mode, we advance the "open
1105 		 * txg" without actually spa_sync()-ing the changes to
1106 		 * disk.  So we would see that object N is still
1107 		 * allocated and in the unlinked set, and there is an
1108 		 * intent log record saying to allocate it.
1109 		 */
1110 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1111 			if (zil_replay_disable) {
1112 				zil_destroy(zfsvfs->z_log, B_FALSE);
1113 			} else {
1114 				boolean_t use_nc = zfsvfs->z_use_namecache;
1115 				zfsvfs->z_use_namecache = B_FALSE;
1116 				zfsvfs->z_replay = B_TRUE;
1117 				zil_replay(zfsvfs->z_os, zfsvfs,
1118 				    zfs_replay_vector);
1119 				zfsvfs->z_replay = B_FALSE;
1120 				zfsvfs->z_use_namecache = use_nc;
1121 			}
1122 		}
1123 
1124 		/* restore readonly bit */
1125 		if (readonly != 0)
1126 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1127 	} else {
1128 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1129 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1130 		    &zfsvfs->z_kstat.dk_zil_sums);
1131 	}
1132 
1133 	/*
1134 	 * Set the objset user_ptr to track its zfsvfs.
1135 	 */
1136 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1137 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1138 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1139 
1140 	return (0);
1141 }
1142 
1143 void
1144 zfsvfs_free(zfsvfs_t *zfsvfs)
1145 {
1146 	int i;
1147 
1148 	zfs_fuid_destroy(zfsvfs);
1149 
1150 	mutex_destroy(&zfsvfs->z_znodes_lock);
1151 	mutex_destroy(&zfsvfs->z_lock);
1152 	ASSERT3U(zfsvfs->z_nr_znodes, ==, 0);
1153 	list_destroy(&zfsvfs->z_all_znodes);
1154 	ZFS_TEARDOWN_DESTROY(zfsvfs);
1155 	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1156 	rw_destroy(&zfsvfs->z_fuid_lock);
1157 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1158 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1159 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1160 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1161 }
1162 
1163 static void
1164 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1165 {
1166 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1167 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1168 }
1169 
1170 static int
1171 zfs_domount(vfs_t *vfsp, char *osname)
1172 {
1173 	uint64_t recordsize, fsid_guid;
1174 	int error = 0;
1175 	zfsvfs_t *zfsvfs;
1176 
1177 	ASSERT3P(vfsp, !=, NULL);
1178 	ASSERT3P(osname, !=, NULL);
1179 
1180 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1181 	if (error)
1182 		return (error);
1183 	zfsvfs->z_vfs = vfsp;
1184 
1185 	if ((error = dsl_prop_get_integer(osname,
1186 	    "recordsize", &recordsize, NULL)))
1187 		goto out;
1188 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1189 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1190 
1191 	vfsp->vfs_data = zfsvfs;
1192 	vfsp->mnt_flag |= MNT_LOCAL;
1193 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1194 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1195 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1196 	/*
1197 	 * This can cause a loss of coherence between ARC and page cache
1198 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1199 	 */
1200 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1201 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1202 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1203 
1204 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1205 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1206 #endif
1207 	/*
1208 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1209 	 * separates our fsid from any other filesystem types, and a
1210 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1211 	 * all objsets open on this system, provided by unique_create().
1212 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1213 	 * because that's where other Solaris filesystems put it.
1214 	 */
1215 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1216 	ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1217 	vfsp->vfs_fsid.val[0] = fsid_guid;
1218 	vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1219 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1220 
1221 	/*
1222 	 * Set features for file system.
1223 	 */
1224 	zfs_set_fuid_feature(zfsvfs);
1225 
1226 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1227 		uint64_t pval;
1228 
1229 		atime_changed_cb(zfsvfs, B_FALSE);
1230 		readonly_changed_cb(zfsvfs, B_TRUE);
1231 		if ((error = dsl_prop_get_integer(osname,
1232 		    "xattr", &pval, NULL)))
1233 			goto out;
1234 		xattr_changed_cb(zfsvfs, pval);
1235 		if ((error = dsl_prop_get_integer(osname,
1236 		    "acltype", &pval, NULL)))
1237 			goto out;
1238 		acl_type_changed_cb(zfsvfs, pval);
1239 		zfsvfs->z_issnap = B_TRUE;
1240 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1241 
1242 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1243 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1244 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1245 	} else {
1246 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1247 			goto out;
1248 	}
1249 
1250 	vfs_mountedfrom(vfsp, osname);
1251 
1252 	if (!zfsvfs->z_issnap)
1253 		zfsctl_create(zfsvfs);
1254 out:
1255 	if (error) {
1256 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1257 		zfsvfs_free(zfsvfs);
1258 	} else {
1259 		atomic_inc_32(&zfs_active_fs_count);
1260 	}
1261 
1262 	return (error);
1263 }
1264 
1265 static void
1266 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1267 {
1268 	objset_t *os = zfsvfs->z_os;
1269 
1270 	if (!dmu_objset_is_snapshot(os))
1271 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1272 }
1273 
1274 static int
1275 getpoolname(const char *osname, char *poolname)
1276 {
1277 	char *p;
1278 
1279 	p = strchr(osname, '/');
1280 	if (p == NULL) {
1281 		if (strlen(osname) >= MAXNAMELEN)
1282 			return (ENAMETOOLONG);
1283 		(void) strcpy(poolname, osname);
1284 	} else {
1285 		if (p - osname >= MAXNAMELEN)
1286 			return (ENAMETOOLONG);
1287 		(void) strlcpy(poolname, osname, p - osname + 1);
1288 	}
1289 	return (0);
1290 }
1291 
1292 static void
1293 fetch_osname_options(char *name, bool *checkpointrewind)
1294 {
1295 
1296 	if (name[0] == '!') {
1297 		*checkpointrewind = true;
1298 		memmove(name, name + 1, strlen(name));
1299 	} else {
1300 		*checkpointrewind = false;
1301 	}
1302 }
1303 
1304 static int
1305 zfs_mount(vfs_t *vfsp)
1306 {
1307 	kthread_t	*td = curthread;
1308 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1309 	cred_t		*cr = td->td_ucred;
1310 	char		*osname;
1311 	int		error = 0;
1312 	int		canwrite;
1313 	bool		checkpointrewind, isctlsnap = false;
1314 
1315 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1316 		return (SET_ERROR(EINVAL));
1317 
1318 	/*
1319 	 * If full-owner-access is enabled and delegated administration is
1320 	 * turned on, we must set nosuid.
1321 	 */
1322 	if (zfs_super_owner &&
1323 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1324 		secpolicy_fs_mount_clearopts(cr, vfsp);
1325 	}
1326 
1327 	fetch_osname_options(osname, &checkpointrewind);
1328 	isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1329 	    strchr(osname, '@') != NULL);
1330 
1331 	/*
1332 	 * Check for mount privilege?
1333 	 *
1334 	 * If we don't have privilege then see if
1335 	 * we have local permission to allow it
1336 	 */
1337 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1338 	if (error && isctlsnap) {
1339 		secpolicy_fs_mount_clearopts(cr, vfsp);
1340 	} else if (error) {
1341 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1342 			goto out;
1343 
1344 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1345 			vattr_t		vattr;
1346 
1347 			/*
1348 			 * Make sure user is the owner of the mount point
1349 			 * or has sufficient privileges.
1350 			 */
1351 
1352 			vattr.va_mask = AT_UID;
1353 
1354 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1355 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1356 				VOP_UNLOCK1(mvp);
1357 				goto out;
1358 			}
1359 
1360 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1361 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1362 				VOP_UNLOCK1(mvp);
1363 				goto out;
1364 			}
1365 			VOP_UNLOCK1(mvp);
1366 		}
1367 
1368 		secpolicy_fs_mount_clearopts(cr, vfsp);
1369 	}
1370 
1371 	/*
1372 	 * Refuse to mount a filesystem if we are in a local zone and the
1373 	 * dataset is not visible.
1374 	 */
1375 	if (!INGLOBALZONE(curproc) &&
1376 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1377 		boolean_t mount_snapshot = B_FALSE;
1378 
1379 		/*
1380 		 * Snapshots may be mounted in .zfs for unjailed datasets
1381 		 * if allowed by the jail param zfs.mount_snapshot.
1382 		 */
1383 		if (isctlsnap) {
1384 			struct prison *pr;
1385 			struct zfs_jailparam *zjp;
1386 
1387 			pr = curthread->td_ucred->cr_prison;
1388 			mtx_lock(&pr->pr_mtx);
1389 			zjp = osd_jail_get(pr, zfs_jailparam_slot);
1390 			mtx_unlock(&pr->pr_mtx);
1391 			if (zjp && zjp->mount_snapshot)
1392 				mount_snapshot = B_TRUE;
1393 		}
1394 		if (!mount_snapshot) {
1395 			error = SET_ERROR(EPERM);
1396 			goto out;
1397 		}
1398 	}
1399 
1400 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1401 
1402 	/*
1403 	 * When doing a remount, we simply refresh our temporary properties
1404 	 * according to those options set in the current VFS options.
1405 	 */
1406 	if (vfsp->vfs_flag & MS_REMOUNT) {
1407 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1408 
1409 		/*
1410 		 * Refresh mount options with z_teardown_lock blocking I/O while
1411 		 * the filesystem is in an inconsistent state.
1412 		 * The lock also serializes this code with filesystem
1413 		 * manipulations between entry to zfs_suspend_fs() and return
1414 		 * from zfs_resume_fs().
1415 		 */
1416 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1417 		zfs_unregister_callbacks(zfsvfs);
1418 		error = zfs_register_callbacks(vfsp);
1419 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1420 		goto out;
1421 	}
1422 
1423 	/* Initial root mount: try hard to import the requested root pool. */
1424 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1425 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1426 		char pname[MAXNAMELEN];
1427 
1428 		error = getpoolname(osname, pname);
1429 		if (error == 0)
1430 			error = spa_import_rootpool(pname, checkpointrewind);
1431 		if (error)
1432 			goto out;
1433 	}
1434 	DROP_GIANT();
1435 	error = zfs_domount(vfsp, osname);
1436 	PICKUP_GIANT();
1437 
1438 out:
1439 	return (error);
1440 }
1441 
1442 static int
1443 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1444 {
1445 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1446 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1447 	int error;
1448 
1449 	statp->f_version = STATFS_VERSION;
1450 
1451 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1452 		return (error);
1453 
1454 	dmu_objset_space(zfsvfs->z_os,
1455 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1456 
1457 	/*
1458 	 * The underlying storage pool actually uses multiple block sizes.
1459 	 * We report the fragsize as the smallest block size we support,
1460 	 * and we report our blocksize as the filesystem's maximum blocksize.
1461 	 */
1462 	statp->f_bsize = SPA_MINBLOCKSIZE;
1463 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1464 
1465 	/*
1466 	 * The following report "total" blocks of various kinds in the
1467 	 * file system, but reported in terms of f_frsize - the
1468 	 * "fragment" size.
1469 	 */
1470 
1471 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1472 	statp->f_bfree = availbytes / statp->f_bsize;
1473 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1474 
1475 	/*
1476 	 * statvfs() should really be called statufs(), because it assumes
1477 	 * static metadata.  ZFS doesn't preallocate files, so the best
1478 	 * we can do is report the max that could possibly fit in f_files,
1479 	 * and that minus the number actually used in f_ffree.
1480 	 * For f_ffree, report the smaller of the number of object available
1481 	 * and the number of blocks (each object will take at least a block).
1482 	 */
1483 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1484 	statp->f_files = statp->f_ffree + usedobjs;
1485 
1486 	/*
1487 	 * We're a zfs filesystem.
1488 	 */
1489 	strlcpy(statp->f_fstypename, "zfs",
1490 	    sizeof (statp->f_fstypename));
1491 
1492 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1493 	    sizeof (statp->f_mntfromname));
1494 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1495 	    sizeof (statp->f_mntonname));
1496 
1497 	statp->f_namemax = MAXNAMELEN - 1;
1498 
1499 	zfs_exit(zfsvfs, FTAG);
1500 	return (0);
1501 }
1502 
1503 static int
1504 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1505 {
1506 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1507 	znode_t *rootzp;
1508 	int error;
1509 
1510 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1511 		return (error);
1512 
1513 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1514 	if (error == 0)
1515 		*vpp = ZTOV(rootzp);
1516 
1517 	zfs_exit(zfsvfs, FTAG);
1518 
1519 	if (error == 0) {
1520 		error = vn_lock(*vpp, flags);
1521 		if (error != 0) {
1522 			VN_RELE(*vpp);
1523 			*vpp = NULL;
1524 		}
1525 	}
1526 	return (error);
1527 }
1528 
1529 /*
1530  * Teardown the zfsvfs::z_os.
1531  *
1532  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1533  * and 'z_teardown_inactive_lock' held.
1534  */
1535 static int
1536 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1537 {
1538 	znode_t	*zp;
1539 	dsl_dir_t *dd;
1540 
1541 	/*
1542 	 * If someone has not already unmounted this file system,
1543 	 * drain the zrele_taskq to ensure all active references to the
1544 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1545 	 */
1546 	if (zfsvfs->z_os) {
1547 		/*
1548 		 * If we're unmounting we have to wait for the list to
1549 		 * drain completely.
1550 		 *
1551 		 * If we're not unmounting there's no guarantee the list
1552 		 * will drain completely, but zreles run from the taskq
1553 		 * may add the parents of dir-based xattrs to the taskq
1554 		 * so we want to wait for these.
1555 		 *
1556 		 * We can safely read z_nr_znodes without locking because the
1557 		 * VFS has already blocked operations which add to the
1558 		 * z_all_znodes list and thus increment z_nr_znodes.
1559 		 */
1560 		int round = 0;
1561 		while (zfsvfs->z_nr_znodes > 0) {
1562 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1563 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1564 			if (++round > 1 && !unmounting)
1565 				break;
1566 		}
1567 	}
1568 	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1569 
1570 	if (!unmounting) {
1571 		/*
1572 		 * We purge the parent filesystem's vfsp as the parent
1573 		 * filesystem and all of its snapshots have their vnode's
1574 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1575 		 * 'z_parent' is self referential for non-snapshots.
1576 		 */
1577 #ifdef FREEBSD_NAMECACHE
1578 #if __FreeBSD_version >= 1300117
1579 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1580 #else
1581 		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1582 #endif
1583 #endif
1584 	}
1585 
1586 	/*
1587 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1588 	 * threads are blocked as zil_close can call zfs_inactive.
1589 	 */
1590 	if (zfsvfs->z_log) {
1591 		zil_close(zfsvfs->z_log);
1592 		zfsvfs->z_log = NULL;
1593 	}
1594 
1595 	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1596 
1597 	/*
1598 	 * If we are not unmounting (ie: online recv) and someone already
1599 	 * unmounted this file system while we were doing the switcheroo,
1600 	 * or a reopen of z_os failed then just bail out now.
1601 	 */
1602 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1603 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1604 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1605 		return (SET_ERROR(EIO));
1606 	}
1607 
1608 	/*
1609 	 * At this point there are no vops active, and any new vops will
1610 	 * fail with EIO since we have z_teardown_lock for writer (only
1611 	 * relevant for forced unmount).
1612 	 *
1613 	 * Release all holds on dbufs.
1614 	 */
1615 	mutex_enter(&zfsvfs->z_znodes_lock);
1616 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1617 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1618 		if (zp->z_sa_hdl != NULL) {
1619 			zfs_znode_dmu_fini(zp);
1620 		}
1621 	}
1622 	mutex_exit(&zfsvfs->z_znodes_lock);
1623 
1624 	/*
1625 	 * If we are unmounting, set the unmounted flag and let new vops
1626 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1627 	 * other vops will fail with EIO.
1628 	 */
1629 	if (unmounting) {
1630 		zfsvfs->z_unmounted = B_TRUE;
1631 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1632 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1633 	}
1634 
1635 	/*
1636 	 * z_os will be NULL if there was an error in attempting to reopen
1637 	 * zfsvfs, so just return as the properties had already been
1638 	 * unregistered and cached data had been evicted before.
1639 	 */
1640 	if (zfsvfs->z_os == NULL)
1641 		return (0);
1642 
1643 	/*
1644 	 * Unregister properties.
1645 	 */
1646 	zfs_unregister_callbacks(zfsvfs);
1647 
1648 	/*
1649 	 * Evict cached data
1650 	 */
1651 	if (!zfs_is_readonly(zfsvfs))
1652 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1653 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1654 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1655 	dsl_dir_cancel_waiters(dd);
1656 
1657 	return (0);
1658 }
1659 
1660 static int
1661 zfs_umount(vfs_t *vfsp, int fflag)
1662 {
1663 	kthread_t *td = curthread;
1664 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1665 	objset_t *os;
1666 	cred_t *cr = td->td_ucred;
1667 	int ret;
1668 
1669 	ret = secpolicy_fs_unmount(cr, vfsp);
1670 	if (ret) {
1671 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1672 		    ZFS_DELEG_PERM_MOUNT, cr))
1673 			return (ret);
1674 	}
1675 
1676 	/*
1677 	 * Unmount any snapshots mounted under .zfs before unmounting the
1678 	 * dataset itself.
1679 	 */
1680 	if (zfsvfs->z_ctldir != NULL) {
1681 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1682 			return (ret);
1683 	}
1684 
1685 	if (fflag & MS_FORCE) {
1686 		/*
1687 		 * Mark file system as unmounted before calling
1688 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1689 		 * will be called and risk operating on DOOMED vnodes.
1690 		 */
1691 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1692 		zfsvfs->z_unmounted = B_TRUE;
1693 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1694 	}
1695 
1696 	/*
1697 	 * Flush all the files.
1698 	 */
1699 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1700 	if (ret != 0)
1701 		return (ret);
1702 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1703 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1704 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1705 		    &zfsvfs->z_unlinked_drain_task);
1706 
1707 	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1708 	os = zfsvfs->z_os;
1709 
1710 	/*
1711 	 * z_os will be NULL if there was an error in
1712 	 * attempting to reopen zfsvfs.
1713 	 */
1714 	if (os != NULL) {
1715 		/*
1716 		 * Unset the objset user_ptr.
1717 		 */
1718 		mutex_enter(&os->os_user_ptr_lock);
1719 		dmu_objset_set_user(os, NULL);
1720 		mutex_exit(&os->os_user_ptr_lock);
1721 
1722 		/*
1723 		 * Finally release the objset
1724 		 */
1725 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1726 	}
1727 
1728 	/*
1729 	 * We can now safely destroy the '.zfs' directory node.
1730 	 */
1731 	if (zfsvfs->z_ctldir != NULL)
1732 		zfsctl_destroy(zfsvfs);
1733 	zfs_freevfs(vfsp);
1734 
1735 	return (0);
1736 }
1737 
1738 static int
1739 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1740 {
1741 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1742 	znode_t		*zp;
1743 	int 		err;
1744 
1745 	/*
1746 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1747 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1748 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1749 	 */
1750 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1751 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1752 		return (EOPNOTSUPP);
1753 
1754 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1755 		return (err);
1756 	err = zfs_zget(zfsvfs, ino, &zp);
1757 	if (err == 0 && zp->z_unlinked) {
1758 		vrele(ZTOV(zp));
1759 		err = EINVAL;
1760 	}
1761 	if (err == 0)
1762 		*vpp = ZTOV(zp);
1763 	zfs_exit(zfsvfs, FTAG);
1764 	if (err == 0) {
1765 		err = vn_lock(*vpp, flags);
1766 		if (err != 0)
1767 			vrele(*vpp);
1768 	}
1769 	if (err != 0)
1770 		*vpp = NULL;
1771 	return (err);
1772 }
1773 
1774 static int
1775 #if __FreeBSD_version >= 1300098
1776 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1777     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1778 #else
1779 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1780     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1781 #endif
1782 {
1783 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1784 
1785 	/*
1786 	 * If this is regular file system vfsp is the same as
1787 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1788 	 * zfsvfs->z_parent->z_vfs represents parent file system
1789 	 * which we have to use here, because only this file system
1790 	 * has mnt_export configured.
1791 	 */
1792 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1793 	    credanonp, numsecflavors, secflavors));
1794 }
1795 
1796 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1797 	"struct fid bigger than SHORT_FID_LEN");
1798 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1799 	"struct fid bigger than LONG_FID_LEN");
1800 
1801 static int
1802 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1803 {
1804 	struct componentname cn;
1805 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1806 	znode_t		*zp;
1807 	vnode_t		*dvp;
1808 	uint64_t	object = 0;
1809 	uint64_t	fid_gen = 0;
1810 	uint64_t	setgen = 0;
1811 	uint64_t	gen_mask;
1812 	uint64_t	zp_gen;
1813 	int 		i, err;
1814 
1815 	*vpp = NULL;
1816 
1817 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1818 		return (err);
1819 
1820 	/*
1821 	 * On FreeBSD we can get snapshot's mount point or its parent file
1822 	 * system mount point depending if snapshot is already mounted or not.
1823 	 */
1824 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1825 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1826 		uint64_t	objsetid = 0;
1827 
1828 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1829 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1830 
1831 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1832 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1833 
1834 		zfs_exit(zfsvfs, FTAG);
1835 
1836 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1837 		if (err)
1838 			return (SET_ERROR(EINVAL));
1839 		if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1840 			return (err);
1841 	}
1842 
1843 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1844 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1845 
1846 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1847 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1848 
1849 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1850 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1851 	} else {
1852 		zfs_exit(zfsvfs, FTAG);
1853 		return (SET_ERROR(EINVAL));
1854 	}
1855 
1856 	if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1857 		zfs_exit(zfsvfs, FTAG);
1858 		dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1859 		    (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1860 		return (SET_ERROR(EINVAL));
1861 	}
1862 
1863 	/*
1864 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1865 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1866 	 * we are in the .zfs/shares directory tree.
1867 	 */
1868 	if ((fid_gen == 0 &&
1869 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1870 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1871 		zfs_exit(zfsvfs, FTAG);
1872 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1873 		if (object == ZFSCTL_INO_SNAPDIR) {
1874 			cn.cn_nameptr = "snapshot";
1875 			cn.cn_namelen = strlen(cn.cn_nameptr);
1876 			cn.cn_nameiop = LOOKUP;
1877 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1878 			cn.cn_lkflags = flags;
1879 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1880 			vput(dvp);
1881 		} else if (object == zfsvfs->z_shares_dir) {
1882 			/*
1883 			 * XXX This branch must not be taken,
1884 			 * if it is, then the lookup below will
1885 			 * explode.
1886 			 */
1887 			cn.cn_nameptr = "shares";
1888 			cn.cn_namelen = strlen(cn.cn_nameptr);
1889 			cn.cn_nameiop = LOOKUP;
1890 			cn.cn_flags = ISLASTCN;
1891 			cn.cn_lkflags = flags;
1892 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1893 			vput(dvp);
1894 		} else {
1895 			*vpp = dvp;
1896 		}
1897 		return (err);
1898 	}
1899 
1900 	gen_mask = -1ULL >> (64 - 8 * i);
1901 
1902 	dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1903 	    (u_longlong_t)fid_gen,
1904 	    (u_longlong_t)gen_mask);
1905 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1906 		zfs_exit(zfsvfs, FTAG);
1907 		return (err);
1908 	}
1909 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1910 	    sizeof (uint64_t));
1911 	zp_gen = zp_gen & gen_mask;
1912 	if (zp_gen == 0)
1913 		zp_gen = 1;
1914 	if (zp->z_unlinked || zp_gen != fid_gen) {
1915 		dprintf("znode gen (%llu) != fid gen (%llu)\n",
1916 		    (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1917 		vrele(ZTOV(zp));
1918 		zfs_exit(zfsvfs, FTAG);
1919 		return (SET_ERROR(EINVAL));
1920 	}
1921 
1922 	*vpp = ZTOV(zp);
1923 	zfs_exit(zfsvfs, FTAG);
1924 	err = vn_lock(*vpp, flags);
1925 	if (err == 0)
1926 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1927 	else
1928 		*vpp = NULL;
1929 	return (err);
1930 }
1931 
1932 /*
1933  * Block out VOPs and close zfsvfs_t::z_os
1934  *
1935  * Note, if successful, then we return with the 'z_teardown_lock' and
1936  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1937  * dataset and objset intact so that they can be atomically handed off during
1938  * a subsequent rollback or recv operation and the resume thereafter.
1939  */
1940 int
1941 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1942 {
1943 	int error;
1944 
1945 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1946 		return (error);
1947 
1948 	return (0);
1949 }
1950 
1951 /*
1952  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1953  * is an invariant across any of the operations that can be performed while the
1954  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1955  * are the same: the relevant objset and associated dataset are owned by
1956  * zfsvfs, held, and long held on entry.
1957  */
1958 int
1959 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1960 {
1961 	int err;
1962 	znode_t *zp;
1963 
1964 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1965 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1966 
1967 	/*
1968 	 * We already own this, so just update the objset_t, as the one we
1969 	 * had before may have been evicted.
1970 	 */
1971 	objset_t *os;
1972 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
1973 	VERIFY(dsl_dataset_long_held(ds));
1974 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1975 	dsl_pool_config_enter(dp, FTAG);
1976 	VERIFY0(dmu_objset_from_ds(ds, &os));
1977 	dsl_pool_config_exit(dp, FTAG);
1978 
1979 	err = zfsvfs_init(zfsvfs, os);
1980 	if (err != 0)
1981 		goto bail;
1982 
1983 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
1984 	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1985 
1986 	zfs_set_fuid_feature(zfsvfs);
1987 
1988 	/*
1989 	 * Attempt to re-establish all the active znodes with
1990 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
1991 	 * any potential callers discover that via zfs_enter_verify_zp
1992 	 * when they try to use their znode.
1993 	 */
1994 	mutex_enter(&zfsvfs->z_znodes_lock);
1995 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1996 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1997 		(void) zfs_rezget(zp);
1998 	}
1999 	mutex_exit(&zfsvfs->z_znodes_lock);
2000 
2001 bail:
2002 	/* release the VOPs */
2003 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2004 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2005 
2006 	if (err) {
2007 		/*
2008 		 * Since we couldn't setup the sa framework, try to force
2009 		 * unmount this file system.
2010 		 */
2011 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2012 			vfs_ref(zfsvfs->z_vfs);
2013 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2014 		}
2015 	}
2016 	return (err);
2017 }
2018 
2019 static void
2020 zfs_freevfs(vfs_t *vfsp)
2021 {
2022 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2023 
2024 	zfsvfs_free(zfsvfs);
2025 
2026 	atomic_dec_32(&zfs_active_fs_count);
2027 }
2028 
2029 #ifdef __i386__
2030 static int desiredvnodes_backup;
2031 #include <sys/vmmeter.h>
2032 
2033 
2034 #include <vm/vm_page.h>
2035 #include <vm/vm_object.h>
2036 #include <vm/vm_kern.h>
2037 #include <vm/vm_map.h>
2038 #endif
2039 
2040 static void
2041 zfs_vnodes_adjust(void)
2042 {
2043 #ifdef __i386__
2044 	int newdesiredvnodes;
2045 
2046 	desiredvnodes_backup = desiredvnodes;
2047 
2048 	/*
2049 	 * We calculate newdesiredvnodes the same way it is done in
2050 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2051 	 * it wasn't tuned by the administrator and we can tune it down.
2052 	 */
2053 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2054 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
2055 	    sizeof (struct vnode))));
2056 	if (newdesiredvnodes == desiredvnodes)
2057 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2058 #endif
2059 }
2060 
2061 static void
2062 zfs_vnodes_adjust_back(void)
2063 {
2064 
2065 #ifdef __i386__
2066 	desiredvnodes = desiredvnodes_backup;
2067 #endif
2068 }
2069 
2070 void
2071 zfs_init(void)
2072 {
2073 
2074 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2075 
2076 	/*
2077 	 * Initialize .zfs directory structures
2078 	 */
2079 	zfsctl_init();
2080 
2081 	/*
2082 	 * Initialize znode cache, vnode ops, etc...
2083 	 */
2084 	zfs_znode_init();
2085 
2086 	/*
2087 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2088 	 * with UFS inode in mind. We reduce it here, because it's too big for
2089 	 * ZFS/i386.
2090 	 */
2091 	zfs_vnodes_adjust();
2092 
2093 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2094 
2095 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2096 }
2097 
2098 void
2099 zfs_fini(void)
2100 {
2101 	taskq_destroy(zfsvfs_taskq);
2102 	zfsctl_fini();
2103 	zfs_znode_fini();
2104 	zfs_vnodes_adjust_back();
2105 }
2106 
2107 int
2108 zfs_busy(void)
2109 {
2110 	return (zfs_active_fs_count != 0);
2111 }
2112 
2113 /*
2114  * Release VOPs and unmount a suspended filesystem.
2115  */
2116 int
2117 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2118 {
2119 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2120 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2121 
2122 	/*
2123 	 * We already own this, so just hold and rele it to update the
2124 	 * objset_t, as the one we had before may have been evicted.
2125 	 */
2126 	objset_t *os;
2127 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2128 	VERIFY(dsl_dataset_long_held(ds));
2129 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2130 	dsl_pool_config_enter(dp, FTAG);
2131 	VERIFY0(dmu_objset_from_ds(ds, &os));
2132 	dsl_pool_config_exit(dp, FTAG);
2133 	zfsvfs->z_os = os;
2134 
2135 	/* release the VOPs */
2136 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2137 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2138 
2139 	/*
2140 	 * Try to force unmount this file system.
2141 	 */
2142 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2143 	zfsvfs->z_unmounted = B_TRUE;
2144 	return (0);
2145 }
2146 
2147 int
2148 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2149 {
2150 	int error;
2151 	objset_t *os = zfsvfs->z_os;
2152 	dmu_tx_t *tx;
2153 
2154 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2155 		return (SET_ERROR(EINVAL));
2156 
2157 	if (newvers < zfsvfs->z_version)
2158 		return (SET_ERROR(EINVAL));
2159 
2160 	if (zfs_spa_version_map(newvers) >
2161 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2162 		return (SET_ERROR(ENOTSUP));
2163 
2164 	tx = dmu_tx_create(os);
2165 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2166 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2167 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2168 		    ZFS_SA_ATTRS);
2169 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2170 	}
2171 	error = dmu_tx_assign(tx, TXG_WAIT);
2172 	if (error) {
2173 		dmu_tx_abort(tx);
2174 		return (error);
2175 	}
2176 
2177 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2178 	    8, 1, &newvers, tx);
2179 
2180 	if (error) {
2181 		dmu_tx_commit(tx);
2182 		return (error);
2183 	}
2184 
2185 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2186 		uint64_t sa_obj;
2187 
2188 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2189 		    SPA_VERSION_SA);
2190 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2191 		    DMU_OT_NONE, 0, tx);
2192 
2193 		error = zap_add(os, MASTER_NODE_OBJ,
2194 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2195 		ASSERT0(error);
2196 
2197 		VERIFY0(sa_set_sa_object(os, sa_obj));
2198 		sa_register_update_callback(os, zfs_sa_upgrade);
2199 	}
2200 
2201 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2202 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2203 	    (uintmax_t)newvers);
2204 	dmu_tx_commit(tx);
2205 
2206 	zfsvfs->z_version = newvers;
2207 	os->os_version = newvers;
2208 
2209 	zfs_set_fuid_feature(zfsvfs);
2210 
2211 	return (0);
2212 }
2213 
2214 /*
2215  * Read a property stored within the master node.
2216  */
2217 int
2218 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2219 {
2220 	uint64_t *cached_copy = NULL;
2221 
2222 	/*
2223 	 * Figure out where in the objset_t the cached copy would live, if it
2224 	 * is available for the requested property.
2225 	 */
2226 	if (os != NULL) {
2227 		switch (prop) {
2228 		case ZFS_PROP_VERSION:
2229 			cached_copy = &os->os_version;
2230 			break;
2231 		case ZFS_PROP_NORMALIZE:
2232 			cached_copy = &os->os_normalization;
2233 			break;
2234 		case ZFS_PROP_UTF8ONLY:
2235 			cached_copy = &os->os_utf8only;
2236 			break;
2237 		case ZFS_PROP_CASE:
2238 			cached_copy = &os->os_casesensitivity;
2239 			break;
2240 		default:
2241 			break;
2242 		}
2243 	}
2244 	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2245 		*value = *cached_copy;
2246 		return (0);
2247 	}
2248 
2249 	/*
2250 	 * If the property wasn't cached, look up the file system's value for
2251 	 * the property. For the version property, we look up a slightly
2252 	 * different string.
2253 	 */
2254 	const char *pname;
2255 	int error = ENOENT;
2256 	if (prop == ZFS_PROP_VERSION) {
2257 		pname = ZPL_VERSION_STR;
2258 	} else {
2259 		pname = zfs_prop_to_name(prop);
2260 	}
2261 
2262 	if (os != NULL) {
2263 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2264 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2265 	}
2266 
2267 	if (error == ENOENT) {
2268 		/* No value set, use the default value */
2269 		switch (prop) {
2270 		case ZFS_PROP_VERSION:
2271 			*value = ZPL_VERSION;
2272 			break;
2273 		case ZFS_PROP_NORMALIZE:
2274 		case ZFS_PROP_UTF8ONLY:
2275 			*value = 0;
2276 			break;
2277 		case ZFS_PROP_CASE:
2278 			*value = ZFS_CASE_SENSITIVE;
2279 			break;
2280 		case ZFS_PROP_ACLTYPE:
2281 			*value = ZFS_ACLTYPE_NFSV4;
2282 			break;
2283 		default:
2284 			return (error);
2285 		}
2286 		error = 0;
2287 	}
2288 
2289 	/*
2290 	 * If one of the methods for getting the property value above worked,
2291 	 * copy it into the objset_t's cache.
2292 	 */
2293 	if (error == 0 && cached_copy != NULL) {
2294 		*cached_copy = *value;
2295 	}
2296 
2297 	return (error);
2298 }
2299 
2300 /*
2301  * Return true if the corresponding vfs's unmounted flag is set.
2302  * Otherwise return false.
2303  * If this function returns true we know VFS unmount has been initiated.
2304  */
2305 boolean_t
2306 zfs_get_vfs_flag_unmounted(objset_t *os)
2307 {
2308 	zfsvfs_t *zfvp;
2309 	boolean_t unmounted = B_FALSE;
2310 
2311 	ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2312 
2313 	mutex_enter(&os->os_user_ptr_lock);
2314 	zfvp = dmu_objset_get_user(os);
2315 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2316 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2317 		unmounted = B_TRUE;
2318 	mutex_exit(&os->os_user_ptr_lock);
2319 
2320 	return (unmounted);
2321 }
2322 
2323 #ifdef _KERNEL
2324 void
2325 zfsvfs_update_fromname(const char *oldname, const char *newname)
2326 {
2327 	char tmpbuf[MAXPATHLEN];
2328 	struct mount *mp;
2329 	char *fromname;
2330 	size_t oldlen;
2331 
2332 	oldlen = strlen(oldname);
2333 
2334 	mtx_lock(&mountlist_mtx);
2335 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2336 		fromname = mp->mnt_stat.f_mntfromname;
2337 		if (strcmp(fromname, oldname) == 0) {
2338 			(void) strlcpy(fromname, newname,
2339 			    sizeof (mp->mnt_stat.f_mntfromname));
2340 			continue;
2341 		}
2342 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2343 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2344 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2345 			    newname, fromname + oldlen);
2346 			(void) strlcpy(fromname, tmpbuf,
2347 			    sizeof (mp->mnt_stat.f_mntfromname));
2348 			continue;
2349 		}
2350 	}
2351 	mtx_unlock(&mountlist_mtx);
2352 }
2353 #endif
2354 
2355 /*
2356  * Find a prison with ZFS info.
2357  * Return the ZFS info and the (locked) prison.
2358  */
2359 static struct zfs_jailparam *
2360 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2361 {
2362 	struct prison *pr;
2363 	struct zfs_jailparam *zjp;
2364 
2365 	for (pr = spr; ; pr = pr->pr_parent) {
2366 		mtx_lock(&pr->pr_mtx);
2367 		if (pr == &prison0) {
2368 			zjp = &zfs_jailparam0;
2369 			break;
2370 		}
2371 		zjp = osd_jail_get(pr, zfs_jailparam_slot);
2372 		if (zjp != NULL)
2373 			break;
2374 		mtx_unlock(&pr->pr_mtx);
2375 	}
2376 	*prp = pr;
2377 
2378 	return (zjp);
2379 }
2380 
2381 /*
2382  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2383  * ZFS info and lock the prison.
2384  */
2385 static void
2386 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2387 {
2388 	struct prison *ppr;
2389 	struct zfs_jailparam *zjp, *nzjp;
2390 	void **rsv;
2391 
2392 	/* If this prison already has ZFS info, return that. */
2393 	zjp = zfs_jailparam_find(pr, &ppr);
2394 	if (ppr == pr)
2395 		goto done;
2396 
2397 	/*
2398 	 * Allocate a new info record.  Then check again, in case something
2399 	 * changed during the allocation.
2400 	 */
2401 	mtx_unlock(&ppr->pr_mtx);
2402 	nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2403 	rsv = osd_reserve(zfs_jailparam_slot);
2404 	zjp = zfs_jailparam_find(pr, &ppr);
2405 	if (ppr == pr) {
2406 		free(nzjp, M_PRISON);
2407 		osd_free_reserved(rsv);
2408 		goto done;
2409 	}
2410 	/* Inherit the initial values from the ancestor. */
2411 	mtx_lock(&pr->pr_mtx);
2412 	(void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2413 	(void) memcpy(nzjp, zjp, sizeof (*zjp));
2414 	zjp = nzjp;
2415 	mtx_unlock(&ppr->pr_mtx);
2416 done:
2417 	if (zjpp != NULL)
2418 		*zjpp = zjp;
2419 	else
2420 		mtx_unlock(&pr->pr_mtx);
2421 }
2422 
2423 /*
2424  * Jail OSD methods for ZFS VFS info.
2425  */
2426 static int
2427 zfs_jailparam_create(void *obj, void *data)
2428 {
2429 	struct prison *pr = obj;
2430 	struct vfsoptlist *opts = data;
2431 	int jsys;
2432 
2433 	if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2434 	    jsys == JAIL_SYS_INHERIT)
2435 		return (0);
2436 	/*
2437 	 * Inherit a prison's initial values from its parent
2438 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
2439 	 */
2440 	zfs_jailparam_alloc(pr, NULL);
2441 	return (0);
2442 }
2443 
2444 static int
2445 zfs_jailparam_get(void *obj, void *data)
2446 {
2447 	struct prison *ppr, *pr = obj;
2448 	struct vfsoptlist *opts = data;
2449 	struct zfs_jailparam *zjp;
2450 	int jsys, error;
2451 
2452 	zjp = zfs_jailparam_find(pr, &ppr);
2453 	jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2454 	error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2455 	if (error != 0 && error != ENOENT)
2456 		goto done;
2457 	if (jsys == JAIL_SYS_NEW) {
2458 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2459 		    &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2460 		if (error != 0 && error != ENOENT)
2461 			goto done;
2462 	} else {
2463 		/*
2464 		 * If this prison is inheriting its ZFS info, report
2465 		 * empty/zero parameters.
2466 		 */
2467 		static int mount_snapshot = 0;
2468 
2469 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2470 		    &mount_snapshot, sizeof (mount_snapshot));
2471 		if (error != 0 && error != ENOENT)
2472 			goto done;
2473 	}
2474 	error = 0;
2475 done:
2476 	mtx_unlock(&ppr->pr_mtx);
2477 	return (error);
2478 }
2479 
2480 static int
2481 zfs_jailparam_set(void *obj, void *data)
2482 {
2483 	struct prison *pr = obj;
2484 	struct prison *ppr;
2485 	struct vfsoptlist *opts = data;
2486 	int error, jsys, mount_snapshot;
2487 
2488 	/* Set the parameters, which should be correct. */
2489 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2490 	if (error == ENOENT)
2491 		jsys = -1;
2492 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2493 	    sizeof (mount_snapshot));
2494 	if (error == ENOENT)
2495 		mount_snapshot = -1;
2496 	else
2497 		jsys = JAIL_SYS_NEW;
2498 	switch (jsys) {
2499 	case JAIL_SYS_NEW:
2500 	{
2501 		/* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2502 		struct zfs_jailparam *zjp;
2503 
2504 		/*
2505 		 * A child jail cannot have more permissions than its parent
2506 		 */
2507 		if (pr->pr_parent != &prison0) {
2508 			zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2509 			mtx_unlock(&ppr->pr_mtx);
2510 			if (zjp->mount_snapshot < mount_snapshot) {
2511 				return (EPERM);
2512 			}
2513 		}
2514 		zfs_jailparam_alloc(pr, &zjp);
2515 		if (mount_snapshot != -1)
2516 			zjp->mount_snapshot = mount_snapshot;
2517 		mtx_unlock(&pr->pr_mtx);
2518 		break;
2519 	}
2520 	case JAIL_SYS_INHERIT:
2521 		/* "zfs=inherit": inherit the parent's ZFS info. */
2522 		mtx_lock(&pr->pr_mtx);
2523 		osd_jail_del(pr, zfs_jailparam_slot);
2524 		mtx_unlock(&pr->pr_mtx);
2525 		break;
2526 	case -1:
2527 		/*
2528 		 * If the setting being changed is not ZFS related
2529 		 * then do nothing.
2530 		 */
2531 		break;
2532 	}
2533 
2534 	return (0);
2535 }
2536 
2537 static int
2538 zfs_jailparam_check(void *obj __unused, void *data)
2539 {
2540 	struct vfsoptlist *opts = data;
2541 	int error, jsys, mount_snapshot;
2542 
2543 	/* Check that the parameters are correct. */
2544 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2545 	if (error != ENOENT) {
2546 		if (error != 0)
2547 			return (error);
2548 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2549 			return (EINVAL);
2550 	}
2551 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2552 	    sizeof (mount_snapshot));
2553 	if (error != ENOENT) {
2554 		if (error != 0)
2555 			return (error);
2556 		if (mount_snapshot != 0 && mount_snapshot != 1)
2557 			return (EINVAL);
2558 	}
2559 	return (0);
2560 }
2561 
2562 static void
2563 zfs_jailparam_destroy(void *data)
2564 {
2565 
2566 	free(data, M_PRISON);
2567 }
2568 
2569 static void
2570 zfs_jailparam_sysinit(void *arg __unused)
2571 {
2572 	struct prison *pr;
2573 	osd_method_t  methods[PR_MAXMETHOD] = {
2574 		[PR_METHOD_CREATE] = zfs_jailparam_create,
2575 		[PR_METHOD_GET] = zfs_jailparam_get,
2576 		[PR_METHOD_SET] = zfs_jailparam_set,
2577 		[PR_METHOD_CHECK] = zfs_jailparam_check,
2578 	};
2579 
2580 	zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2581 	/* Copy the defaults to any existing prisons. */
2582 	sx_slock(&allprison_lock);
2583 	TAILQ_FOREACH(pr, &allprison, pr_list)
2584 		zfs_jailparam_alloc(pr, NULL);
2585 	sx_sunlock(&allprison_lock);
2586 }
2587 
2588 static void
2589 zfs_jailparam_sysuninit(void *arg __unused)
2590 {
2591 
2592 	osd_jail_deregister(zfs_jailparam_slot);
2593 }
2594 
2595 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2596 	zfs_jailparam_sysinit, NULL);
2597 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2598 	zfs_jailparam_sysuninit, NULL);
2599