xref: /freebsd-src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c (revision 0a7e5f1f02aad2ff5fff1c60f44c6975fd07e1d9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 /* Portions Copyright 2010 Robert Milkowski */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <sys/osd.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
69 
70 #include "zfs_comutil.h"
71 
72 #ifndef	MNTK_VMSETSIZE_BUG
73 #define	MNTK_VMSETSIZE_BUG	0
74 #endif
75 #ifndef	MNTK_NOMSYNC
76 #define	MNTK_NOMSYNC	8
77 #endif
78 
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86 	"File system owners can perform privileged operation on file systems");
87 
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 	"Debug level");
91 
92 struct zfs_jailparam {
93 	int mount_snapshot;
94 };
95 
96 static struct zfs_jailparam zfs_jailparam0 = {
97 	.mount_snapshot = 0,
98 };
99 
100 static int zfs_jailparam_slot;
101 
102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 	"Allow mounting snapshots in the .zfs directory for unjailed datasets");
105 
106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107 static int zfs_version_acl = ZFS_ACL_VERSION;
108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
109 	"ZFS_ACL_VERSION");
110 static int zfs_version_spa = SPA_VERSION;
111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
112 	"SPA_VERSION");
113 static int zfs_version_zpl = ZPL_VERSION;
114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
115 	"ZPL_VERSION");
116 
117 #if __FreeBSD_version >= 1400018
118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119     bool *mp_busy);
120 #else
121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
122 #endif
123 static int zfs_mount(vfs_t *vfsp);
124 static int zfs_umount(vfs_t *vfsp, int fflag);
125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128 static int zfs_sync(vfs_t *vfsp, int waitfor);
129 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
130     struct ucred **credanonp, int *numsecflavors, int *secflavors);
131 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
132 static void zfs_freevfs(vfs_t *vfsp);
133 
134 struct vfsops zfs_vfsops = {
135 	.vfs_mount =		zfs_mount,
136 	.vfs_unmount =		zfs_umount,
137 	.vfs_root =		vfs_cache_root,
138 	.vfs_cachedroot =	zfs_root,
139 	.vfs_statfs =		zfs_statfs,
140 	.vfs_vget =		zfs_vget,
141 	.vfs_sync =		zfs_sync,
142 	.vfs_checkexp =		zfs_checkexp,
143 	.vfs_fhtovp =		zfs_fhtovp,
144 	.vfs_quotactl =		zfs_quotactl,
145 };
146 
147 #ifdef VFCF_CROSS_COPY_FILE_RANGE
148 VFS_SET(zfs_vfsops, zfs,
149     VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
150 #else
151 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
152 #endif
153 
154 /*
155  * We need to keep a count of active fs's.
156  * This is necessary to prevent our module
157  * from being unloaded after a umount -f
158  */
159 static uint32_t	zfs_active_fs_count = 0;
160 
161 int
162 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
163     char *setpoint)
164 {
165 	int error;
166 	zfsvfs_t *zfvp;
167 	vfs_t *vfsp;
168 	objset_t *os;
169 	uint64_t tmp = *val;
170 
171 	error = dmu_objset_from_ds(ds, &os);
172 	if (error != 0)
173 		return (error);
174 
175 	error = getzfsvfs_impl(os, &zfvp);
176 	if (error != 0)
177 		return (error);
178 	if (zfvp == NULL)
179 		return (ENOENT);
180 	vfsp = zfvp->z_vfs;
181 	switch (zfs_prop) {
182 	case ZFS_PROP_ATIME:
183 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
184 			tmp = 0;
185 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
186 			tmp = 1;
187 		break;
188 	case ZFS_PROP_DEVICES:
189 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
190 			tmp = 0;
191 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
192 			tmp = 1;
193 		break;
194 	case ZFS_PROP_EXEC:
195 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
196 			tmp = 0;
197 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
198 			tmp = 1;
199 		break;
200 	case ZFS_PROP_SETUID:
201 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
202 			tmp = 0;
203 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
204 			tmp = 1;
205 		break;
206 	case ZFS_PROP_READONLY:
207 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
208 			tmp = 0;
209 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
210 			tmp = 1;
211 		break;
212 	case ZFS_PROP_XATTR:
213 		if (zfvp->z_flags & ZSB_XATTR)
214 			tmp = zfvp->z_xattr;
215 		break;
216 	case ZFS_PROP_NBMAND:
217 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
218 			tmp = 0;
219 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
220 			tmp = 1;
221 		break;
222 	default:
223 		vfs_unbusy(vfsp);
224 		return (ENOENT);
225 	}
226 
227 	vfs_unbusy(vfsp);
228 	if (tmp != *val) {
229 		if (setpoint)
230 			(void) strcpy(setpoint, "temporary");
231 		*val = tmp;
232 	}
233 	return (0);
234 }
235 
236 static int
237 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
238 {
239 	int error = 0;
240 	char buf[32];
241 	uint64_t usedobj, quotaobj;
242 	uint64_t quota, used = 0;
243 	timespec_t now;
244 
245 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
246 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
247 
248 	if (quotaobj == 0 || zfsvfs->z_replay) {
249 		error = ENOENT;
250 		goto done;
251 	}
252 	(void) sprintf(buf, "%llx", (longlong_t)id);
253 	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
254 	    buf, sizeof (quota), 1, &quota)) != 0) {
255 		dprintf("%s(%d): quotaobj lookup failed\n",
256 		    __FUNCTION__, __LINE__);
257 		goto done;
258 	}
259 	/*
260 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
261 	 * So we set them to be the same.
262 	 */
263 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
264 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
265 	if (error && error != ENOENT) {
266 		dprintf("%s(%d):  usedobj failed; %d\n",
267 		    __FUNCTION__, __LINE__, error);
268 		goto done;
269 	}
270 	dqp->dqb_curblocks = btodb(used);
271 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
272 	vfs_timestamp(&now);
273 	/*
274 	 * Setting this to 0 causes FreeBSD quota(8) to print
275 	 * the number of days since the epoch, which isn't
276 	 * particularly useful.
277 	 */
278 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
279 done:
280 	return (error);
281 }
282 
283 static int
284 #if __FreeBSD_version >= 1400018
285 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
286 #else
287 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
288 #endif
289 {
290 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
291 	struct thread *td;
292 	int cmd, type, error = 0;
293 	int bitsize;
294 	zfs_userquota_prop_t quota_type;
295 	struct dqblk64 dqblk = { 0 };
296 
297 	td = curthread;
298 	cmd = cmds >> SUBCMDSHIFT;
299 	type = cmds & SUBCMDMASK;
300 
301 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
302 		return (error);
303 	if (id == -1) {
304 		switch (type) {
305 		case USRQUOTA:
306 			id = td->td_ucred->cr_ruid;
307 			break;
308 		case GRPQUOTA:
309 			id = td->td_ucred->cr_rgid;
310 			break;
311 		default:
312 			error = EINVAL;
313 #if __FreeBSD_version < 1400018
314 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
315 				vfs_unbusy(vfsp);
316 #endif
317 			goto done;
318 		}
319 	}
320 	/*
321 	 * Map BSD type to:
322 	 * ZFS_PROP_USERUSED,
323 	 * ZFS_PROP_USERQUOTA,
324 	 * ZFS_PROP_GROUPUSED,
325 	 * ZFS_PROP_GROUPQUOTA
326 	 */
327 	switch (cmd) {
328 	case Q_SETQUOTA:
329 	case Q_SETQUOTA32:
330 		if (type == USRQUOTA)
331 			quota_type = ZFS_PROP_USERQUOTA;
332 		else if (type == GRPQUOTA)
333 			quota_type = ZFS_PROP_GROUPQUOTA;
334 		else
335 			error = EINVAL;
336 		break;
337 	case Q_GETQUOTA:
338 	case Q_GETQUOTA32:
339 		if (type == USRQUOTA)
340 			quota_type = ZFS_PROP_USERUSED;
341 		else if (type == GRPQUOTA)
342 			quota_type = ZFS_PROP_GROUPUSED;
343 		else
344 			error = EINVAL;
345 		break;
346 	}
347 
348 	/*
349 	 * Depending on the cmd, we may need to get
350 	 * the ruid and domain (see fuidstr_to_sid?),
351 	 * the fuid (how?), or other information.
352 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
353 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
354 	 * I think I can use just the id?
355 	 *
356 	 * Look at zfs_id_overquota() to look up a quota.
357 	 * zap_lookup(something, quotaobj, fuidstring,
358 	 *     sizeof (long long), 1, &quota)
359 	 *
360 	 * See zfs_set_userquota() to set a quota.
361 	 */
362 	if ((uint32_t)type >= MAXQUOTAS) {
363 		error = EINVAL;
364 		goto done;
365 	}
366 
367 	switch (cmd) {
368 	case Q_GETQUOTASIZE:
369 		bitsize = 64;
370 		error = copyout(&bitsize, arg, sizeof (int));
371 		break;
372 	case Q_QUOTAON:
373 		// As far as I can tell, you can't turn quotas on or off on zfs
374 		error = 0;
375 #if __FreeBSD_version < 1400018
376 		vfs_unbusy(vfsp);
377 #endif
378 		break;
379 	case Q_QUOTAOFF:
380 		error = ENOTSUP;
381 #if __FreeBSD_version < 1400018
382 		vfs_unbusy(vfsp);
383 #endif
384 		break;
385 	case Q_SETQUOTA:
386 		error = copyin(arg, &dqblk, sizeof (dqblk));
387 		if (error == 0)
388 			error = zfs_set_userquota(zfsvfs, quota_type,
389 			    "", id, dbtob(dqblk.dqb_bhardlimit));
390 		break;
391 	case Q_GETQUOTA:
392 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
393 		if (error == 0)
394 			error = copyout(&dqblk, arg, sizeof (dqblk));
395 		break;
396 	default:
397 		error = EINVAL;
398 		break;
399 	}
400 done:
401 	zfs_exit(zfsvfs, FTAG);
402 	return (error);
403 }
404 
405 
406 boolean_t
407 zfs_is_readonly(zfsvfs_t *zfsvfs)
408 {
409 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
410 }
411 
412 static int
413 zfs_sync(vfs_t *vfsp, int waitfor)
414 {
415 
416 	/*
417 	 * Data integrity is job one.  We don't want a compromised kernel
418 	 * writing to the storage pool, so we never sync during panic.
419 	 */
420 	if (panicstr)
421 		return (0);
422 
423 	/*
424 	 * Ignore the system syncher.  ZFS already commits async data
425 	 * at zfs_txg_timeout intervals.
426 	 */
427 	if (waitfor == MNT_LAZY)
428 		return (0);
429 
430 	if (vfsp != NULL) {
431 		/*
432 		 * Sync a specific filesystem.
433 		 */
434 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
435 		dsl_pool_t *dp;
436 		int error;
437 
438 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
439 			return (error);
440 		dp = dmu_objset_pool(zfsvfs->z_os);
441 
442 		/*
443 		 * If the system is shutting down, then skip any
444 		 * filesystems which may exist on a suspended pool.
445 		 */
446 		if (rebooting && spa_suspended(dp->dp_spa)) {
447 			zfs_exit(zfsvfs, FTAG);
448 			return (0);
449 		}
450 
451 		if (zfsvfs->z_log != NULL)
452 			zil_commit(zfsvfs->z_log, 0);
453 
454 		zfs_exit(zfsvfs, FTAG);
455 	} else {
456 		/*
457 		 * Sync all ZFS filesystems.  This is what happens when you
458 		 * run sync(8).  Unlike other filesystems, ZFS honors the
459 		 * request by waiting for all pools to commit all dirty data.
460 		 */
461 		spa_sync_allpools();
462 	}
463 
464 	return (0);
465 }
466 
467 static void
468 atime_changed_cb(void *arg, uint64_t newval)
469 {
470 	zfsvfs_t *zfsvfs = arg;
471 
472 	if (newval == TRUE) {
473 		zfsvfs->z_atime = TRUE;
474 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
475 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
476 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
477 	} else {
478 		zfsvfs->z_atime = FALSE;
479 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
480 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
481 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
482 	}
483 }
484 
485 static void
486 xattr_changed_cb(void *arg, uint64_t newval)
487 {
488 	zfsvfs_t *zfsvfs = arg;
489 
490 	if (newval == ZFS_XATTR_OFF) {
491 		zfsvfs->z_flags &= ~ZSB_XATTR;
492 	} else {
493 		zfsvfs->z_flags |= ZSB_XATTR;
494 
495 		if (newval == ZFS_XATTR_SA)
496 			zfsvfs->z_xattr_sa = B_TRUE;
497 		else
498 			zfsvfs->z_xattr_sa = B_FALSE;
499 	}
500 }
501 
502 static void
503 blksz_changed_cb(void *arg, uint64_t newval)
504 {
505 	zfsvfs_t *zfsvfs = arg;
506 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
507 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
508 	ASSERT(ISP2(newval));
509 
510 	zfsvfs->z_max_blksz = newval;
511 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
512 }
513 
514 static void
515 readonly_changed_cb(void *arg, uint64_t newval)
516 {
517 	zfsvfs_t *zfsvfs = arg;
518 
519 	if (newval) {
520 		/* XXX locking on vfs_flag? */
521 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
522 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
523 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
524 	} else {
525 		/* XXX locking on vfs_flag? */
526 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
527 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
528 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
529 	}
530 }
531 
532 static void
533 setuid_changed_cb(void *arg, uint64_t newval)
534 {
535 	zfsvfs_t *zfsvfs = arg;
536 
537 	if (newval == FALSE) {
538 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
539 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
540 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
541 	} else {
542 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
543 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
544 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
545 	}
546 }
547 
548 static void
549 exec_changed_cb(void *arg, uint64_t newval)
550 {
551 	zfsvfs_t *zfsvfs = arg;
552 
553 	if (newval == FALSE) {
554 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
555 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
556 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
557 	} else {
558 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
559 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
560 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
561 	}
562 }
563 
564 /*
565  * The nbmand mount option can be changed at mount time.
566  * We can't allow it to be toggled on live file systems or incorrect
567  * behavior may be seen from cifs clients
568  *
569  * This property isn't registered via dsl_prop_register(), but this callback
570  * will be called when a file system is first mounted
571  */
572 static void
573 nbmand_changed_cb(void *arg, uint64_t newval)
574 {
575 	zfsvfs_t *zfsvfs = arg;
576 	if (newval == FALSE) {
577 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
578 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
579 	} else {
580 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
581 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
582 	}
583 }
584 
585 static void
586 snapdir_changed_cb(void *arg, uint64_t newval)
587 {
588 	zfsvfs_t *zfsvfs = arg;
589 
590 	zfsvfs->z_show_ctldir = newval;
591 }
592 
593 static void
594 acl_mode_changed_cb(void *arg, uint64_t newval)
595 {
596 	zfsvfs_t *zfsvfs = arg;
597 
598 	zfsvfs->z_acl_mode = newval;
599 }
600 
601 static void
602 acl_inherit_changed_cb(void *arg, uint64_t newval)
603 {
604 	zfsvfs_t *zfsvfs = arg;
605 
606 	zfsvfs->z_acl_inherit = newval;
607 }
608 
609 static void
610 acl_type_changed_cb(void *arg, uint64_t newval)
611 {
612 	zfsvfs_t *zfsvfs = arg;
613 
614 	zfsvfs->z_acl_type = newval;
615 }
616 
617 static int
618 zfs_register_callbacks(vfs_t *vfsp)
619 {
620 	struct dsl_dataset *ds = NULL;
621 	objset_t *os = NULL;
622 	zfsvfs_t *zfsvfs = NULL;
623 	uint64_t nbmand;
624 	boolean_t readonly = B_FALSE;
625 	boolean_t do_readonly = B_FALSE;
626 	boolean_t setuid = B_FALSE;
627 	boolean_t do_setuid = B_FALSE;
628 	boolean_t exec = B_FALSE;
629 	boolean_t do_exec = B_FALSE;
630 	boolean_t xattr = B_FALSE;
631 	boolean_t atime = B_FALSE;
632 	boolean_t do_atime = B_FALSE;
633 	boolean_t do_xattr = B_FALSE;
634 	int error = 0;
635 
636 	ASSERT3P(vfsp, !=, NULL);
637 	zfsvfs = vfsp->vfs_data;
638 	ASSERT3P(zfsvfs, !=, NULL);
639 	os = zfsvfs->z_os;
640 
641 	/*
642 	 * This function can be called for a snapshot when we update snapshot's
643 	 * mount point, which isn't really supported.
644 	 */
645 	if (dmu_objset_is_snapshot(os))
646 		return (EOPNOTSUPP);
647 
648 	/*
649 	 * The act of registering our callbacks will destroy any mount
650 	 * options we may have.  In order to enable temporary overrides
651 	 * of mount options, we stash away the current values and
652 	 * restore them after we register the callbacks.
653 	 */
654 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
655 	    !spa_writeable(dmu_objset_spa(os))) {
656 		readonly = B_TRUE;
657 		do_readonly = B_TRUE;
658 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
659 		readonly = B_FALSE;
660 		do_readonly = B_TRUE;
661 	}
662 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
663 		setuid = B_FALSE;
664 		do_setuid = B_TRUE;
665 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
666 		setuid = B_TRUE;
667 		do_setuid = B_TRUE;
668 	}
669 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
670 		exec = B_FALSE;
671 		do_exec = B_TRUE;
672 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
673 		exec = B_TRUE;
674 		do_exec = B_TRUE;
675 	}
676 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
677 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
678 		do_xattr = B_TRUE;
679 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
680 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
681 		do_xattr = B_TRUE;
682 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
683 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
684 		do_xattr = B_TRUE;
685 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
686 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
687 		do_xattr = B_TRUE;
688 	}
689 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
690 		atime = B_FALSE;
691 		do_atime = B_TRUE;
692 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
693 		atime = B_TRUE;
694 		do_atime = B_TRUE;
695 	}
696 
697 	/*
698 	 * We need to enter pool configuration here, so that we can use
699 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
700 	 * dsl_prop_get_integer() can not be used, because it has to acquire
701 	 * spa_namespace_lock and we can not do that because we already hold
702 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
703 	 * with spa_namespace_lock held and the function calls ZFS vnode
704 	 * operations to write the cache file and thus z_teardown_lock is
705 	 * acquired after spa_namespace_lock.
706 	 */
707 	ds = dmu_objset_ds(os);
708 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
709 
710 	/*
711 	 * nbmand is a special property.  It can only be changed at
712 	 * mount time.
713 	 *
714 	 * This is weird, but it is documented to only be changeable
715 	 * at mount time.
716 	 */
717 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
718 		nbmand = B_FALSE;
719 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
720 		nbmand = B_TRUE;
721 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
722 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
723 		return (error);
724 	}
725 
726 	/*
727 	 * Register property callbacks.
728 	 *
729 	 * It would probably be fine to just check for i/o error from
730 	 * the first prop_register(), but I guess I like to go
731 	 * overboard...
732 	 */
733 	error = dsl_prop_register(ds,
734 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
735 	error = error ? error : dsl_prop_register(ds,
736 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
737 	error = error ? error : dsl_prop_register(ds,
738 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
739 	error = error ? error : dsl_prop_register(ds,
740 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
741 	error = error ? error : dsl_prop_register(ds,
742 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
743 	error = error ? error : dsl_prop_register(ds,
744 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
745 	error = error ? error : dsl_prop_register(ds,
746 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
747 	error = error ? error : dsl_prop_register(ds,
748 	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
749 	error = error ? error : dsl_prop_register(ds,
750 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
751 	error = error ? error : dsl_prop_register(ds,
752 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
753 	    zfsvfs);
754 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
755 	if (error)
756 		goto unregister;
757 
758 	/*
759 	 * Invoke our callbacks to restore temporary mount options.
760 	 */
761 	if (do_readonly)
762 		readonly_changed_cb(zfsvfs, readonly);
763 	if (do_setuid)
764 		setuid_changed_cb(zfsvfs, setuid);
765 	if (do_exec)
766 		exec_changed_cb(zfsvfs, exec);
767 	if (do_xattr)
768 		xattr_changed_cb(zfsvfs, xattr);
769 	if (do_atime)
770 		atime_changed_cb(zfsvfs, atime);
771 
772 	nbmand_changed_cb(zfsvfs, nbmand);
773 
774 	return (0);
775 
776 unregister:
777 	dsl_prop_unregister_all(ds, zfsvfs);
778 	return (error);
779 }
780 
781 /*
782  * Associate this zfsvfs with the given objset, which must be owned.
783  * This will cache a bunch of on-disk state from the objset in the
784  * zfsvfs.
785  */
786 static int
787 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
788 {
789 	int error;
790 	uint64_t val;
791 
792 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
793 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
794 	zfsvfs->z_os = os;
795 
796 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
797 	if (error != 0)
798 		return (error);
799 	if (zfsvfs->z_version >
800 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
801 		(void) printf("Can't mount a version %lld file system "
802 		    "on a version %lld pool\n. Pool must be upgraded to mount "
803 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
804 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
805 		return (SET_ERROR(ENOTSUP));
806 	}
807 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
808 	if (error != 0)
809 		return (error);
810 	zfsvfs->z_norm = (int)val;
811 
812 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
813 	if (error != 0)
814 		return (error);
815 	zfsvfs->z_utf8 = (val != 0);
816 
817 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
818 	if (error != 0)
819 		return (error);
820 	zfsvfs->z_case = (uint_t)val;
821 
822 	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
823 	if (error != 0)
824 		return (error);
825 	zfsvfs->z_acl_type = (uint_t)val;
826 
827 	/*
828 	 * Fold case on file systems that are always or sometimes case
829 	 * insensitive.
830 	 */
831 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
832 	    zfsvfs->z_case == ZFS_CASE_MIXED)
833 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
834 
835 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
836 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
837 
838 	uint64_t sa_obj = 0;
839 	if (zfsvfs->z_use_sa) {
840 		/* should either have both of these objects or none */
841 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
842 		    &sa_obj);
843 		if (error != 0)
844 			return (error);
845 
846 		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
847 		if (error == 0 && val == ZFS_XATTR_SA)
848 			zfsvfs->z_xattr_sa = B_TRUE;
849 	}
850 
851 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
852 	    &zfsvfs->z_attr_table);
853 	if (error != 0)
854 		return (error);
855 
856 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
857 		sa_register_update_callback(os, zfs_sa_upgrade);
858 
859 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
860 	    &zfsvfs->z_root);
861 	if (error != 0)
862 		return (error);
863 	ASSERT3U(zfsvfs->z_root, !=, 0);
864 
865 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
866 	    &zfsvfs->z_unlinkedobj);
867 	if (error != 0)
868 		return (error);
869 
870 	error = zap_lookup(os, MASTER_NODE_OBJ,
871 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
872 	    8, 1, &zfsvfs->z_userquota_obj);
873 	if (error == ENOENT)
874 		zfsvfs->z_userquota_obj = 0;
875 	else if (error != 0)
876 		return (error);
877 
878 	error = zap_lookup(os, MASTER_NODE_OBJ,
879 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
880 	    8, 1, &zfsvfs->z_groupquota_obj);
881 	if (error == ENOENT)
882 		zfsvfs->z_groupquota_obj = 0;
883 	else if (error != 0)
884 		return (error);
885 
886 	error = zap_lookup(os, MASTER_NODE_OBJ,
887 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
888 	    8, 1, &zfsvfs->z_projectquota_obj);
889 	if (error == ENOENT)
890 		zfsvfs->z_projectquota_obj = 0;
891 	else if (error != 0)
892 		return (error);
893 
894 	error = zap_lookup(os, MASTER_NODE_OBJ,
895 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
896 	    8, 1, &zfsvfs->z_userobjquota_obj);
897 	if (error == ENOENT)
898 		zfsvfs->z_userobjquota_obj = 0;
899 	else if (error != 0)
900 		return (error);
901 
902 	error = zap_lookup(os, MASTER_NODE_OBJ,
903 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
904 	    8, 1, &zfsvfs->z_groupobjquota_obj);
905 	if (error == ENOENT)
906 		zfsvfs->z_groupobjquota_obj = 0;
907 	else if (error != 0)
908 		return (error);
909 
910 	error = zap_lookup(os, MASTER_NODE_OBJ,
911 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
912 	    8, 1, &zfsvfs->z_projectobjquota_obj);
913 	if (error == ENOENT)
914 		zfsvfs->z_projectobjquota_obj = 0;
915 	else if (error != 0)
916 		return (error);
917 
918 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
919 	    &zfsvfs->z_fuid_obj);
920 	if (error == ENOENT)
921 		zfsvfs->z_fuid_obj = 0;
922 	else if (error != 0)
923 		return (error);
924 
925 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
926 	    &zfsvfs->z_shares_dir);
927 	if (error == ENOENT)
928 		zfsvfs->z_shares_dir = 0;
929 	else if (error != 0)
930 		return (error);
931 
932 	/*
933 	 * Only use the name cache if we are looking for a
934 	 * name on a file system that does not require normalization
935 	 * or case folding.  We can also look there if we happen to be
936 	 * on a non-normalizing, mixed sensitivity file system IF we
937 	 * are looking for the exact name (which is always the case on
938 	 * FreeBSD).
939 	 */
940 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
941 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
942 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
943 
944 	return (0);
945 }
946 
947 taskq_t *zfsvfs_taskq;
948 
949 static void
950 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
951 {
952 
953 	zfs_unlinked_drain((zfsvfs_t *)context);
954 }
955 
956 int
957 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
958 {
959 	objset_t *os;
960 	zfsvfs_t *zfsvfs;
961 	int error;
962 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
963 
964 	/*
965 	 * XXX: Fix struct statfs so this isn't necessary!
966 	 *
967 	 * The 'osname' is used as the filesystem's special node, which means
968 	 * it must fit in statfs.f_mntfromname, or else it can't be
969 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
970 	 * 'zfs unmount' to think it's not mounted when it is.
971 	 */
972 	if (strlen(osname) >= MNAMELEN)
973 		return (SET_ERROR(ENAMETOOLONG));
974 
975 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
976 
977 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
978 	    &os);
979 	if (error != 0) {
980 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
981 		return (error);
982 	}
983 
984 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
985 
986 	return (error);
987 }
988 
989 
990 int
991 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
992 {
993 	int error;
994 
995 	zfsvfs->z_vfs = NULL;
996 	zfsvfs->z_parent = zfsvfs;
997 
998 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
999 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1000 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1001 	    offsetof(znode_t, z_link_node));
1002 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1003 	    zfsvfs_task_unlinked_drain, zfsvfs);
1004 	ZFS_TEARDOWN_INIT(zfsvfs);
1005 	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1006 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1007 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1008 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1009 
1010 	error = zfsvfs_init(zfsvfs, os);
1011 	if (error != 0) {
1012 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1013 		*zfvp = NULL;
1014 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1015 		return (error);
1016 	}
1017 
1018 	*zfvp = zfsvfs;
1019 	return (0);
1020 }
1021 
1022 static int
1023 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1024 {
1025 	int error;
1026 
1027 	/*
1028 	 * Check for a bad on-disk format version now since we
1029 	 * lied about owning the dataset readonly before.
1030 	 */
1031 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1032 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1033 		return (SET_ERROR(EROFS));
1034 
1035 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1036 	if (error)
1037 		return (error);
1038 
1039 	/*
1040 	 * If we are not mounting (ie: online recv), then we don't
1041 	 * have to worry about replaying the log as we blocked all
1042 	 * operations out since we closed the ZIL.
1043 	 */
1044 	if (mounting) {
1045 		boolean_t readonly;
1046 
1047 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1048 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1049 		if (error)
1050 			return (error);
1051 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1052 		    &zfsvfs->z_kstat.dk_zil_sums);
1053 
1054 		/*
1055 		 * During replay we remove the read only flag to
1056 		 * allow replays to succeed.
1057 		 */
1058 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1059 		if (readonly != 0) {
1060 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1061 		} else {
1062 			dsl_dir_t *dd;
1063 			zap_stats_t zs;
1064 
1065 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1066 			    &zs) == 0) {
1067 				dataset_kstats_update_nunlinks_kstat(
1068 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1069 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1070 				    "num_entries in unlinked set: %llu",
1071 				    (u_longlong_t)zs.zs_num_entries);
1072 			}
1073 
1074 			zfs_unlinked_drain(zfsvfs);
1075 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1076 			dd->dd_activity_cancelled = B_FALSE;
1077 		}
1078 
1079 		/*
1080 		 * Parse and replay the intent log.
1081 		 *
1082 		 * Because of ziltest, this must be done after
1083 		 * zfs_unlinked_drain().  (Further note: ziltest
1084 		 * doesn't use readonly mounts, where
1085 		 * zfs_unlinked_drain() isn't called.)  This is because
1086 		 * ziltest causes spa_sync() to think it's committed,
1087 		 * but actually it is not, so the intent log contains
1088 		 * many txg's worth of changes.
1089 		 *
1090 		 * In particular, if object N is in the unlinked set in
1091 		 * the last txg to actually sync, then it could be
1092 		 * actually freed in a later txg and then reallocated
1093 		 * in a yet later txg.  This would write a "create
1094 		 * object N" record to the intent log.  Normally, this
1095 		 * would be fine because the spa_sync() would have
1096 		 * written out the fact that object N is free, before
1097 		 * we could write the "create object N" intent log
1098 		 * record.
1099 		 *
1100 		 * But when we are in ziltest mode, we advance the "open
1101 		 * txg" without actually spa_sync()-ing the changes to
1102 		 * disk.  So we would see that object N is still
1103 		 * allocated and in the unlinked set, and there is an
1104 		 * intent log record saying to allocate it.
1105 		 */
1106 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1107 			if (zil_replay_disable) {
1108 				zil_destroy(zfsvfs->z_log, B_FALSE);
1109 			} else {
1110 				boolean_t use_nc = zfsvfs->z_use_namecache;
1111 				zfsvfs->z_use_namecache = B_FALSE;
1112 				zfsvfs->z_replay = B_TRUE;
1113 				zil_replay(zfsvfs->z_os, zfsvfs,
1114 				    zfs_replay_vector);
1115 				zfsvfs->z_replay = B_FALSE;
1116 				zfsvfs->z_use_namecache = use_nc;
1117 			}
1118 		}
1119 
1120 		/* restore readonly bit */
1121 		if (readonly != 0)
1122 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1123 	} else {
1124 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1125 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1126 		    &zfsvfs->z_kstat.dk_zil_sums);
1127 	}
1128 
1129 	/*
1130 	 * Set the objset user_ptr to track its zfsvfs.
1131 	 */
1132 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1133 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1134 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1135 
1136 	return (0);
1137 }
1138 
1139 void
1140 zfsvfs_free(zfsvfs_t *zfsvfs)
1141 {
1142 	int i;
1143 
1144 	zfs_fuid_destroy(zfsvfs);
1145 
1146 	mutex_destroy(&zfsvfs->z_znodes_lock);
1147 	mutex_destroy(&zfsvfs->z_lock);
1148 	list_destroy(&zfsvfs->z_all_znodes);
1149 	ZFS_TEARDOWN_DESTROY(zfsvfs);
1150 	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1151 	rw_destroy(&zfsvfs->z_fuid_lock);
1152 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1153 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1154 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1155 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1156 }
1157 
1158 static void
1159 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1160 {
1161 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1162 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1163 }
1164 
1165 static int
1166 zfs_domount(vfs_t *vfsp, char *osname)
1167 {
1168 	uint64_t recordsize, fsid_guid;
1169 	int error = 0;
1170 	zfsvfs_t *zfsvfs;
1171 
1172 	ASSERT3P(vfsp, !=, NULL);
1173 	ASSERT3P(osname, !=, NULL);
1174 
1175 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1176 	if (error)
1177 		return (error);
1178 	zfsvfs->z_vfs = vfsp;
1179 
1180 	if ((error = dsl_prop_get_integer(osname,
1181 	    "recordsize", &recordsize, NULL)))
1182 		goto out;
1183 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1184 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1185 
1186 	vfsp->vfs_data = zfsvfs;
1187 	vfsp->mnt_flag |= MNT_LOCAL;
1188 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1189 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1190 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1191 	/*
1192 	 * This can cause a loss of coherence between ARC and page cache
1193 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1194 	 */
1195 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1196 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1197 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1198 
1199 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1200 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1201 #endif
1202 	/*
1203 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1204 	 * separates our fsid from any other filesystem types, and a
1205 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1206 	 * all objsets open on this system, provided by unique_create().
1207 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1208 	 * because that's where other Solaris filesystems put it.
1209 	 */
1210 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1211 	ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1212 	vfsp->vfs_fsid.val[0] = fsid_guid;
1213 	vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1214 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1215 
1216 	/*
1217 	 * Set features for file system.
1218 	 */
1219 	zfs_set_fuid_feature(zfsvfs);
1220 
1221 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1222 		uint64_t pval;
1223 
1224 		atime_changed_cb(zfsvfs, B_FALSE);
1225 		readonly_changed_cb(zfsvfs, B_TRUE);
1226 		if ((error = dsl_prop_get_integer(osname,
1227 		    "xattr", &pval, NULL)))
1228 			goto out;
1229 		xattr_changed_cb(zfsvfs, pval);
1230 		if ((error = dsl_prop_get_integer(osname,
1231 		    "acltype", &pval, NULL)))
1232 			goto out;
1233 		acl_type_changed_cb(zfsvfs, pval);
1234 		zfsvfs->z_issnap = B_TRUE;
1235 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1236 
1237 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1238 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1239 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1240 	} else {
1241 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1242 			goto out;
1243 	}
1244 
1245 	vfs_mountedfrom(vfsp, osname);
1246 
1247 	if (!zfsvfs->z_issnap)
1248 		zfsctl_create(zfsvfs);
1249 out:
1250 	if (error) {
1251 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1252 		zfsvfs_free(zfsvfs);
1253 	} else {
1254 		atomic_inc_32(&zfs_active_fs_count);
1255 	}
1256 
1257 	return (error);
1258 }
1259 
1260 static void
1261 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1262 {
1263 	objset_t *os = zfsvfs->z_os;
1264 
1265 	if (!dmu_objset_is_snapshot(os))
1266 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1267 }
1268 
1269 static int
1270 getpoolname(const char *osname, char *poolname)
1271 {
1272 	char *p;
1273 
1274 	p = strchr(osname, '/');
1275 	if (p == NULL) {
1276 		if (strlen(osname) >= MAXNAMELEN)
1277 			return (ENAMETOOLONG);
1278 		(void) strcpy(poolname, osname);
1279 	} else {
1280 		if (p - osname >= MAXNAMELEN)
1281 			return (ENAMETOOLONG);
1282 		(void) strlcpy(poolname, osname, p - osname + 1);
1283 	}
1284 	return (0);
1285 }
1286 
1287 static void
1288 fetch_osname_options(char *name, bool *checkpointrewind)
1289 {
1290 
1291 	if (name[0] == '!') {
1292 		*checkpointrewind = true;
1293 		memmove(name, name + 1, strlen(name));
1294 	} else {
1295 		*checkpointrewind = false;
1296 	}
1297 }
1298 
1299 static int
1300 zfs_mount(vfs_t *vfsp)
1301 {
1302 	kthread_t	*td = curthread;
1303 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1304 	cred_t		*cr = td->td_ucred;
1305 	char		*osname;
1306 	int		error = 0;
1307 	int		canwrite;
1308 	bool		checkpointrewind, isctlsnap = false;
1309 
1310 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1311 		return (SET_ERROR(EINVAL));
1312 
1313 	/*
1314 	 * If full-owner-access is enabled and delegated administration is
1315 	 * turned on, we must set nosuid.
1316 	 */
1317 	if (zfs_super_owner &&
1318 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1319 		secpolicy_fs_mount_clearopts(cr, vfsp);
1320 	}
1321 
1322 	fetch_osname_options(osname, &checkpointrewind);
1323 	isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1324 	    strchr(osname, '@') != NULL);
1325 
1326 	/*
1327 	 * Check for mount privilege?
1328 	 *
1329 	 * If we don't have privilege then see if
1330 	 * we have local permission to allow it
1331 	 */
1332 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1333 	if (error && isctlsnap) {
1334 		secpolicy_fs_mount_clearopts(cr, vfsp);
1335 	} else if (error) {
1336 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1337 			goto out;
1338 
1339 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1340 			vattr_t		vattr;
1341 
1342 			/*
1343 			 * Make sure user is the owner of the mount point
1344 			 * or has sufficient privileges.
1345 			 */
1346 
1347 			vattr.va_mask = AT_UID;
1348 
1349 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1350 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1351 				VOP_UNLOCK(mvp);
1352 				goto out;
1353 			}
1354 
1355 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1356 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1357 				VOP_UNLOCK(mvp);
1358 				goto out;
1359 			}
1360 			VOP_UNLOCK(mvp);
1361 		}
1362 
1363 		secpolicy_fs_mount_clearopts(cr, vfsp);
1364 	}
1365 
1366 	/*
1367 	 * Refuse to mount a filesystem if we are in a local zone and the
1368 	 * dataset is not visible.
1369 	 */
1370 	if (!INGLOBALZONE(curproc) &&
1371 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1372 		boolean_t mount_snapshot = B_FALSE;
1373 
1374 		/*
1375 		 * Snapshots may be mounted in .zfs for unjailed datasets
1376 		 * if allowed by the jail param zfs.mount_snapshot.
1377 		 */
1378 		if (isctlsnap) {
1379 			struct prison *pr;
1380 			struct zfs_jailparam *zjp;
1381 
1382 			pr = curthread->td_ucred->cr_prison;
1383 			mtx_lock(&pr->pr_mtx);
1384 			zjp = osd_jail_get(pr, zfs_jailparam_slot);
1385 			mtx_unlock(&pr->pr_mtx);
1386 			if (zjp && zjp->mount_snapshot)
1387 				mount_snapshot = B_TRUE;
1388 		}
1389 		if (!mount_snapshot) {
1390 			error = SET_ERROR(EPERM);
1391 			goto out;
1392 		}
1393 	}
1394 
1395 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1396 
1397 	/*
1398 	 * When doing a remount, we simply refresh our temporary properties
1399 	 * according to those options set in the current VFS options.
1400 	 */
1401 	if (vfsp->vfs_flag & MS_REMOUNT) {
1402 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1403 
1404 		/*
1405 		 * Refresh mount options with z_teardown_lock blocking I/O while
1406 		 * the filesystem is in an inconsistent state.
1407 		 * The lock also serializes this code with filesystem
1408 		 * manipulations between entry to zfs_suspend_fs() and return
1409 		 * from zfs_resume_fs().
1410 		 */
1411 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1412 		zfs_unregister_callbacks(zfsvfs);
1413 		error = zfs_register_callbacks(vfsp);
1414 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1415 		goto out;
1416 	}
1417 
1418 	/* Initial root mount: try hard to import the requested root pool. */
1419 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1420 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1421 		char pname[MAXNAMELEN];
1422 
1423 		error = getpoolname(osname, pname);
1424 		if (error == 0)
1425 			error = spa_import_rootpool(pname, checkpointrewind);
1426 		if (error)
1427 			goto out;
1428 	}
1429 	DROP_GIANT();
1430 	error = zfs_domount(vfsp, osname);
1431 	PICKUP_GIANT();
1432 
1433 out:
1434 	return (error);
1435 }
1436 
1437 static int
1438 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1439 {
1440 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1441 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1442 	int error;
1443 
1444 	statp->f_version = STATFS_VERSION;
1445 
1446 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1447 		return (error);
1448 
1449 	dmu_objset_space(zfsvfs->z_os,
1450 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1451 
1452 	/*
1453 	 * The underlying storage pool actually uses multiple block sizes.
1454 	 * We report the fragsize as the smallest block size we support,
1455 	 * and we report our blocksize as the filesystem's maximum blocksize.
1456 	 */
1457 	statp->f_bsize = SPA_MINBLOCKSIZE;
1458 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1459 
1460 	/*
1461 	 * The following report "total" blocks of various kinds in the
1462 	 * file system, but reported in terms of f_frsize - the
1463 	 * "fragment" size.
1464 	 */
1465 
1466 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1467 	statp->f_bfree = availbytes / statp->f_bsize;
1468 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1469 
1470 	/*
1471 	 * statvfs() should really be called statufs(), because it assumes
1472 	 * static metadata.  ZFS doesn't preallocate files, so the best
1473 	 * we can do is report the max that could possibly fit in f_files,
1474 	 * and that minus the number actually used in f_ffree.
1475 	 * For f_ffree, report the smaller of the number of object available
1476 	 * and the number of blocks (each object will take at least a block).
1477 	 */
1478 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1479 	statp->f_files = statp->f_ffree + usedobjs;
1480 
1481 	/*
1482 	 * We're a zfs filesystem.
1483 	 */
1484 	strlcpy(statp->f_fstypename, "zfs",
1485 	    sizeof (statp->f_fstypename));
1486 
1487 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1488 	    sizeof (statp->f_mntfromname));
1489 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1490 	    sizeof (statp->f_mntonname));
1491 
1492 	statp->f_namemax = MAXNAMELEN - 1;
1493 
1494 	zfs_exit(zfsvfs, FTAG);
1495 	return (0);
1496 }
1497 
1498 static int
1499 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1500 {
1501 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1502 	znode_t *rootzp;
1503 	int error;
1504 
1505 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1506 		return (error);
1507 
1508 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1509 	if (error == 0)
1510 		*vpp = ZTOV(rootzp);
1511 
1512 	zfs_exit(zfsvfs, FTAG);
1513 
1514 	if (error == 0) {
1515 		error = vn_lock(*vpp, flags);
1516 		if (error != 0) {
1517 			VN_RELE(*vpp);
1518 			*vpp = NULL;
1519 		}
1520 	}
1521 	return (error);
1522 }
1523 
1524 /*
1525  * Teardown the zfsvfs::z_os.
1526  *
1527  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1528  * and 'z_teardown_inactive_lock' held.
1529  */
1530 static int
1531 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1532 {
1533 	znode_t	*zp;
1534 	dsl_dir_t *dd;
1535 
1536 	/*
1537 	 * If someone has not already unmounted this file system,
1538 	 * drain the zrele_taskq to ensure all active references to the
1539 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1540 	 */
1541 	if (zfsvfs->z_os) {
1542 		/*
1543 		 * If we're unmounting we have to wait for the list to
1544 		 * drain completely.
1545 		 *
1546 		 * If we're not unmounting there's no guarantee the list
1547 		 * will drain completely, but zreles run from the taskq
1548 		 * may add the parents of dir-based xattrs to the taskq
1549 		 * so we want to wait for these.
1550 		 *
1551 		 * We can safely check z_all_znodes for being empty because the
1552 		 * VFS has already blocked operations which add to it.
1553 		 */
1554 		int round = 0;
1555 		while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1556 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1557 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1558 			if (++round > 1 && !unmounting)
1559 				break;
1560 		}
1561 	}
1562 	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1563 
1564 	if (!unmounting) {
1565 		/*
1566 		 * We purge the parent filesystem's vfsp as the parent
1567 		 * filesystem and all of its snapshots have their vnode's
1568 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1569 		 * 'z_parent' is self referential for non-snapshots.
1570 		 */
1571 #ifdef FREEBSD_NAMECACHE
1572 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1573 #endif
1574 	}
1575 
1576 	/*
1577 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1578 	 * threads are blocked as zil_close can call zfs_inactive.
1579 	 */
1580 	if (zfsvfs->z_log) {
1581 		zil_close(zfsvfs->z_log);
1582 		zfsvfs->z_log = NULL;
1583 	}
1584 
1585 	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1586 
1587 	/*
1588 	 * If we are not unmounting (ie: online recv) and someone already
1589 	 * unmounted this file system while we were doing the switcheroo,
1590 	 * or a reopen of z_os failed then just bail out now.
1591 	 */
1592 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1593 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1594 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1595 		return (SET_ERROR(EIO));
1596 	}
1597 
1598 	/*
1599 	 * At this point there are no vops active, and any new vops will
1600 	 * fail with EIO since we have z_teardown_lock for writer (only
1601 	 * relevant for forced unmount).
1602 	 *
1603 	 * Release all holds on dbufs.
1604 	 */
1605 	mutex_enter(&zfsvfs->z_znodes_lock);
1606 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1607 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1608 		if (zp->z_sa_hdl != NULL) {
1609 			zfs_znode_dmu_fini(zp);
1610 		}
1611 	}
1612 	mutex_exit(&zfsvfs->z_znodes_lock);
1613 
1614 	/*
1615 	 * If we are unmounting, set the unmounted flag and let new vops
1616 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1617 	 * other vops will fail with EIO.
1618 	 */
1619 	if (unmounting) {
1620 		zfsvfs->z_unmounted = B_TRUE;
1621 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1622 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1623 	}
1624 
1625 	/*
1626 	 * z_os will be NULL if there was an error in attempting to reopen
1627 	 * zfsvfs, so just return as the properties had already been
1628 	 * unregistered and cached data had been evicted before.
1629 	 */
1630 	if (zfsvfs->z_os == NULL)
1631 		return (0);
1632 
1633 	/*
1634 	 * Unregister properties.
1635 	 */
1636 	zfs_unregister_callbacks(zfsvfs);
1637 
1638 	/*
1639 	 * Evict cached data. We must write out any dirty data before
1640 	 * disowning the dataset.
1641 	 */
1642 	objset_t *os = zfsvfs->z_os;
1643 	boolean_t os_dirty = B_FALSE;
1644 	for (int t = 0; t < TXG_SIZE; t++) {
1645 		if (dmu_objset_is_dirty(os, t)) {
1646 			os_dirty = B_TRUE;
1647 			break;
1648 		}
1649 	}
1650 	if (!zfs_is_readonly(zfsvfs) && os_dirty)
1651 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1652 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1653 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1654 	dsl_dir_cancel_waiters(dd);
1655 
1656 	return (0);
1657 }
1658 
1659 static int
1660 zfs_umount(vfs_t *vfsp, int fflag)
1661 {
1662 	kthread_t *td = curthread;
1663 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1664 	objset_t *os;
1665 	cred_t *cr = td->td_ucred;
1666 	int ret;
1667 
1668 	ret = secpolicy_fs_unmount(cr, vfsp);
1669 	if (ret) {
1670 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1671 		    ZFS_DELEG_PERM_MOUNT, cr))
1672 			return (ret);
1673 	}
1674 
1675 	/*
1676 	 * Unmount any snapshots mounted under .zfs before unmounting the
1677 	 * dataset itself.
1678 	 */
1679 	if (zfsvfs->z_ctldir != NULL) {
1680 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1681 			return (ret);
1682 	}
1683 
1684 	if (fflag & MS_FORCE) {
1685 		/*
1686 		 * Mark file system as unmounted before calling
1687 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1688 		 * will be called and risk operating on DOOMED vnodes.
1689 		 */
1690 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1691 		zfsvfs->z_unmounted = B_TRUE;
1692 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1693 	}
1694 
1695 	/*
1696 	 * Flush all the files.
1697 	 */
1698 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1699 	if (ret != 0)
1700 		return (ret);
1701 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1702 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1703 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1704 		    &zfsvfs->z_unlinked_drain_task);
1705 
1706 	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1707 	os = zfsvfs->z_os;
1708 
1709 	/*
1710 	 * z_os will be NULL if there was an error in
1711 	 * attempting to reopen zfsvfs.
1712 	 */
1713 	if (os != NULL) {
1714 		/*
1715 		 * Unset the objset user_ptr.
1716 		 */
1717 		mutex_enter(&os->os_user_ptr_lock);
1718 		dmu_objset_set_user(os, NULL);
1719 		mutex_exit(&os->os_user_ptr_lock);
1720 
1721 		/*
1722 		 * Finally release the objset
1723 		 */
1724 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1725 	}
1726 
1727 	/*
1728 	 * We can now safely destroy the '.zfs' directory node.
1729 	 */
1730 	if (zfsvfs->z_ctldir != NULL)
1731 		zfsctl_destroy(zfsvfs);
1732 	zfs_freevfs(vfsp);
1733 
1734 	return (0);
1735 }
1736 
1737 static int
1738 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1739 {
1740 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1741 	znode_t		*zp;
1742 	int 		err;
1743 
1744 	/*
1745 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1746 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1747 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1748 	 */
1749 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1750 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1751 		return (EOPNOTSUPP);
1752 
1753 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1754 		return (err);
1755 	err = zfs_zget(zfsvfs, ino, &zp);
1756 	if (err == 0 && zp->z_unlinked) {
1757 		vrele(ZTOV(zp));
1758 		err = EINVAL;
1759 	}
1760 	if (err == 0)
1761 		*vpp = ZTOV(zp);
1762 	zfs_exit(zfsvfs, FTAG);
1763 	if (err == 0) {
1764 		err = vn_lock(*vpp, flags);
1765 		if (err != 0)
1766 			vrele(*vpp);
1767 	}
1768 	if (err != 0)
1769 		*vpp = NULL;
1770 	return (err);
1771 }
1772 
1773 static int
1774 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1775     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1776 {
1777 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1778 
1779 	/*
1780 	 * If this is regular file system vfsp is the same as
1781 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1782 	 * zfsvfs->z_parent->z_vfs represents parent file system
1783 	 * which we have to use here, because only this file system
1784 	 * has mnt_export configured.
1785 	 */
1786 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1787 	    credanonp, numsecflavors, secflavors));
1788 }
1789 
1790 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1791 	"struct fid bigger than SHORT_FID_LEN");
1792 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1793 	"struct fid bigger than LONG_FID_LEN");
1794 
1795 static int
1796 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1797 {
1798 	struct componentname cn;
1799 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1800 	znode_t		*zp;
1801 	vnode_t		*dvp;
1802 	uint64_t	object = 0;
1803 	uint64_t	fid_gen = 0;
1804 	uint64_t	setgen = 0;
1805 	uint64_t	gen_mask;
1806 	uint64_t	zp_gen;
1807 	int 		i, err;
1808 
1809 	*vpp = NULL;
1810 
1811 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1812 		return (err);
1813 
1814 	/*
1815 	 * On FreeBSD we can get snapshot's mount point or its parent file
1816 	 * system mount point depending if snapshot is already mounted or not.
1817 	 */
1818 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1819 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1820 		uint64_t	objsetid = 0;
1821 
1822 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1823 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1824 
1825 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1826 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1827 
1828 		zfs_exit(zfsvfs, FTAG);
1829 
1830 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1831 		if (err)
1832 			return (SET_ERROR(EINVAL));
1833 		if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1834 			return (err);
1835 	}
1836 
1837 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1838 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1839 
1840 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1841 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1842 
1843 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1844 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1845 	} else {
1846 		zfs_exit(zfsvfs, FTAG);
1847 		return (SET_ERROR(EINVAL));
1848 	}
1849 
1850 	if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1851 		zfs_exit(zfsvfs, FTAG);
1852 		dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1853 		    (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1854 		return (SET_ERROR(EINVAL));
1855 	}
1856 
1857 	/*
1858 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1859 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1860 	 * we are in the .zfs/shares directory tree.
1861 	 */
1862 	if ((fid_gen == 0 &&
1863 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1864 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1865 		zfs_exit(zfsvfs, FTAG);
1866 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1867 		if (object == ZFSCTL_INO_SNAPDIR) {
1868 			cn.cn_nameptr = "snapshot";
1869 			cn.cn_namelen = strlen(cn.cn_nameptr);
1870 			cn.cn_nameiop = LOOKUP;
1871 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1872 			cn.cn_lkflags = flags;
1873 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1874 			vput(dvp);
1875 		} else if (object == zfsvfs->z_shares_dir) {
1876 			/*
1877 			 * XXX This branch must not be taken,
1878 			 * if it is, then the lookup below will
1879 			 * explode.
1880 			 */
1881 			cn.cn_nameptr = "shares";
1882 			cn.cn_namelen = strlen(cn.cn_nameptr);
1883 			cn.cn_nameiop = LOOKUP;
1884 			cn.cn_flags = ISLASTCN;
1885 			cn.cn_lkflags = flags;
1886 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1887 			vput(dvp);
1888 		} else {
1889 			*vpp = dvp;
1890 		}
1891 		return (err);
1892 	}
1893 
1894 	gen_mask = -1ULL >> (64 - 8 * i);
1895 
1896 	dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1897 	    (u_longlong_t)fid_gen,
1898 	    (u_longlong_t)gen_mask);
1899 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1900 		zfs_exit(zfsvfs, FTAG);
1901 		return (err);
1902 	}
1903 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1904 	    sizeof (uint64_t));
1905 	zp_gen = zp_gen & gen_mask;
1906 	if (zp_gen == 0)
1907 		zp_gen = 1;
1908 	if (zp->z_unlinked || zp_gen != fid_gen) {
1909 		dprintf("znode gen (%llu) != fid gen (%llu)\n",
1910 		    (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1911 		vrele(ZTOV(zp));
1912 		zfs_exit(zfsvfs, FTAG);
1913 		return (SET_ERROR(EINVAL));
1914 	}
1915 
1916 	*vpp = ZTOV(zp);
1917 	zfs_exit(zfsvfs, FTAG);
1918 	err = vn_lock(*vpp, flags);
1919 	if (err == 0)
1920 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1921 	else
1922 		*vpp = NULL;
1923 	return (err);
1924 }
1925 
1926 /*
1927  * Block out VOPs and close zfsvfs_t::z_os
1928  *
1929  * Note, if successful, then we return with the 'z_teardown_lock' and
1930  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1931  * dataset and objset intact so that they can be atomically handed off during
1932  * a subsequent rollback or recv operation and the resume thereafter.
1933  */
1934 int
1935 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1936 {
1937 	int error;
1938 
1939 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1940 		return (error);
1941 
1942 	return (0);
1943 }
1944 
1945 /*
1946  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1947  * is an invariant across any of the operations that can be performed while the
1948  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1949  * are the same: the relevant objset and associated dataset are owned by
1950  * zfsvfs, held, and long held on entry.
1951  */
1952 int
1953 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1954 {
1955 	int err;
1956 	znode_t *zp;
1957 
1958 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1959 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1960 
1961 	/*
1962 	 * We already own this, so just update the objset_t, as the one we
1963 	 * had before may have been evicted.
1964 	 */
1965 	objset_t *os;
1966 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
1967 	VERIFY(dsl_dataset_long_held(ds));
1968 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1969 	dsl_pool_config_enter(dp, FTAG);
1970 	VERIFY0(dmu_objset_from_ds(ds, &os));
1971 	dsl_pool_config_exit(dp, FTAG);
1972 
1973 	err = zfsvfs_init(zfsvfs, os);
1974 	if (err != 0)
1975 		goto bail;
1976 
1977 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
1978 	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1979 
1980 	zfs_set_fuid_feature(zfsvfs);
1981 
1982 	/*
1983 	 * Attempt to re-establish all the active znodes with
1984 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
1985 	 * any potential callers discover that via zfs_enter_verify_zp
1986 	 * when they try to use their znode.
1987 	 */
1988 	mutex_enter(&zfsvfs->z_znodes_lock);
1989 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1990 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1991 		(void) zfs_rezget(zp);
1992 	}
1993 	mutex_exit(&zfsvfs->z_znodes_lock);
1994 
1995 bail:
1996 	/* release the VOPs */
1997 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1998 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1999 
2000 	if (err) {
2001 		/*
2002 		 * Since we couldn't setup the sa framework, try to force
2003 		 * unmount this file system.
2004 		 */
2005 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2006 			vfs_ref(zfsvfs->z_vfs);
2007 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2008 		}
2009 	}
2010 	return (err);
2011 }
2012 
2013 static void
2014 zfs_freevfs(vfs_t *vfsp)
2015 {
2016 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2017 
2018 	zfsvfs_free(zfsvfs);
2019 
2020 	atomic_dec_32(&zfs_active_fs_count);
2021 }
2022 
2023 #ifdef __i386__
2024 static int desiredvnodes_backup;
2025 #include <sys/vmmeter.h>
2026 
2027 
2028 #include <vm/vm_page.h>
2029 #include <vm/vm_object.h>
2030 #include <vm/vm_kern.h>
2031 #include <vm/vm_map.h>
2032 #endif
2033 
2034 static void
2035 zfs_vnodes_adjust(void)
2036 {
2037 #ifdef __i386__
2038 	int newdesiredvnodes;
2039 
2040 	desiredvnodes_backup = desiredvnodes;
2041 
2042 	/*
2043 	 * We calculate newdesiredvnodes the same way it is done in
2044 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2045 	 * it wasn't tuned by the administrator and we can tune it down.
2046 	 */
2047 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2048 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
2049 	    sizeof (struct vnode))));
2050 	if (newdesiredvnodes == desiredvnodes)
2051 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2052 #endif
2053 }
2054 
2055 static void
2056 zfs_vnodes_adjust_back(void)
2057 {
2058 
2059 #ifdef __i386__
2060 	desiredvnodes = desiredvnodes_backup;
2061 #endif
2062 }
2063 
2064 static struct sx zfs_vnlru_lock;
2065 static struct vnode *zfs_vnlru_marker;
2066 static arc_prune_t *zfs_prune;
2067 
2068 static void
2069 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2070 {
2071 	if (nr_to_scan > INT_MAX)
2072 		nr_to_scan = INT_MAX;
2073 	sx_xlock(&zfs_vnlru_lock);
2074 	vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2075 	sx_xunlock(&zfs_vnlru_lock);
2076 }
2077 
2078 void
2079 zfs_init(void)
2080 {
2081 
2082 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2083 
2084 	/*
2085 	 * Initialize .zfs directory structures
2086 	 */
2087 	zfsctl_init();
2088 
2089 	/*
2090 	 * Initialize znode cache, vnode ops, etc...
2091 	 */
2092 	zfs_znode_init();
2093 
2094 	/*
2095 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2096 	 * with UFS inode in mind. We reduce it here, because it's too big for
2097 	 * ZFS/i386.
2098 	 */
2099 	zfs_vnodes_adjust();
2100 
2101 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2102 
2103 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2104 
2105 	zfs_vnlru_marker = vnlru_alloc_marker();
2106 	sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2107 	zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2108 }
2109 
2110 void
2111 zfs_fini(void)
2112 {
2113 	arc_remove_prune_callback(zfs_prune);
2114 	vnlru_free_marker(zfs_vnlru_marker);
2115 	sx_destroy(&zfs_vnlru_lock);
2116 
2117 	taskq_destroy(zfsvfs_taskq);
2118 	zfsctl_fini();
2119 	zfs_znode_fini();
2120 	zfs_vnodes_adjust_back();
2121 }
2122 
2123 int
2124 zfs_busy(void)
2125 {
2126 	return (zfs_active_fs_count != 0);
2127 }
2128 
2129 /*
2130  * Release VOPs and unmount a suspended filesystem.
2131  */
2132 int
2133 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2134 {
2135 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2136 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2137 
2138 	/*
2139 	 * We already own this, so just hold and rele it to update the
2140 	 * objset_t, as the one we had before may have been evicted.
2141 	 */
2142 	objset_t *os;
2143 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2144 	VERIFY(dsl_dataset_long_held(ds));
2145 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2146 	dsl_pool_config_enter(dp, FTAG);
2147 	VERIFY0(dmu_objset_from_ds(ds, &os));
2148 	dsl_pool_config_exit(dp, FTAG);
2149 	zfsvfs->z_os = os;
2150 
2151 	/* release the VOPs */
2152 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2153 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2154 
2155 	/*
2156 	 * Try to force unmount this file system.
2157 	 */
2158 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2159 	zfsvfs->z_unmounted = B_TRUE;
2160 	return (0);
2161 }
2162 
2163 int
2164 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2165 {
2166 	int error;
2167 	objset_t *os = zfsvfs->z_os;
2168 	dmu_tx_t *tx;
2169 
2170 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2171 		return (SET_ERROR(EINVAL));
2172 
2173 	if (newvers < zfsvfs->z_version)
2174 		return (SET_ERROR(EINVAL));
2175 
2176 	if (zfs_spa_version_map(newvers) >
2177 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2178 		return (SET_ERROR(ENOTSUP));
2179 
2180 	tx = dmu_tx_create(os);
2181 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2182 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2183 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2184 		    ZFS_SA_ATTRS);
2185 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2186 	}
2187 	error = dmu_tx_assign(tx, TXG_WAIT);
2188 	if (error) {
2189 		dmu_tx_abort(tx);
2190 		return (error);
2191 	}
2192 
2193 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2194 	    8, 1, &newvers, tx);
2195 
2196 	if (error) {
2197 		dmu_tx_commit(tx);
2198 		return (error);
2199 	}
2200 
2201 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2202 		uint64_t sa_obj;
2203 
2204 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2205 		    SPA_VERSION_SA);
2206 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2207 		    DMU_OT_NONE, 0, tx);
2208 
2209 		error = zap_add(os, MASTER_NODE_OBJ,
2210 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2211 		ASSERT0(error);
2212 
2213 		VERIFY0(sa_set_sa_object(os, sa_obj));
2214 		sa_register_update_callback(os, zfs_sa_upgrade);
2215 	}
2216 
2217 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2218 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2219 	    (uintmax_t)newvers);
2220 	dmu_tx_commit(tx);
2221 
2222 	zfsvfs->z_version = newvers;
2223 	os->os_version = newvers;
2224 
2225 	zfs_set_fuid_feature(zfsvfs);
2226 
2227 	return (0);
2228 }
2229 
2230 /*
2231  * Return true if the corresponding vfs's unmounted flag is set.
2232  * Otherwise return false.
2233  * If this function returns true we know VFS unmount has been initiated.
2234  */
2235 boolean_t
2236 zfs_get_vfs_flag_unmounted(objset_t *os)
2237 {
2238 	zfsvfs_t *zfvp;
2239 	boolean_t unmounted = B_FALSE;
2240 
2241 	ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2242 
2243 	mutex_enter(&os->os_user_ptr_lock);
2244 	zfvp = dmu_objset_get_user(os);
2245 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2246 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2247 		unmounted = B_TRUE;
2248 	mutex_exit(&os->os_user_ptr_lock);
2249 
2250 	return (unmounted);
2251 }
2252 
2253 #ifdef _KERNEL
2254 void
2255 zfsvfs_update_fromname(const char *oldname, const char *newname)
2256 {
2257 	char tmpbuf[MAXPATHLEN];
2258 	struct mount *mp;
2259 	char *fromname;
2260 	size_t oldlen;
2261 
2262 	oldlen = strlen(oldname);
2263 
2264 	mtx_lock(&mountlist_mtx);
2265 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2266 		fromname = mp->mnt_stat.f_mntfromname;
2267 		if (strcmp(fromname, oldname) == 0) {
2268 			(void) strlcpy(fromname, newname,
2269 			    sizeof (mp->mnt_stat.f_mntfromname));
2270 			continue;
2271 		}
2272 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2273 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2274 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2275 			    newname, fromname + oldlen);
2276 			(void) strlcpy(fromname, tmpbuf,
2277 			    sizeof (mp->mnt_stat.f_mntfromname));
2278 			continue;
2279 		}
2280 	}
2281 	mtx_unlock(&mountlist_mtx);
2282 }
2283 #endif
2284 
2285 /*
2286  * Find a prison with ZFS info.
2287  * Return the ZFS info and the (locked) prison.
2288  */
2289 static struct zfs_jailparam *
2290 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2291 {
2292 	struct prison *pr;
2293 	struct zfs_jailparam *zjp;
2294 
2295 	for (pr = spr; ; pr = pr->pr_parent) {
2296 		mtx_lock(&pr->pr_mtx);
2297 		if (pr == &prison0) {
2298 			zjp = &zfs_jailparam0;
2299 			break;
2300 		}
2301 		zjp = osd_jail_get(pr, zfs_jailparam_slot);
2302 		if (zjp != NULL)
2303 			break;
2304 		mtx_unlock(&pr->pr_mtx);
2305 	}
2306 	*prp = pr;
2307 
2308 	return (zjp);
2309 }
2310 
2311 /*
2312  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2313  * ZFS info and lock the prison.
2314  */
2315 static void
2316 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2317 {
2318 	struct prison *ppr;
2319 	struct zfs_jailparam *zjp, *nzjp;
2320 	void **rsv;
2321 
2322 	/* If this prison already has ZFS info, return that. */
2323 	zjp = zfs_jailparam_find(pr, &ppr);
2324 	if (ppr == pr)
2325 		goto done;
2326 
2327 	/*
2328 	 * Allocate a new info record.  Then check again, in case something
2329 	 * changed during the allocation.
2330 	 */
2331 	mtx_unlock(&ppr->pr_mtx);
2332 	nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2333 	rsv = osd_reserve(zfs_jailparam_slot);
2334 	zjp = zfs_jailparam_find(pr, &ppr);
2335 	if (ppr == pr) {
2336 		free(nzjp, M_PRISON);
2337 		osd_free_reserved(rsv);
2338 		goto done;
2339 	}
2340 	/* Inherit the initial values from the ancestor. */
2341 	mtx_lock(&pr->pr_mtx);
2342 	(void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2343 	(void) memcpy(nzjp, zjp, sizeof (*zjp));
2344 	zjp = nzjp;
2345 	mtx_unlock(&ppr->pr_mtx);
2346 done:
2347 	if (zjpp != NULL)
2348 		*zjpp = zjp;
2349 	else
2350 		mtx_unlock(&pr->pr_mtx);
2351 }
2352 
2353 /*
2354  * Jail OSD methods for ZFS VFS info.
2355  */
2356 static int
2357 zfs_jailparam_create(void *obj, void *data)
2358 {
2359 	struct prison *pr = obj;
2360 	struct vfsoptlist *opts = data;
2361 	int jsys;
2362 
2363 	if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2364 	    jsys == JAIL_SYS_INHERIT)
2365 		return (0);
2366 	/*
2367 	 * Inherit a prison's initial values from its parent
2368 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
2369 	 */
2370 	zfs_jailparam_alloc(pr, NULL);
2371 	return (0);
2372 }
2373 
2374 static int
2375 zfs_jailparam_get(void *obj, void *data)
2376 {
2377 	struct prison *ppr, *pr = obj;
2378 	struct vfsoptlist *opts = data;
2379 	struct zfs_jailparam *zjp;
2380 	int jsys, error;
2381 
2382 	zjp = zfs_jailparam_find(pr, &ppr);
2383 	jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2384 	error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2385 	if (error != 0 && error != ENOENT)
2386 		goto done;
2387 	if (jsys == JAIL_SYS_NEW) {
2388 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2389 		    &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2390 		if (error != 0 && error != ENOENT)
2391 			goto done;
2392 	} else {
2393 		/*
2394 		 * If this prison is inheriting its ZFS info, report
2395 		 * empty/zero parameters.
2396 		 */
2397 		static int mount_snapshot = 0;
2398 
2399 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2400 		    &mount_snapshot, sizeof (mount_snapshot));
2401 		if (error != 0 && error != ENOENT)
2402 			goto done;
2403 	}
2404 	error = 0;
2405 done:
2406 	mtx_unlock(&ppr->pr_mtx);
2407 	return (error);
2408 }
2409 
2410 static int
2411 zfs_jailparam_set(void *obj, void *data)
2412 {
2413 	struct prison *pr = obj;
2414 	struct prison *ppr;
2415 	struct vfsoptlist *opts = data;
2416 	int error, jsys, mount_snapshot;
2417 
2418 	/* Set the parameters, which should be correct. */
2419 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2420 	if (error == ENOENT)
2421 		jsys = -1;
2422 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2423 	    sizeof (mount_snapshot));
2424 	if (error == ENOENT)
2425 		mount_snapshot = -1;
2426 	else
2427 		jsys = JAIL_SYS_NEW;
2428 	switch (jsys) {
2429 	case JAIL_SYS_NEW:
2430 	{
2431 		/* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2432 		struct zfs_jailparam *zjp;
2433 
2434 		/*
2435 		 * A child jail cannot have more permissions than its parent
2436 		 */
2437 		if (pr->pr_parent != &prison0) {
2438 			zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2439 			mtx_unlock(&ppr->pr_mtx);
2440 			if (zjp->mount_snapshot < mount_snapshot) {
2441 				return (EPERM);
2442 			}
2443 		}
2444 		zfs_jailparam_alloc(pr, &zjp);
2445 		if (mount_snapshot != -1)
2446 			zjp->mount_snapshot = mount_snapshot;
2447 		mtx_unlock(&pr->pr_mtx);
2448 		break;
2449 	}
2450 	case JAIL_SYS_INHERIT:
2451 		/* "zfs=inherit": inherit the parent's ZFS info. */
2452 		mtx_lock(&pr->pr_mtx);
2453 		osd_jail_del(pr, zfs_jailparam_slot);
2454 		mtx_unlock(&pr->pr_mtx);
2455 		break;
2456 	case -1:
2457 		/*
2458 		 * If the setting being changed is not ZFS related
2459 		 * then do nothing.
2460 		 */
2461 		break;
2462 	}
2463 
2464 	return (0);
2465 }
2466 
2467 static int
2468 zfs_jailparam_check(void *obj __unused, void *data)
2469 {
2470 	struct vfsoptlist *opts = data;
2471 	int error, jsys, mount_snapshot;
2472 
2473 	/* Check that the parameters are correct. */
2474 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2475 	if (error != ENOENT) {
2476 		if (error != 0)
2477 			return (error);
2478 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2479 			return (EINVAL);
2480 	}
2481 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2482 	    sizeof (mount_snapshot));
2483 	if (error != ENOENT) {
2484 		if (error != 0)
2485 			return (error);
2486 		if (mount_snapshot != 0 && mount_snapshot != 1)
2487 			return (EINVAL);
2488 	}
2489 	return (0);
2490 }
2491 
2492 static void
2493 zfs_jailparam_destroy(void *data)
2494 {
2495 
2496 	free(data, M_PRISON);
2497 }
2498 
2499 static void
2500 zfs_jailparam_sysinit(void *arg __unused)
2501 {
2502 	struct prison *pr;
2503 	osd_method_t  methods[PR_MAXMETHOD] = {
2504 		[PR_METHOD_CREATE] = zfs_jailparam_create,
2505 		[PR_METHOD_GET] = zfs_jailparam_get,
2506 		[PR_METHOD_SET] = zfs_jailparam_set,
2507 		[PR_METHOD_CHECK] = zfs_jailparam_check,
2508 	};
2509 
2510 	zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2511 	/* Copy the defaults to any existing prisons. */
2512 	sx_slock(&allprison_lock);
2513 	TAILQ_FOREACH(pr, &allprison, pr_list)
2514 		zfs_jailparam_alloc(pr, NULL);
2515 	sx_sunlock(&allprison_lock);
2516 }
2517 
2518 static void
2519 zfs_jailparam_sysuninit(void *arg __unused)
2520 {
2521 
2522 	osd_jail_deregister(zfs_jailparam_slot);
2523 }
2524 
2525 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2526 	zfs_jailparam_sysinit, NULL);
2527 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2528 	zfs_jailparam_sysuninit, NULL);
2529