xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 /* Portions Copyright 2010 Robert Milkowski */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/sysmacros.h>
36 #include <sys/kmem.h>
37 #include <sys/acl.h>
38 #include <sys/vnode.h>
39 #include <sys/vfs.h>
40 #include <sys/mntent.h>
41 #include <sys/mount.h>
42 #include <sys/cmn_err.h>
43 #include <sys/zfs_znode.h>
44 #include <sys/zfs_dir.h>
45 #include <sys/zil.h>
46 #include <sys/fs/zfs.h>
47 #include <sys/dmu.h>
48 #include <sys/dsl_prop.h>
49 #include <sys/dsl_dataset.h>
50 #include <sys/dsl_deleg.h>
51 #include <sys/spa.h>
52 #include <sys/zap.h>
53 #include <sys/sa.h>
54 #include <sys/sa_impl.h>
55 #include <sys/varargs.h>
56 #include <sys/policy.h>
57 #include <sys/atomic.h>
58 #include <sys/zfs_ioctl.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_fuid.h>
61 #include <sys/sunddi.h>
62 #include <sys/dnlc.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/spa_boot.h>
65 #include "zfs_comutil.h"
66 
67 #ifdef __FreeBSD_kernel__
68 
69 #include <sys/jail.h>
70 
71 struct mtx zfs_debug_mtx;
72 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
73 
74 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
75 
76 int zfs_super_owner;
77 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
78     "File system owner can perform privileged operation on his file systems");
79 
80 int zfs_debug_level;
81 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
82     "Debug level");
83 
84 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
85 static int zfs_version_acl = ZFS_ACL_VERSION;
86 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
87     "ZFS_ACL_VERSION");
88 static int zfs_version_spa = SPA_VERSION;
89 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
90     "SPA_VERSION");
91 static int zfs_version_zpl = ZPL_VERSION;
92 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
93     "ZPL_VERSION");
94 
95 static int zfs_mount(vfs_t *vfsp);
96 static int zfs_umount(vfs_t *vfsp, int fflag);
97 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
98 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
99 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
100 static int zfs_sync(vfs_t *vfsp, int waitfor);
101 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
102     struct ucred **credanonp, int *numsecflavors, int **secflavors);
103 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
104 static void zfs_objset_close(zfsvfs_t *zfsvfs);
105 static void zfs_freevfs(vfs_t *vfsp);
106 
107 struct vfsops zfs_vfsops = {
108 	.vfs_mount =		zfs_mount,
109 	.vfs_unmount =		zfs_umount,
110 	.vfs_root =		zfs_root,
111 	.vfs_statfs =		zfs_statfs,
112 	.vfs_vget =		zfs_vget,
113 	.vfs_sync =		zfs_sync,
114 	.vfs_checkexp =		zfs_checkexp,
115 	.vfs_fhtovp =		zfs_fhtovp,
116 };
117 
118 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
119 
120 #endif /* __FreeBSD_kernel__ */
121 
122 #ifdef __NetBSD__
123 
124 #include <sys/mkdev.h>
125 #include <miscfs/genfs/genfs.h>
126 
127 int zfs_debug_level;
128 kmutex_t zfs_debug_mtx;
129 
130 #define	DROP_GIANT()	/* nothing */
131 #define PICKUP_GIANT()	/* nothing */
132 #define vfs_stdsync(a, b) 0
133 
134 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
135 static int zfs_umount(vfs_t *vfsp, int fflag);
136 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
137 static int zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp);
138 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
139 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
140 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp);
141 static int zfs_sync(vfs_t *vfsp, int waitfor);
142 static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr);
143 static void zfs_freevfs(vfs_t *vfsp);
144 
145 void zfs_init(void);
146 void zfs_fini(void);
147 
148 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
149 
150 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
151 	&zfs_vnodeop_opv_desc,
152 	NULL,
153 };
154 
155 struct vfsops zfs_vfsops = {
156 	.vfs_name = MOUNT_ZFS,
157 	.vfs_min_mount_data = sizeof(struct zfs_args),
158 	.vfs_opv_descs = zfs_vnodeop_descs,
159 	.vfs_mount = zfs_mount,
160 	.vfs_unmount = zfs_umount,
161 	.vfs_root = zfs_netbsd_root,
162 	.vfs_statvfs = zfs_statvfs,
163 	.vfs_sync = zfs_netbsd_sync,
164 	.vfs_vget = zfs_vget,
165 	.vfs_loadvnode = zfs_loadvnode,
166 	.vfs_fhtovp = zfs_fhtovp,
167 	.vfs_init = zfs_init,
168 	.vfs_done = zfs_fini,
169 	.vfs_start = (void *)nullop,
170 	.vfs_renamelock_enter = genfs_renamelock_enter,
171 	.vfs_renamelock_exit = genfs_renamelock_exit,
172 	.vfs_reinit = (void *)nullop,
173 	.vfs_vptofh = (void *)eopnotsupp,
174 	.vfs_fhtovp = (void *)eopnotsupp,
175 	.vfs_quotactl = (void *)eopnotsupp,
176 	.vfs_extattrctl = (void *)eopnotsupp,
177 	.vfs_suspendctl = genfs_suspendctl,
178 	.vfs_snapshot = (void *)eopnotsupp,
179 	.vfs_fsync = (void *)eopnotsupp,
180 };
181 
182 static bool
183 zfs_sync_selector(void *cl, struct vnode *vp)
184 {
185 	znode_t *zp;
186 
187 	/*
188 	 * Skip the vnode/inode if inaccessible, or if the
189 	 * atime is clean.
190 	 */
191 	zp = VTOZ(vp);
192 	return zp != NULL && vp->v_type != VNON && zp->z_atime_dirty != 0
193 	    && !zp->z_unlinked;
194 }
195 
196 static int
197 zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr)
198 {
199 	struct vnode_iterator *marker;
200 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
201 	vnode_t *vp;
202 
203 	/*
204 	 * On NetBSD, we need to push out atime updates.  Solaris does
205 	 * this during VOP_INACTIVE, but that does not work well with the
206 	 * BSD VFS, so we do it in batch here.
207 	 */
208 	vfs_vnode_iterator_init(vfsp, &marker);
209 	while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL)))
210 	{
211 		znode_t *zp;
212 		dmu_buf_t *dbp;
213 		dmu_tx_t *tx;
214 		int error;
215 
216 		error = vn_lock(vp, LK_EXCLUSIVE);
217 		if (error) {
218 			VN_RELE(vp);
219 			continue;
220 		}
221 		ZFS_ENTER(zfsvfs);
222 		zp = VTOZ(vp);
223 		tx = dmu_tx_create(zfsvfs->z_os);
224 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
225 		zfs_sa_upgrade_txholds(tx, zp);
226 		error = dmu_tx_assign(tx, TXG_WAIT);
227 		if (error) {
228 			dmu_tx_abort(tx);
229 		} else {
230 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
231 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
232 			zp->z_atime_dirty = 0;
233 			dmu_tx_commit(tx);
234 		}
235 		ZFS_EXIT(zfsvfs);
236 		vput(vp);
237 	}
238 	vfs_vnode_iterator_destroy(marker);
239 
240 	/*
241 	 * Then do the regular ZFS stuff.
242 	 */
243 	return zfs_sync(vfsp, waitfor);
244 }
245 
246 static int
247 zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp)
248 {
249 
250 	return zfs_root(vfsp, LK_EXCLUSIVE | LK_RETRY, vpp);
251 }
252 
253 #endif /* __NetBSD__ */
254 
255 /*
256  * We need to keep a count of active fs's.
257  * This is necessary to prevent our module
258  * from being unloaded after a umount -f
259  */
260 static uint32_t	zfs_active_fs_count = 0;
261 
262 /*ARGSUSED*/
263 static int
264 zfs_sync(vfs_t *vfsp, int waitfor)
265 {
266         /*
267 	 * Data integrity is job one.  We don't want a compromised kernel
268 	 * writing to the storage pool, so we never sync during panic.
269 	 */
270 	if (panicstr)
271 		return (0);
272 
273 	/*
274 	 * Ignore the system syncher.  ZFS already commits async data
275 	 * at zfs_txg_timeout intervals.
276 	 */
277 	if (waitfor == MNT_LAZY)
278 		return (0);
279 
280 	if (vfsp != NULL) {
281 		/*
282 		 * Sync a specific filesystem.
283 		 */
284 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
285 		dsl_pool_t *dp;
286 		int error;
287 
288 		error = vfs_stdsync(vfsp, waitfor);
289 		if (error != 0)
290 			return (error);
291 
292 		ZFS_ENTER(zfsvfs);
293 		dp = dmu_objset_pool(zfsvfs->z_os);
294 
295 		/*
296 		 * If the system is shutting down, then skip any
297 		 * filesystems which may exist on a suspended pool.
298 		 */
299 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
300 			ZFS_EXIT(zfsvfs);
301 			return (0);
302 		}
303 
304 		if (zfsvfs->z_log != NULL)
305 			zil_commit(zfsvfs->z_log, 0);
306 
307 		ZFS_EXIT(zfsvfs);
308 	} else {
309 		/*
310 		 * Sync all ZFS filesystems.  This is what happens when you
311 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
312 		 * request by waiting for all pools to commit all dirty data.
313 		 */
314 		spa_sync_allpools();
315 	}
316 
317 	return (0);
318 }
319 
320 #ifdef illumos
321 static int
322 zfs_create_unique_device(dev_t *dev)
323 {
324 	major_t new_major;
325 
326 	do {
327 		ASSERT3U(zfs_minor, <=, MAXMIN32);
328 		minor_t start = zfs_minor;
329 		do {
330 			mutex_enter(&zfs_dev_mtx);
331 			if (zfs_minor >= MAXMIN32) {
332 				/*
333 				 * If we're still using the real major
334 				 * keep out of /dev/zfs and /dev/zvol minor
335 				 * number space.  If we're using a getudev()'ed
336 				 * major number, we can use all of its minors.
337 				 */
338 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
339 					zfs_minor = ZFS_MIN_MINOR;
340 				else
341 					zfs_minor = 0;
342 			} else {
343 				zfs_minor++;
344 			}
345 			*dev = makedevice(zfs_major, zfs_minor);
346 			mutex_exit(&zfs_dev_mtx);
347 		} while (vfs_devismounted(*dev) && zfs_minor != start);
348 #ifdef illumos
349 		if (zfs_minor == start) {
350 			/*
351 			 * We are using all ~262,000 minor numbers for the
352 			 * current major number.  Create a new major number.
353 			 */
354 			if ((new_major = getudev()) == (major_t)-1) {
355 				cmn_err(CE_WARN,
356 				    "zfs_mount: Can't get unique major "
357 				    "device number.");
358 				return (-1);
359 			}
360 			mutex_enter(&zfs_dev_mtx);
361 			zfs_major = new_major;
362 			zfs_minor = 0;
363 
364 			mutex_exit(&zfs_dev_mtx);
365 		} else {
366 			break;
367 		}
368 		/* CONSTANTCONDITION */
369 #endif
370 	} while (1);
371 
372 	return (0);
373 }
374 #endif	/* illumos */
375 
376 
377 static void
378 atime_changed_cb(void *arg, uint64_t newval)
379 {
380 	zfsvfs_t *zfsvfs = arg;
381 
382 	if (newval == TRUE) {
383 		zfsvfs->z_atime = TRUE;
384 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
385 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
386 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
387 	} else {
388 		zfsvfs->z_atime = FALSE;
389 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
390 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
391 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
392 	}
393 }
394 
395 static void
396 xattr_changed_cb(void *arg, uint64_t newval)
397 {
398 	zfsvfs_t *zfsvfs = arg;
399 
400 	if (newval == TRUE) {
401 		/* XXX locking on vfs_flag? */
402 #ifdef TODO
403 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
404 #endif
405 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
406 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
407 	} else {
408 		/* XXX locking on vfs_flag? */
409 #ifdef TODO
410 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
411 #endif
412 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
413 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
414 	}
415 }
416 
417 static void
418 blksz_changed_cb(void *arg, uint64_t newval)
419 {
420 	zfsvfs_t *zfsvfs = arg;
421 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
422 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
423 	ASSERT(ISP2(newval));
424 
425 	zfsvfs->z_max_blksz = newval;
426 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
427 }
428 
429 static void
430 readonly_changed_cb(void *arg, uint64_t newval)
431 {
432 	zfsvfs_t *zfsvfs = arg;
433 
434 	if (newval) {
435 		/* XXX locking on vfs_flag? */
436 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
437 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
438 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
439 	} else {
440 		/* XXX locking on vfs_flag? */
441 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
442 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
443 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
444 	}
445 }
446 
447 static void
448 setuid_changed_cb(void *arg, uint64_t newval)
449 {
450 	zfsvfs_t *zfsvfs = arg;
451 
452 	if (newval == FALSE) {
453 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
454 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
455 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
456 	} else {
457 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
458 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
459 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
460 	}
461 }
462 
463 static void
464 exec_changed_cb(void *arg, uint64_t newval)
465 {
466 	zfsvfs_t *zfsvfs = arg;
467 
468 	if (newval == FALSE) {
469 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
470 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
471 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
472 	} else {
473 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
474 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
475 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
476 	}
477 }
478 
479 /*
480  * The nbmand mount option can be changed at mount time.
481  * We can't allow it to be toggled on live file systems or incorrect
482  * behavior may be seen from cifs clients
483  *
484  * This property isn't registered via dsl_prop_register(), but this callback
485  * will be called when a file system is first mounted
486  */
487 static void
488 nbmand_changed_cb(void *arg, uint64_t newval)
489 {
490 	zfsvfs_t *zfsvfs = arg;
491 	if (newval == FALSE) {
492 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
493 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
494 	} else {
495 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
496 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
497 	}
498 }
499 
500 static void
501 snapdir_changed_cb(void *arg, uint64_t newval)
502 {
503 	zfsvfs_t *zfsvfs = arg;
504 
505 	zfsvfs->z_show_ctldir = newval;
506 }
507 
508 static void
509 vscan_changed_cb(void *arg, uint64_t newval)
510 {
511 	zfsvfs_t *zfsvfs = arg;
512 
513 	zfsvfs->z_vscan = newval;
514 }
515 
516 static void
517 acl_mode_changed_cb(void *arg, uint64_t newval)
518 {
519 	zfsvfs_t *zfsvfs = arg;
520 
521 	zfsvfs->z_acl_mode = newval;
522 }
523 
524 static void
525 acl_inherit_changed_cb(void *arg, uint64_t newval)
526 {
527 	zfsvfs_t *zfsvfs = arg;
528 
529 	zfsvfs->z_acl_inherit = newval;
530 }
531 
532 static int
533 zfs_register_callbacks(vfs_t *vfsp)
534 {
535 	struct dsl_dataset *ds = NULL;
536 	objset_t *os = NULL;
537 	zfsvfs_t *zfsvfs = NULL;
538 	uint64_t nbmand;
539 	boolean_t readonly = B_FALSE;
540 	boolean_t do_readonly = B_FALSE;
541 	boolean_t setuid = B_FALSE;
542 	boolean_t do_setuid = B_FALSE;
543 	boolean_t exec = B_FALSE;
544 	boolean_t do_exec = B_FALSE;
545 #ifdef illumos
546 	boolean_t devices = B_FALSE;
547 	boolean_t do_devices = B_FALSE;
548 #endif
549 	boolean_t xattr = B_FALSE;
550 	boolean_t do_xattr = B_FALSE;
551 	boolean_t atime = B_FALSE;
552 	boolean_t do_atime = B_FALSE;
553 	int error = 0;
554 
555 	ASSERT(vfsp);
556 	zfsvfs = vfsp->vfs_data;
557 	ASSERT(zfsvfs);
558 	os = zfsvfs->z_os;
559 
560 	/*
561 	 * This function can be called for a snapshot when we update snapshot's
562 	 * mount point, which isn't really supported.
563 	 */
564 	if (dmu_objset_is_snapshot(os))
565 		return (EOPNOTSUPP);
566 
567 	/*
568 	 * The act of registering our callbacks will destroy any mount
569 	 * options we may have.  In order to enable temporary overrides
570 	 * of mount options, we stash away the current values and
571 	 * restore them after we register the callbacks.
572 	 */
573 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
574 	    !spa_writeable(dmu_objset_spa(os))) {
575 		readonly = B_TRUE;
576 		do_readonly = B_TRUE;
577 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
578 		readonly = B_FALSE;
579 		do_readonly = B_TRUE;
580 	}
581 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
582 		setuid = B_FALSE;
583 		do_setuid = B_TRUE;
584 	} else {
585 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
586 			setuid = B_FALSE;
587 			do_setuid = B_TRUE;
588 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
589 			setuid = B_TRUE;
590 			do_setuid = B_TRUE;
591 		}
592 	}
593 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
594 		exec = B_FALSE;
595 		do_exec = B_TRUE;
596 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
597 		exec = B_TRUE;
598 		do_exec = B_TRUE;
599 	}
600 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
601 		xattr = B_FALSE;
602 		do_xattr = B_TRUE;
603 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
604 		xattr = B_TRUE;
605 		do_xattr = B_TRUE;
606 	}
607 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
608 		atime = B_FALSE;
609 		do_atime = B_TRUE;
610 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
611 		atime = B_TRUE;
612 		do_atime = B_TRUE;
613 	}
614 
615 	/*
616 	 * We need to enter pool configuration here, so that we can use
617 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
618 	 * dsl_prop_get_integer() can not be used, because it has to acquire
619 	 * spa_namespace_lock and we can not do that because we already hold
620 	 * z_teardown_lock.  The problem is that spa_config_sync() is called
621 	 * with spa_namespace_lock held and the function calls ZFS vnode
622 	 * operations to write the cache file and thus z_teardown_lock is
623 	 * acquired after spa_namespace_lock.
624 	 */
625 	ds = dmu_objset_ds(os);
626 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
627 
628 	/*
629 	 * nbmand is a special property.  It can only be changed at
630 	 * mount time.
631 	 *
632 	 * This is weird, but it is documented to only be changeable
633 	 * at mount time.
634 	 */
635 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
636 		nbmand = B_FALSE;
637 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
638 		nbmand = B_TRUE;
639 	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
640 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
641 		return (error);
642 	}
643 
644 	/*
645 	 * Register property callbacks.
646 	 *
647 	 * It would probably be fine to just check for i/o error from
648 	 * the first prop_register(), but I guess I like to go
649 	 * overboard...
650 	 */
651 	error = dsl_prop_register(ds,
652 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
653 	error = error ? error : dsl_prop_register(ds,
654 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
655 	error = error ? error : dsl_prop_register(ds,
656 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
657 	error = error ? error : dsl_prop_register(ds,
658 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
659 #ifdef illumos
660 	error = error ? error : dsl_prop_register(ds,
661 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
662 #endif
663 	error = error ? error : dsl_prop_register(ds,
664 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
665 	error = error ? error : dsl_prop_register(ds,
666 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
667 	error = error ? error : dsl_prop_register(ds,
668 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
669 	error = error ? error : dsl_prop_register(ds,
670 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
671 	error = error ? error : dsl_prop_register(ds,
672 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
673 	    zfsvfs);
674 	error = error ? error : dsl_prop_register(ds,
675 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
676 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
677 	if (error)
678 		goto unregister;
679 
680 	/*
681 	 * Invoke our callbacks to restore temporary mount options.
682 	 */
683 	if (do_readonly)
684 		readonly_changed_cb(zfsvfs, readonly);
685 	if (do_setuid)
686 		setuid_changed_cb(zfsvfs, setuid);
687 	if (do_exec)
688 		exec_changed_cb(zfsvfs, exec);
689 	if (do_xattr)
690 		xattr_changed_cb(zfsvfs, xattr);
691 	if (do_atime)
692 		atime_changed_cb(zfsvfs, atime);
693 
694 	nbmand_changed_cb(zfsvfs, nbmand);
695 
696 	return (0);
697 
698 unregister:
699 	dsl_prop_unregister_all(ds, zfsvfs);
700 	return (error);
701 }
702 
703 static int
704 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
705     uint64_t *userp, uint64_t *groupp)
706 {
707 	/*
708 	 * Is it a valid type of object to track?
709 	 */
710 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
711 		return (SET_ERROR(ENOENT));
712 
713 	/*
714 	 * If we have a NULL data pointer
715 	 * then assume the id's aren't changing and
716 	 * return EEXIST to the dmu to let it know to
717 	 * use the same ids
718 	 */
719 	if (data == NULL)
720 		return (SET_ERROR(EEXIST));
721 
722 	if (bonustype == DMU_OT_ZNODE) {
723 		znode_phys_t *znp = data;
724 		*userp = znp->zp_uid;
725 		*groupp = znp->zp_gid;
726 	} else {
727 		int hdrsize;
728 		sa_hdr_phys_t *sap = data;
729 		sa_hdr_phys_t sa = *sap;
730 		boolean_t swap = B_FALSE;
731 
732 		ASSERT(bonustype == DMU_OT_SA);
733 
734 		if (sa.sa_magic == 0) {
735 			/*
736 			 * This should only happen for newly created
737 			 * files that haven't had the znode data filled
738 			 * in yet.
739 			 */
740 			*userp = 0;
741 			*groupp = 0;
742 			return (0);
743 		}
744 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
745 			sa.sa_magic = SA_MAGIC;
746 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
747 			swap = B_TRUE;
748 		} else {
749 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
750 		}
751 
752 		hdrsize = sa_hdrsize(&sa);
753 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
754 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
755 		    SA_UID_OFFSET));
756 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
757 		    SA_GID_OFFSET));
758 		if (swap) {
759 			*userp = BSWAP_64(*userp);
760 			*groupp = BSWAP_64(*groupp);
761 		}
762 	}
763 	return (0);
764 }
765 
766 static void
767 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
768     char *domainbuf, int buflen, uid_t *ridp)
769 {
770 	uint64_t fuid;
771 	const char *domain;
772 
773 	fuid = strtonum(fuidstr, NULL);
774 
775 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
776 	if (domain)
777 		(void) strlcpy(domainbuf, domain, buflen);
778 	else
779 		domainbuf[0] = '\0';
780 	*ridp = FUID_RID(fuid);
781 }
782 
783 static uint64_t
784 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
785 {
786 	switch (type) {
787 	case ZFS_PROP_USERUSED:
788 		return (DMU_USERUSED_OBJECT);
789 	case ZFS_PROP_GROUPUSED:
790 		return (DMU_GROUPUSED_OBJECT);
791 	case ZFS_PROP_USERQUOTA:
792 		return (zfsvfs->z_userquota_obj);
793 	case ZFS_PROP_GROUPQUOTA:
794 		return (zfsvfs->z_groupquota_obj);
795 	}
796 	return (0);
797 }
798 
799 int
800 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
801     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
802 {
803 	int error;
804 	zap_cursor_t zc;
805 	zap_attribute_t za;
806 	zfs_useracct_t *buf = vbuf;
807 	uint64_t obj;
808 
809 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
810 		return (SET_ERROR(ENOTSUP));
811 
812 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
813 	if (obj == 0) {
814 		*bufsizep = 0;
815 		return (0);
816 	}
817 
818 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
819 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
820 	    zap_cursor_advance(&zc)) {
821 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
822 		    *bufsizep)
823 			break;
824 
825 		fuidstr_to_sid(zfsvfs, za.za_name,
826 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
827 
828 		buf->zu_space = za.za_first_integer;
829 		buf++;
830 	}
831 	if (error == ENOENT)
832 		error = 0;
833 
834 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
835 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
836 	*cookiep = zap_cursor_serialize(&zc);
837 	zap_cursor_fini(&zc);
838 	return (error);
839 }
840 
841 /*
842  * buf must be big enough (eg, 32 bytes)
843  */
844 static int
845 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
846     char *buf, boolean_t addok)
847 {
848 	uint64_t fuid;
849 	int domainid = 0;
850 
851 	if (domain && domain[0]) {
852 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
853 		if (domainid == -1)
854 			return (SET_ERROR(ENOENT));
855 	}
856 	fuid = FUID_ENCODE(domainid, rid);
857 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
858 	return (0);
859 }
860 
861 int
862 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
863     const char *domain, uint64_t rid, uint64_t *valp)
864 {
865 	char buf[32];
866 	int err;
867 	uint64_t obj;
868 
869 	*valp = 0;
870 
871 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
872 		return (SET_ERROR(ENOTSUP));
873 
874 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
875 	if (obj == 0)
876 		return (0);
877 
878 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
879 	if (err)
880 		return (err);
881 
882 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
883 	if (err == ENOENT)
884 		err = 0;
885 	return (err);
886 }
887 
888 int
889 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
890     const char *domain, uint64_t rid, uint64_t quota)
891 {
892 	char buf[32];
893 	int err;
894 	dmu_tx_t *tx;
895 	uint64_t *objp;
896 	boolean_t fuid_dirtied;
897 
898 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
899 		return (SET_ERROR(EINVAL));
900 
901 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
902 		return (SET_ERROR(ENOTSUP));
903 
904 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
905 	    &zfsvfs->z_groupquota_obj;
906 
907 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
908 	if (err)
909 		return (err);
910 	fuid_dirtied = zfsvfs->z_fuid_dirty;
911 
912 	tx = dmu_tx_create(zfsvfs->z_os);
913 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
914 	if (*objp == 0) {
915 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
916 		    zfs_userquota_prop_prefixes[type]);
917 	}
918 	if (fuid_dirtied)
919 		zfs_fuid_txhold(zfsvfs, tx);
920 	err = dmu_tx_assign(tx, TXG_WAIT);
921 	if (err) {
922 		dmu_tx_abort(tx);
923 		return (err);
924 	}
925 
926 	mutex_enter(&zfsvfs->z_lock);
927 	if (*objp == 0) {
928 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
929 		    DMU_OT_NONE, 0, tx);
930 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
931 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
932 	}
933 	mutex_exit(&zfsvfs->z_lock);
934 
935 	if (quota == 0) {
936 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
937 		if (err == ENOENT)
938 			err = 0;
939 	} else {
940 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
941 	}
942 	ASSERT(err == 0);
943 	if (fuid_dirtied)
944 		zfs_fuid_sync(zfsvfs, tx);
945 	dmu_tx_commit(tx);
946 	return (err);
947 }
948 
949 boolean_t
950 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
951 {
952 	char buf[32];
953 	uint64_t used, quota, usedobj, quotaobj;
954 	int err;
955 
956 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
957 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
958 
959 	if (quotaobj == 0 || zfsvfs->z_replay)
960 		return (B_FALSE);
961 
962 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
963 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
964 	if (err != 0)
965 		return (B_FALSE);
966 
967 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
968 	if (err != 0)
969 		return (B_FALSE);
970 	return (used >= quota);
971 }
972 
973 boolean_t
974 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
975 {
976 	uint64_t fuid;
977 	uint64_t quotaobj;
978 
979 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
980 
981 	fuid = isgroup ? zp->z_gid : zp->z_uid;
982 
983 	if (quotaobj == 0 || zfsvfs->z_replay)
984 		return (B_FALSE);
985 
986 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
987 }
988 
989 /*
990  * Associate this zfsvfs with the given objset, which must be owned.
991  * This will cache a bunch of on-disk state from the objset in the
992  * zfsvfs.
993  */
994 static int
995 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
996 {
997 	int error;
998 	uint64_t val;
999 
1000 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
1001 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
1002 	zfsvfs->z_os = os;
1003 
1004 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
1005 	if (error != 0)
1006 		return (error);
1007 	if (zfsvfs->z_version >
1008 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
1009 		(void) printf("Can't mount a version %lld file system "
1010 		    "on a version %lld pool\n. Pool must be upgraded to mount "
1011 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
1012 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
1013 		return (SET_ERROR(ENOTSUP));
1014 	}
1015 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
1016 	if (error != 0)
1017 		return (error);
1018 	zfsvfs->z_norm = (int)val;
1019 
1020 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
1021 	if (error != 0)
1022 		return (error);
1023 	zfsvfs->z_utf8 = (val != 0);
1024 
1025 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
1026 	if (error != 0)
1027 		return (error);
1028 	zfsvfs->z_case = (uint_t)val;
1029 
1030 	/*
1031 	 * Fold case on file systems that are always or sometimes case
1032 	 * insensitive.
1033 	 */
1034 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
1035 	    zfsvfs->z_case == ZFS_CASE_MIXED)
1036 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1037 
1038 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1039 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1040 
1041 	uint64_t sa_obj = 0;
1042 	if (zfsvfs->z_use_sa) {
1043 		/* should either have both of these objects or none */
1044 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
1045 		    &sa_obj);
1046 		if (error != 0)
1047 			return (error);
1048 	}
1049 
1050 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1051 	    &zfsvfs->z_attr_table);
1052 	if (error != 0)
1053 		return (error);
1054 
1055 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
1056 		sa_register_update_callback(os, zfs_sa_upgrade);
1057 
1058 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
1059 	    &zfsvfs->z_root);
1060 	if (error != 0)
1061 		return (error);
1062 	ASSERT(zfsvfs->z_root != 0);
1063 
1064 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
1065 	    &zfsvfs->z_unlinkedobj);
1066 	if (error != 0)
1067 		return (error);
1068 
1069 	error = zap_lookup(os, MASTER_NODE_OBJ,
1070 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
1071 	    8, 1, &zfsvfs->z_userquota_obj);
1072 	if (error == ENOENT)
1073 		zfsvfs->z_userquota_obj = 0;
1074 	else if (error != 0)
1075 		return (error);
1076 
1077 	error = zap_lookup(os, MASTER_NODE_OBJ,
1078 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
1079 	    8, 1, &zfsvfs->z_groupquota_obj);
1080 	if (error == ENOENT)
1081 		zfsvfs->z_groupquota_obj = 0;
1082 	else if (error != 0)
1083 		return (error);
1084 
1085 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
1086 	    &zfsvfs->z_fuid_obj);
1087 	if (error == ENOENT)
1088 		zfsvfs->z_fuid_obj = 0;
1089 	else if (error != 0)
1090 		return (error);
1091 
1092 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
1093 	    &zfsvfs->z_shares_dir);
1094 	if (error == ENOENT)
1095 		zfsvfs->z_shares_dir = 0;
1096 	else if (error != 0)
1097 		return (error);
1098 
1099 	/*
1100 	 * Only use the name cache if we are looking for a
1101 	 * name on a file system that does not require normalization
1102 	 * or case folding.  We can also look there if we happen to be
1103 	 * on a non-normalizing, mixed sensitivity file system IF we
1104 	 * are looking for the exact name (which is always the case on
1105 	 * FreeBSD).
1106 	 */
1107 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
1108 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
1109 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
1110 
1111 	return (0);
1112 }
1113 
1114 int
1115 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
1116 {
1117 	objset_t *os;
1118 	zfsvfs_t *zfsvfs;
1119 	int error;
1120 
1121 	/*
1122 	 * XXX: Fix struct statfs so this isn't necessary!
1123 	 *
1124 	 * The 'osname' is used as the filesystem's special node, which means
1125 	 * it must fit in statfs.f_mntfromname, or else it can't be
1126 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
1127 	 * 'zfs unmount' to think it's not mounted when it is.
1128 	 */
1129 	if (strlen(osname) >= MNAMELEN)
1130 		return (SET_ERROR(ENAMETOOLONG));
1131 
1132 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1133 
1134 	/*
1135 	 * We claim to always be readonly so we can open snapshots;
1136 	 * other ZPL code will prevent us from writing to snapshots.
1137 	 */
1138 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
1139 	if (error) {
1140 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1141 		return (error);
1142 	}
1143 
1144 	zfsvfs->z_vfs = NULL;
1145 	zfsvfs->z_parent = zfsvfs;
1146 
1147 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1148 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1149 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1150 	    offsetof(znode_t, z_link_node));
1151 #ifdef DIAGNOSTIC
1152 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
1153 #else
1154 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1155 #endif
1156 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1157 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1158 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1159 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1160 
1161 	error = zfsvfs_init(zfsvfs, os);
1162 	if (error != 0) {
1163 		dmu_objset_disown(os, zfsvfs);
1164 		*zfvp = NULL;
1165 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1166 		return (error);
1167 	}
1168 
1169 	*zfvp = zfsvfs;
1170 	return (0);
1171 }
1172 
1173 static int
1174 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1175 {
1176 	int error;
1177 
1178 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1179 	if (error)
1180 		return (error);
1181 
1182 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1183 
1184 	/*
1185 	 * If we are not mounting (ie: online recv), then we don't
1186 	 * have to worry about replaying the log as we blocked all
1187 	 * operations out since we closed the ZIL.
1188 	 */
1189 	if (mounting) {
1190 		boolean_t readonly;
1191 
1192 		/*
1193 		 * During replay we remove the read only flag to
1194 		 * allow replays to succeed.
1195 		 */
1196 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1197 		if (readonly != 0)
1198 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1199 		else
1200 			zfs_unlinked_drain(zfsvfs);
1201 
1202 		/*
1203 		 * Parse and replay the intent log.
1204 		 *
1205 		 * Because of ziltest, this must be done after
1206 		 * zfs_unlinked_drain().  (Further note: ziltest
1207 		 * doesn't use readonly mounts, where
1208 		 * zfs_unlinked_drain() isn't called.)  This is because
1209 		 * ziltest causes spa_sync() to think it's committed,
1210 		 * but actually it is not, so the intent log contains
1211 		 * many txg's worth of changes.
1212 		 *
1213 		 * In particular, if object N is in the unlinked set in
1214 		 * the last txg to actually sync, then it could be
1215 		 * actually freed in a later txg and then reallocated
1216 		 * in a yet later txg.  This would write a "create
1217 		 * object N" record to the intent log.  Normally, this
1218 		 * would be fine because the spa_sync() would have
1219 		 * written out the fact that object N is free, before
1220 		 * we could write the "create object N" intent log
1221 		 * record.
1222 		 *
1223 		 * But when we are in ziltest mode, we advance the "open
1224 		 * txg" without actually spa_sync()-ing the changes to
1225 		 * disk.  So we would see that object N is still
1226 		 * allocated and in the unlinked set, and there is an
1227 		 * intent log record saying to allocate it.
1228 		 */
1229 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1230 			if (zil_replay_disable) {
1231 				zil_destroy(zfsvfs->z_log, B_FALSE);
1232 			} else {
1233 				zfsvfs->z_replay = B_TRUE;
1234 				zil_replay(zfsvfs->z_os, zfsvfs,
1235 				    zfs_replay_vector);
1236 				zfsvfs->z_replay = B_FALSE;
1237 			}
1238 		}
1239 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1240 	}
1241 
1242 	/*
1243 	 * Set the objset user_ptr to track its zfsvfs.
1244 	 */
1245 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1246 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1247 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1248 
1249 	return (0);
1250 }
1251 
1252 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1253 
1254 void
1255 zfsvfs_free(zfsvfs_t *zfsvfs)
1256 {
1257 	int i;
1258 
1259 	/*
1260 	 * This is a barrier to prevent the filesystem from going away in
1261 	 * zfs_znode_move() until we can safely ensure that the filesystem is
1262 	 * not unmounted. We consider the filesystem valid before the barrier
1263 	 * and invalid after the barrier.
1264 	 */
1265 	rw_enter(&zfsvfs_lock, RW_READER);
1266 	rw_exit(&zfsvfs_lock);
1267 
1268 	zfs_fuid_destroy(zfsvfs);
1269 
1270 	mutex_destroy(&zfsvfs->z_znodes_lock);
1271 	mutex_destroy(&zfsvfs->z_lock);
1272 	list_destroy(&zfsvfs->z_all_znodes);
1273 	rrm_destroy(&zfsvfs->z_teardown_lock);
1274 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1275 	rw_destroy(&zfsvfs->z_fuid_lock);
1276 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1277 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1278 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1279 }
1280 
1281 static void
1282 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1283 {
1284 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1285 	if (zfsvfs->z_vfs) {
1286 		if (zfsvfs->z_use_fuids) {
1287 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1288 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1289 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1290 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1291 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1292 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1293 		} else {
1294 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1295 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1296 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1297 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1298 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1299 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1300 		}
1301 	}
1302 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1303 }
1304 
1305 static int
1306 zfs_domount(vfs_t *vfsp, char *osname)
1307 {
1308 	uint64_t recordsize, fsid_guid;
1309 	int error = 0;
1310 	zfsvfs_t *zfsvfs;
1311 	vnode_t *vp;
1312 
1313 	ASSERT(vfsp);
1314 	ASSERT(osname);
1315 
1316 	error = zfsvfs_create(osname, &zfsvfs);
1317 	if (error)
1318 		return (error);
1319 	zfsvfs->z_vfs = vfsp;
1320 
1321 #ifdef illumos
1322 	/* Initialize the generic filesystem structure. */
1323 	vfsp->vfs_bcount = 0;
1324 	vfsp->vfs_data = NULL;
1325 
1326 	if (zfs_create_unique_device(&mount_dev) == -1) {
1327 		error = SET_ERROR(ENODEV);
1328 		goto out;
1329 	}
1330 	ASSERT(vfs_devismounted(mount_dev) == 0);
1331 #endif
1332 
1333 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1334 	    NULL))
1335 		goto out;
1336 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1337 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1338 
1339 	vfsp->vfs_data = zfsvfs;
1340 #ifdef __FreeBSD_kernel__
1341 	vfsp->mnt_flag |= MNT_LOCAL;
1342 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1343 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1344 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1345 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1346 #endif
1347 #ifdef __NetBSD__
1348 	vfsp->mnt_flag |= MNT_LOCAL;
1349 	vfsp->mnt_iflag |= IMNT_MPSAFE;
1350 #endif
1351 
1352 	/*
1353 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1354 	 * separates our fsid from any other filesystem types, and a
1355 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1356 	 * all objsets open on this system, provided by unique_create().
1357 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1358 	 * because that's where other Solaris filesystems put it.
1359 	 */
1360 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1361 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1362 #ifdef __FreeBSD_kernel__
1363 	vfsp->vfs_fsid.val[0] = fsid_guid;
1364 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1365 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
1366 #endif
1367 #ifdef __NetBSD__
1368 	vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
1369 	vfsp->mnt_stat.f_fsidx.__fsid_val[1] = fsid_guid >> 32;
1370 #endif
1371 
1372 	/*
1373 	 * Set features for file system.
1374 	 */
1375 	zfs_set_fuid_feature(zfsvfs);
1376 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1377 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1378 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1379 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1380 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1381 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1382 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1383 	}
1384 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1385 
1386 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1387 		uint64_t pval;
1388 
1389 		atime_changed_cb(zfsvfs, B_FALSE);
1390 		readonly_changed_cb(zfsvfs, B_TRUE);
1391 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1392 			goto out;
1393 		xattr_changed_cb(zfsvfs, pval);
1394 		zfsvfs->z_issnap = B_TRUE;
1395 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1396 
1397 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1398 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1399 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1400 	} else {
1401 		error = zfsvfs_setup(zfsvfs, B_TRUE);
1402 	}
1403 
1404 #ifdef __FreeBSD_kernel__
1405 	vfs_mountedfrom(vfsp, osname);
1406 #endif
1407 #ifdef __NetBSD__
1408 	set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp);
1409 #endif
1410 
1411 	if (!zfsvfs->z_issnap)
1412 		zfsctl_create(zfsvfs);
1413 out:
1414 	if (error) {
1415 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1416 		zfsvfs_free(zfsvfs);
1417 	} else {
1418 		atomic_inc_32(&zfs_active_fs_count);
1419 	}
1420 
1421 	return (error);
1422 }
1423 
1424 void
1425 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1426 {
1427 	objset_t *os = zfsvfs->z_os;
1428 
1429 	if (!dmu_objset_is_snapshot(os))
1430 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1431 }
1432 
1433 #ifdef SECLABEL
1434 /*
1435  * Convert a decimal digit string to a uint64_t integer.
1436  */
1437 static int
1438 str_to_uint64(char *str, uint64_t *objnum)
1439 {
1440 	uint64_t num = 0;
1441 
1442 	while (*str) {
1443 		if (*str < '0' || *str > '9')
1444 			return (SET_ERROR(EINVAL));
1445 
1446 		num = num*10 + *str++ - '0';
1447 	}
1448 
1449 	*objnum = num;
1450 	return (0);
1451 }
1452 
1453 /*
1454  * The boot path passed from the boot loader is in the form of
1455  * "rootpool-name/root-filesystem-object-number'. Convert this
1456  * string to a dataset name: "rootpool-name/root-filesystem-name".
1457  */
1458 static int
1459 zfs_parse_bootfs(char *bpath, char *outpath)
1460 {
1461 	char *slashp;
1462 	uint64_t objnum;
1463 	int error;
1464 
1465 	if (*bpath == 0 || *bpath == '/')
1466 		return (SET_ERROR(EINVAL));
1467 
1468 	(void) strcpy(outpath, bpath);
1469 
1470 	slashp = strchr(bpath, '/');
1471 
1472 	/* if no '/', just return the pool name */
1473 	if (slashp == NULL) {
1474 		return (0);
1475 	}
1476 
1477 	/* if not a number, just return the root dataset name */
1478 	if (str_to_uint64(slashp+1, &objnum)) {
1479 		return (0);
1480 	}
1481 
1482 	*slashp = '\0';
1483 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1484 	*slashp = '/';
1485 
1486 	return (error);
1487 }
1488 
1489 /*
1490  * Check that the hex label string is appropriate for the dataset being
1491  * mounted into the global_zone proper.
1492  *
1493  * Return an error if the hex label string is not default or
1494  * admin_low/admin_high.  For admin_low labels, the corresponding
1495  * dataset must be readonly.
1496  */
1497 int
1498 zfs_check_global_label(const char *dsname, const char *hexsl)
1499 {
1500 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1501 		return (0);
1502 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1503 		return (0);
1504 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1505 		/* must be readonly */
1506 		uint64_t rdonly;
1507 
1508 		if (dsl_prop_get_integer(dsname,
1509 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1510 			return (SET_ERROR(EACCES));
1511 		return (rdonly ? 0 : EACCES);
1512 	}
1513 	return (SET_ERROR(EACCES));
1514 }
1515 
1516 /*
1517  * Determine whether the mount is allowed according to MAC check.
1518  * by comparing (where appropriate) label of the dataset against
1519  * the label of the zone being mounted into.  If the dataset has
1520  * no label, create one.
1521  *
1522  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1523  */
1524 static int
1525 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1526 {
1527 	int		error, retv;
1528 	zone_t		*mntzone = NULL;
1529 	ts_label_t	*mnt_tsl;
1530 	bslabel_t	*mnt_sl;
1531 	bslabel_t	ds_sl;
1532 	char		ds_hexsl[MAXNAMELEN];
1533 
1534 	retv = EACCES;				/* assume the worst */
1535 
1536 	/*
1537 	 * Start by getting the dataset label if it exists.
1538 	 */
1539 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1540 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1541 	if (error)
1542 		return (SET_ERROR(EACCES));
1543 
1544 	/*
1545 	 * If labeling is NOT enabled, then disallow the mount of datasets
1546 	 * which have a non-default label already.  No other label checks
1547 	 * are needed.
1548 	 */
1549 	if (!is_system_labeled()) {
1550 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1551 			return (0);
1552 		return (SET_ERROR(EACCES));
1553 	}
1554 
1555 	/*
1556 	 * Get the label of the mountpoint.  If mounting into the global
1557 	 * zone (i.e. mountpoint is not within an active zone and the
1558 	 * zoned property is off), the label must be default or
1559 	 * admin_low/admin_high only; no other checks are needed.
1560 	 */
1561 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1562 	if (mntzone->zone_id == GLOBAL_ZONEID) {
1563 		uint64_t zoned;
1564 
1565 		zone_rele(mntzone);
1566 
1567 		if (dsl_prop_get_integer(osname,
1568 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1569 			return (SET_ERROR(EACCES));
1570 		if (!zoned)
1571 			return (zfs_check_global_label(osname, ds_hexsl));
1572 		else
1573 			/*
1574 			 * This is the case of a zone dataset being mounted
1575 			 * initially, before the zone has been fully created;
1576 			 * allow this mount into global zone.
1577 			 */
1578 			return (0);
1579 	}
1580 
1581 	mnt_tsl = mntzone->zone_slabel;
1582 	ASSERT(mnt_tsl != NULL);
1583 	label_hold(mnt_tsl);
1584 	mnt_sl = label2bslabel(mnt_tsl);
1585 
1586 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1587 		/*
1588 		 * The dataset doesn't have a real label, so fabricate one.
1589 		 */
1590 		char *str = NULL;
1591 
1592 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
1593 		    dsl_prop_set_string(osname,
1594 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1595 		    ZPROP_SRC_LOCAL, str) == 0)
1596 			retv = 0;
1597 		if (str != NULL)
1598 			kmem_free(str, strlen(str) + 1);
1599 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1600 		/*
1601 		 * Now compare labels to complete the MAC check.  If the
1602 		 * labels are equal then allow access.  If the mountpoint
1603 		 * label dominates the dataset label, allow readonly access.
1604 		 * Otherwise, access is denied.
1605 		 */
1606 		if (blequal(mnt_sl, &ds_sl))
1607 			retv = 0;
1608 		else if (bldominates(mnt_sl, &ds_sl)) {
1609 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1610 			retv = 0;
1611 		}
1612 	}
1613 
1614 	label_rele(mnt_tsl);
1615 	zone_rele(mntzone);
1616 	return (retv);
1617 }
1618 #endif	/* SECLABEL */
1619 
1620 #ifdef OPENSOLARIS_MOUNTROOT
1621 static int
1622 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1623 {
1624 	int error = 0;
1625 	static int zfsrootdone = 0;
1626 	zfsvfs_t *zfsvfs = NULL;
1627 	znode_t *zp = NULL;
1628 	vnode_t *vp = NULL;
1629 	char *zfs_bootfs;
1630 	char *zfs_devid;
1631 
1632 	ASSERT(vfsp);
1633 
1634 	/*
1635 	 * The filesystem that we mount as root is defined in the
1636 	 * boot property "zfs-bootfs" with a format of
1637 	 * "poolname/root-dataset-objnum".
1638 	 */
1639 	if (why == ROOT_INIT) {
1640 		if (zfsrootdone++)
1641 			return (SET_ERROR(EBUSY));
1642 		/*
1643 		 * the process of doing a spa_load will require the
1644 		 * clock to be set before we could (for example) do
1645 		 * something better by looking at the timestamp on
1646 		 * an uberblock, so just set it to -1.
1647 		 */
1648 		clkset(-1);
1649 
1650 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1651 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1652 			    "bootfs name");
1653 			return (SET_ERROR(EINVAL));
1654 		}
1655 		zfs_devid = spa_get_bootprop("diskdevid");
1656 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1657 		if (zfs_devid)
1658 			spa_free_bootprop(zfs_devid);
1659 		if (error) {
1660 			spa_free_bootprop(zfs_bootfs);
1661 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1662 			    error);
1663 			return (error);
1664 		}
1665 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1666 			spa_free_bootprop(zfs_bootfs);
1667 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1668 			    error);
1669 			return (error);
1670 		}
1671 
1672 		spa_free_bootprop(zfs_bootfs);
1673 
1674 		if (error = vfs_lock(vfsp))
1675 			return (error);
1676 
1677 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1678 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1679 			goto out;
1680 		}
1681 
1682 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1683 		ASSERT(zfsvfs);
1684 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1685 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1686 			goto out;
1687 		}
1688 
1689 		vp = ZTOV(zp);
1690 		mutex_enter(&vp->v_lock);
1691 		vp->v_flag |= VROOT;
1692 		mutex_exit(&vp->v_lock);
1693 		rootvp = vp;
1694 
1695 		/*
1696 		 * Leave rootvp held.  The root file system is never unmounted.
1697 		 */
1698 
1699 		vfs_add((struct vnode *)0, vfsp,
1700 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1701 out:
1702 		vfs_unlock(vfsp);
1703 		return (error);
1704 	} else if (why == ROOT_REMOUNT) {
1705 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1706 		vfsp->vfs_flag |= VFS_REMOUNT;
1707 
1708 		/* refresh mount options */
1709 		zfs_unregister_callbacks(vfsp->vfs_data);
1710 		return (zfs_register_callbacks(vfsp));
1711 
1712 	} else if (why == ROOT_UNMOUNT) {
1713 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1714 		(void) zfs_sync(vfsp, 0, 0);
1715 		return (0);
1716 	}
1717 
1718 	/*
1719 	 * if "why" is equal to anything else other than ROOT_INIT,
1720 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1721 	 */
1722 	return (SET_ERROR(ENOTSUP));
1723 }
1724 #endif	/* OPENSOLARIS_MOUNTROOT */
1725 
1726 static int
1727 getpoolname(const char *osname, char *poolname)
1728 {
1729 	char *p;
1730 
1731 	p = strchr(osname, '/');
1732 	if (p == NULL) {
1733 		if (strlen(osname) >= MAXNAMELEN)
1734 			return (ENAMETOOLONG);
1735 		(void) strcpy(poolname, osname);
1736 	} else {
1737 		if (p - osname >= MAXNAMELEN)
1738 			return (ENAMETOOLONG);
1739 		(void) strncpy(poolname, osname, p - osname);
1740 		poolname[p - osname] = '\0';
1741 	}
1742 	return (0);
1743 }
1744 
1745 /*ARGSUSED*/
1746 #ifdef illumos
1747 static int
1748 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1749 #endif
1750 #ifdef __FreeBSD_kernel__
1751 static int
1752 zfs_mount(vfs_t *vfsp)
1753 #endif
1754 #ifdef __NetBSD__
1755 static int
1756 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
1757 #endif
1758 {
1759 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1760 	char		*osname;
1761 	int		error = 0;
1762 	int		canwrite;
1763 
1764 #ifdef illumos
1765 	if (mvp->v_type != VDIR)
1766 		return (SET_ERROR(ENOTDIR));
1767 
1768 	mutex_enter(&mvp->v_lock);
1769 	if ((uap->flags & MS_REMOUNT) == 0 &&
1770 	    (uap->flags & MS_OVERLAY) == 0 &&
1771 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1772 		mutex_exit(&mvp->v_lock);
1773 		return (SET_ERROR(EBUSY));
1774 	}
1775 	mutex_exit(&mvp->v_lock);
1776 
1777 	/*
1778 	 * ZFS does not support passing unparsed data in via MS_DATA.
1779 	 * Users should use the MS_OPTIONSTR interface; this means
1780 	 * that all option parsing is already done and the options struct
1781 	 * can be interrogated.
1782 	 */
1783 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
1784 		return (SET_ERROR(EINVAL));
1785 #endif /* illumos */
1786 
1787 #ifdef __FreeBSD_kernel__
1788 	kthread_t	*td = curthread;
1789 	cred_t		*cr = td->td_ucred;
1790 
1791 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
1792 		return (SET_ERROR(EPERM));
1793 
1794 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1795 		return (SET_ERROR(EINVAL));
1796 
1797 	/*
1798 	 * If full-owner-access is enabled and delegated administration is
1799 	 * turned on, we must set nosuid.
1800 	 */
1801 	if (zfs_super_owner &&
1802 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1803 		secpolicy_fs_mount_clearopts(cr, vfsp);
1804 	}
1805 
1806 #endif /* __FreeBSD_kernel__ */
1807 
1808 #ifdef __NetBSD__
1809 	cred_t		*cr = CRED();
1810 	struct mounta	*uap = data;
1811 
1812 	if (uap == NULL)
1813 		return (SET_ERROR(EINVAL));
1814 
1815 	if (mvp->v_type != VDIR)
1816 		return (SET_ERROR(ENOTDIR));
1817 
1818 	mutex_enter(mvp->v_interlock);
1819 	if ((uap->flags & MS_REMOUNT) == 0 &&
1820 	    (uap->flags & MS_OVERLAY) == 0 &&
1821 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1822 		mutex_exit(mvp->v_interlock);
1823 		return (SET_ERROR(EBUSY));
1824 	}
1825 	mutex_exit(mvp->v_interlock);
1826 
1827 	osname = PNBUF_GET();
1828 	strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
1829 #endif /* __NetBSD__ */
1830 
1831 	/*
1832 	 * Check for mount privilege?
1833 	 *
1834 	 * If we don't have privilege then see if
1835 	 * we have local permission to allow it
1836 	 */
1837 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1838 	if (error) {
1839 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1840 			goto out;
1841 
1842 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1843 			vattr_t		vattr;
1844 
1845 			/*
1846 			 * Make sure user is the owner of the mount point
1847 			 * or has sufficient privileges.
1848 			 */
1849 
1850 			vattr.va_mask = AT_UID;
1851 
1852 #ifdef __FreeBSD_kernel__
1853 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1854 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1855 				VOP_UNLOCK(mvp, 0);
1856 				goto out;
1857 			}
1858 
1859 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1860 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1861 				VOP_UNLOCK(mvp, 0);
1862 				goto out;
1863 			}
1864 			VOP_UNLOCK(mvp, 0);
1865 #endif
1866 #ifdef __NetBSD__
1867 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1868 			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1869 				VOP_UNLOCK(mvp, 0);
1870 				goto out;
1871 			}
1872 
1873 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1874 			    VOP_ACCESS(mvp, VWRITE, cr) != 0) {
1875 				VOP_UNLOCK(mvp, 0);
1876 				goto out;
1877 			}
1878 			VOP_UNLOCK(mvp, 0);
1879 #endif
1880 		}
1881 
1882 		secpolicy_fs_mount_clearopts(cr, vfsp);
1883 	}
1884 
1885 	/*
1886 	 * Refuse to mount a filesystem if we are in a local zone and the
1887 	 * dataset is not visible.
1888 	 */
1889 	if (!INGLOBALZONE(curthread) &&
1890 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1891 		error = SET_ERROR(EPERM);
1892 		goto out;
1893 	}
1894 
1895 #ifdef SECLABEL
1896 	error = zfs_mount_label_policy(vfsp, osname);
1897 	if (error)
1898 		goto out;
1899 #endif
1900 
1901 #ifdef __FreeBSD_kernel__
1902 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1903 #endif
1904 #ifdef __NetBSD__
1905 	vfsp->mnt_iflag |= IMNT_MPSAFE;
1906 #endif
1907 
1908 	/*
1909 	 * When doing a remount, we simply refresh our temporary properties
1910 	 * according to those options set in the current VFS options.
1911 	 */
1912 	if (vfsp->vfs_flag & MS_REMOUNT) {
1913 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1914 
1915 		/*
1916 		 * Refresh mount options with z_teardown_lock blocking I/O while
1917 		 * the filesystem is in an inconsistent state.
1918 		 * The lock also serializes this code with filesystem
1919 		 * manipulations between entry to zfs_suspend_fs() and return
1920 		 * from zfs_resume_fs().
1921 		 */
1922 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1923 		zfs_unregister_callbacks(zfsvfs);
1924 		error = zfs_register_callbacks(vfsp);
1925 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1926 		goto out;
1927 	}
1928 
1929 #ifdef __FreeBSD_kernel__
1930 	/* Initial root mount: try hard to import the requested root pool. */
1931 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1932 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1933 		char pname[MAXNAMELEN];
1934 
1935 		error = getpoolname(osname, pname);
1936 		if (error == 0)
1937 			error = spa_import_rootpool(pname);
1938 		if (error)
1939 			goto out;
1940 	}
1941 #endif
1942 
1943 	DROP_GIANT();
1944 	error = zfs_domount(vfsp, osname);
1945 	PICKUP_GIANT();
1946 
1947 #ifdef illumos
1948 	/*
1949 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
1950 	 * disappear due to a forced unmount.
1951 	 */
1952 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1953 		VFS_HOLD(mvp->v_vfsp);
1954 #endif
1955 
1956 #ifdef __NetBSD__
1957 	vfs_getnewfsid(vfsp);
1958 
1959 	/* setup zfs mount info */
1960 	strlcpy(vfsp->mnt_stat.f_mntfromname, osname,
1961 	    sizeof(vfsp->mnt_stat.f_mntfromname));
1962 	set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
1963 	    UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
1964 #endif
1965 
1966 out:
1967 	return (error);
1968 }
1969 
1970 #ifdef __FreeBSD_kernel__
1971 static int
1972 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1973 #endif
1974 #ifdef __NetBSD__
1975 static int
1976 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
1977 #endif
1978 {
1979 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1980 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1981 
1982 #ifdef __FreeBSD_kernel__
1983 	statp->f_version = STATFS_VERSION;
1984 #endif
1985 
1986 	ZFS_ENTER(zfsvfs);
1987 
1988 	dmu_objset_space(zfsvfs->z_os,
1989 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1990 
1991 	/*
1992 	 * The underlying storage pool actually uses multiple block sizes.
1993 	 * We report the fragsize as the smallest block size we support,
1994 	 * and we report our blocksize as the filesystem's maximum blocksize.
1995 	 */
1996 	statp->f_bsize = SPA_MINBLOCKSIZE;
1997 #ifdef __NetBSD__
1998 	statp->f_frsize = SPA_MINBLOCKSIZE;
1999 #endif
2000 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
2001 
2002 	/*
2003 	 * The following report "total" blocks of various kinds in the
2004 	 * file system, but reported in terms of f_frsize - the
2005 	 * "fragment" size.
2006 	 */
2007 
2008 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
2009 	statp->f_bfree = availbytes / statp->f_bsize;
2010 	statp->f_bavail = statp->f_bfree; /* no root reservation */
2011 
2012 	/*
2013 	 * statvfs() should really be called statufs(), because it assumes
2014 	 * static metadata.  ZFS doesn't preallocate files, so the best
2015 	 * we can do is report the max that could possibly fit in f_files,
2016 	 * and that minus the number actually used in f_ffree.
2017 	 * For f_ffree, report the smaller of the number of object available
2018 	 * and the number of blocks (each object will take at least a block).
2019 	 */
2020 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
2021 #ifndef __FreeBSD__
2022 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
2023 #endif
2024 	statp->f_files = statp->f_ffree + usedobjs;
2025 
2026 #ifdef __FreeBSD__
2027 	(void) cmpldev(&d32, vfsp->vfs_dev);
2028 	statp->f_fsid = d32;
2029 #endif
2030 #ifdef __NetBSD__
2031 	statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
2032 #endif
2033 
2034 	/*
2035 	 * We're a zfs filesystem.
2036 	 */
2037 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
2038 
2039 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
2040 	    sizeof(statp->f_mntfromname));
2041 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
2042 	    sizeof(statp->f_mntonname));
2043 
2044 #ifdef __FreeBSD_kernel__
2045 	statp->f_namemax = MAXNAMELEN - 1;
2046 #endif
2047 #ifdef __NetBSD__
2048 	statp->f_namemax = ZFS_MAXNAMELEN;
2049 #endif
2050 
2051 	ZFS_EXIT(zfsvfs);
2052 	return (0);
2053 }
2054 
2055 static int
2056 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
2057 {
2058 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2059 	znode_t *rootzp;
2060 	int error;
2061 
2062 	ZFS_ENTER(zfsvfs);
2063 
2064 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
2065 	if (error == 0)
2066 		*vpp = ZTOV(rootzp);
2067 
2068 	ZFS_EXIT(zfsvfs);
2069 
2070 	if (error == 0) {
2071 		error = vn_lock(*vpp, flags);
2072 		if (error != 0) {
2073 			VN_RELE(*vpp);
2074 			*vpp = NULL;
2075 		}
2076 	}
2077 	return (error);
2078 }
2079 
2080 /*
2081  * Teardown the zfsvfs::z_os.
2082  *
2083  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
2084  * and 'z_teardown_inactive_lock' held.
2085  */
2086 static int
2087 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
2088 {
2089 	znode_t	*zp;
2090 
2091 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2092 
2093 	if (!unmounting) {
2094 		/*
2095 		 * We purge the parent filesystem's vfsp as the parent
2096 		 * filesystem and all of its snapshots have their vnode's
2097 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
2098 		 * 'z_parent' is self referential for non-snapshots.
2099 		 */
2100 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2101 #ifdef FREEBSD_NAMECACHE
2102 		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
2103 #endif
2104 	}
2105 
2106 	/*
2107 	 * Close the zil. NB: Can't close the zil while zfs_inactive
2108 	 * threads are blocked as zil_close can call zfs_inactive.
2109 	 */
2110 	if (zfsvfs->z_log) {
2111 		zil_close(zfsvfs->z_log);
2112 		zfsvfs->z_log = NULL;
2113 	}
2114 
2115 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
2116 
2117 	/*
2118 	 * If we are not unmounting (ie: online recv) and someone already
2119 	 * unmounted this file system while we were doing the switcheroo,
2120 	 * or a reopen of z_os failed then just bail out now.
2121 	 */
2122 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
2123 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
2124 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2125 		return (SET_ERROR(EIO));
2126 	}
2127 
2128 	/*
2129 	 * At this point there are no vops active, and any new vops will
2130 	 * fail with EIO since we have z_teardown_lock for writer (only
2131 	 * relavent for forced unmount).
2132 	 *
2133 	 * Release all holds on dbufs.
2134 	 */
2135 	mutex_enter(&zfsvfs->z_znodes_lock);
2136 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
2137 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
2138 		if (zp->z_sa_hdl) {
2139 			ASSERT(ZTOV(zp)->v_count >= 0);
2140 			zfs_znode_dmu_fini(zp);
2141 		}
2142 	mutex_exit(&zfsvfs->z_znodes_lock);
2143 
2144 	/*
2145 	 * If we are unmounting, set the unmounted flag and let new vops
2146 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
2147 	 * other vops will fail with EIO.
2148 	 */
2149 	if (unmounting) {
2150 		zfsvfs->z_unmounted = B_TRUE;
2151 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2152 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
2153 	}
2154 
2155 	/*
2156 	 * z_os will be NULL if there was an error in attempting to reopen
2157 	 * zfsvfs, so just return as the properties had already been
2158 	 * unregistered and cached data had been evicted before.
2159 	 */
2160 	if (zfsvfs->z_os == NULL)
2161 		return (0);
2162 
2163 	/*
2164 	 * Unregister properties.
2165 	 */
2166 	zfs_unregister_callbacks(zfsvfs);
2167 
2168 	/*
2169 	 * Evict cached data
2170 	 */
2171 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
2172 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
2173 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
2174 	dmu_objset_evict_dbufs(zfsvfs->z_os);
2175 
2176 	return (0);
2177 }
2178 
2179 /*ARGSUSED*/
2180 static int
2181 zfs_umount(vfs_t *vfsp, int fflag)
2182 {
2183 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2184 	objset_t *os;
2185 	int ret;
2186 #ifdef __FreeBSD_kernel__
2187 	kthread_t *td = curthread;
2188 	cred_t *cr = td->td_ucred;
2189 #endif
2190 #ifdef __NetBSD__
2191 	cred_t *cr = CRED();
2192 #endif
2193 
2194 	ret = secpolicy_fs_unmount(cr, vfsp);
2195 	if (ret) {
2196 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
2197 		    ZFS_DELEG_PERM_MOUNT, cr))
2198 			return (ret);
2199 	}
2200 
2201 	/*
2202 	 * We purge the parent filesystem's vfsp as the parent filesystem
2203 	 * and all of its snapshots have their vnode's v_vfsp set to the
2204 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
2205 	 * referential for non-snapshots.
2206 	 */
2207 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2208 
2209 	/*
2210 	 * Unmount any snapshots mounted under .zfs before unmounting the
2211 	 * dataset itself.
2212 	 */
2213 	if (zfsvfs->z_ctldir != NULL) {
2214 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
2215 			return (ret);
2216 	}
2217 
2218 	if (fflag & MS_FORCE) {
2219 		/*
2220 		 * Mark file system as unmounted before calling
2221 		 * vflush(FORCECLOSE). This way we ensure no future vnops
2222 		 * will be called and risk operating on DOOMED vnodes.
2223 		 */
2224 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2225 		zfsvfs->z_unmounted = B_TRUE;
2226 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2227 	}
2228 
2229 	/*
2230 	 * Flush all the files.
2231 	 */
2232 #ifdef __FreeBSD_kernel__
2233 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
2234 #endif
2235 #ifdef __NetBSD__
2236 	ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0);
2237 #endif
2238 	if (ret != 0)
2239 		return (ret);
2240 
2241 #ifdef illumos
2242 	if (!(fflag & MS_FORCE)) {
2243 		/*
2244 		 * Check the number of active vnodes in the file system.
2245 		 * Our count is maintained in the vfs structure, but the
2246 		 * number is off by 1 to indicate a hold on the vfs
2247 		 * structure itself.
2248 		 *
2249 		 * The '.zfs' directory maintains a reference of its
2250 		 * own, and any active references underneath are
2251 		 * reflected in the vnode count.
2252 		 */
2253 		if (zfsvfs->z_ctldir == NULL) {
2254 			if (vfsp->vfs_count > 1)
2255 				return (SET_ERROR(EBUSY));
2256 		} else {
2257 			if (vfsp->vfs_count > 2 ||
2258 			    zfsvfs->z_ctldir->v_count > 1)
2259 				return (SET_ERROR(EBUSY));
2260 		}
2261 	}
2262 #endif
2263 
2264 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
2265 	os = zfsvfs->z_os;
2266 
2267 	/*
2268 	 * z_os will be NULL if there was an error in
2269 	 * attempting to reopen zfsvfs.
2270 	 */
2271 	if (os != NULL) {
2272 		/*
2273 		 * Unset the objset user_ptr.
2274 		 */
2275 		mutex_enter(&os->os_user_ptr_lock);
2276 		dmu_objset_set_user(os, NULL);
2277 		mutex_exit(&os->os_user_ptr_lock);
2278 
2279 		/*
2280 		 * Finally release the objset
2281 		 */
2282 		dmu_objset_disown(os, zfsvfs);
2283 	}
2284 
2285 	/*
2286 	 * We can now safely destroy the '.zfs' directory node.
2287 	 */
2288 	if (zfsvfs->z_ctldir != NULL)
2289 		zfsctl_destroy(zfsvfs);
2290 	zfs_freevfs(vfsp);
2291 
2292 	return (0);
2293 }
2294 
2295 #ifdef __FreeBSD_kernel__
2296 static int
2297 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
2298 #endif
2299 #ifdef __NetBSD__
2300 static int
2301 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp)
2302 #endif
2303 {
2304 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2305 	znode_t		*zp;
2306 	int 		err;
2307 #ifdef __NetBSD__
2308 	int		flags = LK_EXCLUSIVE;
2309 #endif
2310 
2311 	/*
2312 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
2313 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
2314 	 * This will make NFS to switch to LOOKUP instead of using VGET.
2315 	 */
2316 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
2317 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
2318 		return (EOPNOTSUPP);
2319 
2320 	ZFS_ENTER(zfsvfs);
2321 	err = zfs_zget(zfsvfs, ino, &zp);
2322 	if (err == 0 && zp->z_unlinked) {
2323 		VN_RELE(ZTOV(zp));
2324 		err = EINVAL;
2325 	}
2326 	if (err == 0)
2327 		*vpp = ZTOV(zp);
2328 	ZFS_EXIT(zfsvfs);
2329 	if (err == 0)
2330 		err = vn_lock(*vpp, flags);
2331 	if (err != 0)
2332 		*vpp = NULL;
2333 
2334 	return (err);
2335 }
2336 
2337 #ifdef __FreeBSD_kernel__
2338 static int
2339 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
2340     struct ucred **credanonp, int *numsecflavors, int **secflavors)
2341 {
2342 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2343 
2344 	/*
2345 	 * If this is regular file system vfsp is the same as
2346 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
2347 	 * zfsvfs->z_parent->z_vfs represents parent file system
2348 	 * which we have to use here, because only this file system
2349 	 * has mnt_export configured.
2350 	 */
2351 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
2352 	    credanonp, numsecflavors, secflavors));
2353 }
2354 
2355 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
2356 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
2357 #endif
2358 
2359 #ifdef __FreeBSD_kernel__
2360 static int
2361 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
2362 #endif
2363 #ifdef __NetBSD__
2364 static int
2365 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
2366 #endif
2367 {
2368 	struct componentname cn;
2369 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2370 	znode_t		*zp;
2371 	vnode_t		*dvp;
2372 	uint64_t	object = 0;
2373 	uint64_t	fid_gen = 0;
2374 	uint64_t	gen_mask;
2375 	uint64_t	zp_gen;
2376 	int 		i, err;
2377 
2378 	*vpp = NULL;
2379 
2380 #ifdef __NetBSD__
2381 	return (SET_ERROR(ENOTSUP));
2382 #endif
2383 
2384 	ZFS_ENTER(zfsvfs);
2385 
2386 #ifdef __FreeBSD_kernel__
2387 	/*
2388 	 * On FreeBSD we can get snapshot's mount point or its parent file
2389 	 * system mount point depending if snapshot is already mounted or not.
2390 	 */
2391 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
2392 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
2393 		uint64_t	objsetid = 0;
2394 		uint64_t	setgen = 0;
2395 
2396 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2397 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2398 
2399 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2400 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2401 
2402 		ZFS_EXIT(zfsvfs);
2403 
2404 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2405 		if (err)
2406 			return (SET_ERROR(EINVAL));
2407 		ZFS_ENTER(zfsvfs);
2408 	}
2409 
2410 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2411 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
2412 
2413 		for (i = 0; i < sizeof (zfid->zf_object); i++)
2414 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2415 
2416 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
2417 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2418 	} else {
2419 		ZFS_EXIT(zfsvfs);
2420 		return (SET_ERROR(EINVAL));
2421 	}
2422 
2423 	/*
2424 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
2425 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
2426 	 * we are in the .zfs/shares directory tree.
2427 	 */
2428 	if ((fid_gen == 0 &&
2429 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
2430 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
2431 		ZFS_EXIT(zfsvfs);
2432 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
2433 		if (object == ZFSCTL_INO_SNAPDIR) {
2434 			cn.cn_nameptr = "snapshot";
2435 			cn.cn_namelen = strlen(cn.cn_nameptr);
2436 			cn.cn_nameiop = LOOKUP;
2437 			cn.cn_flags = ISLASTCN | LOCKLEAF;
2438 			cn.cn_lkflags = flags;
2439 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2440 			vput(dvp);
2441 		} else if (object == zfsvfs->z_shares_dir) {
2442 			/*
2443 			 * XXX This branch must not be taken,
2444 			 * if it is, then the lookup below will
2445 			 * explode.
2446 			 */
2447 			cn.cn_nameptr = "shares";
2448 			cn.cn_namelen = strlen(cn.cn_nameptr);
2449 			cn.cn_nameiop = LOOKUP;
2450 			cn.cn_flags = ISLASTCN;
2451 			cn.cn_lkflags = flags;
2452 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2453 			vput(dvp);
2454 		} else {
2455 			*vpp = dvp;
2456 		}
2457 		return (err);
2458 	}
2459 #endif
2460 	gen_mask = -1ULL >> (64 - 8 * i);
2461 
2462 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2463 	if (err = zfs_zget(zfsvfs, object, &zp)) {
2464 		ZFS_EXIT(zfsvfs);
2465 		return (err);
2466 	}
2467 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2468 	    sizeof (uint64_t));
2469 	zp_gen = zp_gen & gen_mask;
2470 	if (zp_gen == 0)
2471 		zp_gen = 1;
2472 	if (zp->z_unlinked || zp_gen != fid_gen) {
2473 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2474 		VN_RELE(ZTOV(zp));
2475 		ZFS_EXIT(zfsvfs);
2476 		return (SET_ERROR(EINVAL));
2477 	}
2478 
2479 	*vpp = ZTOV(zp);
2480 	ZFS_EXIT(zfsvfs);
2481 #ifdef __FreeBSD_kernel__
2482 	err = vn_lock(*vpp, flags);
2483 	if (err == 0)
2484 		vnode_create_vobject(*vpp, zp->z_size, curthread);
2485 	else
2486 		*vpp = NULL;
2487 #endif
2488 	return (err);
2489 }
2490 
2491 /*
2492  * Block out VOPs and close zfsvfs_t::z_os
2493  *
2494  * Note, if successful, then we return with the 'z_teardown_lock' and
2495  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2496  * dataset and objset intact so that they can be atomically handed off during
2497  * a subsequent rollback or recv operation and the resume thereafter.
2498  */
2499 int
2500 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2501 {
2502 	int error;
2503 
2504 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2505 		return (error);
2506 
2507 	return (0);
2508 }
2509 
2510 /*
2511  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2512  * is an invariant across any of the operations that can be performed while the
2513  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2514  * are the same: the relevant objset and associated dataset are owned by
2515  * zfsvfs, held, and long held on entry.
2516  */
2517 int
2518 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2519 {
2520 	int err;
2521 	znode_t *zp;
2522 
2523 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2524 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2525 
2526 	/*
2527 	 * We already own this, so just update the objset_t, as the one we
2528 	 * had before may have been evicted.
2529 	 */
2530 	objset_t *os;
2531 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2532 	VERIFY(dsl_dataset_long_held(ds));
2533 	VERIFY0(dmu_objset_from_ds(ds, &os));
2534 
2535 	err = zfsvfs_init(zfsvfs, os);
2536 	if (err != 0)
2537 		goto bail;
2538 
2539 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2540 
2541 	zfs_set_fuid_feature(zfsvfs);
2542 
2543 	/*
2544 	 * Attempt to re-establish all the active znodes with
2545 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
2546 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2547 	 * when they try to use their znode.
2548 	 */
2549 	mutex_enter(&zfsvfs->z_znodes_lock);
2550 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2551 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2552 		(void) zfs_rezget(zp);
2553 	}
2554 	mutex_exit(&zfsvfs->z_znodes_lock);
2555 
2556 bail:
2557 	/* release the VOPs */
2558 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
2559 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2560 
2561 	if (err) {
2562 		/*
2563 		 * Since we couldn't setup the sa framework, try to force
2564 		 * unmount this file system.
2565 		 */
2566 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2567 			vfs_ref(zfsvfs->z_vfs);
2568 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2569 		}
2570 	}
2571 	return (err);
2572 }
2573 
2574 static void
2575 zfs_freevfs(vfs_t *vfsp)
2576 {
2577 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2578 
2579 #ifdef illumos
2580 	/*
2581 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2582 	 * from zfs_mount().  Release it here.  If we came through
2583 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
2584 	 * skip the VFS_RELE for rootvfs.
2585 	 */
2586 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
2587 		VFS_RELE(zfsvfs->z_parent->z_vfs);
2588 #endif
2589 
2590 	zfsvfs_free(zfsvfs);
2591 
2592 	atomic_dec_32(&zfs_active_fs_count);
2593 }
2594 
2595 #ifdef __FreeBSD_kernel__
2596 #ifdef __i386__
2597 static int desiredvnodes_backup;
2598 #endif
2599 
2600 static void
2601 zfs_vnodes_adjust(void)
2602 {
2603 #ifdef __i386__
2604 	int newdesiredvnodes;
2605 
2606 	desiredvnodes_backup = desiredvnodes;
2607 
2608 	/*
2609 	 * We calculate newdesiredvnodes the same way it is done in
2610 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2611 	 * it wasn't tuned by the administrator and we can tune it down.
2612 	 */
2613 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2614 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
2615 	    sizeof(struct vnode))));
2616 	if (newdesiredvnodes == desiredvnodes)
2617 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2618 #endif
2619 }
2620 
2621 static void
2622 zfs_vnodes_adjust_back(void)
2623 {
2624 
2625 #ifdef __i386__
2626 	desiredvnodes = desiredvnodes_backup;
2627 #endif
2628 }
2629 #endif /* __FreeBSD_kernel__ */
2630 
2631 #ifdef __NetBSD__
2632 static void
2633 zfs_vnodes_adjust(void)
2634 {
2635 }
2636 
2637 static void
2638 zfs_vnodes_adjust_back(void)
2639 {
2640 }
2641 #endif
2642 
2643 void
2644 zfs_init(void)
2645 {
2646 
2647 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2648 
2649 	/*
2650 	 * Initialize .zfs directory structures
2651 	 */
2652 	zfsctl_init();
2653 
2654 	/*
2655 	 * Initialize znode cache, vnode ops, etc...
2656 	 */
2657 	zfs_znode_init();
2658 
2659 	/*
2660 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2661 	 * with UFS inode in mind. We reduce it here, because it's too big for
2662 	 * ZFS/i386.
2663 	 */
2664 	zfs_vnodes_adjust();
2665 
2666 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2667 }
2668 
2669 void
2670 zfs_fini(void)
2671 {
2672 	zfsctl_fini();
2673 	zfs_znode_fini();
2674 	zfs_vnodes_adjust_back();
2675 }
2676 
2677 int
2678 zfs_busy(void)
2679 {
2680 	return (zfs_active_fs_count != 0);
2681 }
2682 
2683 int
2684 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2685 {
2686 	int error;
2687 	objset_t *os = zfsvfs->z_os;
2688 	dmu_tx_t *tx;
2689 
2690 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2691 		return (SET_ERROR(EINVAL));
2692 
2693 	if (newvers < zfsvfs->z_version)
2694 		return (SET_ERROR(EINVAL));
2695 
2696 	if (zfs_spa_version_map(newvers) >
2697 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2698 		return (SET_ERROR(ENOTSUP));
2699 
2700 	tx = dmu_tx_create(os);
2701 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2702 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2703 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2704 		    ZFS_SA_ATTRS);
2705 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2706 	}
2707 	error = dmu_tx_assign(tx, TXG_WAIT);
2708 	if (error) {
2709 		dmu_tx_abort(tx);
2710 		return (error);
2711 	}
2712 
2713 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2714 	    8, 1, &newvers, tx);
2715 
2716 	if (error) {
2717 		dmu_tx_commit(tx);
2718 		return (error);
2719 	}
2720 
2721 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2722 		uint64_t sa_obj;
2723 
2724 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2725 		    SPA_VERSION_SA);
2726 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2727 		    DMU_OT_NONE, 0, tx);
2728 
2729 		error = zap_add(os, MASTER_NODE_OBJ,
2730 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2731 		ASSERT0(error);
2732 
2733 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
2734 		sa_register_update_callback(os, zfs_sa_upgrade);
2735 	}
2736 
2737 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2738 	    "from %llu to %llu", zfsvfs->z_version, newvers);
2739 
2740 	dmu_tx_commit(tx);
2741 
2742 	zfsvfs->z_version = newvers;
2743 
2744 	zfs_set_fuid_feature(zfsvfs);
2745 
2746 	return (0);
2747 }
2748 
2749 /*
2750  * Read a property stored within the master node.
2751  */
2752 int
2753 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2754 {
2755 	const char *pname;
2756 	int error = ENOENT;
2757 
2758 	/*
2759 	 * Look up the file system's value for the property.  For the
2760 	 * version property, we look up a slightly different string.
2761 	 */
2762 	if (prop == ZFS_PROP_VERSION)
2763 		pname = ZPL_VERSION_STR;
2764 	else
2765 		pname = zfs_prop_to_name(prop);
2766 
2767 	if (os != NULL)
2768 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2769 
2770 	if (error == ENOENT) {
2771 		/* No value set, use the default value */
2772 		switch (prop) {
2773 		case ZFS_PROP_VERSION:
2774 			*value = ZPL_VERSION;
2775 			break;
2776 		case ZFS_PROP_NORMALIZE:
2777 		case ZFS_PROP_UTF8ONLY:
2778 			*value = 0;
2779 			break;
2780 		case ZFS_PROP_CASE:
2781 			*value = ZFS_CASE_SENSITIVE;
2782 			break;
2783 		default:
2784 			return (error);
2785 		}
2786 		error = 0;
2787 	}
2788 	return (error);
2789 }
2790 
2791 #ifdef __FreeBSD_kernel__
2792 #ifdef _KERNEL
2793 void
2794 zfsvfs_update_fromname(const char *oldname, const char *newname)
2795 {
2796 	char tmpbuf[MAXPATHLEN];
2797 	struct mount *mp;
2798 	char *fromname;
2799 	size_t oldlen;
2800 
2801 	oldlen = strlen(oldname);
2802 
2803 	mtx_lock(&mountlist_mtx);
2804 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2805 		fromname = mp->mnt_stat.f_mntfromname;
2806 		if (strcmp(fromname, oldname) == 0) {
2807 			(void)strlcpy(fromname, newname,
2808 			    sizeof(mp->mnt_stat.f_mntfromname));
2809 			continue;
2810 		}
2811 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2812 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2813 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
2814 			    newname, fromname + oldlen);
2815 			(void)strlcpy(fromname, tmpbuf,
2816 			    sizeof(mp->mnt_stat.f_mntfromname));
2817 			continue;
2818 		}
2819 	}
2820 	mtx_unlock(&mountlist_mtx);
2821 }
2822 #endif
2823 #endif
2824