xref: /netbsd-src/sys/kern/vfs_mount.c (revision 53b02e147d4ed531c0d2a5ca9b3e8026ba3e99b5)
1 /*	$NetBSD: vfs_mount.c,v 1.86 2021/02/16 09:56:32 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.86 2021/02/16 09:56:32 hannken Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/kernel.h>
74 
75 #include <sys/atomic.h>
76 #include <sys/buf.h>
77 #include <sys/conf.h>
78 #include <sys/fcntl.h>
79 #include <sys/filedesc.h>
80 #include <sys/device.h>
81 #include <sys/kauth.h>
82 #include <sys/kmem.h>
83 #include <sys/module.h>
84 #include <sys/mount.h>
85 #include <sys/fstrans.h>
86 #include <sys/namei.h>
87 #include <sys/extattr.h>
88 #include <sys/syscallargs.h>
89 #include <sys/sysctl.h>
90 #include <sys/systm.h>
91 #include <sys/vfs_syscalls.h>
92 #include <sys/vnode_impl.h>
93 
94 #include <miscfs/genfs/genfs.h>
95 #include <miscfs/specfs/specdev.h>
96 
97 #include <uvm/uvm_swap.h>
98 
99 enum mountlist_type {
100 	ME_MOUNT,
101 	ME_MARKER
102 };
103 struct mountlist_entry {
104 	TAILQ_ENTRY(mountlist_entry) me_list;	/* Mount list. */
105 	struct mount *me_mount;			/* Actual mount if ME_MOUNT,
106 						   current mount else. */
107 	enum mountlist_type me_type;		/* Mount or marker. */
108 };
109 struct mount_iterator {
110 	struct mountlist_entry mi_entry;
111 };
112 
113 static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
114     bool (*)(void *, struct vnode *), void *, bool);
115 
116 /* Root filesystem. */
117 vnode_t *			rootvnode;
118 
119 /* Mounted filesystem list. */
120 static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
121 static kmutex_t			mountlist_lock __cacheline_aligned;
122 int vnode_offset_next_by_lru	/* XXX: ugly hack for pstat.c */
123     = offsetof(vnode_impl_t, vi_lrulist.tqe_next);
124 
125 kmutex_t			vfs_list_lock __cacheline_aligned;
126 
127 static specificdata_domain_t	mount_specificdata_domain;
128 static kmutex_t			mntid_lock;
129 
130 static kmutex_t			mountgen_lock __cacheline_aligned;
131 static uint64_t			mountgen;
132 
133 void
134 vfs_mount_sysinit(void)
135 {
136 
137 	TAILQ_INIT(&mountlist);
138 	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
139 	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
140 
141 	mount_specificdata_domain = specificdata_domain_create();
142 	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
143 	mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
144 	mountgen = 0;
145 }
146 
147 struct mount *
148 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
149 {
150 	struct mount *mp;
151 	int error __diagused;
152 
153 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
154 	mp->mnt_op = vfsops;
155 	mp->mnt_refcnt = 1;
156 	TAILQ_INIT(&mp->mnt_vnodelist);
157 	mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
158 	mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
159 	mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
160 	mp->mnt_vnodecovered = vp;
161 	mount_initspecific(mp);
162 
163 	error = fstrans_mount(mp);
164 	KASSERT(error == 0);
165 
166 	mutex_enter(&mountgen_lock);
167 	mp->mnt_gen = mountgen++;
168 	mutex_exit(&mountgen_lock);
169 
170 	return mp;
171 }
172 
173 /*
174  * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
175  * initialize a mount structure for it.
176  *
177  * Devname is usually updated by mount(8) after booting.
178  */
179 int
180 vfs_rootmountalloc(const char *fstypename, const char *devname,
181     struct mount **mpp)
182 {
183 	struct vfsops *vfsp = NULL;
184 	struct mount *mp;
185 	int error __diagused;
186 
187 	mutex_enter(&vfs_list_lock);
188 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
189 		if (!strncmp(vfsp->vfs_name, fstypename,
190 		    sizeof(mp->mnt_stat.f_fstypename)))
191 			break;
192 	if (vfsp == NULL) {
193 		mutex_exit(&vfs_list_lock);
194 		return (ENODEV);
195 	}
196 	vfsp->vfs_refcount++;
197 	mutex_exit(&vfs_list_lock);
198 
199 	if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
200 		return ENOMEM;
201 	error = vfs_busy(mp);
202 	KASSERT(error == 0);
203 	mp->mnt_flag = MNT_RDONLY;
204 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
205 	    sizeof(mp->mnt_stat.f_fstypename));
206 	mp->mnt_stat.f_mntonname[0] = '/';
207 	mp->mnt_stat.f_mntonname[1] = '\0';
208 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
209 	    '\0';
210 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
211 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
212 	*mpp = mp;
213 	return 0;
214 }
215 
216 /*
217  * vfs_getnewfsid: get a new unique fsid.
218  */
219 void
220 vfs_getnewfsid(struct mount *mp)
221 {
222 	static u_short xxxfs_mntid;
223 	fsid_t tfsid;
224 	int mtype;
225 
226 	mutex_enter(&mntid_lock);
227 	mtype = makefstype(mp->mnt_op->vfs_name);
228 	mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
229 	mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
230 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
231 	if (xxxfs_mntid == 0)
232 		++xxxfs_mntid;
233 	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
234 	tfsid.__fsid_val[1] = mtype;
235 	while (vfs_getvfs(&tfsid)) {
236 		tfsid.__fsid_val[0]++;
237 		xxxfs_mntid++;
238 	}
239 	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
240 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
241 	mutex_exit(&mntid_lock);
242 }
243 
244 /*
245  * Lookup a mount point by filesystem identifier.
246  *
247  * XXX Needs to add a reference to the mount point.
248  */
249 struct mount *
250 vfs_getvfs(fsid_t *fsid)
251 {
252 	mount_iterator_t *iter;
253 	struct mount *mp;
254 
255 	mountlist_iterator_init(&iter);
256 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
257 		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
258 		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
259 			mountlist_iterator_destroy(iter);
260 			return mp;
261 		}
262 	}
263 	mountlist_iterator_destroy(iter);
264 	return NULL;
265 }
266 
267 /*
268  * Take a reference to a mount structure.
269  */
270 void
271 vfs_ref(struct mount *mp)
272 {
273 
274 	KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
275 
276 	atomic_inc_uint(&mp->mnt_refcnt);
277 }
278 
279 /*
280  * Drop a reference to a mount structure, freeing if the last reference.
281  */
282 void
283 vfs_rele(struct mount *mp)
284 {
285 
286 	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
287 		return;
288 	}
289 
290 	/*
291 	 * Nothing else has visibility of the mount: we can now
292 	 * free the data structures.
293 	 */
294 	KASSERT(mp->mnt_refcnt == 0);
295 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
296 	mutex_obj_free(mp->mnt_updating);
297 	mutex_obj_free(mp->mnt_renamelock);
298 	mutex_obj_free(mp->mnt_vnodelock);
299 	if (mp->mnt_op != NULL) {
300 		vfs_delref(mp->mnt_op);
301 	}
302 	fstrans_unmount(mp);
303 	/*
304 	 * Final free of mp gets done from fstrans_mount_dtor().
305 	 *
306 	 * Prevents this memory to be reused as a mount before
307 	 * fstrans releases all references to it.
308 	 */
309 }
310 
311 /*
312  * Mark a mount point as busy, and gain a new reference to it.  Used to
313  * prevent the file system from being unmounted during critical sections.
314  *
315  * vfs_busy can be called multiple times and by multiple threads
316  * and must be accompanied by the same number of vfs_unbusy calls.
317  *
318  * => The caller must hold a pre-existing reference to the mount.
319  * => Will fail if the file system is being unmounted, or is unmounted.
320  */
321 static inline int
322 _vfs_busy(struct mount *mp, bool wait)
323 {
324 
325 	KASSERT(mp->mnt_refcnt > 0);
326 
327 	if (wait) {
328 		fstrans_start(mp);
329 	} else {
330 		if (fstrans_start_nowait(mp))
331 			return EBUSY;
332 	}
333 	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
334 		fstrans_done(mp);
335 		return ENOENT;
336 	}
337 	vfs_ref(mp);
338 	return 0;
339 }
340 
341 int
342 vfs_busy(struct mount *mp)
343 {
344 
345 	return _vfs_busy(mp, true);
346 }
347 
348 int
349 vfs_trybusy(struct mount *mp)
350 {
351 
352 	return _vfs_busy(mp, false);
353 }
354 
355 /*
356  * Unbusy a busy filesystem.
357  *
358  * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
359  */
360 void
361 vfs_unbusy(struct mount *mp)
362 {
363 
364 	KASSERT(mp->mnt_refcnt > 0);
365 
366 	fstrans_done(mp);
367 	vfs_rele(mp);
368 }
369 
370 struct vnode_iterator {
371 	vnode_impl_t vi_vnode;
372 };
373 
374 void
375 vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
376 {
377 	vnode_t *vp;
378 	vnode_impl_t *vip;
379 
380 	vp = vnalloc_marker(mp);
381 	vip = VNODE_TO_VIMPL(vp);
382 
383 	mutex_enter(mp->mnt_vnodelock);
384 	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
385 	vp->v_usecount = 1;
386 	mutex_exit(mp->mnt_vnodelock);
387 
388 	*vnip = (struct vnode_iterator *)vip;
389 }
390 
391 void
392 vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
393 {
394 	vnode_impl_t *mvip = &vni->vi_vnode;
395 	vnode_t *mvp = VIMPL_TO_VNODE(mvip);
396 	kmutex_t *lock;
397 
398 	KASSERT(vnis_marker(mvp));
399 	if (vrefcnt(mvp) != 0) {
400 		lock = mvp->v_mount->mnt_vnodelock;
401 		mutex_enter(lock);
402 		TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
403 		mvp->v_usecount = 0;
404 		mutex_exit(lock);
405 	}
406 	vnfree_marker(mvp);
407 }
408 
409 static struct vnode *
410 vfs_vnode_iterator_next1(struct vnode_iterator *vni,
411     bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
412 {
413 	vnode_impl_t *mvip = &vni->vi_vnode;
414 	struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
415 	vnode_t *vp;
416 	vnode_impl_t *vip;
417 	kmutex_t *lock;
418 	int error;
419 
420 	KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
421 
422 	lock = mp->mnt_vnodelock;
423 	do {
424 		mutex_enter(lock);
425 		vip = TAILQ_NEXT(mvip, vi_mntvnodes);
426 		TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
427 		VIMPL_TO_VNODE(mvip)->v_usecount = 0;
428 again:
429 		if (vip == NULL) {
430 			mutex_exit(lock);
431 	       		return NULL;
432 		}
433 		vp = VIMPL_TO_VNODE(vip);
434 		KASSERT(vp != NULL);
435 		mutex_enter(vp->v_interlock);
436 		if (vnis_marker(vp) ||
437 		    vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
438 		    (f && !(*f)(cl, vp))) {
439 			mutex_exit(vp->v_interlock);
440 			vip = TAILQ_NEXT(vip, vi_mntvnodes);
441 			goto again;
442 		}
443 
444 		TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
445 		VIMPL_TO_VNODE(mvip)->v_usecount = 1;
446 		mutex_exit(lock);
447 		error = vcache_vget(vp);
448 		KASSERT(error == 0 || error == ENOENT);
449 	} while (error != 0);
450 
451 	return vp;
452 }
453 
454 struct vnode *
455 vfs_vnode_iterator_next(struct vnode_iterator *vni,
456     bool (*f)(void *, struct vnode *), void *cl)
457 {
458 
459 	return vfs_vnode_iterator_next1(vni, f, cl, false);
460 }
461 
462 /*
463  * Move a vnode from one mount queue to another.
464  */
465 void
466 vfs_insmntque(vnode_t *vp, struct mount *mp)
467 {
468 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
469 	struct mount *omp;
470 	kmutex_t *lock;
471 
472 	KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
473 	    vp->v_tag == VT_VFS);
474 
475 	/*
476 	 * Delete from old mount point vnode list, if on one.
477 	 */
478 	if ((omp = vp->v_mount) != NULL) {
479 		lock = omp->mnt_vnodelock;
480 		mutex_enter(lock);
481 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
482 		mutex_exit(lock);
483 	}
484 
485 	/*
486 	 * Insert into list of vnodes for the new mount point, if
487 	 * available.  The caller must take a reference on the mount
488 	 * structure and donate to the vnode.
489 	 */
490 	if ((vp->v_mount = mp) != NULL) {
491 		lock = mp->mnt_vnodelock;
492 		mutex_enter(lock);
493 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
494 		mutex_exit(lock);
495 	}
496 
497 	if (omp != NULL) {
498 		/* Release reference to old mount. */
499 		vfs_rele(omp);
500 	}
501 }
502 
503 /*
504  * Remove any vnodes in the vnode table belonging to mount point mp.
505  *
506  * If FORCECLOSE is not specified, there should not be any active ones,
507  * return error if any are found (nb: this is a user error, not a
508  * system error). If FORCECLOSE is specified, detach any active vnodes
509  * that are found.
510  *
511  * If WRITECLOSE is set, only flush out regular file vnodes open for
512  * writing.
513  *
514  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
515  */
516 #ifdef DEBUG
517 int busyprt = 0;	/* print out busy vnodes */
518 struct ctldebug debug1 = { "busyprt", &busyprt };
519 #endif
520 
521 static vnode_t *
522 vflushnext(struct vnode_iterator *marker, int *when)
523 {
524 	if (getticks() > *when) {
525 		yield();
526 		*when = getticks() + hz / 10;
527 	}
528 	return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
529 }
530 
531 /*
532  * Flush one vnode.  Referenced on entry, unreferenced on return.
533  */
534 static int
535 vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
536 {
537 	int error;
538 	struct vattr vattr;
539 
540 	if (vp == skipvp ||
541 	    ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
542 		vrele(vp);
543 		return 0;
544 	}
545 	/*
546 	 * If WRITECLOSE is set, only flush out regular file
547 	 * vnodes open for writing or open and unlinked.
548 	 */
549 	if ((flags & WRITECLOSE)) {
550 		if (vp->v_type != VREG) {
551 			vrele(vp);
552 			return 0;
553 		}
554 		error = vn_lock(vp, LK_EXCLUSIVE);
555 		if (error) {
556 			KASSERT(error == ENOENT);
557 			vrele(vp);
558 			return 0;
559 		}
560 		error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
561 		if (error == 0)
562 			error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
563 		VOP_UNLOCK(vp);
564 		if (error) {
565 			vrele(vp);
566 			return error;
567 		}
568 		if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
569 			vrele(vp);
570 			return 0;
571 		}
572 	}
573 	/*
574 	 * First try to recycle the vnode.
575 	 */
576 	if (vrecycle(vp))
577 		return 0;
578 	/*
579 	 * If FORCECLOSE is set, forcibly close the vnode.
580 	 * For block or character devices, revert to an
581 	 * anonymous device.  For all other files, just
582 	 * kill them.
583 	 */
584 	if (flags & FORCECLOSE) {
585 		if (vrefcnt(vp) > 1 &&
586 		    (vp->v_type == VBLK || vp->v_type == VCHR))
587 			vcache_make_anon(vp);
588 		else
589 			vgone(vp);
590 		return 0;
591 	}
592 	vrele(vp);
593 	return EBUSY;
594 }
595 
596 int
597 vflush(struct mount *mp, vnode_t *skipvp, int flags)
598 {
599 	vnode_t *vp;
600 	struct vnode_iterator *marker;
601 	int busy, error, when, retries = 2;
602 
603 	do {
604 		busy = error = when = 0;
605 
606 		/*
607 		 * First, flush out any vnode references from the
608 		 * deferred vrele list.
609 		 */
610 		vrele_flush(mp);
611 
612 		vfs_vnode_iterator_init(mp, &marker);
613 
614 		while ((vp = vflushnext(marker, &when)) != NULL) {
615 			error = vflush_one(vp, skipvp, flags);
616 			if (error == EBUSY) {
617 				error = 0;
618 				busy++;
619 #ifdef DEBUG
620 				if (busyprt && retries == 0)
621 					vprint("vflush: busy vnode", vp);
622 #endif
623 			} else if (error != 0) {
624 				break;
625 			}
626 		}
627 
628 		vfs_vnode_iterator_destroy(marker);
629 	} while (error == 0 && busy > 0 && retries-- > 0);
630 
631 	if (error)
632 		return error;
633 	if (busy)
634 		return EBUSY;
635 	return 0;
636 }
637 
638 /*
639  * Mount a file system.
640  */
641 
642 /*
643  * Scan all active processes to see if any of them have a current or root
644  * directory onto which the new filesystem has just been  mounted. If so,
645  * replace them with the new mount point.
646  */
647 static void
648 mount_checkdirs(vnode_t *olddp)
649 {
650 	vnode_t *newdp, *rele1, *rele2;
651 	struct cwdinfo *cwdi;
652 	struct proc *p;
653 	bool retry;
654 
655 	if (vrefcnt(olddp) == 1) {
656 		return;
657 	}
658 	if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
659 		panic("mount: lost mount");
660 
661 	do {
662 		retry = false;
663 		mutex_enter(&proc_lock);
664 		PROCLIST_FOREACH(p, &allproc) {
665 			if ((cwdi = p->p_cwdi) == NULL)
666 				continue;
667 			/*
668 			 * Cannot change to the old directory any more,
669 			 * so even if we see a stale value it is not a
670 			 * problem.
671 			 */
672 			if (cwdi->cwdi_cdir != olddp &&
673 			    cwdi->cwdi_rdir != olddp)
674 				continue;
675 			retry = true;
676 			rele1 = NULL;
677 			rele2 = NULL;
678 			atomic_inc_uint(&cwdi->cwdi_refcnt);
679 			mutex_exit(&proc_lock);
680 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
681 			if (cwdi->cwdi_cdir == olddp) {
682 				rele1 = cwdi->cwdi_cdir;
683 				vref(newdp);
684 				cwdi->cwdi_cdir = newdp;
685 			}
686 			if (cwdi->cwdi_rdir == olddp) {
687 				rele2 = cwdi->cwdi_rdir;
688 				vref(newdp);
689 				cwdi->cwdi_rdir = newdp;
690 			}
691 			rw_exit(&cwdi->cwdi_lock);
692 			cwdfree(cwdi);
693 			if (rele1 != NULL)
694 				vrele(rele1);
695 			if (rele2 != NULL)
696 				vrele(rele2);
697 			mutex_enter(&proc_lock);
698 			break;
699 		}
700 		mutex_exit(&proc_lock);
701 	} while (retry);
702 
703 	if (rootvnode == olddp) {
704 		vrele(rootvnode);
705 		vref(newdp);
706 		rootvnode = newdp;
707 	}
708 	vput(newdp);
709 }
710 
711 /*
712  * Start extended attributes
713  */
714 static int
715 start_extattr(struct mount *mp)
716 {
717 	int error;
718 
719 	error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
720 	if (error)
721 		printf("%s: failed to start extattr: error = %d\n",
722 		       mp->mnt_stat.f_mntonname, error);
723 
724 	return error;
725 }
726 
727 int
728 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
729     const char *path, int flags, void *data, size_t *data_len)
730 {
731 	vnode_t *vp = *vpp;
732 	struct mount *mp;
733 	struct pathbuf *pb;
734 	struct nameidata nd;
735 	int error, error2;
736 
737 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
738 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
739 	if (error) {
740 		vfs_delref(vfsops);
741 		return error;
742 	}
743 
744 	/* Cannot make a non-dir a mount-point (from here anyway). */
745 	if (vp->v_type != VDIR) {
746 		vfs_delref(vfsops);
747 		return ENOTDIR;
748 	}
749 
750 	if (flags & MNT_EXPORTED) {
751 		vfs_delref(vfsops);
752 		return EINVAL;
753 	}
754 
755 	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
756 		vfs_delref(vfsops);
757 		return ENOMEM;
758 	}
759 
760 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
761 
762 	/*
763 	 * The underlying file system may refuse the mount for
764 	 * various reasons.  Allow the user to force it to happen.
765 	 *
766 	 * Set the mount level flags.
767 	 */
768 	mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
769 
770 	mutex_enter(mp->mnt_updating);
771 	error = VFS_MOUNT(mp, path, data, data_len);
772 	mp->mnt_flag &= ~MNT_OP_FLAGS;
773 
774 	if (error != 0)
775 		goto err_unmounted;
776 
777 	/*
778 	 * Validate and prepare the mount point.
779 	 */
780 	error = pathbuf_copyin(path, &pb);
781 	if (error != 0) {
782 		goto err_mounted;
783 	}
784 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
785 	error = namei(&nd);
786 	pathbuf_destroy(pb);
787 	if (error != 0) {
788 		goto err_mounted;
789 	}
790 	if (nd.ni_vp != vp) {
791 		vput(nd.ni_vp);
792 		error = EINVAL;
793 		goto err_mounted;
794 	}
795 	if (vp->v_mountedhere != NULL) {
796 		vput(nd.ni_vp);
797 		error = EBUSY;
798 		goto err_mounted;
799 	}
800 	error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
801 	if (error != 0) {
802 		vput(nd.ni_vp);
803 		goto err_mounted;
804 	}
805 
806 	/*
807 	 * Put the new filesystem on the mount list after root.
808 	 */
809 	cache_purge(vp);
810 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
811 
812 	mountlist_append(mp);
813 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
814 		vfs_syncer_add_to_worklist(mp);
815 	vp->v_mountedhere = mp;
816 	vput(nd.ni_vp);
817 
818 	mount_checkdirs(vp);
819 	mutex_exit(mp->mnt_updating);
820 
821 	/* Hold an additional reference to the mount across VFS_START(). */
822 	vfs_ref(mp);
823 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
824 	error = VFS_START(mp, 0);
825 	if (error) {
826 		vrele(vp);
827 	} else if (flags & MNT_EXTATTR) {
828 		if (start_extattr(mp) != 0)
829 			mp->mnt_flag &= ~MNT_EXTATTR;
830 	}
831 	/* Drop reference held for VFS_START(). */
832 	vfs_rele(mp);
833 	*vpp = NULL;
834 	return error;
835 
836 err_mounted:
837 	do {
838 		error2 = vfs_suspend(mp, 0);
839 	} while (error2 == EINTR || error2 == ERESTART);
840 	KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
841 
842 	if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
843 		panic("Unmounting fresh file system failed");
844 
845 	if (error2 == 0)
846 		vfs_resume(mp);
847 
848 err_unmounted:
849 	vp->v_mountedhere = NULL;
850 	mutex_exit(mp->mnt_updating);
851 	vfs_rele(mp);
852 
853 	return error;
854 }
855 
856 /*
857  * Do the actual file system unmount.  File system is assumed to have
858  * been locked by the caller.
859  *
860  * => Caller hold reference to the mount, explicitly for dounmount().
861  */
862 int
863 dounmount(struct mount *mp, int flags, struct lwp *l)
864 {
865 	vnode_t *coveredvp;
866 	int error, async, used_syncer, used_extattr;
867 	const bool was_suspended = fstrans_is_owner(mp);
868 
869 #if NVERIEXEC > 0
870 	error = veriexec_unmountchk(mp);
871 	if (error)
872 		return (error);
873 #endif /* NVERIEXEC > 0 */
874 
875 	if (!was_suspended) {
876 		error = vfs_suspend(mp, 0);
877 		if (error) {
878 			return error;
879 		}
880 	}
881 
882 	KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
883 
884 	used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
885 	used_extattr = mp->mnt_flag & MNT_EXTATTR;
886 
887 	mp->mnt_iflag |= IMNT_UNMOUNT;
888 	mutex_enter(mp->mnt_updating);
889 	async = mp->mnt_flag & MNT_ASYNC;
890 	mp->mnt_flag &= ~MNT_ASYNC;
891 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
892 	if (used_syncer)
893 		vfs_syncer_remove_from_worklist(mp);
894 	error = 0;
895 	if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
896 		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
897 	}
898 	if (error == 0 || (flags & MNT_FORCE)) {
899 		error = VFS_UNMOUNT(mp, flags);
900 	}
901 	if (error) {
902 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
903 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
904 			vfs_syncer_add_to_worklist(mp);
905 		mp->mnt_flag |= async;
906 		mutex_exit(mp->mnt_updating);
907 		if (!was_suspended)
908 			vfs_resume(mp);
909 		if (used_extattr) {
910 			if (start_extattr(mp) != 0)
911 				mp->mnt_flag &= ~MNT_EXTATTR;
912 			else
913 				mp->mnt_flag |= MNT_EXTATTR;
914 		}
915 		return (error);
916 	}
917 	mutex_exit(mp->mnt_updating);
918 
919 	/*
920 	 * mark filesystem as gone to prevent further umounts
921 	 * after mnt_umounting lock is gone, this also prevents
922 	 * vfs_busy() from succeeding.
923 	 */
924 	mp->mnt_iflag |= IMNT_GONE;
925 	if (!was_suspended)
926 		vfs_resume(mp);
927 
928 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
929 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY);
930 		coveredvp->v_mountedhere = NULL;
931 		VOP_UNLOCK(coveredvp);
932 	}
933 	mountlist_remove(mp);
934 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
935 		panic("unmount: dangling vnode");
936 	vfs_hooks_unmount(mp);
937 
938 	vfs_rele(mp);	/* reference from mount() */
939 	if (coveredvp != NULLVP) {
940 		vrele(coveredvp);
941 	}
942 	return (0);
943 }
944 
945 /*
946  * Unmount all file systems.
947  * We traverse the list in reverse order under the assumption that doing so
948  * will avoid needing to worry about dependencies.
949  */
950 bool
951 vfs_unmountall(struct lwp *l)
952 {
953 
954 	printf("unmounting file systems...\n");
955 	return vfs_unmountall1(l, true, true);
956 }
957 
958 static void
959 vfs_unmount_print(struct mount *mp, const char *pfx)
960 {
961 
962 	aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
963 	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
964 	    mp->mnt_stat.f_fstypename);
965 }
966 
967 /*
968  * Return the mount with the highest generation less than "gen".
969  */
970 static struct mount *
971 vfs_unmount_next(uint64_t gen)
972 {
973 	mount_iterator_t *iter;
974 	struct mount *mp, *nmp;
975 
976 	nmp = NULL;
977 
978 	mountlist_iterator_init(&iter);
979 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
980 		if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
981 		    mp->mnt_gen < gen) {
982 			if (nmp != NULL)
983 				vfs_rele(nmp);
984 			nmp = mp;
985 			vfs_ref(nmp);
986 		}
987 	}
988 	mountlist_iterator_destroy(iter);
989 
990 	return nmp;
991 }
992 
993 bool
994 vfs_unmount_forceone(struct lwp *l)
995 {
996 	struct mount *mp;
997 	int error;
998 
999 	mp = vfs_unmount_next(mountgen);
1000 	if (mp == NULL) {
1001 		return false;
1002 	}
1003 
1004 #ifdef DEBUG
1005 	printf("forcefully unmounting %s (%s)...\n",
1006 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1007 #endif
1008 	if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
1009 		vfs_unmount_print(mp, "forcefully ");
1010 		return true;
1011 	} else {
1012 		vfs_rele(mp);
1013 	}
1014 
1015 #ifdef DEBUG
1016 	printf("forceful unmount of %s failed with error %d\n",
1017 	    mp->mnt_stat.f_mntonname, error);
1018 #endif
1019 
1020 	return false;
1021 }
1022 
1023 bool
1024 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
1025 {
1026 	struct mount *mp;
1027 	mount_iterator_t *iter;
1028 	bool any_error = false, progress = false;
1029 	uint64_t gen;
1030 	int error;
1031 
1032 	gen = mountgen;
1033 	for (;;) {
1034 		mp = vfs_unmount_next(gen);
1035 		if (mp == NULL)
1036 			break;
1037 		gen = mp->mnt_gen;
1038 
1039 #ifdef DEBUG
1040 		printf("unmounting %p %s (%s)...\n",
1041 		    (void *)mp, mp->mnt_stat.f_mntonname,
1042 		    mp->mnt_stat.f_mntfromname);
1043 #endif
1044 		if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
1045 			vfs_unmount_print(mp, "");
1046 			progress = true;
1047 		} else {
1048 			vfs_rele(mp);
1049 			if (verbose) {
1050 				printf("unmount of %s failed with error %d\n",
1051 				    mp->mnt_stat.f_mntonname, error);
1052 			}
1053 			any_error = true;
1054 		}
1055 	}
1056 	if (verbose) {
1057 		printf("unmounting done\n");
1058 	}
1059 	if (any_error && verbose) {
1060 		printf("WARNING: some file systems would not unmount\n");
1061 	}
1062 	/* If the mountlist is empty it is time to remove swap. */
1063 	mountlist_iterator_init(&iter);
1064 	if (mountlist_iterator_next(iter) == NULL) {
1065 		uvm_swap_shutdown(l);
1066 	}
1067 	mountlist_iterator_destroy(iter);
1068 
1069 	return progress;
1070 }
1071 
1072 void
1073 vfs_sync_all(struct lwp *l)
1074 {
1075 	printf("syncing disks... ");
1076 
1077 	/* remove user processes from run queue */
1078 	suspendsched();
1079 	(void)spl0();
1080 
1081 	/* avoid coming back this way again if we panic. */
1082 	doing_shutdown = 1;
1083 
1084 	do_sys_sync(l);
1085 
1086 	/* Wait for sync to finish. */
1087 	if (vfs_syncwait() != 0) {
1088 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1089 		Debugger();
1090 #endif
1091 		printf("giving up\n");
1092 		return;
1093 	} else
1094 		printf("done\n");
1095 }
1096 
1097 /*
1098  * Sync and unmount file systems before shutting down.
1099  */
1100 void
1101 vfs_shutdown(void)
1102 {
1103 	lwp_t *l = curlwp;
1104 
1105 	vfs_sync_all(l);
1106 
1107 	/*
1108 	 * If we have paniced - do not make the situation potentially
1109 	 * worse by unmounting the file systems.
1110 	 */
1111 	if (panicstr != NULL) {
1112 		return;
1113 	}
1114 
1115 	/* Unmount file systems. */
1116 	vfs_unmountall(l);
1117 }
1118 
1119 /*
1120  * Print a list of supported file system types (used by vfs_mountroot)
1121  */
1122 static void
1123 vfs_print_fstypes(void)
1124 {
1125 	struct vfsops *v;
1126 	int cnt = 0;
1127 
1128 	mutex_enter(&vfs_list_lock);
1129 	LIST_FOREACH(v, &vfs_list, vfs_list)
1130 		++cnt;
1131 	mutex_exit(&vfs_list_lock);
1132 
1133 	if (cnt == 0) {
1134 		printf("WARNING: No file system modules have been loaded.\n");
1135 		return;
1136 	}
1137 
1138 	printf("Supported file systems:");
1139 	mutex_enter(&vfs_list_lock);
1140 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1141 		printf(" %s", v->vfs_name);
1142 	}
1143 	mutex_exit(&vfs_list_lock);
1144 	printf("\n");
1145 }
1146 
1147 /*
1148  * Mount the root file system.  If the operator didn't specify a
1149  * file system to use, try all possible file systems until one
1150  * succeeds.
1151  */
1152 int
1153 vfs_mountroot(void)
1154 {
1155 	struct vfsops *v;
1156 	int error = ENODEV;
1157 
1158 	if (root_device == NULL)
1159 		panic("vfs_mountroot: root device unknown");
1160 
1161 	switch (device_class(root_device)) {
1162 	case DV_IFNET:
1163 		if (rootdev != NODEV)
1164 			panic("vfs_mountroot: rootdev set for DV_IFNET "
1165 			    "(0x%llx -> %llu,%llu)",
1166 			    (unsigned long long)rootdev,
1167 			    (unsigned long long)major(rootdev),
1168 			    (unsigned long long)minor(rootdev));
1169 		break;
1170 
1171 	case DV_DISK:
1172 		if (rootdev == NODEV)
1173 			panic("vfs_mountroot: rootdev not set for DV_DISK");
1174 	        if (bdevvp(rootdev, &rootvp))
1175 	                panic("vfs_mountroot: can't get vnode for rootdev");
1176 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1177 		if (error) {
1178 			printf("vfs_mountroot: can't open root device\n");
1179 			return (error);
1180 		}
1181 		break;
1182 
1183 	case DV_VIRTUAL:
1184 		break;
1185 
1186 	default:
1187 		printf("%s: inappropriate for root file system\n",
1188 		    device_xname(root_device));
1189 		return (ENODEV);
1190 	}
1191 
1192 	/*
1193 	 * If user specified a root fs type, use it.  Make sure the
1194 	 * specified type exists and has a mount_root()
1195 	 */
1196 	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1197 		v = vfs_getopsbyname(rootfstype);
1198 		error = EFTYPE;
1199 		if (v != NULL) {
1200 			if (v->vfs_mountroot != NULL) {
1201 				error = (v->vfs_mountroot)();
1202 			}
1203 			v->vfs_refcount--;
1204 		}
1205 		goto done;
1206 	}
1207 
1208 	/*
1209 	 * Try each file system currently configured into the kernel.
1210 	 */
1211 	mutex_enter(&vfs_list_lock);
1212 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1213 		if (v->vfs_mountroot == NULL)
1214 			continue;
1215 #ifdef DEBUG
1216 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1217 #endif
1218 		v->vfs_refcount++;
1219 		mutex_exit(&vfs_list_lock);
1220 		error = (*v->vfs_mountroot)();
1221 		mutex_enter(&vfs_list_lock);
1222 		v->vfs_refcount--;
1223 		if (!error) {
1224 			aprint_normal("root file system type: %s\n",
1225 			    v->vfs_name);
1226 			break;
1227 		}
1228 	}
1229 	mutex_exit(&vfs_list_lock);
1230 
1231 	if (v == NULL) {
1232 		vfs_print_fstypes();
1233 		printf("no file system for %s", device_xname(root_device));
1234 		if (device_class(root_device) == DV_DISK)
1235 			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1236 		printf("\n");
1237 		error = EFTYPE;
1238 	}
1239 
1240 done:
1241 	if (error && device_class(root_device) == DV_DISK) {
1242 		VOP_CLOSE(rootvp, FREAD, FSCRED);
1243 		vrele(rootvp);
1244 	}
1245 	if (error == 0) {
1246 		mount_iterator_t *iter;
1247 		struct mount *mp;
1248 		extern struct cwdinfo cwdi0;
1249 
1250 		mountlist_iterator_init(&iter);
1251 		mp = mountlist_iterator_next(iter);
1252 		KASSERT(mp != NULL);
1253 		mountlist_iterator_destroy(iter);
1254 
1255 		mp->mnt_flag |= MNT_ROOTFS;
1256 		mp->mnt_op->vfs_refcount++;
1257 
1258 		/*
1259 		 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
1260 		 * reference it, and donate it the reference grabbed
1261 		 * with VFS_ROOT().
1262 		 */
1263 		error = VFS_ROOT(mp, LK_NONE, &rootvnode);
1264 		if (error)
1265 			panic("cannot find root vnode, error=%d", error);
1266 		cwdi0.cwdi_cdir = rootvnode;
1267 		cwdi0.cwdi_rdir = NULL;
1268 
1269 		/*
1270 		 * Now that root is mounted, we can fixup initproc's CWD
1271 		 * info.  All other processes are kthreads, which merely
1272 		 * share proc0's CWD info.
1273 		 */
1274 		initproc->p_cwdi->cwdi_cdir = rootvnode;
1275 		vref(initproc->p_cwdi->cwdi_cdir);
1276 		initproc->p_cwdi->cwdi_rdir = NULL;
1277 		/*
1278 		 * Enable loading of modules from the filesystem
1279 		 */
1280 		module_load_vfs_init();
1281 
1282 	}
1283 	return (error);
1284 }
1285 
1286 /*
1287  * mount_specific_key_create --
1288  *	Create a key for subsystem mount-specific data.
1289  */
1290 int
1291 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1292 {
1293 
1294 	return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1295 }
1296 
1297 /*
1298  * mount_specific_key_delete --
1299  *	Delete a key for subsystem mount-specific data.
1300  */
1301 void
1302 mount_specific_key_delete(specificdata_key_t key)
1303 {
1304 
1305 	specificdata_key_delete(mount_specificdata_domain, key);
1306 }
1307 
1308 /*
1309  * mount_initspecific --
1310  *	Initialize a mount's specificdata container.
1311  */
1312 void
1313 mount_initspecific(struct mount *mp)
1314 {
1315 	int error __diagused;
1316 
1317 	error = specificdata_init(mount_specificdata_domain,
1318 				  &mp->mnt_specdataref);
1319 	KASSERT(error == 0);
1320 }
1321 
1322 /*
1323  * mount_finispecific --
1324  *	Finalize a mount's specificdata container.
1325  */
1326 void
1327 mount_finispecific(struct mount *mp)
1328 {
1329 
1330 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1331 }
1332 
1333 /*
1334  * mount_getspecific --
1335  *	Return mount-specific data corresponding to the specified key.
1336  */
1337 void *
1338 mount_getspecific(struct mount *mp, specificdata_key_t key)
1339 {
1340 
1341 	return specificdata_getspecific(mount_specificdata_domain,
1342 					 &mp->mnt_specdataref, key);
1343 }
1344 
1345 /*
1346  * mount_setspecific --
1347  *	Set mount-specific data corresponding to the specified key.
1348  */
1349 void
1350 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1351 {
1352 
1353 	specificdata_setspecific(mount_specificdata_domain,
1354 				 &mp->mnt_specdataref, key, data);
1355 }
1356 
1357 /*
1358  * Check to see if a filesystem is mounted on a block device.
1359  */
1360 int
1361 vfs_mountedon(vnode_t *vp)
1362 {
1363 	vnode_t *vq;
1364 	int error = 0;
1365 
1366 	if (vp->v_type != VBLK)
1367 		return ENOTBLK;
1368 	if (spec_node_getmountedfs(vp) != NULL)
1369 		return EBUSY;
1370 	if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) {
1371 		if (spec_node_getmountedfs(vq) != NULL)
1372 			error = EBUSY;
1373 		vrele(vq);
1374 	}
1375 
1376 	return error;
1377 }
1378 
1379 /*
1380  * Check if a device pointed to by vp is mounted.
1381  *
1382  * Returns:
1383  *   EINVAL	if it's not a disk
1384  *   EBUSY	if it's a disk and mounted
1385  *   0		if it's a disk and not mounted
1386  */
1387 int
1388 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1389 {
1390 	vnode_t *bvp;
1391 	dev_t dev;
1392 	int d_type;
1393 
1394 	bvp = NULL;
1395 	d_type = D_OTHER;
1396 
1397 	if (iskmemvp(vp))
1398 		return EINVAL;
1399 
1400 	switch (vp->v_type) {
1401 	case VCHR: {
1402 		const struct cdevsw *cdev;
1403 
1404 		dev = vp->v_rdev;
1405 		cdev = cdevsw_lookup(dev);
1406 		if (cdev != NULL) {
1407 			dev_t blkdev;
1408 
1409 			blkdev = devsw_chr2blk(dev);
1410 			if (blkdev != NODEV) {
1411 				if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1412 					d_type = (cdev->d_flag & D_TYPEMASK);
1413 					/* XXX: what if bvp disappears? */
1414 					vrele(bvp);
1415 				}
1416 			}
1417 		}
1418 
1419 		break;
1420 		}
1421 
1422 	case VBLK: {
1423 		const struct bdevsw *bdev;
1424 
1425 		dev = vp->v_rdev;
1426 		bdev = bdevsw_lookup(dev);
1427 		if (bdev != NULL)
1428 			d_type = (bdev->d_flag & D_TYPEMASK);
1429 
1430 		bvp = vp;
1431 
1432 		break;
1433 		}
1434 
1435 	default:
1436 		break;
1437 	}
1438 
1439 	if (d_type != D_DISK)
1440 		return EINVAL;
1441 
1442 	if (bvpp != NULL)
1443 		*bvpp = bvp;
1444 
1445 	/*
1446 	 * XXX: This is bogus. We should be failing the request
1447 	 * XXX: not only if this specific slice is mounted, but
1448 	 * XXX: if it's on a disk with any other mounted slice.
1449 	 */
1450 	if (vfs_mountedon(bvp))
1451 		return EBUSY;
1452 
1453 	return 0;
1454 }
1455 
1456 /*
1457  * Make a 'unique' number from a mount type name.
1458  */
1459 long
1460 makefstype(const char *type)
1461 {
1462 	long rv;
1463 
1464 	for (rv = 0; *type; type++) {
1465 		rv <<= 2;
1466 		rv ^= *type;
1467 	}
1468 	return rv;
1469 }
1470 
1471 static struct mountlist_entry *
1472 mountlist_alloc(enum mountlist_type type, struct mount *mp)
1473 {
1474 	struct mountlist_entry *me;
1475 
1476 	me = kmem_zalloc(sizeof(*me), KM_SLEEP);
1477 	me->me_mount = mp;
1478 	me->me_type = type;
1479 
1480 	return me;
1481 }
1482 
1483 static void
1484 mountlist_free(struct mountlist_entry *me)
1485 {
1486 
1487 	kmem_free(me, sizeof(*me));
1488 }
1489 
1490 void
1491 mountlist_iterator_init(mount_iterator_t **mip)
1492 {
1493 	struct mountlist_entry *me;
1494 
1495 	me = mountlist_alloc(ME_MARKER, NULL);
1496 	mutex_enter(&mountlist_lock);
1497 	TAILQ_INSERT_HEAD(&mountlist, me, me_list);
1498 	mutex_exit(&mountlist_lock);
1499 	*mip = (mount_iterator_t *)me;
1500 }
1501 
1502 void
1503 mountlist_iterator_destroy(mount_iterator_t *mi)
1504 {
1505 	struct mountlist_entry *marker = &mi->mi_entry;
1506 
1507 	if (marker->me_mount != NULL)
1508 		vfs_unbusy(marker->me_mount);
1509 
1510 	mutex_enter(&mountlist_lock);
1511 	TAILQ_REMOVE(&mountlist, marker, me_list);
1512 	mutex_exit(&mountlist_lock);
1513 
1514 	mountlist_free(marker);
1515 
1516 }
1517 
1518 /*
1519  * Return the next mount or NULL for this iterator.
1520  * Mark it busy on success.
1521  */
1522 static inline struct mount *
1523 _mountlist_iterator_next(mount_iterator_t *mi, bool wait)
1524 {
1525 	struct mountlist_entry *me, *marker = &mi->mi_entry;
1526 	struct mount *mp;
1527 	int error;
1528 
1529 	if (marker->me_mount != NULL) {
1530 		vfs_unbusy(marker->me_mount);
1531 		marker->me_mount = NULL;
1532 	}
1533 
1534 	mutex_enter(&mountlist_lock);
1535 	for (;;) {
1536 		KASSERT(marker->me_type == ME_MARKER);
1537 
1538 		me = TAILQ_NEXT(marker, me_list);
1539 		if (me == NULL) {
1540 			/* End of list: keep marker and return. */
1541 			mutex_exit(&mountlist_lock);
1542 			return NULL;
1543 		}
1544 		TAILQ_REMOVE(&mountlist, marker, me_list);
1545 		TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
1546 
1547 		/* Skip other markers. */
1548 		if (me->me_type != ME_MOUNT)
1549 			continue;
1550 
1551 		/* Take an initial reference for vfs_busy() below. */
1552 		mp = me->me_mount;
1553 		KASSERT(mp != NULL);
1554 		vfs_ref(mp);
1555 		mutex_exit(&mountlist_lock);
1556 
1557 		/* Try to mark this mount busy and return on success. */
1558 		if (wait)
1559 			error = vfs_busy(mp);
1560 		else
1561 			error = vfs_trybusy(mp);
1562 		if (error == 0) {
1563 			vfs_rele(mp);
1564 			marker->me_mount = mp;
1565 			return mp;
1566 		}
1567 		vfs_rele(mp);
1568 		mutex_enter(&mountlist_lock);
1569 	}
1570 }
1571 
1572 struct mount *
1573 mountlist_iterator_next(mount_iterator_t *mi)
1574 {
1575 
1576 	return _mountlist_iterator_next(mi, true);
1577 }
1578 
1579 struct mount *
1580 mountlist_iterator_trynext(mount_iterator_t *mi)
1581 {
1582 
1583 	return _mountlist_iterator_next(mi, false);
1584 }
1585 
1586 /*
1587  * Attach new mount to the end of the mount list.
1588  */
1589 void
1590 mountlist_append(struct mount *mp)
1591 {
1592 	struct mountlist_entry *me;
1593 
1594 	me = mountlist_alloc(ME_MOUNT, mp);
1595 	mutex_enter(&mountlist_lock);
1596 	TAILQ_INSERT_TAIL(&mountlist, me, me_list);
1597 	mutex_exit(&mountlist_lock);
1598 }
1599 
1600 /*
1601  * Remove mount from mount list.
1602  */void
1603 mountlist_remove(struct mount *mp)
1604 {
1605 	struct mountlist_entry *me;
1606 
1607 	mutex_enter(&mountlist_lock);
1608 	TAILQ_FOREACH(me, &mountlist, me_list)
1609 		if (me->me_type == ME_MOUNT && me->me_mount == mp)
1610 			break;
1611 	KASSERT(me != NULL);
1612 	TAILQ_REMOVE(&mountlist, me, me_list);
1613 	mutex_exit(&mountlist_lock);
1614 	mountlist_free(me);
1615 }
1616 
1617 /*
1618  * Unlocked variant to traverse the mountlist.
1619  * To be used from DDB only.
1620  */
1621 struct mount *
1622 _mountlist_next(struct mount *mp)
1623 {
1624 	struct mountlist_entry *me;
1625 
1626 	if (mp == NULL) {
1627 		me = TAILQ_FIRST(&mountlist);
1628 	} else {
1629 		TAILQ_FOREACH(me, &mountlist, me_list)
1630 			if (me->me_type == ME_MOUNT && me->me_mount == mp)
1631 				break;
1632 		if (me != NULL)
1633 			me = TAILQ_NEXT(me, me_list);
1634 	}
1635 
1636 	while (me != NULL && me->me_type != ME_MOUNT)
1637 		me = TAILQ_NEXT(me, me_list);
1638 
1639 	return (me ? me->me_mount : NULL);
1640 }
1641