xref: /netbsd-src/sys/kern/vfs_mount.c (revision 9fd8799cb5ceb66c69f2eb1a6d26a1d587ba1f1e)
1 /*	$NetBSD: vfs_mount.c,v 1.85 2020/11/19 10:47:47 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.85 2020/11/19 10:47:47 hannken Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/kernel.h>
74 
75 #include <sys/atomic.h>
76 #include <sys/buf.h>
77 #include <sys/conf.h>
78 #include <sys/fcntl.h>
79 #include <sys/filedesc.h>
80 #include <sys/device.h>
81 #include <sys/kauth.h>
82 #include <sys/kmem.h>
83 #include <sys/module.h>
84 #include <sys/mount.h>
85 #include <sys/fstrans.h>
86 #include <sys/namei.h>
87 #include <sys/extattr.h>
88 #include <sys/syscallargs.h>
89 #include <sys/sysctl.h>
90 #include <sys/systm.h>
91 #include <sys/vfs_syscalls.h>
92 #include <sys/vnode_impl.h>
93 
94 #include <miscfs/genfs/genfs.h>
95 #include <miscfs/specfs/specdev.h>
96 
97 enum mountlist_type {
98 	ME_MOUNT,
99 	ME_MARKER
100 };
101 struct mountlist_entry {
102 	TAILQ_ENTRY(mountlist_entry) me_list;	/* Mount list. */
103 	struct mount *me_mount;			/* Actual mount if ME_MOUNT,
104 						   current mount else. */
105 	enum mountlist_type me_type;		/* Mount or marker. */
106 };
107 struct mount_iterator {
108 	struct mountlist_entry mi_entry;
109 };
110 
111 static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
112     bool (*)(void *, struct vnode *), void *, bool);
113 
114 /* Root filesystem. */
115 vnode_t *			rootvnode;
116 
117 /* Mounted filesystem list. */
118 static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
119 static kmutex_t			mountlist_lock __cacheline_aligned;
120 int vnode_offset_next_by_lru	/* XXX: ugly hack for pstat.c */
121     = offsetof(vnode_impl_t, vi_lrulist.tqe_next);
122 
123 kmutex_t			vfs_list_lock __cacheline_aligned;
124 
125 static specificdata_domain_t	mount_specificdata_domain;
126 static kmutex_t			mntid_lock;
127 
128 static kmutex_t			mountgen_lock __cacheline_aligned;
129 static uint64_t			mountgen;
130 
131 void
132 vfs_mount_sysinit(void)
133 {
134 
135 	TAILQ_INIT(&mountlist);
136 	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
137 	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
138 
139 	mount_specificdata_domain = specificdata_domain_create();
140 	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
141 	mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
142 	mountgen = 0;
143 }
144 
145 struct mount *
146 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
147 {
148 	struct mount *mp;
149 	int error __diagused;
150 
151 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
152 	mp->mnt_op = vfsops;
153 	mp->mnt_refcnt = 1;
154 	TAILQ_INIT(&mp->mnt_vnodelist);
155 	mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
156 	mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
157 	mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
158 	mp->mnt_vnodecovered = vp;
159 	mount_initspecific(mp);
160 
161 	error = fstrans_mount(mp);
162 	KASSERT(error == 0);
163 
164 	mutex_enter(&mountgen_lock);
165 	mp->mnt_gen = mountgen++;
166 	mutex_exit(&mountgen_lock);
167 
168 	return mp;
169 }
170 
171 /*
172  * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
173  * initialize a mount structure for it.
174  *
175  * Devname is usually updated by mount(8) after booting.
176  */
177 int
178 vfs_rootmountalloc(const char *fstypename, const char *devname,
179     struct mount **mpp)
180 {
181 	struct vfsops *vfsp = NULL;
182 	struct mount *mp;
183 	int error __diagused;
184 
185 	mutex_enter(&vfs_list_lock);
186 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
187 		if (!strncmp(vfsp->vfs_name, fstypename,
188 		    sizeof(mp->mnt_stat.f_fstypename)))
189 			break;
190 	if (vfsp == NULL) {
191 		mutex_exit(&vfs_list_lock);
192 		return (ENODEV);
193 	}
194 	vfsp->vfs_refcount++;
195 	mutex_exit(&vfs_list_lock);
196 
197 	if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
198 		return ENOMEM;
199 	error = vfs_busy(mp);
200 	KASSERT(error == 0);
201 	mp->mnt_flag = MNT_RDONLY;
202 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
203 	    sizeof(mp->mnt_stat.f_fstypename));
204 	mp->mnt_stat.f_mntonname[0] = '/';
205 	mp->mnt_stat.f_mntonname[1] = '\0';
206 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
207 	    '\0';
208 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
209 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
210 	*mpp = mp;
211 	return 0;
212 }
213 
214 /*
215  * vfs_getnewfsid: get a new unique fsid.
216  */
217 void
218 vfs_getnewfsid(struct mount *mp)
219 {
220 	static u_short xxxfs_mntid;
221 	fsid_t tfsid;
222 	int mtype;
223 
224 	mutex_enter(&mntid_lock);
225 	mtype = makefstype(mp->mnt_op->vfs_name);
226 	mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
227 	mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
228 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
229 	if (xxxfs_mntid == 0)
230 		++xxxfs_mntid;
231 	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
232 	tfsid.__fsid_val[1] = mtype;
233 	while (vfs_getvfs(&tfsid)) {
234 		tfsid.__fsid_val[0]++;
235 		xxxfs_mntid++;
236 	}
237 	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
238 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
239 	mutex_exit(&mntid_lock);
240 }
241 
242 /*
243  * Lookup a mount point by filesystem identifier.
244  *
245  * XXX Needs to add a reference to the mount point.
246  */
247 struct mount *
248 vfs_getvfs(fsid_t *fsid)
249 {
250 	mount_iterator_t *iter;
251 	struct mount *mp;
252 
253 	mountlist_iterator_init(&iter);
254 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
255 		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
256 		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
257 			mountlist_iterator_destroy(iter);
258 			return mp;
259 		}
260 	}
261 	mountlist_iterator_destroy(iter);
262 	return NULL;
263 }
264 
265 /*
266  * Take a reference to a mount structure.
267  */
268 void
269 vfs_ref(struct mount *mp)
270 {
271 
272 	KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
273 
274 	atomic_inc_uint(&mp->mnt_refcnt);
275 }
276 
277 /*
278  * Drop a reference to a mount structure, freeing if the last reference.
279  */
280 void
281 vfs_rele(struct mount *mp)
282 {
283 
284 	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
285 		return;
286 	}
287 
288 	/*
289 	 * Nothing else has visibility of the mount: we can now
290 	 * free the data structures.
291 	 */
292 	KASSERT(mp->mnt_refcnt == 0);
293 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
294 	mutex_obj_free(mp->mnt_updating);
295 	mutex_obj_free(mp->mnt_renamelock);
296 	mutex_obj_free(mp->mnt_vnodelock);
297 	if (mp->mnt_op != NULL) {
298 		vfs_delref(mp->mnt_op);
299 	}
300 	fstrans_unmount(mp);
301 	/*
302 	 * Final free of mp gets done from fstrans_mount_dtor().
303 	 *
304 	 * Prevents this memory to be reused as a mount before
305 	 * fstrans releases all references to it.
306 	 */
307 }
308 
309 /*
310  * Mark a mount point as busy, and gain a new reference to it.  Used to
311  * prevent the file system from being unmounted during critical sections.
312  *
313  * vfs_busy can be called multiple times and by multiple threads
314  * and must be accompanied by the same number of vfs_unbusy calls.
315  *
316  * => The caller must hold a pre-existing reference to the mount.
317  * => Will fail if the file system is being unmounted, or is unmounted.
318  */
319 static inline int
320 _vfs_busy(struct mount *mp, bool wait)
321 {
322 
323 	KASSERT(mp->mnt_refcnt > 0);
324 
325 	if (wait) {
326 		fstrans_start(mp);
327 	} else {
328 		if (fstrans_start_nowait(mp))
329 			return EBUSY;
330 	}
331 	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
332 		fstrans_done(mp);
333 		return ENOENT;
334 	}
335 	vfs_ref(mp);
336 	return 0;
337 }
338 
339 int
340 vfs_busy(struct mount *mp)
341 {
342 
343 	return _vfs_busy(mp, true);
344 }
345 
346 int
347 vfs_trybusy(struct mount *mp)
348 {
349 
350 	return _vfs_busy(mp, false);
351 }
352 
353 /*
354  * Unbusy a busy filesystem.
355  *
356  * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
357  */
358 void
359 vfs_unbusy(struct mount *mp)
360 {
361 
362 	KASSERT(mp->mnt_refcnt > 0);
363 
364 	fstrans_done(mp);
365 	vfs_rele(mp);
366 }
367 
368 struct vnode_iterator {
369 	vnode_impl_t vi_vnode;
370 };
371 
372 void
373 vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
374 {
375 	vnode_t *vp;
376 	vnode_impl_t *vip;
377 
378 	vp = vnalloc_marker(mp);
379 	vip = VNODE_TO_VIMPL(vp);
380 
381 	mutex_enter(mp->mnt_vnodelock);
382 	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
383 	vp->v_usecount = 1;
384 	mutex_exit(mp->mnt_vnodelock);
385 
386 	*vnip = (struct vnode_iterator *)vip;
387 }
388 
389 void
390 vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
391 {
392 	vnode_impl_t *mvip = &vni->vi_vnode;
393 	vnode_t *mvp = VIMPL_TO_VNODE(mvip);
394 	kmutex_t *lock;
395 
396 	KASSERT(vnis_marker(mvp));
397 	if (vrefcnt(mvp) != 0) {
398 		lock = mvp->v_mount->mnt_vnodelock;
399 		mutex_enter(lock);
400 		TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
401 		mvp->v_usecount = 0;
402 		mutex_exit(lock);
403 	}
404 	vnfree_marker(mvp);
405 }
406 
407 static struct vnode *
408 vfs_vnode_iterator_next1(struct vnode_iterator *vni,
409     bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
410 {
411 	vnode_impl_t *mvip = &vni->vi_vnode;
412 	struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
413 	vnode_t *vp;
414 	vnode_impl_t *vip;
415 	kmutex_t *lock;
416 	int error;
417 
418 	KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
419 
420 	lock = mp->mnt_vnodelock;
421 	do {
422 		mutex_enter(lock);
423 		vip = TAILQ_NEXT(mvip, vi_mntvnodes);
424 		TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
425 		VIMPL_TO_VNODE(mvip)->v_usecount = 0;
426 again:
427 		if (vip == NULL) {
428 			mutex_exit(lock);
429 	       		return NULL;
430 		}
431 		vp = VIMPL_TO_VNODE(vip);
432 		KASSERT(vp != NULL);
433 		mutex_enter(vp->v_interlock);
434 		if (vnis_marker(vp) ||
435 		    vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
436 		    (f && !(*f)(cl, vp))) {
437 			mutex_exit(vp->v_interlock);
438 			vip = TAILQ_NEXT(vip, vi_mntvnodes);
439 			goto again;
440 		}
441 
442 		TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
443 		VIMPL_TO_VNODE(mvip)->v_usecount = 1;
444 		mutex_exit(lock);
445 		error = vcache_vget(vp);
446 		KASSERT(error == 0 || error == ENOENT);
447 	} while (error != 0);
448 
449 	return vp;
450 }
451 
452 struct vnode *
453 vfs_vnode_iterator_next(struct vnode_iterator *vni,
454     bool (*f)(void *, struct vnode *), void *cl)
455 {
456 
457 	return vfs_vnode_iterator_next1(vni, f, cl, false);
458 }
459 
460 /*
461  * Move a vnode from one mount queue to another.
462  */
463 void
464 vfs_insmntque(vnode_t *vp, struct mount *mp)
465 {
466 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
467 	struct mount *omp;
468 	kmutex_t *lock;
469 
470 	KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
471 	    vp->v_tag == VT_VFS);
472 
473 	/*
474 	 * Delete from old mount point vnode list, if on one.
475 	 */
476 	if ((omp = vp->v_mount) != NULL) {
477 		lock = omp->mnt_vnodelock;
478 		mutex_enter(lock);
479 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
480 		mutex_exit(lock);
481 	}
482 
483 	/*
484 	 * Insert into list of vnodes for the new mount point, if
485 	 * available.  The caller must take a reference on the mount
486 	 * structure and donate to the vnode.
487 	 */
488 	if ((vp->v_mount = mp) != NULL) {
489 		lock = mp->mnt_vnodelock;
490 		mutex_enter(lock);
491 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
492 		mutex_exit(lock);
493 	}
494 
495 	if (omp != NULL) {
496 		/* Release reference to old mount. */
497 		vfs_rele(omp);
498 	}
499 }
500 
501 /*
502  * Remove any vnodes in the vnode table belonging to mount point mp.
503  *
504  * If FORCECLOSE is not specified, there should not be any active ones,
505  * return error if any are found (nb: this is a user error, not a
506  * system error). If FORCECLOSE is specified, detach any active vnodes
507  * that are found.
508  *
509  * If WRITECLOSE is set, only flush out regular file vnodes open for
510  * writing.
511  *
512  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
513  */
514 #ifdef DEBUG
515 int busyprt = 0;	/* print out busy vnodes */
516 struct ctldebug debug1 = { "busyprt", &busyprt };
517 #endif
518 
519 static vnode_t *
520 vflushnext(struct vnode_iterator *marker, int *when)
521 {
522 	if (getticks() > *when) {
523 		yield();
524 		*when = getticks() + hz / 10;
525 	}
526 	return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
527 }
528 
529 /*
530  * Flush one vnode.  Referenced on entry, unreferenced on return.
531  */
532 static int
533 vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
534 {
535 	int error;
536 	struct vattr vattr;
537 
538 	if (vp == skipvp ||
539 	    ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
540 		vrele(vp);
541 		return 0;
542 	}
543 	/*
544 	 * If WRITECLOSE is set, only flush out regular file
545 	 * vnodes open for writing or open and unlinked.
546 	 */
547 	if ((flags & WRITECLOSE)) {
548 		if (vp->v_type != VREG) {
549 			vrele(vp);
550 			return 0;
551 		}
552 		error = vn_lock(vp, LK_EXCLUSIVE);
553 		if (error) {
554 			KASSERT(error == ENOENT);
555 			vrele(vp);
556 			return 0;
557 		}
558 		error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
559 		if (error == 0)
560 			error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
561 		VOP_UNLOCK(vp);
562 		if (error) {
563 			vrele(vp);
564 			return error;
565 		}
566 		if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
567 			vrele(vp);
568 			return 0;
569 		}
570 	}
571 	/*
572 	 * First try to recycle the vnode.
573 	 */
574 	if (vrecycle(vp))
575 		return 0;
576 	/*
577 	 * If FORCECLOSE is set, forcibly close the vnode.
578 	 * For block or character devices, revert to an
579 	 * anonymous device.  For all other files, just
580 	 * kill them.
581 	 */
582 	if (flags & FORCECLOSE) {
583 		if (vrefcnt(vp) > 1 &&
584 		    (vp->v_type == VBLK || vp->v_type == VCHR))
585 			vcache_make_anon(vp);
586 		else
587 			vgone(vp);
588 		return 0;
589 	}
590 	vrele(vp);
591 	return EBUSY;
592 }
593 
594 int
595 vflush(struct mount *mp, vnode_t *skipvp, int flags)
596 {
597 	vnode_t *vp;
598 	struct vnode_iterator *marker;
599 	int busy, error, when, retries = 2;
600 
601 	do {
602 		busy = error = when = 0;
603 
604 		/*
605 		 * First, flush out any vnode references from the
606 		 * deferred vrele list.
607 		 */
608 		vrele_flush(mp);
609 
610 		vfs_vnode_iterator_init(mp, &marker);
611 
612 		while ((vp = vflushnext(marker, &when)) != NULL) {
613 			error = vflush_one(vp, skipvp, flags);
614 			if (error == EBUSY) {
615 				error = 0;
616 				busy++;
617 #ifdef DEBUG
618 				if (busyprt && retries == 0)
619 					vprint("vflush: busy vnode", vp);
620 #endif
621 			} else if (error != 0) {
622 				break;
623 			}
624 		}
625 
626 		vfs_vnode_iterator_destroy(marker);
627 	} while (error == 0 && busy > 0 && retries-- > 0);
628 
629 	if (error)
630 		return error;
631 	if (busy)
632 		return EBUSY;
633 	return 0;
634 }
635 
636 /*
637  * Mount a file system.
638  */
639 
640 /*
641  * Scan all active processes to see if any of them have a current or root
642  * directory onto which the new filesystem has just been  mounted. If so,
643  * replace them with the new mount point.
644  */
645 static void
646 mount_checkdirs(vnode_t *olddp)
647 {
648 	vnode_t *newdp, *rele1, *rele2;
649 	struct cwdinfo *cwdi;
650 	struct proc *p;
651 	bool retry;
652 
653 	if (vrefcnt(olddp) == 1) {
654 		return;
655 	}
656 	if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
657 		panic("mount: lost mount");
658 
659 	do {
660 		retry = false;
661 		mutex_enter(&proc_lock);
662 		PROCLIST_FOREACH(p, &allproc) {
663 			if ((cwdi = p->p_cwdi) == NULL)
664 				continue;
665 			/*
666 			 * Cannot change to the old directory any more,
667 			 * so even if we see a stale value it is not a
668 			 * problem.
669 			 */
670 			if (cwdi->cwdi_cdir != olddp &&
671 			    cwdi->cwdi_rdir != olddp)
672 				continue;
673 			retry = true;
674 			rele1 = NULL;
675 			rele2 = NULL;
676 			atomic_inc_uint(&cwdi->cwdi_refcnt);
677 			mutex_exit(&proc_lock);
678 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
679 			if (cwdi->cwdi_cdir == olddp) {
680 				rele1 = cwdi->cwdi_cdir;
681 				vref(newdp);
682 				cwdi->cwdi_cdir = newdp;
683 			}
684 			if (cwdi->cwdi_rdir == olddp) {
685 				rele2 = cwdi->cwdi_rdir;
686 				vref(newdp);
687 				cwdi->cwdi_rdir = newdp;
688 			}
689 			rw_exit(&cwdi->cwdi_lock);
690 			cwdfree(cwdi);
691 			if (rele1 != NULL)
692 				vrele(rele1);
693 			if (rele2 != NULL)
694 				vrele(rele2);
695 			mutex_enter(&proc_lock);
696 			break;
697 		}
698 		mutex_exit(&proc_lock);
699 	} while (retry);
700 
701 	if (rootvnode == olddp) {
702 		vrele(rootvnode);
703 		vref(newdp);
704 		rootvnode = newdp;
705 	}
706 	vput(newdp);
707 }
708 
709 /*
710  * Start extended attributes
711  */
712 static int
713 start_extattr(struct mount *mp)
714 {
715 	int error;
716 
717 	error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
718 	if (error)
719 		printf("%s: failed to start extattr: error = %d\n",
720 		       mp->mnt_stat.f_mntonname, error);
721 
722 	return error;
723 }
724 
725 int
726 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
727     const char *path, int flags, void *data, size_t *data_len)
728 {
729 	vnode_t *vp = *vpp;
730 	struct mount *mp;
731 	struct pathbuf *pb;
732 	struct nameidata nd;
733 	int error, error2;
734 
735 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
736 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
737 	if (error) {
738 		vfs_delref(vfsops);
739 		return error;
740 	}
741 
742 	/* Cannot make a non-dir a mount-point (from here anyway). */
743 	if (vp->v_type != VDIR) {
744 		vfs_delref(vfsops);
745 		return ENOTDIR;
746 	}
747 
748 	if (flags & MNT_EXPORTED) {
749 		vfs_delref(vfsops);
750 		return EINVAL;
751 	}
752 
753 	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
754 		vfs_delref(vfsops);
755 		return ENOMEM;
756 	}
757 
758 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
759 
760 	/*
761 	 * The underlying file system may refuse the mount for
762 	 * various reasons.  Allow the user to force it to happen.
763 	 *
764 	 * Set the mount level flags.
765 	 */
766 	mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
767 
768 	mutex_enter(mp->mnt_updating);
769 	error = VFS_MOUNT(mp, path, data, data_len);
770 	mp->mnt_flag &= ~MNT_OP_FLAGS;
771 
772 	if (error != 0)
773 		goto err_unmounted;
774 
775 	/*
776 	 * Validate and prepare the mount point.
777 	 */
778 	error = pathbuf_copyin(path, &pb);
779 	if (error != 0) {
780 		goto err_mounted;
781 	}
782 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
783 	error = namei(&nd);
784 	pathbuf_destroy(pb);
785 	if (error != 0) {
786 		goto err_mounted;
787 	}
788 	if (nd.ni_vp != vp) {
789 		vput(nd.ni_vp);
790 		error = EINVAL;
791 		goto err_mounted;
792 	}
793 	if (vp->v_mountedhere != NULL) {
794 		vput(nd.ni_vp);
795 		error = EBUSY;
796 		goto err_mounted;
797 	}
798 	error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
799 	if (error != 0) {
800 		vput(nd.ni_vp);
801 		goto err_mounted;
802 	}
803 
804 	/*
805 	 * Put the new filesystem on the mount list after root.
806 	 */
807 	cache_purge(vp);
808 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
809 
810 	mountlist_append(mp);
811 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
812 		vfs_syncer_add_to_worklist(mp);
813 	vp->v_mountedhere = mp;
814 	vput(nd.ni_vp);
815 
816 	mount_checkdirs(vp);
817 	mutex_exit(mp->mnt_updating);
818 
819 	/* Hold an additional reference to the mount across VFS_START(). */
820 	vfs_ref(mp);
821 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
822 	error = VFS_START(mp, 0);
823 	if (error) {
824 		vrele(vp);
825 	} else if (flags & MNT_EXTATTR) {
826 		if (start_extattr(mp) != 0)
827 			mp->mnt_flag &= ~MNT_EXTATTR;
828 	}
829 	/* Drop reference held for VFS_START(). */
830 	vfs_rele(mp);
831 	*vpp = NULL;
832 	return error;
833 
834 err_mounted:
835 	do {
836 		error2 = vfs_suspend(mp, 0);
837 	} while (error2 == EINTR || error2 == ERESTART);
838 	KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
839 
840 	if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
841 		panic("Unmounting fresh file system failed");
842 
843 	if (error2 == 0)
844 		vfs_resume(mp);
845 
846 err_unmounted:
847 	vp->v_mountedhere = NULL;
848 	mutex_exit(mp->mnt_updating);
849 	vfs_rele(mp);
850 
851 	return error;
852 }
853 
854 /*
855  * Do the actual file system unmount.  File system is assumed to have
856  * been locked by the caller.
857  *
858  * => Caller hold reference to the mount, explicitly for dounmount().
859  */
860 int
861 dounmount(struct mount *mp, int flags, struct lwp *l)
862 {
863 	vnode_t *coveredvp;
864 	int error, async, used_syncer, used_extattr;
865 	const bool was_suspended = fstrans_is_owner(mp);
866 
867 #if NVERIEXEC > 0
868 	error = veriexec_unmountchk(mp);
869 	if (error)
870 		return (error);
871 #endif /* NVERIEXEC > 0 */
872 
873 	if (!was_suspended) {
874 		error = vfs_suspend(mp, 0);
875 		if (error) {
876 			return error;
877 		}
878 	}
879 
880 	KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
881 
882 	used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
883 	used_extattr = mp->mnt_flag & MNT_EXTATTR;
884 
885 	mp->mnt_iflag |= IMNT_UNMOUNT;
886 	mutex_enter(mp->mnt_updating);
887 	async = mp->mnt_flag & MNT_ASYNC;
888 	mp->mnt_flag &= ~MNT_ASYNC;
889 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
890 	if (used_syncer)
891 		vfs_syncer_remove_from_worklist(mp);
892 	error = 0;
893 	if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
894 		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
895 	}
896 	if (error == 0 || (flags & MNT_FORCE)) {
897 		error = VFS_UNMOUNT(mp, flags);
898 	}
899 	if (error) {
900 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
901 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
902 			vfs_syncer_add_to_worklist(mp);
903 		mp->mnt_flag |= async;
904 		mutex_exit(mp->mnt_updating);
905 		if (!was_suspended)
906 			vfs_resume(mp);
907 		if (used_extattr) {
908 			if (start_extattr(mp) != 0)
909 				mp->mnt_flag &= ~MNT_EXTATTR;
910 			else
911 				mp->mnt_flag |= MNT_EXTATTR;
912 		}
913 		return (error);
914 	}
915 	mutex_exit(mp->mnt_updating);
916 
917 	/*
918 	 * mark filesystem as gone to prevent further umounts
919 	 * after mnt_umounting lock is gone, this also prevents
920 	 * vfs_busy() from succeeding.
921 	 */
922 	mp->mnt_iflag |= IMNT_GONE;
923 	if (!was_suspended)
924 		vfs_resume(mp);
925 
926 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
927 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY);
928 		coveredvp->v_mountedhere = NULL;
929 		VOP_UNLOCK(coveredvp);
930 	}
931 	mountlist_remove(mp);
932 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
933 		panic("unmount: dangling vnode");
934 	vfs_hooks_unmount(mp);
935 
936 	vfs_rele(mp);	/* reference from mount() */
937 	if (coveredvp != NULLVP) {
938 		vrele(coveredvp);
939 	}
940 	return (0);
941 }
942 
943 /*
944  * Unmount all file systems.
945  * We traverse the list in reverse order under the assumption that doing so
946  * will avoid needing to worry about dependencies.
947  */
948 bool
949 vfs_unmountall(struct lwp *l)
950 {
951 
952 	printf("unmounting file systems...\n");
953 	return vfs_unmountall1(l, true, true);
954 }
955 
956 static void
957 vfs_unmount_print(struct mount *mp, const char *pfx)
958 {
959 
960 	aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
961 	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
962 	    mp->mnt_stat.f_fstypename);
963 }
964 
965 /*
966  * Return the mount with the highest generation less than "gen".
967  */
968 static struct mount *
969 vfs_unmount_next(uint64_t gen)
970 {
971 	mount_iterator_t *iter;
972 	struct mount *mp, *nmp;
973 
974 	nmp = NULL;
975 
976 	mountlist_iterator_init(&iter);
977 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
978 		if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
979 		    mp->mnt_gen < gen) {
980 			if (nmp != NULL)
981 				vfs_rele(nmp);
982 			nmp = mp;
983 			vfs_ref(nmp);
984 		}
985 	}
986 	mountlist_iterator_destroy(iter);
987 
988 	return nmp;
989 }
990 
991 bool
992 vfs_unmount_forceone(struct lwp *l)
993 {
994 	struct mount *mp;
995 	int error;
996 
997 	mp = vfs_unmount_next(mountgen);
998 	if (mp == NULL) {
999 		return false;
1000 	}
1001 
1002 #ifdef DEBUG
1003 	printf("forcefully unmounting %s (%s)...\n",
1004 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1005 #endif
1006 	if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
1007 		vfs_unmount_print(mp, "forcefully ");
1008 		return true;
1009 	} else {
1010 		vfs_rele(mp);
1011 	}
1012 
1013 #ifdef DEBUG
1014 	printf("forceful unmount of %s failed with error %d\n",
1015 	    mp->mnt_stat.f_mntonname, error);
1016 #endif
1017 
1018 	return false;
1019 }
1020 
1021 bool
1022 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
1023 {
1024 	struct mount *mp;
1025 	bool any_error = false, progress = false;
1026 	uint64_t gen;
1027 	int error;
1028 
1029 	gen = mountgen;
1030 	for (;;) {
1031 		mp = vfs_unmount_next(gen);
1032 		if (mp == NULL)
1033 			break;
1034 		gen = mp->mnt_gen;
1035 
1036 #ifdef DEBUG
1037 		printf("unmounting %p %s (%s)...\n",
1038 		    (void *)mp, mp->mnt_stat.f_mntonname,
1039 		    mp->mnt_stat.f_mntfromname);
1040 #endif
1041 		if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
1042 			vfs_unmount_print(mp, "");
1043 			progress = true;
1044 		} else {
1045 			vfs_rele(mp);
1046 			if (verbose) {
1047 				printf("unmount of %s failed with error %d\n",
1048 				    mp->mnt_stat.f_mntonname, error);
1049 			}
1050 			any_error = true;
1051 		}
1052 	}
1053 	if (verbose) {
1054 		printf("unmounting done\n");
1055 	}
1056 	if (any_error && verbose) {
1057 		printf("WARNING: some file systems would not unmount\n");
1058 	}
1059 	return progress;
1060 }
1061 
1062 void
1063 vfs_sync_all(struct lwp *l)
1064 {
1065 	printf("syncing disks... ");
1066 
1067 	/* remove user processes from run queue */
1068 	suspendsched();
1069 	(void)spl0();
1070 
1071 	/* avoid coming back this way again if we panic. */
1072 	doing_shutdown = 1;
1073 
1074 	do_sys_sync(l);
1075 
1076 	/* Wait for sync to finish. */
1077 	if (vfs_syncwait() != 0) {
1078 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1079 		Debugger();
1080 #endif
1081 		printf("giving up\n");
1082 		return;
1083 	} else
1084 		printf("done\n");
1085 }
1086 
1087 /*
1088  * Sync and unmount file systems before shutting down.
1089  */
1090 void
1091 vfs_shutdown(void)
1092 {
1093 	lwp_t *l = curlwp;
1094 
1095 	vfs_sync_all(l);
1096 
1097 	/*
1098 	 * If we have paniced - do not make the situation potentially
1099 	 * worse by unmounting the file systems.
1100 	 */
1101 	if (panicstr != NULL) {
1102 		return;
1103 	}
1104 
1105 	/* Unmount file systems. */
1106 	vfs_unmountall(l);
1107 }
1108 
1109 /*
1110  * Print a list of supported file system types (used by vfs_mountroot)
1111  */
1112 static void
1113 vfs_print_fstypes(void)
1114 {
1115 	struct vfsops *v;
1116 	int cnt = 0;
1117 
1118 	mutex_enter(&vfs_list_lock);
1119 	LIST_FOREACH(v, &vfs_list, vfs_list)
1120 		++cnt;
1121 	mutex_exit(&vfs_list_lock);
1122 
1123 	if (cnt == 0) {
1124 		printf("WARNING: No file system modules have been loaded.\n");
1125 		return;
1126 	}
1127 
1128 	printf("Supported file systems:");
1129 	mutex_enter(&vfs_list_lock);
1130 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1131 		printf(" %s", v->vfs_name);
1132 	}
1133 	mutex_exit(&vfs_list_lock);
1134 	printf("\n");
1135 }
1136 
1137 /*
1138  * Mount the root file system.  If the operator didn't specify a
1139  * file system to use, try all possible file systems until one
1140  * succeeds.
1141  */
1142 int
1143 vfs_mountroot(void)
1144 {
1145 	struct vfsops *v;
1146 	int error = ENODEV;
1147 
1148 	if (root_device == NULL)
1149 		panic("vfs_mountroot: root device unknown");
1150 
1151 	switch (device_class(root_device)) {
1152 	case DV_IFNET:
1153 		if (rootdev != NODEV)
1154 			panic("vfs_mountroot: rootdev set for DV_IFNET "
1155 			    "(0x%llx -> %llu,%llu)",
1156 			    (unsigned long long)rootdev,
1157 			    (unsigned long long)major(rootdev),
1158 			    (unsigned long long)minor(rootdev));
1159 		break;
1160 
1161 	case DV_DISK:
1162 		if (rootdev == NODEV)
1163 			panic("vfs_mountroot: rootdev not set for DV_DISK");
1164 	        if (bdevvp(rootdev, &rootvp))
1165 	                panic("vfs_mountroot: can't get vnode for rootdev");
1166 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1167 		if (error) {
1168 			printf("vfs_mountroot: can't open root device\n");
1169 			return (error);
1170 		}
1171 		break;
1172 
1173 	case DV_VIRTUAL:
1174 		break;
1175 
1176 	default:
1177 		printf("%s: inappropriate for root file system\n",
1178 		    device_xname(root_device));
1179 		return (ENODEV);
1180 	}
1181 
1182 	/*
1183 	 * If user specified a root fs type, use it.  Make sure the
1184 	 * specified type exists and has a mount_root()
1185 	 */
1186 	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1187 		v = vfs_getopsbyname(rootfstype);
1188 		error = EFTYPE;
1189 		if (v != NULL) {
1190 			if (v->vfs_mountroot != NULL) {
1191 				error = (v->vfs_mountroot)();
1192 			}
1193 			v->vfs_refcount--;
1194 		}
1195 		goto done;
1196 	}
1197 
1198 	/*
1199 	 * Try each file system currently configured into the kernel.
1200 	 */
1201 	mutex_enter(&vfs_list_lock);
1202 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1203 		if (v->vfs_mountroot == NULL)
1204 			continue;
1205 #ifdef DEBUG
1206 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1207 #endif
1208 		v->vfs_refcount++;
1209 		mutex_exit(&vfs_list_lock);
1210 		error = (*v->vfs_mountroot)();
1211 		mutex_enter(&vfs_list_lock);
1212 		v->vfs_refcount--;
1213 		if (!error) {
1214 			aprint_normal("root file system type: %s\n",
1215 			    v->vfs_name);
1216 			break;
1217 		}
1218 	}
1219 	mutex_exit(&vfs_list_lock);
1220 
1221 	if (v == NULL) {
1222 		vfs_print_fstypes();
1223 		printf("no file system for %s", device_xname(root_device));
1224 		if (device_class(root_device) == DV_DISK)
1225 			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1226 		printf("\n");
1227 		error = EFTYPE;
1228 	}
1229 
1230 done:
1231 	if (error && device_class(root_device) == DV_DISK) {
1232 		VOP_CLOSE(rootvp, FREAD, FSCRED);
1233 		vrele(rootvp);
1234 	}
1235 	if (error == 0) {
1236 		mount_iterator_t *iter;
1237 		struct mount *mp;
1238 		extern struct cwdinfo cwdi0;
1239 
1240 		mountlist_iterator_init(&iter);
1241 		mp = mountlist_iterator_next(iter);
1242 		KASSERT(mp != NULL);
1243 		mountlist_iterator_destroy(iter);
1244 
1245 		mp->mnt_flag |= MNT_ROOTFS;
1246 		mp->mnt_op->vfs_refcount++;
1247 
1248 		/*
1249 		 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
1250 		 * reference it, and donate it the reference grabbed
1251 		 * with VFS_ROOT().
1252 		 */
1253 		error = VFS_ROOT(mp, LK_NONE, &rootvnode);
1254 		if (error)
1255 			panic("cannot find root vnode, error=%d", error);
1256 		cwdi0.cwdi_cdir = rootvnode;
1257 		cwdi0.cwdi_rdir = NULL;
1258 
1259 		/*
1260 		 * Now that root is mounted, we can fixup initproc's CWD
1261 		 * info.  All other processes are kthreads, which merely
1262 		 * share proc0's CWD info.
1263 		 */
1264 		initproc->p_cwdi->cwdi_cdir = rootvnode;
1265 		vref(initproc->p_cwdi->cwdi_cdir);
1266 		initproc->p_cwdi->cwdi_rdir = NULL;
1267 		/*
1268 		 * Enable loading of modules from the filesystem
1269 		 */
1270 		module_load_vfs_init();
1271 
1272 	}
1273 	return (error);
1274 }
1275 
1276 /*
1277  * mount_specific_key_create --
1278  *	Create a key for subsystem mount-specific data.
1279  */
1280 int
1281 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1282 {
1283 
1284 	return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1285 }
1286 
1287 /*
1288  * mount_specific_key_delete --
1289  *	Delete a key for subsystem mount-specific data.
1290  */
1291 void
1292 mount_specific_key_delete(specificdata_key_t key)
1293 {
1294 
1295 	specificdata_key_delete(mount_specificdata_domain, key);
1296 }
1297 
1298 /*
1299  * mount_initspecific --
1300  *	Initialize a mount's specificdata container.
1301  */
1302 void
1303 mount_initspecific(struct mount *mp)
1304 {
1305 	int error __diagused;
1306 
1307 	error = specificdata_init(mount_specificdata_domain,
1308 				  &mp->mnt_specdataref);
1309 	KASSERT(error == 0);
1310 }
1311 
1312 /*
1313  * mount_finispecific --
1314  *	Finalize a mount's specificdata container.
1315  */
1316 void
1317 mount_finispecific(struct mount *mp)
1318 {
1319 
1320 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1321 }
1322 
1323 /*
1324  * mount_getspecific --
1325  *	Return mount-specific data corresponding to the specified key.
1326  */
1327 void *
1328 mount_getspecific(struct mount *mp, specificdata_key_t key)
1329 {
1330 
1331 	return specificdata_getspecific(mount_specificdata_domain,
1332 					 &mp->mnt_specdataref, key);
1333 }
1334 
1335 /*
1336  * mount_setspecific --
1337  *	Set mount-specific data corresponding to the specified key.
1338  */
1339 void
1340 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1341 {
1342 
1343 	specificdata_setspecific(mount_specificdata_domain,
1344 				 &mp->mnt_specdataref, key, data);
1345 }
1346 
1347 /*
1348  * Check to see if a filesystem is mounted on a block device.
1349  */
1350 int
1351 vfs_mountedon(vnode_t *vp)
1352 {
1353 	vnode_t *vq;
1354 	int error = 0;
1355 
1356 	if (vp->v_type != VBLK)
1357 		return ENOTBLK;
1358 	if (spec_node_getmountedfs(vp) != NULL)
1359 		return EBUSY;
1360 	if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) {
1361 		if (spec_node_getmountedfs(vq) != NULL)
1362 			error = EBUSY;
1363 		vrele(vq);
1364 	}
1365 
1366 	return error;
1367 }
1368 
1369 /*
1370  * Check if a device pointed to by vp is mounted.
1371  *
1372  * Returns:
1373  *   EINVAL	if it's not a disk
1374  *   EBUSY	if it's a disk and mounted
1375  *   0		if it's a disk and not mounted
1376  */
1377 int
1378 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1379 {
1380 	vnode_t *bvp;
1381 	dev_t dev;
1382 	int d_type;
1383 
1384 	bvp = NULL;
1385 	d_type = D_OTHER;
1386 
1387 	if (iskmemvp(vp))
1388 		return EINVAL;
1389 
1390 	switch (vp->v_type) {
1391 	case VCHR: {
1392 		const struct cdevsw *cdev;
1393 
1394 		dev = vp->v_rdev;
1395 		cdev = cdevsw_lookup(dev);
1396 		if (cdev != NULL) {
1397 			dev_t blkdev;
1398 
1399 			blkdev = devsw_chr2blk(dev);
1400 			if (blkdev != NODEV) {
1401 				if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1402 					d_type = (cdev->d_flag & D_TYPEMASK);
1403 					/* XXX: what if bvp disappears? */
1404 					vrele(bvp);
1405 				}
1406 			}
1407 		}
1408 
1409 		break;
1410 		}
1411 
1412 	case VBLK: {
1413 		const struct bdevsw *bdev;
1414 
1415 		dev = vp->v_rdev;
1416 		bdev = bdevsw_lookup(dev);
1417 		if (bdev != NULL)
1418 			d_type = (bdev->d_flag & D_TYPEMASK);
1419 
1420 		bvp = vp;
1421 
1422 		break;
1423 		}
1424 
1425 	default:
1426 		break;
1427 	}
1428 
1429 	if (d_type != D_DISK)
1430 		return EINVAL;
1431 
1432 	if (bvpp != NULL)
1433 		*bvpp = bvp;
1434 
1435 	/*
1436 	 * XXX: This is bogus. We should be failing the request
1437 	 * XXX: not only if this specific slice is mounted, but
1438 	 * XXX: if it's on a disk with any other mounted slice.
1439 	 */
1440 	if (vfs_mountedon(bvp))
1441 		return EBUSY;
1442 
1443 	return 0;
1444 }
1445 
1446 /*
1447  * Make a 'unique' number from a mount type name.
1448  */
1449 long
1450 makefstype(const char *type)
1451 {
1452 	long rv;
1453 
1454 	for (rv = 0; *type; type++) {
1455 		rv <<= 2;
1456 		rv ^= *type;
1457 	}
1458 	return rv;
1459 }
1460 
1461 static struct mountlist_entry *
1462 mountlist_alloc(enum mountlist_type type, struct mount *mp)
1463 {
1464 	struct mountlist_entry *me;
1465 
1466 	me = kmem_zalloc(sizeof(*me), KM_SLEEP);
1467 	me->me_mount = mp;
1468 	me->me_type = type;
1469 
1470 	return me;
1471 }
1472 
1473 static void
1474 mountlist_free(struct mountlist_entry *me)
1475 {
1476 
1477 	kmem_free(me, sizeof(*me));
1478 }
1479 
1480 void
1481 mountlist_iterator_init(mount_iterator_t **mip)
1482 {
1483 	struct mountlist_entry *me;
1484 
1485 	me = mountlist_alloc(ME_MARKER, NULL);
1486 	mutex_enter(&mountlist_lock);
1487 	TAILQ_INSERT_HEAD(&mountlist, me, me_list);
1488 	mutex_exit(&mountlist_lock);
1489 	*mip = (mount_iterator_t *)me;
1490 }
1491 
1492 void
1493 mountlist_iterator_destroy(mount_iterator_t *mi)
1494 {
1495 	struct mountlist_entry *marker = &mi->mi_entry;
1496 
1497 	if (marker->me_mount != NULL)
1498 		vfs_unbusy(marker->me_mount);
1499 
1500 	mutex_enter(&mountlist_lock);
1501 	TAILQ_REMOVE(&mountlist, marker, me_list);
1502 	mutex_exit(&mountlist_lock);
1503 
1504 	mountlist_free(marker);
1505 
1506 }
1507 
1508 /*
1509  * Return the next mount or NULL for this iterator.
1510  * Mark it busy on success.
1511  */
1512 static inline struct mount *
1513 _mountlist_iterator_next(mount_iterator_t *mi, bool wait)
1514 {
1515 	struct mountlist_entry *me, *marker = &mi->mi_entry;
1516 	struct mount *mp;
1517 	int error;
1518 
1519 	if (marker->me_mount != NULL) {
1520 		vfs_unbusy(marker->me_mount);
1521 		marker->me_mount = NULL;
1522 	}
1523 
1524 	mutex_enter(&mountlist_lock);
1525 	for (;;) {
1526 		KASSERT(marker->me_type == ME_MARKER);
1527 
1528 		me = TAILQ_NEXT(marker, me_list);
1529 		if (me == NULL) {
1530 			/* End of list: keep marker and return. */
1531 			mutex_exit(&mountlist_lock);
1532 			return NULL;
1533 		}
1534 		TAILQ_REMOVE(&mountlist, marker, me_list);
1535 		TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
1536 
1537 		/* Skip other markers. */
1538 		if (me->me_type != ME_MOUNT)
1539 			continue;
1540 
1541 		/* Take an initial reference for vfs_busy() below. */
1542 		mp = me->me_mount;
1543 		KASSERT(mp != NULL);
1544 		vfs_ref(mp);
1545 		mutex_exit(&mountlist_lock);
1546 
1547 		/* Try to mark this mount busy and return on success. */
1548 		if (wait)
1549 			error = vfs_busy(mp);
1550 		else
1551 			error = vfs_trybusy(mp);
1552 		if (error == 0) {
1553 			vfs_rele(mp);
1554 			marker->me_mount = mp;
1555 			return mp;
1556 		}
1557 		vfs_rele(mp);
1558 		mutex_enter(&mountlist_lock);
1559 	}
1560 }
1561 
1562 struct mount *
1563 mountlist_iterator_next(mount_iterator_t *mi)
1564 {
1565 
1566 	return _mountlist_iterator_next(mi, true);
1567 }
1568 
1569 struct mount *
1570 mountlist_iterator_trynext(mount_iterator_t *mi)
1571 {
1572 
1573 	return _mountlist_iterator_next(mi, false);
1574 }
1575 
1576 /*
1577  * Attach new mount to the end of the mount list.
1578  */
1579 void
1580 mountlist_append(struct mount *mp)
1581 {
1582 	struct mountlist_entry *me;
1583 
1584 	me = mountlist_alloc(ME_MOUNT, mp);
1585 	mutex_enter(&mountlist_lock);
1586 	TAILQ_INSERT_TAIL(&mountlist, me, me_list);
1587 	mutex_exit(&mountlist_lock);
1588 }
1589 
1590 /*
1591  * Remove mount from mount list.
1592  */void
1593 mountlist_remove(struct mount *mp)
1594 {
1595 	struct mountlist_entry *me;
1596 
1597 	mutex_enter(&mountlist_lock);
1598 	TAILQ_FOREACH(me, &mountlist, me_list)
1599 		if (me->me_type == ME_MOUNT && me->me_mount == mp)
1600 			break;
1601 	KASSERT(me != NULL);
1602 	TAILQ_REMOVE(&mountlist, me, me_list);
1603 	mutex_exit(&mountlist_lock);
1604 	mountlist_free(me);
1605 }
1606 
1607 /*
1608  * Unlocked variant to traverse the mountlist.
1609  * To be used from DDB only.
1610  */
1611 struct mount *
1612 _mountlist_next(struct mount *mp)
1613 {
1614 	struct mountlist_entry *me;
1615 
1616 	if (mp == NULL) {
1617 		me = TAILQ_FIRST(&mountlist);
1618 	} else {
1619 		TAILQ_FOREACH(me, &mountlist, me_list)
1620 			if (me->me_type == ME_MOUNT && me->me_mount == mp)
1621 				break;
1622 		if (me != NULL)
1623 			me = TAILQ_NEXT(me, me_list);
1624 	}
1625 
1626 	while (me != NULL && me->me_type != ME_MOUNT)
1627 		me = TAILQ_NEXT(me, me_list);
1628 
1629 	return (me ? me->me_mount : NULL);
1630 }
1631