xref: /netbsd-src/sys/kern/vfs_subr.c (revision abb0f93cd77b67f080613360c65701f85e5f5cfe)
1 /*	$NetBSD: vfs_subr.c,v 1.392 2009/11/28 10:10:17 bouyer Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * Note on v_usecount and locking:
71  *
72  * At nearly all points it is known that v_usecount could be zero, the
73  * vnode interlock will be held.
74  *
75  * To change v_usecount away from zero, the interlock must be held.  To
76  * change from a non-zero value to zero, again the interlock must be
77  * held.
78  *
79  * There's a flag bit, VC_XLOCK, embedded in v_usecount.
80  * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock
81  * must be held.
82  * To modify the VC_XLOCK bit, the interlock must be held.
83  * We always keep the usecount (v_usecount & VC_MASK) non-zero while the
84  * VC_XLOCK bit is set.
85  *
86  * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
87  * value to a non-zero value can safely be done using atomic operations,
88  * without the interlock held.
89  * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
90  * value can be done using atomic operations, without the interlock held.
91  */
92 
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.392 2009/11/28 10:10:17 bouyer Exp $");
95 
96 #include "opt_ddb.h"
97 #include "opt_compat_netbsd.h"
98 #include "opt_compat_43.h"
99 
100 #include <sys/param.h>
101 #include <sys/systm.h>
102 #include <sys/conf.h>
103 #include <sys/proc.h>
104 #include <sys/kernel.h>
105 #include <sys/mount.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
108 #include <sys/stat.h>
109 #include <sys/namei.h>
110 #include <sys/ucred.h>
111 #include <sys/buf.h>
112 #include <sys/errno.h>
113 #include <sys/kmem.h>
114 #include <sys/syscallargs.h>
115 #include <sys/device.h>
116 #include <sys/filedesc.h>
117 #include <sys/kauth.h>
118 #include <sys/atomic.h>
119 #include <sys/kthread.h>
120 #include <sys/wapbl.h>
121 
122 #include <miscfs/genfs/genfs.h>
123 #include <miscfs/specfs/specdev.h>
124 #include <miscfs/syncfs/syncfs.h>
125 
126 #include <uvm/uvm.h>
127 #include <uvm/uvm_readahead.h>
128 #include <uvm/uvm_ddb.h>
129 
130 #include <sys/sysctl.h>
131 
132 const enum vtype iftovt_tab[16] = {
133 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
134 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
135 };
136 const int	vttoif_tab[9] = {
137 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
138 	S_IFSOCK, S_IFIFO, S_IFMT,
139 };
140 
141 /*
142  * Insq/Remq for the vnode usage lists.
143  */
144 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
145 #define	bufremvn(bp) {							\
146 	LIST_REMOVE(bp, b_vnbufs);					\
147 	(bp)->b_vnbufs.le_next = NOLIST;				\
148 }
149 
150 int doforce = 1;		/* 1 => permit forcible unmounting */
151 int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
152 
153 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
154 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
155 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
156 
157 struct mntlist mountlist =			/* mounted filesystem list */
158     CIRCLEQ_HEAD_INITIALIZER(mountlist);
159 
160 u_int numvnodes;
161 static specificdata_domain_t mount_specificdata_domain;
162 
163 static int vrele_pending;
164 static int vrele_gen;
165 static kmutex_t	vrele_lock;
166 static kcondvar_t vrele_cv;
167 static lwp_t *vrele_lwp;
168 
169 static uint64_t mountgen = 0;
170 static kmutex_t mountgen_lock;
171 
172 kmutex_t mountlist_lock;
173 kmutex_t mntid_lock;
174 kmutex_t mntvnode_lock;
175 kmutex_t vnode_free_list_lock;
176 kmutex_t vfs_list_lock;
177 
178 static pool_cache_t vnode_cache;
179 
180 /*
181  * These define the root filesystem and device.
182  */
183 struct vnode *rootvnode;
184 struct device *root_device;			/* root device */
185 
186 /*
187  * Local declarations.
188  */
189 
190 static void vrele_thread(void *);
191 static void insmntque(vnode_t *, struct mount *);
192 static int getdevvp(dev_t, vnode_t **, enum vtype);
193 static vnode_t *getcleanvnode(void);
194 void vpanic(vnode_t *, const char *);
195 static void vfs_shutdown1(struct lwp *);
196 
197 #ifdef DEBUG
198 void printlockedvnodes(void);
199 #endif
200 
201 #ifdef DIAGNOSTIC
202 void
203 vpanic(vnode_t *vp, const char *msg)
204 {
205 
206 	vprint(NULL, vp);
207 	panic("%s\n", msg);
208 }
209 #else
210 #define	vpanic(vp, msg)	/* nothing */
211 #endif
212 
213 void
214 vn_init1(void)
215 {
216 
217 	vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
218 	    NULL, IPL_NONE, NULL, NULL, NULL);
219 	KASSERT(vnode_cache != NULL);
220 
221 	/* Create deferred release thread. */
222 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
223 	cv_init(&vrele_cv, "vrele");
224 	if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
225 	    NULL, &vrele_lwp, "vrele"))
226 		panic("fork vrele");
227 }
228 
229 /*
230  * Initialize the vnode management data structures.
231  */
232 void
233 vntblinit(void)
234 {
235 
236 	mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
237 	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
238 	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
239 	mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
240 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
241 	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
242 
243 	mount_specificdata_domain = specificdata_domain_create();
244 
245 	/* Initialize the filesystem syncer. */
246 	vn_initialize_syncerd();
247 	vn_init1();
248 }
249 
250 int
251 vfs_drainvnodes(long target, struct lwp *l)
252 {
253 
254 	while (numvnodes > target) {
255 		vnode_t *vp;
256 
257 		mutex_enter(&vnode_free_list_lock);
258 		vp = getcleanvnode();
259 		if (vp == NULL)
260 			return EBUSY; /* give up */
261 		ungetnewvnode(vp);
262 	}
263 
264 	return 0;
265 }
266 
267 /*
268  * Lookup a mount point by filesystem identifier.
269  *
270  * XXX Needs to add a reference to the mount point.
271  */
272 struct mount *
273 vfs_getvfs(fsid_t *fsid)
274 {
275 	struct mount *mp;
276 
277 	mutex_enter(&mountlist_lock);
278 	CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
279 		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
280 		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
281 			mutex_exit(&mountlist_lock);
282 			return (mp);
283 		}
284 	}
285 	mutex_exit(&mountlist_lock);
286 	return ((struct mount *)0);
287 }
288 
289 /*
290  * Drop a reference to a mount structure, freeing if the last reference.
291  */
292 void
293 vfs_destroy(struct mount *mp)
294 {
295 
296 	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
297 		return;
298 	}
299 
300 	/*
301 	 * Nothing else has visibility of the mount: we can now
302 	 * free the data structures.
303 	 */
304 	KASSERT(mp->mnt_refcnt == 0);
305 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
306 	rw_destroy(&mp->mnt_unmounting);
307 	mutex_destroy(&mp->mnt_updating);
308 	mutex_destroy(&mp->mnt_renamelock);
309 	if (mp->mnt_op != NULL) {
310 		vfs_delref(mp->mnt_op);
311 	}
312 	kmem_free(mp, sizeof(*mp));
313 }
314 
315 /*
316  * grab a vnode from freelist and clean it.
317  */
318 vnode_t *
319 getcleanvnode(void)
320 {
321 	vnode_t *vp;
322 	vnodelst_t *listhd;
323 
324 	KASSERT(mutex_owned(&vnode_free_list_lock));
325 
326 retry:
327 	listhd = &vnode_free_list;
328 try_nextlist:
329 	TAILQ_FOREACH(vp, listhd, v_freelist) {
330 		/*
331 		 * It's safe to test v_usecount and v_iflag
332 		 * without holding the interlock here, since
333 		 * these vnodes should never appear on the
334 		 * lists.
335 		 */
336 		if (vp->v_usecount != 0) {
337 			vpanic(vp, "free vnode isn't");
338 		}
339 		if ((vp->v_iflag & VI_CLEAN) != 0) {
340 			vpanic(vp, "clean vnode on freelist");
341 		}
342 		if (vp->v_freelisthd != listhd) {
343 			printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
344 			vpanic(vp, "list head mismatch");
345 		}
346 		if (!mutex_tryenter(&vp->v_interlock))
347 			continue;
348 		/*
349 		 * Our lwp might hold the underlying vnode
350 		 * locked, so don't try to reclaim a VI_LAYER
351 		 * node if it's locked.
352 		 */
353 		if ((vp->v_iflag & VI_XLOCK) == 0 &&
354 		    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
355 			break;
356 		}
357 		mutex_exit(&vp->v_interlock);
358 	}
359 
360 	if (vp == NULL) {
361 		if (listhd == &vnode_free_list) {
362 			listhd = &vnode_hold_list;
363 			goto try_nextlist;
364 		}
365 		mutex_exit(&vnode_free_list_lock);
366 		return NULL;
367 	}
368 
369 	/* Remove it from the freelist. */
370 	TAILQ_REMOVE(listhd, vp, v_freelist);
371 	vp->v_freelisthd = NULL;
372 	mutex_exit(&vnode_free_list_lock);
373 
374 	if (vp->v_usecount != 0) {
375 		/*
376 		 * was referenced again before we got the interlock
377 		 * Don't return to freelist - the holder of the last
378 		 * reference will destroy it.
379 		 */
380 		mutex_exit(&vp->v_interlock);
381 		mutex_enter(&vnode_free_list_lock);
382 		goto retry;
383 	}
384 
385 	/*
386 	 * The vnode is still associated with a file system, so we must
387 	 * clean it out before reusing it.  We need to add a reference
388 	 * before doing this.  If the vnode gains another reference while
389 	 * being cleaned out then we lose - retry.
390 	 */
391 	atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
392 	vclean(vp, DOCLOSE);
393 	KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
394 	atomic_add_int(&vp->v_usecount, -VC_XLOCK);
395 	if (vp->v_usecount == 1) {
396 		/* We're about to dirty it. */
397 		vp->v_iflag &= ~VI_CLEAN;
398 		mutex_exit(&vp->v_interlock);
399 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
400 			spec_node_destroy(vp);
401 		}
402 		vp->v_type = VNON;
403 	} else {
404 		/*
405 		 * Don't return to freelist - the holder of the last
406 		 * reference will destroy it.
407 		 */
408 		vrelel(vp, 0); /* releases vp->v_interlock */
409 		mutex_enter(&vnode_free_list_lock);
410 		goto retry;
411 	}
412 
413 	if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
414 	    !TAILQ_EMPTY(&vp->v_uobj.memq)) {
415 		vpanic(vp, "cleaned vnode isn't");
416 	}
417 	if (vp->v_numoutput != 0) {
418 		vpanic(vp, "clean vnode has pending I/O's");
419 	}
420 	if ((vp->v_iflag & VI_ONWORKLST) != 0) {
421 		vpanic(vp, "clean vnode on syncer list");
422 	}
423 
424 	return vp;
425 }
426 
427 /*
428  * Mark a mount point as busy, and gain a new reference to it.  Used to
429  * prevent the file system from being unmounted during critical sections.
430  *
431  * => The caller must hold a pre-existing reference to the mount.
432  * => Will fail if the file system is being unmounted, or is unmounted.
433  */
434 int
435 vfs_busy(struct mount *mp, struct mount **nextp)
436 {
437 
438 	KASSERT(mp->mnt_refcnt > 0);
439 
440 	if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
441 		if (nextp != NULL) {
442 			KASSERT(mutex_owned(&mountlist_lock));
443 			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
444 		}
445 		return EBUSY;
446 	}
447 	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
448 		rw_exit(&mp->mnt_unmounting);
449 		if (nextp != NULL) {
450 			KASSERT(mutex_owned(&mountlist_lock));
451 			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
452 		}
453 		return ENOENT;
454 	}
455 	if (nextp != NULL) {
456 		mutex_exit(&mountlist_lock);
457 	}
458 	atomic_inc_uint(&mp->mnt_refcnt);
459 	return 0;
460 }
461 
462 /*
463  * Unbusy a busy filesystem.
464  *
465  * => If keepref is true, preserve reference added by vfs_busy().
466  * => If nextp != NULL, acquire mountlist_lock.
467  */
468 void
469 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
470 {
471 
472 	KASSERT(mp->mnt_refcnt > 0);
473 
474 	if (nextp != NULL) {
475 		mutex_enter(&mountlist_lock);
476 	}
477 	rw_exit(&mp->mnt_unmounting);
478 	if (!keepref) {
479 		vfs_destroy(mp);
480 	}
481 	if (nextp != NULL) {
482 		KASSERT(mutex_owned(&mountlist_lock));
483 		*nextp = CIRCLEQ_NEXT(mp, mnt_list);
484 	}
485 }
486 
487 struct mount *
488 vfs_mountalloc(struct vfsops *vfsops, struct vnode *vp)
489 {
490 	int error;
491 	struct mount *mp;
492 
493 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
494 	if (mp == NULL)
495 		return NULL;
496 
497 	mp->mnt_op = vfsops;
498 	mp->mnt_refcnt = 1;
499 	TAILQ_INIT(&mp->mnt_vnodelist);
500 	rw_init(&mp->mnt_unmounting);
501 	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
502 	mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
503 	error = vfs_busy(mp, NULL);
504 	KASSERT(error == 0);
505 	mp->mnt_vnodecovered = vp;
506 	mount_initspecific(mp);
507 
508 	mutex_enter(&mountgen_lock);
509 	mp->mnt_gen = mountgen++;
510 	mutex_exit(&mountgen_lock);
511 
512 	return mp;
513 }
514 
515 /*
516  * Lookup a filesystem type, and if found allocate and initialize
517  * a mount structure for it.
518  *
519  * Devname is usually updated by mount(8) after booting.
520  */
521 int
522 vfs_rootmountalloc(const char *fstypename, const char *devname,
523     struct mount **mpp)
524 {
525 	struct vfsops *vfsp = NULL;
526 	struct mount *mp;
527 
528 	mutex_enter(&vfs_list_lock);
529 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
530 		if (!strncmp(vfsp->vfs_name, fstypename,
531 		    sizeof(mp->mnt_stat.f_fstypename)))
532 			break;
533 	if (vfsp == NULL) {
534 		mutex_exit(&vfs_list_lock);
535 		return (ENODEV);
536 	}
537 	vfsp->vfs_refcount++;
538 	mutex_exit(&vfs_list_lock);
539 
540 	if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
541 		return ENOMEM;
542 	mp->mnt_flag = MNT_RDONLY;
543 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
544 	    sizeof(mp->mnt_stat.f_fstypename));
545 	mp->mnt_stat.f_mntonname[0] = '/';
546 	mp->mnt_stat.f_mntonname[1] = '\0';
547 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
548 	    '\0';
549 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
550 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
551 	*mpp = mp;
552 	return (0);
553 }
554 
555 /*
556  * Routines having to do with the management of the vnode table.
557  */
558 extern int (**dead_vnodeop_p)(void *);
559 
560 /*
561  * Return the next vnode from the free list.
562  */
563 int
564 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
565 	    vnode_t **vpp)
566 {
567 	struct uvm_object *uobj;
568 	static int toggle;
569 	vnode_t *vp;
570 	int error = 0, tryalloc;
571 
572  try_again:
573 	if (mp != NULL) {
574 		/*
575 		 * Mark filesystem busy while we're creating a
576 		 * vnode.  If unmount is in progress, this will
577 		 * fail.
578 		 */
579 		error = vfs_busy(mp, NULL);
580 		if (error)
581 			return error;
582 	}
583 
584 	/*
585 	 * We must choose whether to allocate a new vnode or recycle an
586 	 * existing one. The criterion for allocating a new one is that
587 	 * the total number of vnodes is less than the number desired or
588 	 * there are no vnodes on either free list. Generally we only
589 	 * want to recycle vnodes that have no buffers associated with
590 	 * them, so we look first on the vnode_free_list. If it is empty,
591 	 * we next consider vnodes with referencing buffers on the
592 	 * vnode_hold_list. The toggle ensures that half the time we
593 	 * will use a buffer from the vnode_hold_list, and half the time
594 	 * we will allocate a new one unless the list has grown to twice
595 	 * the desired size. We are reticent to recycle vnodes from the
596 	 * vnode_hold_list because we will lose the identity of all its
597 	 * referencing buffers.
598 	 */
599 
600 	vp = NULL;
601 
602 	mutex_enter(&vnode_free_list_lock);
603 
604 	toggle ^= 1;
605 	if (numvnodes > 2 * desiredvnodes)
606 		toggle = 0;
607 
608 	tryalloc = numvnodes < desiredvnodes ||
609 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
610 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
611 
612 	if (tryalloc) {
613 		numvnodes++;
614 		mutex_exit(&vnode_free_list_lock);
615 		if ((vp = vnalloc(NULL)) == NULL) {
616 			mutex_enter(&vnode_free_list_lock);
617 			numvnodes--;
618 		} else
619 			vp->v_usecount = 1;
620 	}
621 
622 	if (vp == NULL) {
623 		vp = getcleanvnode();
624 		if (vp == NULL) {
625 			if (mp != NULL) {
626 				vfs_unbusy(mp, false, NULL);
627 			}
628 			if (tryalloc) {
629 				printf("WARNING: unable to allocate new "
630 				    "vnode, retrying...\n");
631 				kpause("newvn", false, hz, NULL);
632 				goto try_again;
633 			}
634 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
635 			*vpp = 0;
636 			return (ENFILE);
637 		}
638 		vp->v_iflag = 0;
639 		vp->v_vflag = 0;
640 		vp->v_uflag = 0;
641 		vp->v_socket = NULL;
642 	}
643 
644 	KASSERT(vp->v_usecount == 1);
645 	KASSERT(vp->v_freelisthd == NULL);
646 	KASSERT(LIST_EMPTY(&vp->v_nclist));
647 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
648 
649 	vp->v_type = VNON;
650 	vp->v_vnlock = &vp->v_lock;
651 	vp->v_tag = tag;
652 	vp->v_op = vops;
653 	insmntque(vp, mp);
654 	*vpp = vp;
655 	vp->v_data = 0;
656 
657 	/*
658 	 * initialize uvm_object within vnode.
659 	 */
660 
661 	uobj = &vp->v_uobj;
662 	KASSERT(uobj->pgops == &uvm_vnodeops);
663 	KASSERT(uobj->uo_npages == 0);
664 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
665 	vp->v_size = vp->v_writesize = VSIZENOTSET;
666 
667 	if (mp != NULL) {
668 		if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
669 			vp->v_vflag |= VV_MPSAFE;
670 		vfs_unbusy(mp, true, NULL);
671 	}
672 
673 	return (0);
674 }
675 
676 /*
677  * This is really just the reverse of getnewvnode(). Needed for
678  * VFS_VGET functions who may need to push back a vnode in case
679  * of a locking race.
680  */
681 void
682 ungetnewvnode(vnode_t *vp)
683 {
684 
685 	KASSERT(vp->v_usecount == 1);
686 	KASSERT(vp->v_data == NULL);
687 	KASSERT(vp->v_freelisthd == NULL);
688 
689 	mutex_enter(&vp->v_interlock);
690 	vp->v_iflag |= VI_CLEAN;
691 	vrelel(vp, 0);
692 }
693 
694 /*
695  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
696  * marker vnode and we are prepared to wait for the allocation.
697  */
698 vnode_t *
699 vnalloc(struct mount *mp)
700 {
701 	vnode_t *vp;
702 
703 	vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
704 	if (vp == NULL) {
705 		return NULL;
706 	}
707 
708 	memset(vp, 0, sizeof(*vp));
709 	UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
710 	cv_init(&vp->v_cv, "vnode");
711 	/*
712 	 * done by memset() above.
713 	 *	LIST_INIT(&vp->v_nclist);
714 	 *	LIST_INIT(&vp->v_dnclist);
715 	 */
716 
717 	if (mp != NULL) {
718 		vp->v_mount = mp;
719 		vp->v_type = VBAD;
720 		vp->v_iflag = VI_MARKER;
721 	} else {
722 		rw_init(&vp->v_lock.vl_lock);
723 	}
724 
725 	return vp;
726 }
727 
728 /*
729  * Free an unused, unreferenced vnode.
730  */
731 void
732 vnfree(vnode_t *vp)
733 {
734 
735 	KASSERT(vp->v_usecount == 0);
736 
737 	if ((vp->v_iflag & VI_MARKER) == 0) {
738 		rw_destroy(&vp->v_lock.vl_lock);
739 		mutex_enter(&vnode_free_list_lock);
740 		numvnodes--;
741 		mutex_exit(&vnode_free_list_lock);
742 	}
743 
744 	UVM_OBJ_DESTROY(&vp->v_uobj);
745 	cv_destroy(&vp->v_cv);
746 	pool_cache_put(vnode_cache, vp);
747 }
748 
749 /*
750  * Remove a vnode from its freelist.
751  */
752 static inline void
753 vremfree(vnode_t *vp)
754 {
755 
756 	KASSERT(mutex_owned(&vp->v_interlock));
757 	KASSERT(vp->v_usecount == 0);
758 
759 	/*
760 	 * Note that the reference count must not change until
761 	 * the vnode is removed.
762 	 */
763 	mutex_enter(&vnode_free_list_lock);
764 	if (vp->v_holdcnt > 0) {
765 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
766 	} else {
767 		KASSERT(vp->v_freelisthd == &vnode_free_list);
768 	}
769 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
770 	vp->v_freelisthd = NULL;
771 	mutex_exit(&vnode_free_list_lock);
772 }
773 
774 /*
775  * Move a vnode from one mount queue to another.
776  */
777 static void
778 insmntque(vnode_t *vp, struct mount *mp)
779 {
780 	struct mount *omp;
781 
782 #ifdef DIAGNOSTIC
783 	if ((mp != NULL) &&
784 	    (mp->mnt_iflag & IMNT_UNMOUNT) &&
785 	    vp->v_tag != VT_VFS) {
786 		panic("insmntque into dying filesystem");
787 	}
788 #endif
789 
790 	mutex_enter(&mntvnode_lock);
791 	/*
792 	 * Delete from old mount point vnode list, if on one.
793 	 */
794 	if ((omp = vp->v_mount) != NULL)
795 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
796 	/*
797 	 * Insert into list of vnodes for the new mount point, if
798 	 * available.  The caller must take a reference on the mount
799 	 * structure and donate to the vnode.
800 	 */
801 	if ((vp->v_mount = mp) != NULL)
802 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
803 	mutex_exit(&mntvnode_lock);
804 
805 	if (omp != NULL) {
806 		/* Release reference to old mount. */
807 		vfs_destroy(omp);
808 	}
809 }
810 
811 /*
812  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
813  * recycled.
814  */
815 void
816 vwait(vnode_t *vp, int flags)
817 {
818 
819 	KASSERT(mutex_owned(&vp->v_interlock));
820 	KASSERT(vp->v_usecount != 0);
821 
822 	while ((vp->v_iflag & flags) != 0)
823 		cv_wait(&vp->v_cv, &vp->v_interlock);
824 }
825 
826 /*
827  * Insert a marker vnode into a mount's vnode list, after the
828  * specified vnode.  mntvnode_lock must be held.
829  */
830 void
831 vmark(vnode_t *mvp, vnode_t *vp)
832 {
833 	struct mount *mp;
834 
835 	mp = mvp->v_mount;
836 
837 	KASSERT(mutex_owned(&mntvnode_lock));
838 	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
839 	KASSERT(vp->v_mount == mp);
840 
841 	TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
842 }
843 
844 /*
845  * Remove a marker vnode from a mount's vnode list, and return
846  * a pointer to the next vnode in the list.  mntvnode_lock must
847  * be held.
848  */
849 vnode_t *
850 vunmark(vnode_t *mvp)
851 {
852 	vnode_t *vp;
853 	struct mount *mp;
854 
855 	mp = mvp->v_mount;
856 
857 	KASSERT(mutex_owned(&mntvnode_lock));
858 	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
859 
860 	vp = TAILQ_NEXT(mvp, v_mntvnodes);
861 	TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
862 
863 	KASSERT(vp == NULL || vp->v_mount == mp);
864 
865 	return vp;
866 }
867 
868 /*
869  * Update outstanding I/O count and do wakeup if requested.
870  */
871 void
872 vwakeup(struct buf *bp)
873 {
874 	struct vnode *vp;
875 
876 	if ((vp = bp->b_vp) == NULL)
877 		return;
878 
879 	KASSERT(bp->b_objlock == &vp->v_interlock);
880 	KASSERT(mutex_owned(bp->b_objlock));
881 
882 	if (--vp->v_numoutput < 0)
883 		panic("vwakeup: neg numoutput, vp %p", vp);
884 	if (vp->v_numoutput == 0)
885 		cv_broadcast(&vp->v_cv);
886 }
887 
888 /*
889  * Flush out and invalidate all buffers associated with a vnode.
890  * Called with the underlying vnode locked, which should prevent new dirty
891  * buffers from being queued.
892  */
893 int
894 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
895 	  bool catch, int slptimeo)
896 {
897 	struct buf *bp, *nbp;
898 	int error;
899 	int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
900 	    (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
901 
902 	/* XXXUBC this doesn't look at flags or slp* */
903 	mutex_enter(&vp->v_interlock);
904 	error = VOP_PUTPAGES(vp, 0, 0, flushflags);
905 	if (error) {
906 		return error;
907 	}
908 
909 	if (flags & V_SAVE) {
910 		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
911 		if (error)
912 		        return (error);
913 		KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
914 	}
915 
916 	mutex_enter(&bufcache_lock);
917 restart:
918 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
919 		nbp = LIST_NEXT(bp, b_vnbufs);
920 		error = bbusy(bp, catch, slptimeo, NULL);
921 		if (error != 0) {
922 			if (error == EPASSTHROUGH)
923 				goto restart;
924 			mutex_exit(&bufcache_lock);
925 			return (error);
926 		}
927 		brelsel(bp, BC_INVAL | BC_VFLUSH);
928 	}
929 
930 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
931 		nbp = LIST_NEXT(bp, b_vnbufs);
932 		error = bbusy(bp, catch, slptimeo, NULL);
933 		if (error != 0) {
934 			if (error == EPASSTHROUGH)
935 				goto restart;
936 			mutex_exit(&bufcache_lock);
937 			return (error);
938 		}
939 		/*
940 		 * XXX Since there are no node locks for NFS, I believe
941 		 * there is a slight chance that a delayed write will
942 		 * occur while sleeping just above, so check for it.
943 		 */
944 		if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
945 #ifdef DEBUG
946 			printf("buffer still DELWRI\n");
947 #endif
948 			bp->b_cflags |= BC_BUSY | BC_VFLUSH;
949 			mutex_exit(&bufcache_lock);
950 			VOP_BWRITE(bp);
951 			mutex_enter(&bufcache_lock);
952 			goto restart;
953 		}
954 		brelsel(bp, BC_INVAL | BC_VFLUSH);
955 	}
956 
957 #ifdef DIAGNOSTIC
958 	if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
959 		panic("vinvalbuf: flush failed, vp %p", vp);
960 #endif
961 
962 	mutex_exit(&bufcache_lock);
963 
964 	return (0);
965 }
966 
967 /*
968  * Destroy any in core blocks past the truncation length.
969  * Called with the underlying vnode locked, which should prevent new dirty
970  * buffers from being queued.
971  */
972 int
973 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
974 {
975 	struct buf *bp, *nbp;
976 	int error;
977 	voff_t off;
978 
979 	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
980 	mutex_enter(&vp->v_interlock);
981 	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
982 	if (error) {
983 		return error;
984 	}
985 
986 	mutex_enter(&bufcache_lock);
987 restart:
988 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
989 		nbp = LIST_NEXT(bp, b_vnbufs);
990 		if (bp->b_lblkno < lbn)
991 			continue;
992 		error = bbusy(bp, catch, slptimeo, NULL);
993 		if (error != 0) {
994 			if (error == EPASSTHROUGH)
995 				goto restart;
996 			mutex_exit(&bufcache_lock);
997 			return (error);
998 		}
999 		brelsel(bp, BC_INVAL | BC_VFLUSH);
1000 	}
1001 
1002 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1003 		nbp = LIST_NEXT(bp, b_vnbufs);
1004 		if (bp->b_lblkno < lbn)
1005 			continue;
1006 		error = bbusy(bp, catch, slptimeo, NULL);
1007 		if (error != 0) {
1008 			if (error == EPASSTHROUGH)
1009 				goto restart;
1010 			mutex_exit(&bufcache_lock);
1011 			return (error);
1012 		}
1013 		brelsel(bp, BC_INVAL | BC_VFLUSH);
1014 	}
1015 	mutex_exit(&bufcache_lock);
1016 
1017 	return (0);
1018 }
1019 
1020 /*
1021  * Flush all dirty buffers from a vnode.
1022  * Called with the underlying vnode locked, which should prevent new dirty
1023  * buffers from being queued.
1024  */
1025 void
1026 vflushbuf(struct vnode *vp, int sync)
1027 {
1028 	struct buf *bp, *nbp;
1029 	int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
1030 	bool dirty;
1031 
1032 	mutex_enter(&vp->v_interlock);
1033 	(void) VOP_PUTPAGES(vp, 0, 0, flags);
1034 
1035 loop:
1036 	mutex_enter(&bufcache_lock);
1037 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1038 		nbp = LIST_NEXT(bp, b_vnbufs);
1039 		if ((bp->b_cflags & BC_BUSY))
1040 			continue;
1041 		if ((bp->b_oflags & BO_DELWRI) == 0)
1042 			panic("vflushbuf: not dirty, bp %p", bp);
1043 		bp->b_cflags |= BC_BUSY | BC_VFLUSH;
1044 		mutex_exit(&bufcache_lock);
1045 		/*
1046 		 * Wait for I/O associated with indirect blocks to complete,
1047 		 * since there is no way to quickly wait for them below.
1048 		 */
1049 		if (bp->b_vp == vp || sync == 0)
1050 			(void) bawrite(bp);
1051 		else
1052 			(void) bwrite(bp);
1053 		goto loop;
1054 	}
1055 	mutex_exit(&bufcache_lock);
1056 
1057 	if (sync == 0)
1058 		return;
1059 
1060 	mutex_enter(&vp->v_interlock);
1061 	while (vp->v_numoutput != 0)
1062 		cv_wait(&vp->v_cv, &vp->v_interlock);
1063 	dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
1064 	mutex_exit(&vp->v_interlock);
1065 
1066 	if (dirty) {
1067 		vprint("vflushbuf: dirty", vp);
1068 		goto loop;
1069 	}
1070 }
1071 
1072 /*
1073  * Create a vnode for a block device.
1074  * Used for root filesystem and swap areas.
1075  * Also used for memory file system special devices.
1076  */
1077 int
1078 bdevvp(dev_t dev, vnode_t **vpp)
1079 {
1080 
1081 	return (getdevvp(dev, vpp, VBLK));
1082 }
1083 
1084 /*
1085  * Create a vnode for a character device.
1086  * Used for kernfs and some console handling.
1087  */
1088 int
1089 cdevvp(dev_t dev, vnode_t **vpp)
1090 {
1091 
1092 	return (getdevvp(dev, vpp, VCHR));
1093 }
1094 
1095 /*
1096  * Associate a buffer with a vnode.  There must already be a hold on
1097  * the vnode.
1098  */
1099 void
1100 bgetvp(struct vnode *vp, struct buf *bp)
1101 {
1102 
1103 	KASSERT(bp->b_vp == NULL);
1104 	KASSERT(bp->b_objlock == &buffer_lock);
1105 	KASSERT(mutex_owned(&vp->v_interlock));
1106 	KASSERT(mutex_owned(&bufcache_lock));
1107 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1108 	KASSERT(!cv_has_waiters(&bp->b_done));
1109 
1110 	vholdl(vp);
1111 	bp->b_vp = vp;
1112 	if (vp->v_type == VBLK || vp->v_type == VCHR)
1113 		bp->b_dev = vp->v_rdev;
1114 	else
1115 		bp->b_dev = NODEV;
1116 
1117 	/*
1118 	 * Insert onto list for new vnode.
1119 	 */
1120 	bufinsvn(bp, &vp->v_cleanblkhd);
1121 	bp->b_objlock = &vp->v_interlock;
1122 }
1123 
1124 /*
1125  * Disassociate a buffer from a vnode.
1126  */
1127 void
1128 brelvp(struct buf *bp)
1129 {
1130 	struct vnode *vp = bp->b_vp;
1131 
1132 	KASSERT(vp != NULL);
1133 	KASSERT(bp->b_objlock == &vp->v_interlock);
1134 	KASSERT(mutex_owned(&vp->v_interlock));
1135 	KASSERT(mutex_owned(&bufcache_lock));
1136 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1137 	KASSERT(!cv_has_waiters(&bp->b_done));
1138 
1139 	/*
1140 	 * Delete from old vnode list, if on one.
1141 	 */
1142 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1143 		bufremvn(bp);
1144 
1145 	if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) &&
1146 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1147 		vp->v_iflag &= ~VI_WRMAPDIRTY;
1148 		vn_syncer_remove_from_worklist(vp);
1149 	}
1150 
1151 	bp->b_objlock = &buffer_lock;
1152 	bp->b_vp = NULL;
1153 	holdrelel(vp);
1154 }
1155 
1156 /*
1157  * Reassign a buffer from one vnode list to another.
1158  * The list reassignment must be within the same vnode.
1159  * Used to assign file specific control information
1160  * (indirect blocks) to the list to which they belong.
1161  */
1162 void
1163 reassignbuf(struct buf *bp, struct vnode *vp)
1164 {
1165 	struct buflists *listheadp;
1166 	int delayx;
1167 
1168 	KASSERT(mutex_owned(&bufcache_lock));
1169 	KASSERT(bp->b_objlock == &vp->v_interlock);
1170 	KASSERT(mutex_owned(&vp->v_interlock));
1171 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1172 
1173 	/*
1174 	 * Delete from old vnode list, if on one.
1175 	 */
1176 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1177 		bufremvn(bp);
1178 
1179 	/*
1180 	 * If dirty, put on list of dirty buffers;
1181 	 * otherwise insert onto list of clean buffers.
1182 	 */
1183 	if ((bp->b_oflags & BO_DELWRI) == 0) {
1184 		listheadp = &vp->v_cleanblkhd;
1185 		if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1186 		    (vp->v_iflag & VI_ONWORKLST) &&
1187 		    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1188 			vp->v_iflag &= ~VI_WRMAPDIRTY;
1189 			vn_syncer_remove_from_worklist(vp);
1190 		}
1191 	} else {
1192 		listheadp = &vp->v_dirtyblkhd;
1193 		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
1194 			switch (vp->v_type) {
1195 			case VDIR:
1196 				delayx = dirdelay;
1197 				break;
1198 			case VBLK:
1199 				if (vp->v_specmountpoint != NULL) {
1200 					delayx = metadelay;
1201 					break;
1202 				}
1203 				/* fall through */
1204 			default:
1205 				delayx = filedelay;
1206 				break;
1207 			}
1208 			if (!vp->v_mount ||
1209 			    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1210 				vn_syncer_add_to_worklist(vp, delayx);
1211 		}
1212 	}
1213 	bufinsvn(bp, listheadp);
1214 }
1215 
1216 /*
1217  * Create a vnode for a device.
1218  * Used by bdevvp (block device) for root file system etc.,
1219  * and by cdevvp (character device) for console and kernfs.
1220  */
1221 static int
1222 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1223 {
1224 	vnode_t *vp;
1225 	vnode_t *nvp;
1226 	int error;
1227 
1228 	if (dev == NODEV) {
1229 		*vpp = NULL;
1230 		return (0);
1231 	}
1232 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1233 	if (error) {
1234 		*vpp = NULL;
1235 		return (error);
1236 	}
1237 	vp = nvp;
1238 	vp->v_type = type;
1239 	vp->v_vflag |= VV_MPSAFE;
1240 	uvm_vnp_setsize(vp, 0);
1241 	spec_node_init(vp, dev);
1242 	*vpp = vp;
1243 	return (0);
1244 }
1245 
1246 /*
1247  * Try to gain a reference to a vnode, without acquiring its interlock.
1248  * The caller must hold a lock that will prevent the vnode from being
1249  * recycled or freed.
1250  */
1251 bool
1252 vtryget(vnode_t *vp)
1253 {
1254 	u_int use, next;
1255 
1256 	/*
1257 	 * If the vnode is being freed, don't make life any harder
1258 	 * for vclean() by adding another reference without waiting.
1259 	 * This is not strictly necessary, but we'll do it anyway.
1260 	 */
1261 	if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) {
1262 		return false;
1263 	}
1264 	for (use = vp->v_usecount;; use = next) {
1265 		if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) {
1266 			/* Need interlock held if first reference. */
1267 			return false;
1268 		}
1269 		next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
1270 		if (__predict_true(next == use)) {
1271 			return true;
1272 		}
1273 	}
1274 }
1275 
1276 /*
1277  * Grab a particular vnode from the free list, increment its
1278  * reference count and lock it. If the vnode lock bit is set the
1279  * vnode is being eliminated in vgone. In that case, we can not
1280  * grab the vnode, so the process is awakened when the transition is
1281  * completed, and an error returned to indicate that the vnode is no
1282  * longer usable (possibly having been changed to a new file system type).
1283  */
1284 int
1285 vget(vnode_t *vp, int flags)
1286 {
1287 	int error;
1288 
1289 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1290 
1291 	if ((flags & LK_INTERLOCK) == 0)
1292 		mutex_enter(&vp->v_interlock);
1293 
1294 	/*
1295 	 * Before adding a reference, we must remove the vnode
1296 	 * from its freelist.
1297 	 */
1298 	if (vp->v_usecount == 0) {
1299 		vremfree(vp);
1300 		vp->v_usecount = 1;
1301 	} else {
1302 		atomic_inc_uint(&vp->v_usecount);
1303 	}
1304 
1305 	/*
1306 	 * If the vnode is in the process of being cleaned out for
1307 	 * another use, we wait for the cleaning to finish and then
1308 	 * return failure.  Cleaning is determined by checking if
1309 	 * the VI_XLOCK or VI_FREEING flags are set.
1310 	 */
1311 	if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1312 		if ((flags & LK_NOWAIT) != 0) {
1313 			vrelel(vp, 0);
1314 			return EBUSY;
1315 		}
1316 		vwait(vp, VI_XLOCK | VI_FREEING);
1317 		vrelel(vp, 0);
1318 		return ENOENT;
1319 	}
1320 
1321 	if ((vp->v_iflag & VI_INACTNOW) != 0) {
1322 		/*
1323 		 * if it's being desactived, wait for it to complete.
1324 		 * Make sure to not return a clean vnode.
1325 		 */
1326 		 if ((flags & LK_NOWAIT) != 0) {
1327 			vrelel(vp, 0);
1328 			return EBUSY;
1329 		}
1330 		vwait(vp, VI_INACTNOW);
1331 		if ((vp->v_iflag & VI_CLEAN) != 0) {
1332 			vrelel(vp, 0);
1333 			return ENOENT;
1334 		}
1335 	}
1336 	if (flags & LK_TYPE_MASK) {
1337 		error = vn_lock(vp, flags | LK_INTERLOCK);
1338 		if (error != 0) {
1339 			vrele(vp);
1340 		}
1341 		return error;
1342 	}
1343 	mutex_exit(&vp->v_interlock);
1344 	return 0;
1345 }
1346 
1347 /*
1348  * vput(), just unlock and vrele()
1349  */
1350 void
1351 vput(vnode_t *vp)
1352 {
1353 
1354 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1355 
1356 	VOP_UNLOCK(vp, 0);
1357 	vrele(vp);
1358 }
1359 
1360 /*
1361  * Try to drop reference on a vnode.  Abort if we are releasing the
1362  * last reference.  Note: this _must_ succeed if not the last reference.
1363  */
1364 static inline bool
1365 vtryrele(vnode_t *vp)
1366 {
1367 	u_int use, next;
1368 
1369 	for (use = vp->v_usecount;; use = next) {
1370 		if (use == 1) {
1371 			return false;
1372 		}
1373 		KASSERT((use & VC_MASK) > 1);
1374 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1375 		if (__predict_true(next == use)) {
1376 			return true;
1377 		}
1378 	}
1379 }
1380 
1381 /*
1382  * Vnode release.  If reference count drops to zero, call inactive
1383  * routine and either return to freelist or free to the pool.
1384  */
1385 void
1386 vrelel(vnode_t *vp, int flags)
1387 {
1388 	bool recycle, defer;
1389 	int error;
1390 
1391 	KASSERT(mutex_owned(&vp->v_interlock));
1392 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1393 	KASSERT(vp->v_freelisthd == NULL);
1394 
1395 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
1396 	    (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
1397 		vpanic(vp, "dead but not clean");
1398 	}
1399 
1400 	/*
1401 	 * If not the last reference, just drop the reference count
1402 	 * and unlock.
1403 	 */
1404 	if (vtryrele(vp)) {
1405 		vp->v_iflag |= VI_INACTREDO;
1406 		mutex_exit(&vp->v_interlock);
1407 		return;
1408 	}
1409 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
1410 		vpanic(vp, "vrelel: bad ref count");
1411 	}
1412 
1413 	KASSERT((vp->v_iflag & VI_XLOCK) == 0);
1414 
1415 	/*
1416 	 * If not clean, deactivate the vnode, but preserve
1417 	 * our reference across the call to VOP_INACTIVE().
1418 	 */
1419  retry:
1420 	if ((vp->v_iflag & VI_CLEAN) == 0) {
1421 		recycle = false;
1422 		vp->v_iflag |= VI_INACTNOW;
1423 
1424 		/*
1425 		 * XXX This ugly block can be largely eliminated if
1426 		 * locking is pushed down into the file systems.
1427 		 */
1428 		if (curlwp == uvm.pagedaemon_lwp) {
1429 			/* The pagedaemon can't wait around; defer. */
1430 			defer = true;
1431 		} else if (curlwp == vrele_lwp) {
1432 			/*
1433 			 * We have to try harder. But we can't sleep
1434 			 * with VI_INACTNOW as vget() may be waiting on it.
1435 			 */
1436 			vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW);
1437 			cv_broadcast(&vp->v_cv);
1438 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1439 			    LK_RETRY);
1440 			if (error != 0) {
1441 				/* XXX */
1442 				vpanic(vp, "vrele: unable to lock %p");
1443 			}
1444 			mutex_enter(&vp->v_interlock);
1445 			/*
1446 			 * if we did get another reference while
1447 			 * sleeping, don't try to inactivate it yet.
1448 			 */
1449 			if (__predict_false(vtryrele(vp))) {
1450 				VOP_UNLOCK(vp, 0);
1451 				mutex_exit(&vp->v_interlock);
1452 				return;
1453 			}
1454 			vp->v_iflag |= VI_INACTNOW;
1455 			mutex_exit(&vp->v_interlock);
1456 			defer = false;
1457 		} else if ((vp->v_iflag & VI_LAYER) != 0) {
1458 			/*
1459 			 * Acquiring the stack's lock in vclean() even
1460 			 * for an honest vput/vrele is dangerous because
1461 			 * our caller may hold other vnode locks; defer.
1462 			 */
1463 			defer = true;
1464 		} else {
1465 			/* If we can't acquire the lock, then defer. */
1466 			vp->v_iflag &= ~VI_INACTREDO;
1467 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1468 			    LK_NOWAIT);
1469 			if (error != 0) {
1470 				defer = true;
1471 				mutex_enter(&vp->v_interlock);
1472 			} else {
1473 				defer = false;
1474 			}
1475 		}
1476 
1477 		if (defer) {
1478 			/*
1479 			 * Defer reclaim to the kthread; it's not safe to
1480 			 * clean it here.  We donate it our last reference.
1481 			 */
1482 			KASSERT(mutex_owned(&vp->v_interlock));
1483 			KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
1484 			vp->v_iflag &= ~VI_INACTNOW;
1485 			vp->v_iflag |= VI_INACTPEND;
1486 			mutex_enter(&vrele_lock);
1487 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
1488 			if (++vrele_pending > (desiredvnodes >> 8))
1489 				cv_signal(&vrele_cv);
1490 			mutex_exit(&vrele_lock);
1491 			cv_broadcast(&vp->v_cv);
1492 			mutex_exit(&vp->v_interlock);
1493 			return;
1494 		}
1495 
1496 #ifdef DIAGNOSTIC
1497 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1498 		    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
1499 			vprint("vrelel: missing VOP_CLOSE()", vp);
1500 		}
1501 #endif
1502 
1503 		/*
1504 		 * The vnode can gain another reference while being
1505 		 * deactivated.  If VOP_INACTIVE() indicates that
1506 		 * the described file has been deleted, then recycle
1507 		 * the vnode irrespective of additional references.
1508 		 * Another thread may be waiting to re-use the on-disk
1509 		 * inode.
1510 		 *
1511 		 * Note that VOP_INACTIVE() will drop the vnode lock.
1512 		 */
1513 		VOP_INACTIVE(vp, &recycle);
1514 		mutex_enter(&vp->v_interlock);
1515 		vp->v_iflag &= ~VI_INACTNOW;
1516 		cv_broadcast(&vp->v_cv);
1517 		if (!recycle) {
1518 			if (vtryrele(vp)) {
1519 				mutex_exit(&vp->v_interlock);
1520 				return;
1521 			}
1522 
1523 			/*
1524 			 * If we grew another reference while
1525 			 * VOP_INACTIVE() was underway, retry.
1526 			 */
1527 			if ((vp->v_iflag & VI_INACTREDO) != 0) {
1528 				goto retry;
1529 			}
1530 		}
1531 
1532 		/* Take care of space accounting. */
1533 		if (vp->v_iflag & VI_EXECMAP) {
1534 			atomic_add_int(&uvmexp.execpages,
1535 			    -vp->v_uobj.uo_npages);
1536 			atomic_add_int(&uvmexp.filepages,
1537 			    vp->v_uobj.uo_npages);
1538 		}
1539 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
1540 		vp->v_vflag &= ~VV_MAPPED;
1541 
1542 		/*
1543 		 * Recycle the vnode if the file is now unused (unlinked),
1544 		 * otherwise just free it.
1545 		 */
1546 		if (recycle) {
1547 			vclean(vp, DOCLOSE);
1548 		}
1549 		KASSERT(vp->v_usecount > 0);
1550 	}
1551 
1552 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
1553 		/* Gained another reference while being reclaimed. */
1554 		mutex_exit(&vp->v_interlock);
1555 		return;
1556 	}
1557 
1558 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1559 		/*
1560 		 * It's clean so destroy it.  It isn't referenced
1561 		 * anywhere since it has been reclaimed.
1562 		 */
1563 		KASSERT(vp->v_holdcnt == 0);
1564 		KASSERT(vp->v_writecount == 0);
1565 		mutex_exit(&vp->v_interlock);
1566 		insmntque(vp, NULL);
1567 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
1568 			spec_node_destroy(vp);
1569 		}
1570 		vnfree(vp);
1571 	} else {
1572 		/*
1573 		 * Otherwise, put it back onto the freelist.  It
1574 		 * can't be destroyed while still associated with
1575 		 * a file system.
1576 		 */
1577 		mutex_enter(&vnode_free_list_lock);
1578 		if (vp->v_holdcnt > 0) {
1579 			vp->v_freelisthd = &vnode_hold_list;
1580 		} else {
1581 			vp->v_freelisthd = &vnode_free_list;
1582 		}
1583 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1584 		mutex_exit(&vnode_free_list_lock);
1585 		mutex_exit(&vp->v_interlock);
1586 	}
1587 }
1588 
1589 void
1590 vrele(vnode_t *vp)
1591 {
1592 
1593 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1594 
1595 	if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
1596 		return;
1597 	}
1598 	mutex_enter(&vp->v_interlock);
1599 	vrelel(vp, 0);
1600 }
1601 
1602 static void
1603 vrele_thread(void *cookie)
1604 {
1605 	vnode_t *vp;
1606 
1607 	for (;;) {
1608 		mutex_enter(&vrele_lock);
1609 		while (TAILQ_EMPTY(&vrele_list)) {
1610 			vrele_gen++;
1611 			cv_broadcast(&vrele_cv);
1612 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
1613 		}
1614 		vp = TAILQ_FIRST(&vrele_list);
1615 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
1616 		vrele_pending--;
1617 		mutex_exit(&vrele_lock);
1618 
1619 		/*
1620 		 * If not the last reference, then ignore the vnode
1621 		 * and look for more work.
1622 		 */
1623 		mutex_enter(&vp->v_interlock);
1624 		KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
1625 		vp->v_iflag &= ~VI_INACTPEND;
1626 		vrelel(vp, 0);
1627 	}
1628 }
1629 
1630 /*
1631  * Page or buffer structure gets a reference.
1632  * Called with v_interlock held.
1633  */
1634 void
1635 vholdl(vnode_t *vp)
1636 {
1637 
1638 	KASSERT(mutex_owned(&vp->v_interlock));
1639 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1640 
1641 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
1642 		mutex_enter(&vnode_free_list_lock);
1643 		KASSERT(vp->v_freelisthd == &vnode_free_list);
1644 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1645 		vp->v_freelisthd = &vnode_hold_list;
1646 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1647 		mutex_exit(&vnode_free_list_lock);
1648 	}
1649 }
1650 
1651 /*
1652  * Page or buffer structure frees a reference.
1653  * Called with v_interlock held.
1654  */
1655 void
1656 holdrelel(vnode_t *vp)
1657 {
1658 
1659 	KASSERT(mutex_owned(&vp->v_interlock));
1660 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1661 
1662 	if (vp->v_holdcnt <= 0) {
1663 		vpanic(vp, "holdrelel: holdcnt vp %p");
1664 	}
1665 
1666 	vp->v_holdcnt--;
1667 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1668 		mutex_enter(&vnode_free_list_lock);
1669 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
1670 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1671 		vp->v_freelisthd = &vnode_free_list;
1672 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1673 		mutex_exit(&vnode_free_list_lock);
1674 	}
1675 }
1676 
1677 /*
1678  * Vnode reference, where a reference is already held by some other
1679  * object (for example, a file structure).
1680  */
1681 void
1682 vref(vnode_t *vp)
1683 {
1684 
1685 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1686 	KASSERT(vp->v_usecount != 0);
1687 
1688 	atomic_inc_uint(&vp->v_usecount);
1689 }
1690 
1691 /*
1692  * Remove any vnodes in the vnode table belonging to mount point mp.
1693  *
1694  * If FORCECLOSE is not specified, there should not be any active ones,
1695  * return error if any are found (nb: this is a user error, not a
1696  * system error). If FORCECLOSE is specified, detach any active vnodes
1697  * that are found.
1698  *
1699  * If WRITECLOSE is set, only flush out regular file vnodes open for
1700  * writing.
1701  *
1702  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1703  */
1704 #ifdef DEBUG
1705 int busyprt = 0;	/* print out busy vnodes */
1706 struct ctldebug debug1 = { "busyprt", &busyprt };
1707 #endif
1708 
1709 static vnode_t *
1710 vflushnext(vnode_t *mvp, int *when)
1711 {
1712 
1713 	if (hardclock_ticks > *when) {
1714 		mutex_exit(&mntvnode_lock);
1715 		yield();
1716 		mutex_enter(&mntvnode_lock);
1717 		*when = hardclock_ticks + hz / 10;
1718 	}
1719 
1720 	return vunmark(mvp);
1721 }
1722 
1723 int
1724 vflush(struct mount *mp, vnode_t *skipvp, int flags)
1725 {
1726 	vnode_t *vp, *mvp;
1727 	int busy = 0, when = 0, gen;
1728 
1729 	/*
1730 	 * First, flush out any vnode references from vrele_list.
1731 	 */
1732 	mutex_enter(&vrele_lock);
1733 	gen = vrele_gen;
1734 	while (vrele_pending && gen == vrele_gen) {
1735 		cv_broadcast(&vrele_cv);
1736 		cv_wait(&vrele_cv, &vrele_lock);
1737 	}
1738 	mutex_exit(&vrele_lock);
1739 
1740 	/* Allocate a marker vnode. */
1741 	if ((mvp = vnalloc(mp)) == NULL)
1742 		return (ENOMEM);
1743 
1744 	/*
1745 	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
1746 	 * and vclean() are called
1747 	 */
1748 	mutex_enter(&mntvnode_lock);
1749 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
1750 	    vp = vflushnext(mvp, &when)) {
1751 		vmark(mvp, vp);
1752 		if (vp->v_mount != mp || vismarker(vp))
1753 			continue;
1754 		/*
1755 		 * Skip over a selected vnode.
1756 		 */
1757 		if (vp == skipvp)
1758 			continue;
1759 		mutex_enter(&vp->v_interlock);
1760 		/*
1761 		 * Ignore clean but still referenced vnodes.
1762 		 */
1763 		if ((vp->v_iflag & VI_CLEAN) != 0) {
1764 			mutex_exit(&vp->v_interlock);
1765 			continue;
1766 		}
1767 		/*
1768 		 * Skip over a vnodes marked VSYSTEM.
1769 		 */
1770 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1771 			mutex_exit(&vp->v_interlock);
1772 			continue;
1773 		}
1774 		/*
1775 		 * If WRITECLOSE is set, only flush out regular file
1776 		 * vnodes open for writing.
1777 		 */
1778 		if ((flags & WRITECLOSE) &&
1779 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1780 			mutex_exit(&vp->v_interlock);
1781 			continue;
1782 		}
1783 		/*
1784 		 * With v_usecount == 0, all we need to do is clear
1785 		 * out the vnode data structures and we are done.
1786 		 */
1787 		if (vp->v_usecount == 0) {
1788 			mutex_exit(&mntvnode_lock);
1789 			vremfree(vp);
1790 			vp->v_usecount = 1;
1791 			vclean(vp, DOCLOSE);
1792 			vrelel(vp, 0);
1793 			mutex_enter(&mntvnode_lock);
1794 			continue;
1795 		}
1796 		/*
1797 		 * If FORCECLOSE is set, forcibly close the vnode.
1798 		 * For block or character devices, revert to an
1799 		 * anonymous device.  For all other files, just
1800 		 * kill them.
1801 		 */
1802 		if (flags & FORCECLOSE) {
1803 			mutex_exit(&mntvnode_lock);
1804 			atomic_inc_uint(&vp->v_usecount);
1805 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1806 				vclean(vp, DOCLOSE);
1807 				vrelel(vp, 0);
1808 			} else {
1809 				vclean(vp, 0);
1810 				vp->v_op = spec_vnodeop_p; /* XXXSMP */
1811 				mutex_exit(&vp->v_interlock);
1812 				/*
1813 				 * The vnode isn't clean, but still resides
1814 				 * on the mount list.  Remove it. XXX This
1815 				 * is a bit dodgy.
1816 				 */
1817 				insmntque(vp, NULL);
1818 				vrele(vp);
1819 			}
1820 			mutex_enter(&mntvnode_lock);
1821 			continue;
1822 		}
1823 #ifdef DEBUG
1824 		if (busyprt)
1825 			vprint("vflush: busy vnode", vp);
1826 #endif
1827 		mutex_exit(&vp->v_interlock);
1828 		busy++;
1829 	}
1830 	mutex_exit(&mntvnode_lock);
1831 	vnfree(mvp);
1832 	if (busy)
1833 		return (EBUSY);
1834 	return (0);
1835 }
1836 
1837 /*
1838  * Disassociate the underlying file system from a vnode.
1839  *
1840  * Must be called with the interlock held, and will return with it held.
1841  */
1842 void
1843 vclean(vnode_t *vp, int flags)
1844 {
1845 	lwp_t *l = curlwp;
1846 	bool recycle, active;
1847 	int error;
1848 
1849 	KASSERT(mutex_owned(&vp->v_interlock));
1850 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1851 	KASSERT(vp->v_usecount != 0);
1852 
1853 	/* If cleaning is already in progress wait until done and return. */
1854 	if (vp->v_iflag & VI_XLOCK) {
1855 		vwait(vp, VI_XLOCK);
1856 		return;
1857 	}
1858 
1859 	/* If already clean, nothing to do. */
1860 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1861 		return;
1862 	}
1863 
1864 	/*
1865 	 * Prevent the vnode from being recycled or brought into use
1866 	 * while we clean it out.
1867 	 */
1868 	vp->v_iflag |= VI_XLOCK;
1869 	if (vp->v_iflag & VI_EXECMAP) {
1870 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1871 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1872 	}
1873 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1874 	active = (vp->v_usecount > 1);
1875 
1876 	/* XXXAD should not lock vnode under layer */
1877 	VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1878 
1879 	/*
1880 	 * Clean out any cached data associated with the vnode.
1881 	 * If purging an active vnode, it must be closed and
1882 	 * deactivated before being reclaimed. Note that the
1883 	 * VOP_INACTIVE will unlock the vnode.
1884 	 */
1885 	if (flags & DOCLOSE) {
1886 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1887 		if (error != 0) {
1888 			/* XXX, fix vn_start_write's grab of mp and use that. */
1889 
1890 			if (wapbl_vphaswapbl(vp))
1891 				WAPBL_DISCARD(wapbl_vptomp(vp));
1892 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1893 		}
1894 		KASSERT(error == 0);
1895 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1896 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1897 			 spec_node_revoke(vp);
1898 		}
1899 	}
1900 	if (active) {
1901 		VOP_INACTIVE(vp, &recycle);
1902 	} else {
1903 		/*
1904 		 * Any other processes trying to obtain this lock must first
1905 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1906 		 */
1907 		VOP_UNLOCK(vp, 0);
1908 	}
1909 
1910 	/* Disassociate the underlying file system from the vnode. */
1911 	if (VOP_RECLAIM(vp)) {
1912 		vpanic(vp, "vclean: cannot reclaim");
1913 	}
1914 
1915 	KASSERT(vp->v_uobj.uo_npages == 0);
1916 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1917 		uvm_ra_freectx(vp->v_ractx);
1918 		vp->v_ractx = NULL;
1919 	}
1920 	cache_purge(vp);
1921 
1922 	/* Done with purge, notify sleepers of the grim news. */
1923 	mutex_enter(&vp->v_interlock);
1924 	vp->v_op = dead_vnodeop_p;
1925 	vp->v_tag = VT_NON;
1926 	vp->v_vnlock = &vp->v_lock;
1927 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1928 	vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1929 	vp->v_vflag &= ~VV_LOCKSWORK;
1930 	if ((flags & DOCLOSE) != 0) {
1931 		vp->v_iflag |= VI_CLEAN;
1932 	}
1933 	cv_broadcast(&vp->v_cv);
1934 
1935 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1936 }
1937 
1938 /*
1939  * Recycle an unused vnode to the front of the free list.
1940  * Release the passed interlock if the vnode will be recycled.
1941  */
1942 int
1943 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1944 {
1945 
1946 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1947 
1948 	mutex_enter(&vp->v_interlock);
1949 	if (vp->v_usecount != 0) {
1950 		mutex_exit(&vp->v_interlock);
1951 		return (0);
1952 	}
1953 	if (inter_lkp)
1954 		mutex_exit(inter_lkp);
1955 	vremfree(vp);
1956 	vp->v_usecount = 1;
1957 	vclean(vp, DOCLOSE);
1958 	vrelel(vp, 0);
1959 	return (1);
1960 }
1961 
1962 /*
1963  * Eliminate all activity associated with a vnode in preparation for
1964  * reuse.  Drops a reference from the vnode.
1965  */
1966 void
1967 vgone(vnode_t *vp)
1968 {
1969 
1970 	mutex_enter(&vp->v_interlock);
1971 	vclean(vp, DOCLOSE);
1972 	vrelel(vp, 0);
1973 }
1974 
1975 /*
1976  * Lookup a vnode by device number.
1977  */
1978 int
1979 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1980 {
1981 	vnode_t *vp;
1982 	int rc = 0;
1983 
1984 	mutex_enter(&device_lock);
1985 	for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1986 		if (dev != vp->v_rdev || type != vp->v_type)
1987 			continue;
1988 		*vpp = vp;
1989 		rc = 1;
1990 		break;
1991 	}
1992 	mutex_exit(&device_lock);
1993 	return (rc);
1994 }
1995 
1996 /*
1997  * Revoke all the vnodes corresponding to the specified minor number
1998  * range (endpoints inclusive) of the specified major.
1999  */
2000 void
2001 vdevgone(int maj, int minl, int minh, enum vtype type)
2002 {
2003 	vnode_t *vp, **vpp;
2004 	dev_t dev;
2005 	int mn;
2006 
2007 	vp = NULL;	/* XXX gcc */
2008 
2009 	mutex_enter(&device_lock);
2010 	for (mn = minl; mn <= minh; mn++) {
2011 		dev = makedev(maj, mn);
2012 		vpp = &specfs_hash[SPECHASH(dev)];
2013 		for (vp = *vpp; vp != NULL;) {
2014 			mutex_enter(&vp->v_interlock);
2015 			if ((vp->v_iflag & VI_CLEAN) != 0 ||
2016 			    dev != vp->v_rdev || type != vp->v_type) {
2017 				mutex_exit(&vp->v_interlock);
2018 				vp = vp->v_specnext;
2019 				continue;
2020 			}
2021 			mutex_exit(&device_lock);
2022 			if (vget(vp, LK_INTERLOCK) == 0) {
2023 				VOP_REVOKE(vp, REVOKEALL);
2024 				vrele(vp);
2025 			}
2026 			mutex_enter(&device_lock);
2027 			vp = *vpp;
2028 		}
2029 	}
2030 	mutex_exit(&device_lock);
2031 }
2032 
2033 /*
2034  * Calculate the total number of references to a special device.
2035  */
2036 int
2037 vcount(vnode_t *vp)
2038 {
2039 	int count;
2040 
2041 	mutex_enter(&device_lock);
2042 	mutex_enter(&vp->v_interlock);
2043 	if (vp->v_specnode == NULL) {
2044 		count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
2045 		mutex_exit(&vp->v_interlock);
2046 		mutex_exit(&device_lock);
2047 		return (count);
2048 	}
2049 	mutex_exit(&vp->v_interlock);
2050 	count = vp->v_specnode->sn_dev->sd_opencnt;
2051 	mutex_exit(&device_lock);
2052 	return (count);
2053 }
2054 
2055 /*
2056  * Eliminate all activity associated with the requested vnode
2057  * and with all vnodes aliased to the requested vnode.
2058  */
2059 void
2060 vrevoke(vnode_t *vp)
2061 {
2062 	vnode_t *vq, **vpp;
2063 	enum vtype type;
2064 	dev_t dev;
2065 
2066 	KASSERT(vp->v_usecount > 0);
2067 
2068 	mutex_enter(&vp->v_interlock);
2069 	if ((vp->v_iflag & VI_CLEAN) != 0) {
2070 		mutex_exit(&vp->v_interlock);
2071 		return;
2072 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
2073 		atomic_inc_uint(&vp->v_usecount);
2074 		vclean(vp, DOCLOSE);
2075 		vrelel(vp, 0);
2076 		return;
2077 	} else {
2078 		dev = vp->v_rdev;
2079 		type = vp->v_type;
2080 		mutex_exit(&vp->v_interlock);
2081 	}
2082 
2083 	vpp = &specfs_hash[SPECHASH(dev)];
2084 	mutex_enter(&device_lock);
2085 	for (vq = *vpp; vq != NULL;) {
2086 		/* If clean or being cleaned, then ignore it. */
2087 		mutex_enter(&vq->v_interlock);
2088 		if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
2089 		    vq->v_rdev != dev || vq->v_type != type) {
2090 			mutex_exit(&vq->v_interlock);
2091 			vq = vq->v_specnext;
2092 			continue;
2093 		}
2094 		mutex_exit(&device_lock);
2095 		if (vq->v_usecount == 0) {
2096 			vremfree(vq);
2097 			vq->v_usecount = 1;
2098 		} else {
2099 			atomic_inc_uint(&vq->v_usecount);
2100 		}
2101 		vclean(vq, DOCLOSE);
2102 		vrelel(vq, 0);
2103 		mutex_enter(&device_lock);
2104 		vq = *vpp;
2105 	}
2106 	mutex_exit(&device_lock);
2107 }
2108 
2109 /*
2110  * sysctl helper routine to return list of supported fstypes
2111  */
2112 int
2113 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
2114 {
2115 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
2116 	char *where = oldp;
2117 	struct vfsops *v;
2118 	size_t needed, left, slen;
2119 	int error, first;
2120 
2121 	if (newp != NULL)
2122 		return (EPERM);
2123 	if (namelen != 0)
2124 		return (EINVAL);
2125 
2126 	first = 1;
2127 	error = 0;
2128 	needed = 0;
2129 	left = *oldlenp;
2130 
2131 	sysctl_unlock();
2132 	mutex_enter(&vfs_list_lock);
2133 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2134 		if (where == NULL)
2135 			needed += strlen(v->vfs_name) + 1;
2136 		else {
2137 			memset(bf, 0, sizeof(bf));
2138 			if (first) {
2139 				strncpy(bf, v->vfs_name, sizeof(bf));
2140 				first = 0;
2141 			} else {
2142 				bf[0] = ' ';
2143 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
2144 			}
2145 			bf[sizeof(bf)-1] = '\0';
2146 			slen = strlen(bf);
2147 			if (left < slen + 1)
2148 				break;
2149 			v->vfs_refcount++;
2150 			mutex_exit(&vfs_list_lock);
2151 			/* +1 to copy out the trailing NUL byte */
2152 			error = copyout(bf, where, slen + 1);
2153 			mutex_enter(&vfs_list_lock);
2154 			v->vfs_refcount--;
2155 			if (error)
2156 				break;
2157 			where += slen;
2158 			needed += slen;
2159 			left -= slen;
2160 		}
2161 	}
2162 	mutex_exit(&vfs_list_lock);
2163 	sysctl_relock();
2164 	*oldlenp = needed;
2165 	return (error);
2166 }
2167 
2168 
2169 int kinfo_vdebug = 1;
2170 int kinfo_vgetfailed;
2171 #define KINFO_VNODESLOP	10
2172 /*
2173  * Dump vnode list (via sysctl).
2174  * Copyout address of vnode followed by vnode.
2175  */
2176 /* ARGSUSED */
2177 int
2178 sysctl_kern_vnode(SYSCTLFN_ARGS)
2179 {
2180 	char *where = oldp;
2181 	size_t *sizep = oldlenp;
2182 	struct mount *mp, *nmp;
2183 	vnode_t *vp, *mvp, vbuf;
2184 	char *bp = where;
2185 	char *ewhere;
2186 	int error;
2187 
2188 	if (namelen != 0)
2189 		return (EOPNOTSUPP);
2190 	if (newp != NULL)
2191 		return (EPERM);
2192 
2193 #define VPTRSZ	sizeof(vnode_t *)
2194 #define VNODESZ	sizeof(vnode_t)
2195 	if (where == NULL) {
2196 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2197 		return (0);
2198 	}
2199 	ewhere = where + *sizep;
2200 
2201 	sysctl_unlock();
2202 	mutex_enter(&mountlist_lock);
2203 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2204 	    mp = nmp) {
2205 		if (vfs_busy(mp, &nmp)) {
2206 			continue;
2207 		}
2208 		/* Allocate a marker vnode. */
2209 		mvp = vnalloc(mp);
2210 		/* Should never fail for mp != NULL */
2211 		KASSERT(mvp != NULL);
2212 		mutex_enter(&mntvnode_lock);
2213 		for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp;
2214 		    vp = vunmark(mvp)) {
2215 			vmark(mvp, vp);
2216 			/*
2217 			 * Check that the vp is still associated with
2218 			 * this filesystem.  RACE: could have been
2219 			 * recycled onto the same filesystem.
2220 			 */
2221 			if (vp->v_mount != mp || vismarker(vp))
2222 				continue;
2223 			if (bp + VPTRSZ + VNODESZ > ewhere) {
2224 				(void)vunmark(mvp);
2225 				mutex_exit(&mntvnode_lock);
2226 				vnfree(mvp);
2227 				vfs_unbusy(mp, false, NULL);
2228 				sysctl_relock();
2229 				*sizep = bp - where;
2230 				return (ENOMEM);
2231 			}
2232 			memcpy(&vbuf, vp, VNODESZ);
2233 			mutex_exit(&mntvnode_lock);
2234 			if ((error = copyout(&vp, bp, VPTRSZ)) ||
2235 			    (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
2236 			   	mutex_enter(&mntvnode_lock);
2237 				(void)vunmark(mvp);
2238 				mutex_exit(&mntvnode_lock);
2239 				vnfree(mvp);
2240 				vfs_unbusy(mp, false, NULL);
2241 				sysctl_relock();
2242 				return (error);
2243 			}
2244 			bp += VPTRSZ + VNODESZ;
2245 			mutex_enter(&mntvnode_lock);
2246 		}
2247 		mutex_exit(&mntvnode_lock);
2248 		vnfree(mvp);
2249 		vfs_unbusy(mp, false, &nmp);
2250 	}
2251 	mutex_exit(&mountlist_lock);
2252 	sysctl_relock();
2253 
2254 	*sizep = bp - where;
2255 	return (0);
2256 }
2257 
2258 /*
2259  * Remove clean vnodes from a mountpoint's vnode list.
2260  */
2261 void
2262 vfs_scrubvnlist(struct mount *mp)
2263 {
2264 	vnode_t *vp, *nvp;
2265 
2266  retry:
2267 	mutex_enter(&mntvnode_lock);
2268 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
2269 		nvp = TAILQ_NEXT(vp, v_mntvnodes);
2270 		mutex_enter(&vp->v_interlock);
2271 		if ((vp->v_iflag & VI_CLEAN) != 0) {
2272 			TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
2273 			vp->v_mount = NULL;
2274 			mutex_exit(&mntvnode_lock);
2275 			mutex_exit(&vp->v_interlock);
2276 			vfs_destroy(mp);
2277 			goto retry;
2278 		}
2279 		mutex_exit(&vp->v_interlock);
2280 	}
2281 	mutex_exit(&mntvnode_lock);
2282 }
2283 
2284 /*
2285  * Check to see if a filesystem is mounted on a block device.
2286  */
2287 int
2288 vfs_mountedon(vnode_t *vp)
2289 {
2290 	vnode_t *vq;
2291 	int error = 0;
2292 
2293 	if (vp->v_type != VBLK)
2294 		return ENOTBLK;
2295 	if (vp->v_specmountpoint != NULL)
2296 		return (EBUSY);
2297 	mutex_enter(&device_lock);
2298 	for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
2299 	    vq = vq->v_specnext) {
2300 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
2301 			continue;
2302 		if (vq->v_specmountpoint != NULL) {
2303 			error = EBUSY;
2304 			break;
2305 		}
2306 	}
2307 	mutex_exit(&device_lock);
2308 	return (error);
2309 }
2310 
2311 /*
2312  * Unmount all file systems.
2313  * We traverse the list in reverse order under the assumption that doing so
2314  * will avoid needing to worry about dependencies.
2315  */
2316 bool
2317 vfs_unmountall(struct lwp *l)
2318 {
2319 	printf("unmounting file systems...");
2320 	return vfs_unmountall1(l, true, true);
2321 }
2322 
2323 static void
2324 vfs_unmount_print(struct mount *mp, const char *pfx)
2325 {
2326 	printf("%sunmounted %s on %s type %s\n", pfx,
2327 	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
2328 	    mp->mnt_stat.f_fstypename);
2329 }
2330 
2331 bool
2332 vfs_unmount_forceone(struct lwp *l)
2333 {
2334 	struct mount *mp, *nmp = NULL;
2335 	int error;
2336 
2337 	CIRCLEQ_FOREACH_REVERSE(mp, &mountlist, mnt_list) {
2338 		if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen)
2339 			nmp = mp;
2340 	}
2341 
2342 	if (nmp == NULL)
2343 		return false;
2344 
2345 #ifdef DEBUG
2346 	printf("\nforcefully unmounting %s (%s)...",
2347 	    nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
2348 #endif
2349 	atomic_inc_uint(&nmp->mnt_refcnt);
2350 	if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
2351 		vfs_unmount_print(nmp, "forcefully ");
2352 		return true;
2353 	} else
2354 		atomic_dec_uint(&nmp->mnt_refcnt);
2355 
2356 #ifdef DEBUG
2357 	printf("forceful unmount of %s failed with error %d\n",
2358 	    nmp->mnt_stat.f_mntonname, error);
2359 #endif
2360 
2361 	return false;
2362 }
2363 
2364 bool
2365 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
2366 {
2367 	struct mount *mp, *nmp;
2368 	bool any_error = false, progress = false;
2369 	int error;
2370 
2371 	for (mp = CIRCLEQ_LAST(&mountlist);
2372 	     mp != (void *)&mountlist;
2373 	     mp = nmp) {
2374 		nmp = CIRCLEQ_PREV(mp, mnt_list);
2375 #ifdef DEBUG
2376 		printf("\nunmounting %p %s (%s)...",
2377 		    (void *)mp, mp->mnt_stat.f_mntonname,
2378 		    mp->mnt_stat.f_mntfromname);
2379 #endif
2380 		atomic_inc_uint(&mp->mnt_refcnt);
2381 		if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
2382 			vfs_unmount_print(mp, "");
2383 			progress = true;
2384 		} else {
2385 			atomic_dec_uint(&mp->mnt_refcnt);
2386 			if (verbose) {
2387 				printf("unmount of %s failed with error %d\n",
2388 				    mp->mnt_stat.f_mntonname, error);
2389 			}
2390 			any_error = true;
2391 		}
2392 	}
2393 	if (verbose)
2394 		printf(" done\n");
2395 	if (any_error && verbose)
2396 		printf("WARNING: some file systems would not unmount\n");
2397 	return progress;
2398 }
2399 
2400 /*
2401  * Sync and unmount file systems before shutting down.
2402  */
2403 void
2404 vfs_shutdown(void)
2405 {
2406 	struct lwp *l;
2407 
2408 	/* XXX we're certainly not running in lwp0's context! */
2409 	l = (curlwp == NULL) ? &lwp0 : curlwp;
2410 
2411 	vfs_shutdown1(l);
2412 }
2413 
2414 void
2415 vfs_sync_all(struct lwp *l)
2416 {
2417 	printf("syncing disks... ");
2418 
2419 	/* remove user processes from run queue */
2420 	suspendsched();
2421 	(void) spl0();
2422 
2423 	/* avoid coming back this way again if we panic. */
2424 	doing_shutdown = 1;
2425 
2426 	sys_sync(l, NULL, NULL);
2427 
2428 	/* Wait for sync to finish. */
2429 	if (buf_syncwait() != 0) {
2430 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
2431 		Debugger();
2432 #endif
2433 		printf("giving up\n");
2434 		return;
2435 	} else
2436 		printf("done\n");
2437 }
2438 
2439 static void
2440 vfs_shutdown1(struct lwp *l)
2441 {
2442 
2443 	vfs_sync_all(l);
2444 
2445 	/*
2446 	 * If we've panic'd, don't make the situation potentially
2447 	 * worse by unmounting the file systems.
2448 	 */
2449 	if (panicstr != NULL)
2450 		return;
2451 
2452 	/* Release inodes held by texts before update. */
2453 #ifdef notdef
2454 	vnshutdown();
2455 #endif
2456 	/* Unmount file systems. */
2457 	vfs_unmountall(l);
2458 }
2459 
2460 /*
2461  * Print a list of supported file system types (used by vfs_mountroot)
2462  */
2463 static void
2464 vfs_print_fstypes(void)
2465 {
2466 	struct vfsops *v;
2467 	int cnt = 0;
2468 
2469 	mutex_enter(&vfs_list_lock);
2470 	LIST_FOREACH(v, &vfs_list, vfs_list)
2471 		++cnt;
2472 	mutex_exit(&vfs_list_lock);
2473 
2474 	if (cnt == 0) {
2475 		printf("WARNING: No file system modules have been loaded.\n");
2476 		return;
2477 	}
2478 
2479 	printf("Supported file systems:");
2480 	mutex_enter(&vfs_list_lock);
2481 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2482 		printf(" %s", v->vfs_name);
2483 	}
2484 	mutex_exit(&vfs_list_lock);
2485 	printf("\n");
2486 }
2487 
2488 /*
2489  * Mount the root file system.  If the operator didn't specify a
2490  * file system to use, try all possible file systems until one
2491  * succeeds.
2492  */
2493 int
2494 vfs_mountroot(void)
2495 {
2496 	struct vfsops *v;
2497 	int error = ENODEV;
2498 
2499 	if (root_device == NULL)
2500 		panic("vfs_mountroot: root device unknown");
2501 
2502 	switch (device_class(root_device)) {
2503 	case DV_IFNET:
2504 		if (rootdev != NODEV)
2505 			panic("vfs_mountroot: rootdev set for DV_IFNET "
2506 			    "(0x%llx -> %llu,%llu)",
2507 			    (unsigned long long)rootdev,
2508 			    (unsigned long long)major(rootdev),
2509 			    (unsigned long long)minor(rootdev));
2510 		break;
2511 
2512 	case DV_DISK:
2513 		if (rootdev == NODEV)
2514 			panic("vfs_mountroot: rootdev not set for DV_DISK");
2515 	        if (bdevvp(rootdev, &rootvp))
2516 	                panic("vfs_mountroot: can't get vnode for rootdev");
2517 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
2518 		if (error) {
2519 			printf("vfs_mountroot: can't open root device\n");
2520 			return (error);
2521 		}
2522 		break;
2523 
2524 	case DV_VIRTUAL:
2525 		break;
2526 
2527 	default:
2528 		printf("%s: inappropriate for root file system\n",
2529 		    device_xname(root_device));
2530 		return (ENODEV);
2531 	}
2532 
2533 	/*
2534 	 * If user specified a root fs type, use it.  Make sure the
2535 	 * specified type exists and has a mount_root()
2536 	 */
2537 	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
2538 		v = vfs_getopsbyname(rootfstype);
2539 		error = EFTYPE;
2540 		if (v != NULL) {
2541 			if (v->vfs_mountroot != NULL) {
2542 				error = (v->vfs_mountroot)();
2543 			}
2544 			v->vfs_refcount--;
2545 		}
2546 		goto done;
2547 	}
2548 
2549 	/*
2550 	 * Try each file system currently configured into the kernel.
2551 	 */
2552 	mutex_enter(&vfs_list_lock);
2553 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2554 		if (v->vfs_mountroot == NULL)
2555 			continue;
2556 #ifdef DEBUG
2557 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
2558 #endif
2559 		v->vfs_refcount++;
2560 		mutex_exit(&vfs_list_lock);
2561 		error = (*v->vfs_mountroot)();
2562 		mutex_enter(&vfs_list_lock);
2563 		v->vfs_refcount--;
2564 		if (!error) {
2565 			aprint_normal("root file system type: %s\n",
2566 			    v->vfs_name);
2567 			break;
2568 		}
2569 	}
2570 	mutex_exit(&vfs_list_lock);
2571 
2572 	if (v == NULL) {
2573 		vfs_print_fstypes();
2574 		printf("no file system for %s", device_xname(root_device));
2575 		if (device_class(root_device) == DV_DISK)
2576 			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
2577 		printf("\n");
2578 		error = EFTYPE;
2579 	}
2580 
2581 done:
2582 	if (error && device_class(root_device) == DV_DISK) {
2583 		VOP_CLOSE(rootvp, FREAD, FSCRED);
2584 		vrele(rootvp);
2585 	}
2586 	if (error == 0) {
2587 		extern struct cwdinfo cwdi0;
2588 
2589 		CIRCLEQ_FIRST(&mountlist)->mnt_flag |= MNT_ROOTFS;
2590 		CIRCLEQ_FIRST(&mountlist)->mnt_op->vfs_refcount++;
2591 
2592 		/*
2593 		 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
2594 		 * reference it.
2595 		 */
2596 		error = VFS_ROOT(CIRCLEQ_FIRST(&mountlist), &rootvnode);
2597 		if (error)
2598 			panic("cannot find root vnode, error=%d", error);
2599 		cwdi0.cwdi_cdir = rootvnode;
2600 		VREF(cwdi0.cwdi_cdir);
2601 		VOP_UNLOCK(rootvnode, 0);
2602 		cwdi0.cwdi_rdir = NULL;
2603 
2604 		/*
2605 		 * Now that root is mounted, we can fixup initproc's CWD
2606 		 * info.  All other processes are kthreads, which merely
2607 		 * share proc0's CWD info.
2608 		 */
2609 		initproc->p_cwdi->cwdi_cdir = rootvnode;
2610 		VREF(initproc->p_cwdi->cwdi_cdir);
2611 		initproc->p_cwdi->cwdi_rdir = NULL;
2612 	}
2613 	return (error);
2614 }
2615 
2616 /*
2617  * Get a new unique fsid
2618  */
2619 void
2620 vfs_getnewfsid(struct mount *mp)
2621 {
2622 	static u_short xxxfs_mntid;
2623 	fsid_t tfsid;
2624 	int mtype;
2625 
2626 	mutex_enter(&mntid_lock);
2627 	mtype = makefstype(mp->mnt_op->vfs_name);
2628 	mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
2629 	mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
2630 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2631 	if (xxxfs_mntid == 0)
2632 		++xxxfs_mntid;
2633 	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
2634 	tfsid.__fsid_val[1] = mtype;
2635 	if (!CIRCLEQ_EMPTY(&mountlist)) {
2636 		while (vfs_getvfs(&tfsid)) {
2637 			tfsid.__fsid_val[0]++;
2638 			xxxfs_mntid++;
2639 		}
2640 	}
2641 	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
2642 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2643 	mutex_exit(&mntid_lock);
2644 }
2645 
2646 /*
2647  * Make a 'unique' number from a mount type name.
2648  */
2649 long
2650 makefstype(const char *type)
2651 {
2652 	long rv;
2653 
2654 	for (rv = 0; *type; type++) {
2655 		rv <<= 2;
2656 		rv ^= *type;
2657 	}
2658 	return rv;
2659 }
2660 
2661 /*
2662  * Set vnode attributes to VNOVAL
2663  */
2664 void
2665 vattr_null(struct vattr *vap)
2666 {
2667 
2668 	vap->va_type = VNON;
2669 
2670 	/*
2671 	 * Assign individually so that it is safe even if size and
2672 	 * sign of each member are varied.
2673 	 */
2674 	vap->va_mode = VNOVAL;
2675 	vap->va_nlink = VNOVAL;
2676 	vap->va_uid = VNOVAL;
2677 	vap->va_gid = VNOVAL;
2678 	vap->va_fsid = VNOVAL;
2679 	vap->va_fileid = VNOVAL;
2680 	vap->va_size = VNOVAL;
2681 	vap->va_blocksize = VNOVAL;
2682 	vap->va_atime.tv_sec =
2683 	    vap->va_mtime.tv_sec =
2684 	    vap->va_ctime.tv_sec =
2685 	    vap->va_birthtime.tv_sec = VNOVAL;
2686 	vap->va_atime.tv_nsec =
2687 	    vap->va_mtime.tv_nsec =
2688 	    vap->va_ctime.tv_nsec =
2689 	    vap->va_birthtime.tv_nsec = VNOVAL;
2690 	vap->va_gen = VNOVAL;
2691 	vap->va_flags = VNOVAL;
2692 	vap->va_rdev = VNOVAL;
2693 	vap->va_bytes = VNOVAL;
2694 	vap->va_vaflags = 0;
2695 }
2696 
2697 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
2698 #define ARRAY_PRINT(idx, arr) \
2699     ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
2700 
2701 const char * const vnode_tags[] = { VNODE_TAGS };
2702 const char * const vnode_types[] = { VNODE_TYPES };
2703 const char vnode_flagbits[] = VNODE_FLAGBITS;
2704 
2705 /*
2706  * Print out a description of a vnode.
2707  */
2708 void
2709 vprint(const char *label, struct vnode *vp)
2710 {
2711 	struct vnlock *vl;
2712 	char bf[96];
2713 	int flag;
2714 
2715 	vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock);
2716 	flag = vp->v_iflag | vp->v_vflag | vp->v_uflag;
2717 	snprintb(bf, sizeof(bf), vnode_flagbits, flag);
2718 
2719 	if (label != NULL)
2720 		printf("%s: ", label);
2721 	printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), "
2722 	    "usecount %d, writecount %d, holdcount %d\n"
2723 	    "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n",
2724 	    vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
2725 	    ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
2726 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt,
2727 	    vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt);
2728 	if (vp->v_data != NULL) {
2729 		printf("\t");
2730 		VOP_PRINT(vp);
2731 	}
2732 }
2733 
2734 #ifdef DEBUG
2735 /*
2736  * List all of the locked vnodes in the system.
2737  * Called when debugging the kernel.
2738  */
2739 void
2740 printlockedvnodes(void)
2741 {
2742 	struct mount *mp, *nmp;
2743 	struct vnode *vp;
2744 
2745 	printf("Locked vnodes\n");
2746 	mutex_enter(&mountlist_lock);
2747 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2748 	     mp = nmp) {
2749 		if (vfs_busy(mp, &nmp)) {
2750 			continue;
2751 		}
2752 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2753 			if (VOP_ISLOCKED(vp))
2754 				vprint(NULL, vp);
2755 		}
2756 		mutex_enter(&mountlist_lock);
2757 		vfs_unbusy(mp, false, &nmp);
2758 	}
2759 	mutex_exit(&mountlist_lock);
2760 }
2761 #endif
2762 
2763 /* Deprecated. Kept for KPI compatibility. */
2764 int
2765 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
2766     mode_t acc_mode, kauth_cred_t cred)
2767 {
2768 
2769 #ifdef DIAGNOSTIC
2770 	printf("vaccess: deprecated interface used.\n");
2771 #endif /* DIAGNOSTIC */
2772 
2773 	return genfs_can_access(type, file_mode, uid, gid, acc_mode, cred);
2774 }
2775 
2776 /*
2777  * Given a file system name, look up the vfsops for that
2778  * file system, or return NULL if file system isn't present
2779  * in the kernel.
2780  */
2781 struct vfsops *
2782 vfs_getopsbyname(const char *name)
2783 {
2784 	struct vfsops *v;
2785 
2786 	mutex_enter(&vfs_list_lock);
2787 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2788 		if (strcmp(v->vfs_name, name) == 0)
2789 			break;
2790 	}
2791 	if (v != NULL)
2792 		v->vfs_refcount++;
2793 	mutex_exit(&vfs_list_lock);
2794 
2795 	return (v);
2796 }
2797 
2798 void
2799 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
2800 {
2801 	const struct statvfs *mbp;
2802 
2803 	if (sbp == (mbp = &mp->mnt_stat))
2804 		return;
2805 
2806 	(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
2807 	sbp->f_fsid = mbp->f_fsid;
2808 	sbp->f_owner = mbp->f_owner;
2809 	sbp->f_flag = mbp->f_flag;
2810 	sbp->f_syncwrites = mbp->f_syncwrites;
2811 	sbp->f_asyncwrites = mbp->f_asyncwrites;
2812 	sbp->f_syncreads = mbp->f_syncreads;
2813 	sbp->f_asyncreads = mbp->f_asyncreads;
2814 	(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
2815 	(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
2816 	    sizeof(sbp->f_fstypename));
2817 	(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
2818 	    sizeof(sbp->f_mntonname));
2819 	(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
2820 	    sizeof(sbp->f_mntfromname));
2821 	sbp->f_namemax = mbp->f_namemax;
2822 }
2823 
2824 int
2825 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
2826     const char *vfsname, struct mount *mp, struct lwp *l)
2827 {
2828 	int error;
2829 	size_t size;
2830 	struct statvfs *sfs = &mp->mnt_stat;
2831 	int (*fun)(const void *, void *, size_t, size_t *);
2832 
2833 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
2834 	    sizeof(mp->mnt_stat.f_fstypename));
2835 
2836 	if (onp) {
2837 		struct cwdinfo *cwdi = l->l_proc->p_cwdi;
2838 		fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
2839 		if (cwdi->cwdi_rdir != NULL) {
2840 			size_t len;
2841 			char *bp;
2842 			char *path = PNBUF_GET();
2843 
2844 			bp = path + MAXPATHLEN;
2845 			*--bp = '\0';
2846 			rw_enter(&cwdi->cwdi_lock, RW_READER);
2847 			error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
2848 			    path, MAXPATHLEN / 2, 0, l);
2849 			rw_exit(&cwdi->cwdi_lock);
2850 			if (error) {
2851 				PNBUF_PUT(path);
2852 				return error;
2853 			}
2854 
2855 			len = strlen(bp);
2856 			if (len > sizeof(sfs->f_mntonname) - 1)
2857 				len = sizeof(sfs->f_mntonname) - 1;
2858 			(void)strncpy(sfs->f_mntonname, bp, len);
2859 			PNBUF_PUT(path);
2860 
2861 			if (len < sizeof(sfs->f_mntonname) - 1) {
2862 				error = (*fun)(onp, &sfs->f_mntonname[len],
2863 				    sizeof(sfs->f_mntonname) - len - 1, &size);
2864 				if (error)
2865 					return error;
2866 				size += len;
2867 			} else {
2868 				size = len;
2869 			}
2870 		} else {
2871 			error = (*fun)(onp, &sfs->f_mntonname,
2872 			    sizeof(sfs->f_mntonname) - 1, &size);
2873 			if (error)
2874 				return error;
2875 		}
2876 		(void)memset(sfs->f_mntonname + size, 0,
2877 		    sizeof(sfs->f_mntonname) - size);
2878 	}
2879 
2880 	if (fromp) {
2881 		fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
2882 		error = (*fun)(fromp, sfs->f_mntfromname,
2883 		    sizeof(sfs->f_mntfromname) - 1, &size);
2884 		if (error)
2885 			return error;
2886 		(void)memset(sfs->f_mntfromname + size, 0,
2887 		    sizeof(sfs->f_mntfromname) - size);
2888 	}
2889 	return 0;
2890 }
2891 
2892 void
2893 vfs_timestamp(struct timespec *ts)
2894 {
2895 
2896 	nanotime(ts);
2897 }
2898 
2899 time_t	rootfstime;			/* recorded root fs time, if known */
2900 void
2901 setrootfstime(time_t t)
2902 {
2903 	rootfstime = t;
2904 }
2905 
2906 /*
2907  * Sham lock manager for vnodes.  This is a temporary measure.
2908  */
2909 int
2910 vlockmgr(struct vnlock *vl, int flags)
2911 {
2912 
2913 	KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
2914 
2915 	switch (flags & LK_TYPE_MASK) {
2916 	case LK_SHARED:
2917 		if (rw_tryenter(&vl->vl_lock, RW_READER)) {
2918 			return 0;
2919 		}
2920 		if ((flags & LK_NOWAIT) != 0) {
2921 			return EBUSY;
2922 		}
2923 		rw_enter(&vl->vl_lock, RW_READER);
2924 		return 0;
2925 
2926 	case LK_EXCLUSIVE:
2927 		if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
2928 			return 0;
2929 		}
2930 		if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
2931 		    rw_write_held(&vl->vl_lock)) {
2932 			vl->vl_recursecnt++;
2933 			return 0;
2934 		}
2935 		if ((flags & LK_NOWAIT) != 0) {
2936 			return EBUSY;
2937 		}
2938 		rw_enter(&vl->vl_lock, RW_WRITER);
2939 		return 0;
2940 
2941 	case LK_RELEASE:
2942 		if (vl->vl_recursecnt != 0) {
2943 			KASSERT(rw_write_held(&vl->vl_lock));
2944 			vl->vl_recursecnt--;
2945 			return 0;
2946 		}
2947 		rw_exit(&vl->vl_lock);
2948 		return 0;
2949 
2950 	default:
2951 		panic("vlockmgr: flags %x", flags);
2952 	}
2953 }
2954 
2955 int
2956 vlockstatus(struct vnlock *vl)
2957 {
2958 
2959 	if (rw_write_held(&vl->vl_lock)) {
2960 		return LK_EXCLUSIVE;
2961 	}
2962 	if (rw_read_held(&vl->vl_lock)) {
2963 		return LK_SHARED;
2964 	}
2965 	return 0;
2966 }
2967 
2968 /*
2969  * mount_specific_key_create --
2970  *	Create a key for subsystem mount-specific data.
2971  */
2972 int
2973 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
2974 {
2975 
2976 	return (specificdata_key_create(mount_specificdata_domain, keyp, dtor));
2977 }
2978 
2979 /*
2980  * mount_specific_key_delete --
2981  *	Delete a key for subsystem mount-specific data.
2982  */
2983 void
2984 mount_specific_key_delete(specificdata_key_t key)
2985 {
2986 
2987 	specificdata_key_delete(mount_specificdata_domain, key);
2988 }
2989 
2990 /*
2991  * mount_initspecific --
2992  *	Initialize a mount's specificdata container.
2993  */
2994 void
2995 mount_initspecific(struct mount *mp)
2996 {
2997 	int error;
2998 
2999 	error = specificdata_init(mount_specificdata_domain,
3000 				  &mp->mnt_specdataref);
3001 	KASSERT(error == 0);
3002 }
3003 
3004 /*
3005  * mount_finispecific --
3006  *	Finalize a mount's specificdata container.
3007  */
3008 void
3009 mount_finispecific(struct mount *mp)
3010 {
3011 
3012 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
3013 }
3014 
3015 /*
3016  * mount_getspecific --
3017  *	Return mount-specific data corresponding to the specified key.
3018  */
3019 void *
3020 mount_getspecific(struct mount *mp, specificdata_key_t key)
3021 {
3022 
3023 	return (specificdata_getspecific(mount_specificdata_domain,
3024 					 &mp->mnt_specdataref, key));
3025 }
3026 
3027 /*
3028  * mount_setspecific --
3029  *	Set mount-specific data corresponding to the specified key.
3030  */
3031 void
3032 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
3033 {
3034 
3035 	specificdata_setspecific(mount_specificdata_domain,
3036 				 &mp->mnt_specdataref, key, data);
3037 }
3038 
3039 int
3040 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
3041 {
3042 	int error;
3043 
3044 	KERNEL_LOCK(1, NULL);
3045 	error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
3046 	KERNEL_UNLOCK_ONE(NULL);
3047 
3048 	return error;
3049 }
3050 
3051 int
3052 VFS_START(struct mount *mp, int a)
3053 {
3054 	int error;
3055 
3056 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3057 		KERNEL_LOCK(1, NULL);
3058 	}
3059 	error = (*(mp->mnt_op->vfs_start))(mp, a);
3060 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3061 		KERNEL_UNLOCK_ONE(NULL);
3062 	}
3063 
3064 	return error;
3065 }
3066 
3067 int
3068 VFS_UNMOUNT(struct mount *mp, int a)
3069 {
3070 	int error;
3071 
3072 	KERNEL_LOCK(1, NULL);
3073 	error = (*(mp->mnt_op->vfs_unmount))(mp, a);
3074 	KERNEL_UNLOCK_ONE(NULL);
3075 
3076 	return error;
3077 }
3078 
3079 int
3080 VFS_ROOT(struct mount *mp, struct vnode **a)
3081 {
3082 	int error;
3083 
3084 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3085 		KERNEL_LOCK(1, NULL);
3086 	}
3087 	error = (*(mp->mnt_op->vfs_root))(mp, a);
3088 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3089 		KERNEL_UNLOCK_ONE(NULL);
3090 	}
3091 
3092 	return error;
3093 }
3094 
3095 int
3096 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c)
3097 {
3098 	int error;
3099 
3100 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3101 		KERNEL_LOCK(1, NULL);
3102 	}
3103 	error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c);
3104 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3105 		KERNEL_UNLOCK_ONE(NULL);
3106 	}
3107 
3108 	return error;
3109 }
3110 
3111 int
3112 VFS_STATVFS(struct mount *mp, struct statvfs *a)
3113 {
3114 	int error;
3115 
3116 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3117 		KERNEL_LOCK(1, NULL);
3118 	}
3119 	error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
3120 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3121 		KERNEL_UNLOCK_ONE(NULL);
3122 	}
3123 
3124 	return error;
3125 }
3126 
3127 int
3128 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
3129 {
3130 	int error;
3131 
3132 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3133 		KERNEL_LOCK(1, NULL);
3134 	}
3135 	error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
3136 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3137 		KERNEL_UNLOCK_ONE(NULL);
3138 	}
3139 
3140 	return error;
3141 }
3142 
3143 int
3144 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b)
3145 {
3146 	int error;
3147 
3148 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3149 		KERNEL_LOCK(1, NULL);
3150 	}
3151 	error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b);
3152 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3153 		KERNEL_UNLOCK_ONE(NULL);
3154 	}
3155 
3156 	return error;
3157 }
3158 
3159 int
3160 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
3161 {
3162 	int error;
3163 
3164 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
3165 		KERNEL_LOCK(1, NULL);
3166 	}
3167 	error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
3168 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
3169 		KERNEL_UNLOCK_ONE(NULL);
3170 	}
3171 
3172 	return error;
3173 }
3174 
3175 int
3176 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
3177 {
3178 	int error;
3179 
3180 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3181 		KERNEL_LOCK(1, NULL);
3182 	}
3183 	error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
3184 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3185 		KERNEL_UNLOCK_ONE(NULL);
3186 	}
3187 
3188 	return error;
3189 }
3190 
3191 int
3192 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
3193 {
3194 	int error;
3195 
3196 	KERNEL_LOCK(1, NULL);		/* XXXSMP check ffs */
3197 	error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
3198 	KERNEL_UNLOCK_ONE(NULL);	/* XXX */
3199 
3200 	return error;
3201 }
3202 
3203 int
3204 VFS_SUSPENDCTL(struct mount *mp, int a)
3205 {
3206 	int error;
3207 
3208 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3209 		KERNEL_LOCK(1, NULL);
3210 	}
3211 	error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
3212 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3213 		KERNEL_UNLOCK_ONE(NULL);
3214 	}
3215 
3216 	return error;
3217 }
3218 
3219 #if defined(DDB) || defined(DEBUGPRINT)
3220 static const char buf_flagbits[] = BUF_FLAGBITS;
3221 
3222 void
3223 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
3224 {
3225 	char bf[1024];
3226 
3227 	(*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
3228 	    PRIx64 " dev 0x%x\n",
3229 	    bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
3230 
3231 	snprintb(bf, sizeof(bf),
3232 	    buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
3233 	(*pr)("  error %d flags 0x%s\n", bp->b_error, bf);
3234 
3235 	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
3236 		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
3237 	(*pr)("  data %p saveaddr %p\n",
3238 		  bp->b_data, bp->b_saveaddr);
3239 	(*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
3240 }
3241 
3242 
3243 void
3244 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
3245 {
3246 	char bf[256];
3247 
3248 	uvm_object_printit(&vp->v_uobj, full, pr);
3249 	snprintb(bf, sizeof(bf),
3250 	    vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
3251 	(*pr)("\nVNODE flags %s\n", bf);
3252 	(*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
3253 	      vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
3254 
3255 	(*pr)("data %p writecount %ld holdcnt %ld\n",
3256 	      vp->v_data, vp->v_writecount, vp->v_holdcnt);
3257 
3258 	(*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
3259 	      ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
3260 	      ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
3261 	      vp->v_mount, vp->v_mountedhere);
3262 
3263 	(*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock);
3264 
3265 	if (full) {
3266 		struct buf *bp;
3267 
3268 		(*pr)("clean bufs:\n");
3269 		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
3270 			(*pr)(" bp %p\n", bp);
3271 			vfs_buf_print(bp, full, pr);
3272 		}
3273 
3274 		(*pr)("dirty bufs:\n");
3275 		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
3276 			(*pr)(" bp %p\n", bp);
3277 			vfs_buf_print(bp, full, pr);
3278 		}
3279 	}
3280 }
3281 
3282 void
3283 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
3284 {
3285 	char sbuf[256];
3286 
3287 	(*pr)("vnodecovered = %p syncer = %p data = %p\n",
3288 			mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data);
3289 
3290 	(*pr)("fs_bshift %d dev_bshift = %d\n",
3291 			mp->mnt_fs_bshift,mp->mnt_dev_bshift);
3292 
3293 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
3294 	(*pr)("flag = %s\n", sbuf);
3295 
3296 	snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
3297 	(*pr)("iflag = %s\n", sbuf);
3298 
3299 	(*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt,
3300 	    &mp->mnt_unmounting, &mp->mnt_updating);
3301 
3302 	(*pr)("statvfs cache:\n");
3303 	(*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
3304 	(*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
3305 	(*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
3306 
3307 	(*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
3308 	(*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
3309 	(*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
3310 	(*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
3311 
3312 	(*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
3313 	(*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
3314 	(*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
3315 	(*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
3316 
3317 	(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
3318 			mp->mnt_stat.f_fsidx.__fsid_val[0],
3319 			mp->mnt_stat.f_fsidx.__fsid_val[1]);
3320 
3321 	(*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
3322 	(*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
3323 
3324 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
3325 
3326 	(*pr)("\tflag = %s\n",sbuf);
3327 	(*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
3328 	(*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
3329 	(*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
3330 	(*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
3331 	(*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
3332 	(*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
3333 	(*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
3334 
3335 	{
3336 		int cnt = 0;
3337 		struct vnode *vp;
3338 		(*pr)("locked vnodes =");
3339 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3340 			if (VOP_ISLOCKED(vp)) {
3341 				if ((++cnt % 6) == 0) {
3342 					(*pr)(" %p,\n\t", vp);
3343 				} else {
3344 					(*pr)(" %p,", vp);
3345 				}
3346 			}
3347 		}
3348 		(*pr)("\n");
3349 	}
3350 
3351 	if (full) {
3352 		int cnt = 0;
3353 		struct vnode *vp;
3354 		(*pr)("all vnodes =");
3355 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3356 			if (!TAILQ_NEXT(vp, v_mntvnodes)) {
3357 				(*pr)(" %p", vp);
3358 			} else if ((++cnt % 6) == 0) {
3359 				(*pr)(" %p,\n\t", vp);
3360 			} else {
3361 				(*pr)(" %p,", vp);
3362 			}
3363 		}
3364 		(*pr)("\n", vp);
3365 	}
3366 }
3367 #endif /* DDB || DEBUGPRINT */
3368 
3369 /*
3370  * Check if a device pointed to by vp is mounted.
3371  *
3372  * Returns:
3373  *   EINVAL	if it's not a disk
3374  *   EBUSY	if it's a disk and mounted
3375  *   0		if it's a disk and not mounted
3376  */
3377 int
3378 rawdev_mounted(struct vnode *vp, struct vnode **bvpp)
3379 {
3380 	struct vnode *bvp;
3381 	dev_t dev;
3382 	int d_type;
3383 
3384 	bvp = NULL;
3385 	dev = vp->v_rdev;
3386 	d_type = D_OTHER;
3387 
3388 	if (iskmemvp(vp))
3389 		return EINVAL;
3390 
3391 	switch (vp->v_type) {
3392 	case VCHR: {
3393 		const struct cdevsw *cdev;
3394 
3395 		cdev = cdevsw_lookup(dev);
3396 		if (cdev != NULL) {
3397 			dev_t blkdev;
3398 
3399 			blkdev = devsw_chr2blk(dev);
3400 			if (blkdev != NODEV) {
3401 				vfinddev(blkdev, VBLK, &bvp);
3402 				if (bvp != NULL)
3403 					d_type = (cdev->d_flag & D_TYPEMASK);
3404 			}
3405 		}
3406 
3407 		break;
3408 		}
3409 
3410 	case VBLK: {
3411 		const struct bdevsw *bdev;
3412 
3413 		bdev = bdevsw_lookup(dev);
3414 		if (bdev != NULL)
3415 			d_type = (bdev->d_flag & D_TYPEMASK);
3416 
3417 		bvp = vp;
3418 
3419 		break;
3420 		}
3421 
3422 	default:
3423 		break;
3424 	}
3425 
3426 	if (d_type != D_DISK)
3427 		return EINVAL;
3428 
3429 	if (bvpp != NULL)
3430 		*bvpp = bvp;
3431 
3432 	/*
3433 	 * XXX: This is bogus. We should be failing the request
3434 	 * XXX: not only if this specific slice is mounted, but
3435 	 * XXX: if it's on a disk with any other mounted slice.
3436 	 */
3437 	if (vfs_mountedon(bvp))
3438 		return EBUSY;
3439 
3440 	return 0;
3441 }
3442