xref: /netbsd-src/sys/kern/vfs_subr.c (revision b5677b36047b601b9addaaa494a58ceae82c2a6c)
1 /*	$NetBSD: vfs_subr.c,v 1.370 2009/03/30 16:38:05 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * Note on v_usecount and locking:
71  *
72  * At nearly all points it is known that v_usecount could be zero, the
73  * vnode interlock will be held.
74  *
75  * To change v_usecount away from zero, the interlock must be held.  To
76  * change from a non-zero value to zero, again the interlock must be
77  * held.
78  *
79  * Changing the usecount from a non-zero value to a non-zero value can
80  * safely be done using atomic operations, without the interlock held.
81  */
82 
83 #include <sys/cdefs.h>
84 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.370 2009/03/30 16:38:05 yamt Exp $");
85 
86 #include "opt_ddb.h"
87 #include "opt_compat_netbsd.h"
88 #include "opt_compat_43.h"
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/conf.h>
93 #include <sys/proc.h>
94 #include <sys/kernel.h>
95 #include <sys/mount.h>
96 #include <sys/fcntl.h>
97 #include <sys/vnode.h>
98 #include <sys/stat.h>
99 #include <sys/namei.h>
100 #include <sys/ucred.h>
101 #include <sys/buf.h>
102 #include <sys/errno.h>
103 #include <sys/kmem.h>
104 #include <sys/syscallargs.h>
105 #include <sys/device.h>
106 #include <sys/filedesc.h>
107 #include <sys/kauth.h>
108 #include <sys/atomic.h>
109 #include <sys/kthread.h>
110 #include <sys/wapbl.h>
111 
112 #include <miscfs/specfs/specdev.h>
113 #include <miscfs/syncfs/syncfs.h>
114 
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_readahead.h>
117 #include <uvm/uvm_ddb.h>
118 
119 #include <sys/sysctl.h>
120 
121 const enum vtype iftovt_tab[16] = {
122 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
123 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
124 };
125 const int	vttoif_tab[9] = {
126 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
127 	S_IFSOCK, S_IFIFO, S_IFMT,
128 };
129 
130 /*
131  * Insq/Remq for the vnode usage lists.
132  */
133 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
134 #define	bufremvn(bp) {							\
135 	LIST_REMOVE(bp, b_vnbufs);					\
136 	(bp)->b_vnbufs.le_next = NOLIST;				\
137 }
138 
139 int doforce = 1;		/* 1 => permit forcible unmounting */
140 int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
141 
142 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
143 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
144 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
145 
146 struct mntlist mountlist =			/* mounted filesystem list */
147     CIRCLEQ_HEAD_INITIALIZER(mountlist);
148 
149 u_int numvnodes;
150 static specificdata_domain_t mount_specificdata_domain;
151 
152 static int vrele_pending;
153 static int vrele_gen;
154 static kmutex_t	vrele_lock;
155 static kcondvar_t vrele_cv;
156 static lwp_t *vrele_lwp;
157 
158 kmutex_t mountlist_lock;
159 kmutex_t mntid_lock;
160 kmutex_t mntvnode_lock;
161 kmutex_t vnode_free_list_lock;
162 kmutex_t vfs_list_lock;
163 
164 static pool_cache_t vnode_cache;
165 
166 /*
167  * These define the root filesystem and device.
168  */
169 struct vnode *rootvnode;
170 struct device *root_device;			/* root device */
171 
172 /*
173  * Local declarations.
174  */
175 
176 static void vrele_thread(void *);
177 static void insmntque(vnode_t *, struct mount *);
178 static int getdevvp(dev_t, vnode_t **, enum vtype);
179 static vnode_t *getcleanvnode(void);
180 void vpanic(vnode_t *, const char *);
181 
182 #ifdef DEBUG
183 void printlockedvnodes(void);
184 #endif
185 
186 #ifdef DIAGNOSTIC
187 void
188 vpanic(vnode_t *vp, const char *msg)
189 {
190 
191 	vprint(NULL, vp);
192 	panic("%s\n", msg);
193 }
194 #else
195 #define	vpanic(vp, msg)	/* nothing */
196 #endif
197 
198 void
199 vn_init1(void)
200 {
201 
202 	vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
203 	    NULL, IPL_NONE, NULL, NULL, NULL);
204 	KASSERT(vnode_cache != NULL);
205 
206 	/* Create deferred release thread. */
207 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
208 	cv_init(&vrele_cv, "vrele");
209 	if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
210 	    NULL, &vrele_lwp, "vrele"))
211 		panic("fork vrele");
212 }
213 
214 /*
215  * Initialize the vnode management data structures.
216  */
217 void
218 vntblinit(void)
219 {
220 
221 	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
222 	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
223 	mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
224 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
225 	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
226 
227 	mount_specificdata_domain = specificdata_domain_create();
228 
229 	/* Initialize the filesystem syncer. */
230 	vn_initialize_syncerd();
231 	vn_init1();
232 }
233 
234 int
235 vfs_drainvnodes(long target, struct lwp *l)
236 {
237 
238 	while (numvnodes > target) {
239 		vnode_t *vp;
240 
241 		mutex_enter(&vnode_free_list_lock);
242 		vp = getcleanvnode();
243 		if (vp == NULL)
244 			return EBUSY; /* give up */
245 		ungetnewvnode(vp);
246 	}
247 
248 	return 0;
249 }
250 
251 /*
252  * Lookup a mount point by filesystem identifier.
253  *
254  * XXX Needs to add a reference to the mount point.
255  */
256 struct mount *
257 vfs_getvfs(fsid_t *fsid)
258 {
259 	struct mount *mp;
260 
261 	mutex_enter(&mountlist_lock);
262 	CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
263 		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
264 		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
265 			mutex_exit(&mountlist_lock);
266 			return (mp);
267 		}
268 	}
269 	mutex_exit(&mountlist_lock);
270 	return ((struct mount *)0);
271 }
272 
273 /*
274  * Drop a reference to a mount structure, freeing if the last reference.
275  */
276 void
277 vfs_destroy(struct mount *mp)
278 {
279 
280 	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
281 		return;
282 	}
283 
284 	/*
285 	 * Nothing else has visibility of the mount: we can now
286 	 * free the data structures.
287 	 */
288 	KASSERT(mp->mnt_refcnt == 0);
289 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
290 	rw_destroy(&mp->mnt_unmounting);
291 	mutex_destroy(&mp->mnt_updating);
292 	mutex_destroy(&mp->mnt_renamelock);
293 	if (mp->mnt_op != NULL) {
294 		vfs_delref(mp->mnt_op);
295 	}
296 	kmem_free(mp, sizeof(*mp));
297 }
298 
299 /*
300  * grab a vnode from freelist and clean it.
301  */
302 vnode_t *
303 getcleanvnode(void)
304 {
305 	vnode_t *vp;
306 	vnodelst_t *listhd;
307 
308 	KASSERT(mutex_owned(&vnode_free_list_lock));
309 
310 retry:
311 	listhd = &vnode_free_list;
312 try_nextlist:
313 	TAILQ_FOREACH(vp, listhd, v_freelist) {
314 		/*
315 		 * It's safe to test v_usecount and v_iflag
316 		 * without holding the interlock here, since
317 		 * these vnodes should never appear on the
318 		 * lists.
319 		 */
320 		if (vp->v_usecount != 0) {
321 			vpanic(vp, "free vnode isn't");
322 		}
323 		if ((vp->v_iflag & VI_CLEAN) != 0) {
324 			vpanic(vp, "clean vnode on freelist");
325 		}
326 		if (vp->v_freelisthd != listhd) {
327 			printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
328 			vpanic(vp, "list head mismatch");
329 		}
330 		if (!mutex_tryenter(&vp->v_interlock))
331 			continue;
332 		/*
333 		 * Our lwp might hold the underlying vnode
334 		 * locked, so don't try to reclaim a VI_LAYER
335 		 * node if it's locked.
336 		 */
337 		if ((vp->v_iflag & VI_XLOCK) == 0 &&
338 		    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
339 			break;
340 		}
341 		mutex_exit(&vp->v_interlock);
342 	}
343 
344 	if (vp == NULL) {
345 		if (listhd == &vnode_free_list) {
346 			listhd = &vnode_hold_list;
347 			goto try_nextlist;
348 		}
349 		mutex_exit(&vnode_free_list_lock);
350 		return NULL;
351 	}
352 
353 	/* Remove it from the freelist. */
354 	TAILQ_REMOVE(listhd, vp, v_freelist);
355 	vp->v_freelisthd = NULL;
356 	mutex_exit(&vnode_free_list_lock);
357 
358 	/*
359 	 * The vnode is still associated with a file system, so we must
360 	 * clean it out before reusing it.  We need to add a reference
361 	 * before doing this.  If the vnode gains another reference while
362 	 * being cleaned out then we lose - retry.
363 	 */
364 	atomic_inc_uint(&vp->v_usecount);
365 	vclean(vp, DOCLOSE);
366 	if (vp->v_usecount == 1) {
367 		/* We're about to dirty it. */
368 		vp->v_iflag &= ~VI_CLEAN;
369 		mutex_exit(&vp->v_interlock);
370 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
371 			spec_node_destroy(vp);
372 		}
373 		vp->v_type = VNON;
374 	} else {
375 		/*
376 		 * Don't return to freelist - the holder of the last
377 		 * reference will destroy it.
378 		 */
379 		vrelel(vp, 0); /* releases vp->v_interlock */
380 		mutex_enter(&vnode_free_list_lock);
381 		goto retry;
382 	}
383 
384 	if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
385 	    !TAILQ_EMPTY(&vp->v_uobj.memq)) {
386 		vpanic(vp, "cleaned vnode isn't");
387 	}
388 	if (vp->v_numoutput != 0) {
389 		vpanic(vp, "clean vnode has pending I/O's");
390 	}
391 	if ((vp->v_iflag & VI_ONWORKLST) != 0) {
392 		vpanic(vp, "clean vnode on syncer list");
393 	}
394 
395 	return vp;
396 }
397 
398 /*
399  * Mark a mount point as busy, and gain a new reference to it.  Used to
400  * prevent the file system from being unmounted during critical sections.
401  *
402  * => The caller must hold a pre-existing reference to the mount.
403  * => Will fail if the file system is being unmounted, or is unmounted.
404  */
405 int
406 vfs_busy(struct mount *mp, struct mount **nextp)
407 {
408 
409 	KASSERT(mp->mnt_refcnt > 0);
410 
411 	if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
412 		if (nextp != NULL) {
413 			KASSERT(mutex_owned(&mountlist_lock));
414 			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
415 		}
416 		return EBUSY;
417 	}
418 	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
419 		rw_exit(&mp->mnt_unmounting);
420 		if (nextp != NULL) {
421 			KASSERT(mutex_owned(&mountlist_lock));
422 			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
423 		}
424 		return ENOENT;
425 	}
426 	if (nextp != NULL) {
427 		mutex_exit(&mountlist_lock);
428 	}
429 	atomic_inc_uint(&mp->mnt_refcnt);
430 	return 0;
431 }
432 
433 /*
434  * Unbusy a busy filesystem.
435  *
436  * => If keepref is true, preserve reference added by vfs_busy().
437  * => If nextp != NULL, acquire mountlist_lock.
438  */
439 void
440 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
441 {
442 
443 	KASSERT(mp->mnt_refcnt > 0);
444 
445 	if (nextp != NULL) {
446 		mutex_enter(&mountlist_lock);
447 	}
448 	rw_exit(&mp->mnt_unmounting);
449 	if (!keepref) {
450 		vfs_destroy(mp);
451 	}
452 	if (nextp != NULL) {
453 		KASSERT(mutex_owned(&mountlist_lock));
454 		*nextp = CIRCLEQ_NEXT(mp, mnt_list);
455 	}
456 }
457 
458 /*
459  * Lookup a filesystem type, and if found allocate and initialize
460  * a mount structure for it.
461  *
462  * Devname is usually updated by mount(8) after booting.
463  */
464 int
465 vfs_rootmountalloc(const char *fstypename, const char *devname,
466     struct mount **mpp)
467 {
468 	struct vfsops *vfsp = NULL;
469 	struct mount *mp;
470 
471 	mutex_enter(&vfs_list_lock);
472 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
473 		if (!strncmp(vfsp->vfs_name, fstypename,
474 		    sizeof(mp->mnt_stat.f_fstypename)))
475 			break;
476 	if (vfsp == NULL) {
477 		mutex_exit(&vfs_list_lock);
478 		return (ENODEV);
479 	}
480 	vfsp->vfs_refcount++;
481 	mutex_exit(&vfs_list_lock);
482 
483 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
484 	if (mp == NULL)
485 		return ENOMEM;
486 	mp->mnt_refcnt = 1;
487 	rw_init(&mp->mnt_unmounting);
488 	mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
489 	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
490 	(void)vfs_busy(mp, NULL);
491 	TAILQ_INIT(&mp->mnt_vnodelist);
492 	mp->mnt_op = vfsp;
493 	mp->mnt_flag = MNT_RDONLY;
494 	mp->mnt_vnodecovered = NULL;
495 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
496 	    sizeof(mp->mnt_stat.f_fstypename));
497 	mp->mnt_stat.f_mntonname[0] = '/';
498 	mp->mnt_stat.f_mntonname[1] = '\0';
499 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
500 	    '\0';
501 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
502 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
503 	mount_initspecific(mp);
504 	*mpp = mp;
505 	return (0);
506 }
507 
508 /*
509  * Routines having to do with the management of the vnode table.
510  */
511 extern int (**dead_vnodeop_p)(void *);
512 
513 /*
514  * Return the next vnode from the free list.
515  */
516 int
517 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
518 	    vnode_t **vpp)
519 {
520 	struct uvm_object *uobj;
521 	static int toggle;
522 	vnode_t *vp;
523 	int error = 0, tryalloc;
524 
525  try_again:
526 	if (mp != NULL) {
527 		/*
528 		 * Mark filesystem busy while we're creating a
529 		 * vnode.  If unmount is in progress, this will
530 		 * fail.
531 		 */
532 		error = vfs_busy(mp, NULL);
533 		if (error)
534 			return error;
535 	}
536 
537 	/*
538 	 * We must choose whether to allocate a new vnode or recycle an
539 	 * existing one. The criterion for allocating a new one is that
540 	 * the total number of vnodes is less than the number desired or
541 	 * there are no vnodes on either free list. Generally we only
542 	 * want to recycle vnodes that have no buffers associated with
543 	 * them, so we look first on the vnode_free_list. If it is empty,
544 	 * we next consider vnodes with referencing buffers on the
545 	 * vnode_hold_list. The toggle ensures that half the time we
546 	 * will use a buffer from the vnode_hold_list, and half the time
547 	 * we will allocate a new one unless the list has grown to twice
548 	 * the desired size. We are reticent to recycle vnodes from the
549 	 * vnode_hold_list because we will lose the identity of all its
550 	 * referencing buffers.
551 	 */
552 
553 	vp = NULL;
554 
555 	mutex_enter(&vnode_free_list_lock);
556 
557 	toggle ^= 1;
558 	if (numvnodes > 2 * desiredvnodes)
559 		toggle = 0;
560 
561 	tryalloc = numvnodes < desiredvnodes ||
562 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
563 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
564 
565 	if (tryalloc) {
566 		numvnodes++;
567 		mutex_exit(&vnode_free_list_lock);
568 		if ((vp = vnalloc(NULL)) == NULL) {
569 			mutex_enter(&vnode_free_list_lock);
570 			numvnodes--;
571 		} else
572 			vp->v_usecount = 1;
573 	}
574 
575 	if (vp == NULL) {
576 		vp = getcleanvnode();
577 		if (vp == NULL) {
578 			if (mp != NULL) {
579 				vfs_unbusy(mp, false, NULL);
580 			}
581 			if (tryalloc) {
582 				printf("WARNING: unable to allocate new "
583 				    "vnode, retrying...\n");
584 				kpause("newvn", false, hz, NULL);
585 				goto try_again;
586 			}
587 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
588 			*vpp = 0;
589 			return (ENFILE);
590 		}
591 		vp->v_iflag = 0;
592 		vp->v_vflag = 0;
593 		vp->v_uflag = 0;
594 		vp->v_socket = NULL;
595 	}
596 
597 	KASSERT(vp->v_usecount == 1);
598 	KASSERT(vp->v_freelisthd == NULL);
599 	KASSERT(LIST_EMPTY(&vp->v_nclist));
600 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
601 
602 	vp->v_type = VNON;
603 	vp->v_vnlock = &vp->v_lock;
604 	vp->v_tag = tag;
605 	vp->v_op = vops;
606 	insmntque(vp, mp);
607 	*vpp = vp;
608 	vp->v_data = 0;
609 
610 	/*
611 	 * initialize uvm_object within vnode.
612 	 */
613 
614 	uobj = &vp->v_uobj;
615 	KASSERT(uobj->pgops == &uvm_vnodeops);
616 	KASSERT(uobj->uo_npages == 0);
617 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
618 	vp->v_size = vp->v_writesize = VSIZENOTSET;
619 
620 	if (mp != NULL) {
621 		if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
622 			vp->v_vflag |= VV_MPSAFE;
623 		vfs_unbusy(mp, true, NULL);
624 	}
625 
626 	return (0);
627 }
628 
629 /*
630  * This is really just the reverse of getnewvnode(). Needed for
631  * VFS_VGET functions who may need to push back a vnode in case
632  * of a locking race.
633  */
634 void
635 ungetnewvnode(vnode_t *vp)
636 {
637 
638 	KASSERT(vp->v_usecount == 1);
639 	KASSERT(vp->v_data == NULL);
640 	KASSERT(vp->v_freelisthd == NULL);
641 
642 	mutex_enter(&vp->v_interlock);
643 	vp->v_iflag |= VI_CLEAN;
644 	vrelel(vp, 0);
645 }
646 
647 /*
648  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
649  * marker vnode and we are prepared to wait for the allocation.
650  */
651 vnode_t *
652 vnalloc(struct mount *mp)
653 {
654 	vnode_t *vp;
655 
656 	vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
657 	if (vp == NULL) {
658 		return NULL;
659 	}
660 
661 	memset(vp, 0, sizeof(*vp));
662 	UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
663 	cv_init(&vp->v_cv, "vnode");
664 	/*
665 	 * done by memset() above.
666 	 *	LIST_INIT(&vp->v_nclist);
667 	 *	LIST_INIT(&vp->v_dnclist);
668 	 */
669 
670 	if (mp != NULL) {
671 		vp->v_mount = mp;
672 		vp->v_type = VBAD;
673 		vp->v_iflag = VI_MARKER;
674 	} else {
675 		rw_init(&vp->v_lock.vl_lock);
676 	}
677 
678 	return vp;
679 }
680 
681 /*
682  * Free an unused, unreferenced vnode.
683  */
684 void
685 vnfree(vnode_t *vp)
686 {
687 
688 	KASSERT(vp->v_usecount == 0);
689 
690 	if ((vp->v_iflag & VI_MARKER) == 0) {
691 		rw_destroy(&vp->v_lock.vl_lock);
692 		mutex_enter(&vnode_free_list_lock);
693 		numvnodes--;
694 		mutex_exit(&vnode_free_list_lock);
695 	}
696 
697 	UVM_OBJ_DESTROY(&vp->v_uobj);
698 	cv_destroy(&vp->v_cv);
699 	pool_cache_put(vnode_cache, vp);
700 }
701 
702 /*
703  * Remove a vnode from its freelist.
704  */
705 static inline void
706 vremfree(vnode_t *vp)
707 {
708 
709 	KASSERT(mutex_owned(&vp->v_interlock));
710 	KASSERT(vp->v_usecount == 0);
711 
712 	/*
713 	 * Note that the reference count must not change until
714 	 * the vnode is removed.
715 	 */
716 	mutex_enter(&vnode_free_list_lock);
717 	if (vp->v_holdcnt > 0) {
718 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
719 	} else {
720 		KASSERT(vp->v_freelisthd == &vnode_free_list);
721 	}
722 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
723 	vp->v_freelisthd = NULL;
724 	mutex_exit(&vnode_free_list_lock);
725 }
726 
727 /*
728  * Move a vnode from one mount queue to another.
729  */
730 static void
731 insmntque(vnode_t *vp, struct mount *mp)
732 {
733 	struct mount *omp;
734 
735 #ifdef DIAGNOSTIC
736 	if ((mp != NULL) &&
737 	    (mp->mnt_iflag & IMNT_UNMOUNT) &&
738 	    vp->v_tag != VT_VFS) {
739 		panic("insmntque into dying filesystem");
740 	}
741 #endif
742 
743 	mutex_enter(&mntvnode_lock);
744 	/*
745 	 * Delete from old mount point vnode list, if on one.
746 	 */
747 	if ((omp = vp->v_mount) != NULL)
748 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
749 	/*
750 	 * Insert into list of vnodes for the new mount point, if
751 	 * available.  The caller must take a reference on the mount
752 	 * structure and donate to the vnode.
753 	 */
754 	if ((vp->v_mount = mp) != NULL)
755 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
756 	mutex_exit(&mntvnode_lock);
757 
758 	if (omp != NULL) {
759 		/* Release reference to old mount. */
760 		vfs_destroy(omp);
761 	}
762 }
763 
764 /*
765  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
766  * recycled.
767  */
768 void
769 vwait(vnode_t *vp, int flags)
770 {
771 
772 	KASSERT(mutex_owned(&vp->v_interlock));
773 	KASSERT(vp->v_usecount != 0);
774 
775 	while ((vp->v_iflag & flags) != 0)
776 		cv_wait(&vp->v_cv, &vp->v_interlock);
777 }
778 
779 /*
780  * Insert a marker vnode into a mount's vnode list, after the
781  * specified vnode.  mntvnode_lock must be held.
782  */
783 void
784 vmark(vnode_t *mvp, vnode_t *vp)
785 {
786 	struct mount *mp;
787 
788 	mp = mvp->v_mount;
789 
790 	KASSERT(mutex_owned(&mntvnode_lock));
791 	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
792 	KASSERT(vp->v_mount == mp);
793 
794 	TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
795 }
796 
797 /*
798  * Remove a marker vnode from a mount's vnode list, and return
799  * a pointer to the next vnode in the list.  mntvnode_lock must
800  * be held.
801  */
802 vnode_t *
803 vunmark(vnode_t *mvp)
804 {
805 	vnode_t *vp;
806 	struct mount *mp;
807 
808 	mp = mvp->v_mount;
809 
810 	KASSERT(mutex_owned(&mntvnode_lock));
811 	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
812 
813 	vp = TAILQ_NEXT(mvp, v_mntvnodes);
814 	TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
815 
816 	KASSERT(vp == NULL || vp->v_mount == mp);
817 
818 	return vp;
819 }
820 
821 /*
822  * Update outstanding I/O count and do wakeup if requested.
823  */
824 void
825 vwakeup(struct buf *bp)
826 {
827 	struct vnode *vp;
828 
829 	if ((vp = bp->b_vp) == NULL)
830 		return;
831 
832 	KASSERT(bp->b_objlock == &vp->v_interlock);
833 	KASSERT(mutex_owned(bp->b_objlock));
834 
835 	if (--vp->v_numoutput < 0)
836 		panic("vwakeup: neg numoutput, vp %p", vp);
837 	if (vp->v_numoutput == 0)
838 		cv_broadcast(&vp->v_cv);
839 }
840 
841 /*
842  * Flush out and invalidate all buffers associated with a vnode.
843  * Called with the underlying vnode locked, which should prevent new dirty
844  * buffers from being queued.
845  */
846 int
847 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
848 	  bool catch, int slptimeo)
849 {
850 	struct buf *bp, *nbp;
851 	int error;
852 	int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
853 	    (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
854 
855 	/* XXXUBC this doesn't look at flags or slp* */
856 	mutex_enter(&vp->v_interlock);
857 	error = VOP_PUTPAGES(vp, 0, 0, flushflags);
858 	if (error) {
859 		return error;
860 	}
861 
862 	if (flags & V_SAVE) {
863 		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
864 		if (error)
865 		        return (error);
866 		KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
867 	}
868 
869 	mutex_enter(&bufcache_lock);
870 restart:
871 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
872 		nbp = LIST_NEXT(bp, b_vnbufs);
873 		error = bbusy(bp, catch, slptimeo, NULL);
874 		if (error != 0) {
875 			if (error == EPASSTHROUGH)
876 				goto restart;
877 			mutex_exit(&bufcache_lock);
878 			return (error);
879 		}
880 		brelsel(bp, BC_INVAL | BC_VFLUSH);
881 	}
882 
883 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
884 		nbp = LIST_NEXT(bp, b_vnbufs);
885 		error = bbusy(bp, catch, slptimeo, NULL);
886 		if (error != 0) {
887 			if (error == EPASSTHROUGH)
888 				goto restart;
889 			mutex_exit(&bufcache_lock);
890 			return (error);
891 		}
892 		/*
893 		 * XXX Since there are no node locks for NFS, I believe
894 		 * there is a slight chance that a delayed write will
895 		 * occur while sleeping just above, so check for it.
896 		 */
897 		if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
898 #ifdef DEBUG
899 			printf("buffer still DELWRI\n");
900 #endif
901 			bp->b_cflags |= BC_BUSY | BC_VFLUSH;
902 			mutex_exit(&bufcache_lock);
903 			VOP_BWRITE(bp);
904 			mutex_enter(&bufcache_lock);
905 			goto restart;
906 		}
907 		brelsel(bp, BC_INVAL | BC_VFLUSH);
908 	}
909 
910 #ifdef DIAGNOSTIC
911 	if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
912 		panic("vinvalbuf: flush failed, vp %p", vp);
913 #endif
914 
915 	mutex_exit(&bufcache_lock);
916 
917 	return (0);
918 }
919 
920 /*
921  * Destroy any in core blocks past the truncation length.
922  * Called with the underlying vnode locked, which should prevent new dirty
923  * buffers from being queued.
924  */
925 int
926 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
927 {
928 	struct buf *bp, *nbp;
929 	int error;
930 	voff_t off;
931 
932 	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
933 	mutex_enter(&vp->v_interlock);
934 	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
935 	if (error) {
936 		return error;
937 	}
938 
939 	mutex_enter(&bufcache_lock);
940 restart:
941 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
942 		nbp = LIST_NEXT(bp, b_vnbufs);
943 		if (bp->b_lblkno < lbn)
944 			continue;
945 		error = bbusy(bp, catch, slptimeo, NULL);
946 		if (error != 0) {
947 			if (error == EPASSTHROUGH)
948 				goto restart;
949 			mutex_exit(&bufcache_lock);
950 			return (error);
951 		}
952 		brelsel(bp, BC_INVAL | BC_VFLUSH);
953 	}
954 
955 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
956 		nbp = LIST_NEXT(bp, b_vnbufs);
957 		if (bp->b_lblkno < lbn)
958 			continue;
959 		error = bbusy(bp, catch, slptimeo, NULL);
960 		if (error != 0) {
961 			if (error == EPASSTHROUGH)
962 				goto restart;
963 			mutex_exit(&bufcache_lock);
964 			return (error);
965 		}
966 		brelsel(bp, BC_INVAL | BC_VFLUSH);
967 	}
968 	mutex_exit(&bufcache_lock);
969 
970 	return (0);
971 }
972 
973 /*
974  * Flush all dirty buffers from a vnode.
975  * Called with the underlying vnode locked, which should prevent new dirty
976  * buffers from being queued.
977  */
978 void
979 vflushbuf(struct vnode *vp, int sync)
980 {
981 	struct buf *bp, *nbp;
982 	int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
983 	bool dirty;
984 
985 	mutex_enter(&vp->v_interlock);
986 	(void) VOP_PUTPAGES(vp, 0, 0, flags);
987 
988 loop:
989 	mutex_enter(&bufcache_lock);
990 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
991 		nbp = LIST_NEXT(bp, b_vnbufs);
992 		if ((bp->b_cflags & BC_BUSY))
993 			continue;
994 		if ((bp->b_oflags & BO_DELWRI) == 0)
995 			panic("vflushbuf: not dirty, bp %p", bp);
996 		bp->b_cflags |= BC_BUSY | BC_VFLUSH;
997 		mutex_exit(&bufcache_lock);
998 		/*
999 		 * Wait for I/O associated with indirect blocks to complete,
1000 		 * since there is no way to quickly wait for them below.
1001 		 */
1002 		if (bp->b_vp == vp || sync == 0)
1003 			(void) bawrite(bp);
1004 		else
1005 			(void) bwrite(bp);
1006 		goto loop;
1007 	}
1008 	mutex_exit(&bufcache_lock);
1009 
1010 	if (sync == 0)
1011 		return;
1012 
1013 	mutex_enter(&vp->v_interlock);
1014 	while (vp->v_numoutput != 0)
1015 		cv_wait(&vp->v_cv, &vp->v_interlock);
1016 	dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
1017 	mutex_exit(&vp->v_interlock);
1018 
1019 	if (dirty) {
1020 		vprint("vflushbuf: dirty", vp);
1021 		goto loop;
1022 	}
1023 }
1024 
1025 /*
1026  * Create a vnode for a block device.
1027  * Used for root filesystem and swap areas.
1028  * Also used for memory file system special devices.
1029  */
1030 int
1031 bdevvp(dev_t dev, vnode_t **vpp)
1032 {
1033 
1034 	return (getdevvp(dev, vpp, VBLK));
1035 }
1036 
1037 /*
1038  * Create a vnode for a character device.
1039  * Used for kernfs and some console handling.
1040  */
1041 int
1042 cdevvp(dev_t dev, vnode_t **vpp)
1043 {
1044 
1045 	return (getdevvp(dev, vpp, VCHR));
1046 }
1047 
1048 /*
1049  * Associate a buffer with a vnode.  There must already be a hold on
1050  * the vnode.
1051  */
1052 void
1053 bgetvp(struct vnode *vp, struct buf *bp)
1054 {
1055 
1056 	KASSERT(bp->b_vp == NULL);
1057 	KASSERT(bp->b_objlock == &buffer_lock);
1058 	KASSERT(mutex_owned(&vp->v_interlock));
1059 	KASSERT(mutex_owned(&bufcache_lock));
1060 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1061 	KASSERT(!cv_has_waiters(&bp->b_done));
1062 
1063 	vholdl(vp);
1064 	bp->b_vp = vp;
1065 	if (vp->v_type == VBLK || vp->v_type == VCHR)
1066 		bp->b_dev = vp->v_rdev;
1067 	else
1068 		bp->b_dev = NODEV;
1069 
1070 	/*
1071 	 * Insert onto list for new vnode.
1072 	 */
1073 	bufinsvn(bp, &vp->v_cleanblkhd);
1074 	bp->b_objlock = &vp->v_interlock;
1075 }
1076 
1077 /*
1078  * Disassociate a buffer from a vnode.
1079  */
1080 void
1081 brelvp(struct buf *bp)
1082 {
1083 	struct vnode *vp = bp->b_vp;
1084 
1085 	KASSERT(vp != NULL);
1086 	KASSERT(bp->b_objlock == &vp->v_interlock);
1087 	KASSERT(mutex_owned(&vp->v_interlock));
1088 	KASSERT(mutex_owned(&bufcache_lock));
1089 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1090 	KASSERT(!cv_has_waiters(&bp->b_done));
1091 
1092 	/*
1093 	 * Delete from old vnode list, if on one.
1094 	 */
1095 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1096 		bufremvn(bp);
1097 
1098 	if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) &&
1099 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1100 		vp->v_iflag &= ~VI_WRMAPDIRTY;
1101 		vn_syncer_remove_from_worklist(vp);
1102 	}
1103 
1104 	bp->b_objlock = &buffer_lock;
1105 	bp->b_vp = NULL;
1106 	holdrelel(vp);
1107 }
1108 
1109 /*
1110  * Reassign a buffer from one vnode list to another.
1111  * The list reassignment must be within the same vnode.
1112  * Used to assign file specific control information
1113  * (indirect blocks) to the list to which they belong.
1114  */
1115 void
1116 reassignbuf(struct buf *bp, struct vnode *vp)
1117 {
1118 	struct buflists *listheadp;
1119 	int delayx;
1120 
1121 	KASSERT(mutex_owned(&bufcache_lock));
1122 	KASSERT(bp->b_objlock == &vp->v_interlock);
1123 	KASSERT(mutex_owned(&vp->v_interlock));
1124 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
1125 
1126 	/*
1127 	 * Delete from old vnode list, if on one.
1128 	 */
1129 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1130 		bufremvn(bp);
1131 
1132 	/*
1133 	 * If dirty, put on list of dirty buffers;
1134 	 * otherwise insert onto list of clean buffers.
1135 	 */
1136 	if ((bp->b_oflags & BO_DELWRI) == 0) {
1137 		listheadp = &vp->v_cleanblkhd;
1138 		if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1139 		    (vp->v_iflag & VI_ONWORKLST) &&
1140 		    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1141 			vp->v_iflag &= ~VI_WRMAPDIRTY;
1142 			vn_syncer_remove_from_worklist(vp);
1143 		}
1144 	} else {
1145 		listheadp = &vp->v_dirtyblkhd;
1146 		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
1147 			switch (vp->v_type) {
1148 			case VDIR:
1149 				delayx = dirdelay;
1150 				break;
1151 			case VBLK:
1152 				if (vp->v_specmountpoint != NULL) {
1153 					delayx = metadelay;
1154 					break;
1155 				}
1156 				/* fall through */
1157 			default:
1158 				delayx = filedelay;
1159 				break;
1160 			}
1161 			if (!vp->v_mount ||
1162 			    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1163 				vn_syncer_add_to_worklist(vp, delayx);
1164 		}
1165 	}
1166 	bufinsvn(bp, listheadp);
1167 }
1168 
1169 /*
1170  * Create a vnode for a device.
1171  * Used by bdevvp (block device) for root file system etc.,
1172  * and by cdevvp (character device) for console and kernfs.
1173  */
1174 static int
1175 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1176 {
1177 	vnode_t *vp;
1178 	vnode_t *nvp;
1179 	int error;
1180 
1181 	if (dev == NODEV) {
1182 		*vpp = NULL;
1183 		return (0);
1184 	}
1185 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1186 	if (error) {
1187 		*vpp = NULL;
1188 		return (error);
1189 	}
1190 	vp = nvp;
1191 	vp->v_type = type;
1192 	vp->v_vflag |= VV_MPSAFE;
1193 	uvm_vnp_setsize(vp, 0);
1194 	spec_node_init(vp, dev);
1195 	*vpp = vp;
1196 	return (0);
1197 }
1198 
1199 /*
1200  * Try to gain a reference to a vnode, without acquiring its interlock.
1201  * The caller must hold a lock that will prevent the vnode from being
1202  * recycled or freed.
1203  */
1204 bool
1205 vtryget(vnode_t *vp)
1206 {
1207 	u_int use, next;
1208 
1209 	/*
1210 	 * If the vnode is being freed, don't make life any harder
1211 	 * for vclean() by adding another reference without waiting.
1212 	 * This is not strictly necessary, but we'll do it anyway.
1213 	 */
1214 	if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) {
1215 		return false;
1216 	}
1217 	for (use = vp->v_usecount;; use = next) {
1218 		if (use == 0) {
1219 			/* Need interlock held if first reference. */
1220 			return false;
1221 		}
1222 		next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
1223 		if (__predict_true(next == use)) {
1224 			return true;
1225 		}
1226 	}
1227 }
1228 
1229 /*
1230  * Grab a particular vnode from the free list, increment its
1231  * reference count and lock it. If the vnode lock bit is set the
1232  * vnode is being eliminated in vgone. In that case, we can not
1233  * grab the vnode, so the process is awakened when the transition is
1234  * completed, and an error returned to indicate that the vnode is no
1235  * longer usable (possibly having been changed to a new file system type).
1236  */
1237 int
1238 vget(vnode_t *vp, int flags)
1239 {
1240 	int error;
1241 
1242 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1243 
1244 	if ((flags & LK_INTERLOCK) == 0)
1245 		mutex_enter(&vp->v_interlock);
1246 
1247 	/*
1248 	 * Before adding a reference, we must remove the vnode
1249 	 * from its freelist.
1250 	 */
1251 	if (vp->v_usecount == 0) {
1252 		vremfree(vp);
1253 		vp->v_usecount = 1;
1254 	} else {
1255 		atomic_inc_uint(&vp->v_usecount);
1256 	}
1257 
1258 	/*
1259 	 * If the vnode is in the process of being cleaned out for
1260 	 * another use, we wait for the cleaning to finish and then
1261 	 * return failure.  Cleaning is determined by checking if
1262 	 * the VI_XLOCK or VI_FREEING flags are set.
1263 	 */
1264 	if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1265 		if ((flags & LK_NOWAIT) != 0) {
1266 			vrelel(vp, 0);
1267 			return EBUSY;
1268 		}
1269 		vwait(vp, VI_XLOCK | VI_FREEING);
1270 		vrelel(vp, 0);
1271 		return ENOENT;
1272 	}
1273 	if (flags & LK_TYPE_MASK) {
1274 		error = vn_lock(vp, flags | LK_INTERLOCK);
1275 		if (error != 0) {
1276 			vrele(vp);
1277 		}
1278 		return error;
1279 	}
1280 	mutex_exit(&vp->v_interlock);
1281 	return 0;
1282 }
1283 
1284 /*
1285  * vput(), just unlock and vrele()
1286  */
1287 void
1288 vput(vnode_t *vp)
1289 {
1290 
1291 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1292 
1293 	VOP_UNLOCK(vp, 0);
1294 	vrele(vp);
1295 }
1296 
1297 /*
1298  * Try to drop reference on a vnode.  Abort if we are releasing the
1299  * last reference.  Note: this _must_ succeed if not the last reference.
1300  */
1301 static inline bool
1302 vtryrele(vnode_t *vp)
1303 {
1304 	u_int use, next;
1305 
1306 	for (use = vp->v_usecount;; use = next) {
1307 		if (use == 1) {
1308 			return false;
1309 		}
1310 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1311 		if (__predict_true(next == use)) {
1312 			return true;
1313 		}
1314 	}
1315 }
1316 
1317 /*
1318  * Vnode release.  If reference count drops to zero, call inactive
1319  * routine and either return to freelist or free to the pool.
1320  */
1321 void
1322 vrelel(vnode_t *vp, int flags)
1323 {
1324 	bool recycle, defer;
1325 	int error;
1326 
1327 	KASSERT(mutex_owned(&vp->v_interlock));
1328 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1329 	KASSERT(vp->v_freelisthd == NULL);
1330 
1331 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
1332 	    (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
1333 		vpanic(vp, "dead but not clean");
1334 	}
1335 
1336 	/*
1337 	 * If not the last reference, just drop the reference count
1338 	 * and unlock.
1339 	 */
1340 	if (vtryrele(vp)) {
1341 		vp->v_iflag |= VI_INACTREDO;
1342 		mutex_exit(&vp->v_interlock);
1343 		return;
1344 	}
1345 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
1346 		vpanic(vp, "vrelel: bad ref count");
1347 	}
1348 
1349 	KASSERT((vp->v_iflag & VI_XLOCK) == 0);
1350 
1351 	/*
1352 	 * If not clean, deactivate the vnode, but preserve
1353 	 * our reference across the call to VOP_INACTIVE().
1354 	 */
1355  retry:
1356 	if ((vp->v_iflag & VI_CLEAN) == 0) {
1357 		recycle = false;
1358 		vp->v_iflag |= VI_INACTNOW;
1359 
1360 		/*
1361 		 * XXX This ugly block can be largely eliminated if
1362 		 * locking is pushed down into the file systems.
1363 		 */
1364 		if (curlwp == uvm.pagedaemon_lwp) {
1365 			/* The pagedaemon can't wait around; defer. */
1366 			defer = true;
1367 		} else if (curlwp == vrele_lwp) {
1368 			/* We have to try harder. */
1369 			vp->v_iflag &= ~VI_INACTREDO;
1370 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1371 			    LK_RETRY);
1372 			if (error != 0) {
1373 				/* XXX */
1374 				vpanic(vp, "vrele: unable to lock %p");
1375 			}
1376 			defer = false;
1377 		} else if ((vp->v_iflag & VI_LAYER) != 0) {
1378 			/*
1379 			 * Acquiring the stack's lock in vclean() even
1380 			 * for an honest vput/vrele is dangerous because
1381 			 * our caller may hold other vnode locks; defer.
1382 			 */
1383 			defer = true;
1384 		} else {
1385 			/* If we can't acquire the lock, then defer. */
1386 			vp->v_iflag &= ~VI_INACTREDO;
1387 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
1388 			    LK_NOWAIT);
1389 			if (error != 0) {
1390 				defer = true;
1391 				mutex_enter(&vp->v_interlock);
1392 			} else {
1393 				defer = false;
1394 			}
1395 		}
1396 
1397 		if (defer) {
1398 			/*
1399 			 * Defer reclaim to the kthread; it's not safe to
1400 			 * clean it here.  We donate it our last reference.
1401 			 */
1402 			KASSERT(mutex_owned(&vp->v_interlock));
1403 			KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
1404 			vp->v_iflag &= ~VI_INACTNOW;
1405 			vp->v_iflag |= VI_INACTPEND;
1406 			mutex_enter(&vrele_lock);
1407 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
1408 			if (++vrele_pending > (desiredvnodes >> 8))
1409 				cv_signal(&vrele_cv);
1410 			mutex_exit(&vrele_lock);
1411 			mutex_exit(&vp->v_interlock);
1412 			return;
1413 		}
1414 
1415 #ifdef DIAGNOSTIC
1416 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1417 		    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
1418 			vprint("vrelel: missing VOP_CLOSE()", vp);
1419 		}
1420 #endif
1421 
1422 		/*
1423 		 * The vnode can gain another reference while being
1424 		 * deactivated.  If VOP_INACTIVE() indicates that
1425 		 * the described file has been deleted, then recycle
1426 		 * the vnode irrespective of additional references.
1427 		 * Another thread may be waiting to re-use the on-disk
1428 		 * inode.
1429 		 *
1430 		 * Note that VOP_INACTIVE() will drop the vnode lock.
1431 		 */
1432 		VOP_INACTIVE(vp, &recycle);
1433 		mutex_enter(&vp->v_interlock);
1434 		vp->v_iflag &= ~VI_INACTNOW;
1435 		if (!recycle) {
1436 			if (vtryrele(vp)) {
1437 				mutex_exit(&vp->v_interlock);
1438 				return;
1439 			}
1440 
1441 			/*
1442 			 * If we grew another reference while
1443 			 * VOP_INACTIVE() was underway, retry.
1444 			 */
1445 			if ((vp->v_iflag & VI_INACTREDO) != 0) {
1446 				goto retry;
1447 			}
1448 		}
1449 
1450 		/* Take care of space accounting. */
1451 		if (vp->v_iflag & VI_EXECMAP) {
1452 			atomic_add_int(&uvmexp.execpages,
1453 			    -vp->v_uobj.uo_npages);
1454 			atomic_add_int(&uvmexp.filepages,
1455 			    vp->v_uobj.uo_npages);
1456 		}
1457 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
1458 		vp->v_vflag &= ~VV_MAPPED;
1459 
1460 		/*
1461 		 * Recycle the vnode if the file is now unused (unlinked),
1462 		 * otherwise just free it.
1463 		 */
1464 		if (recycle) {
1465 			vclean(vp, DOCLOSE);
1466 		}
1467 		KASSERT(vp->v_usecount > 0);
1468 	}
1469 
1470 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
1471 		/* Gained another reference while being reclaimed. */
1472 		mutex_exit(&vp->v_interlock);
1473 		return;
1474 	}
1475 
1476 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1477 		/*
1478 		 * It's clean so destroy it.  It isn't referenced
1479 		 * anywhere since it has been reclaimed.
1480 		 */
1481 		KASSERT(vp->v_holdcnt == 0);
1482 		KASSERT(vp->v_writecount == 0);
1483 		mutex_exit(&vp->v_interlock);
1484 		insmntque(vp, NULL);
1485 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
1486 			spec_node_destroy(vp);
1487 		}
1488 		vnfree(vp);
1489 	} else {
1490 		/*
1491 		 * Otherwise, put it back onto the freelist.  It
1492 		 * can't be destroyed while still associated with
1493 		 * a file system.
1494 		 */
1495 		mutex_enter(&vnode_free_list_lock);
1496 		if (vp->v_holdcnt > 0) {
1497 			vp->v_freelisthd = &vnode_hold_list;
1498 		} else {
1499 			vp->v_freelisthd = &vnode_free_list;
1500 		}
1501 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1502 		mutex_exit(&vnode_free_list_lock);
1503 		mutex_exit(&vp->v_interlock);
1504 	}
1505 }
1506 
1507 void
1508 vrele(vnode_t *vp)
1509 {
1510 
1511 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1512 
1513 	if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
1514 		return;
1515 	}
1516 	mutex_enter(&vp->v_interlock);
1517 	vrelel(vp, 0);
1518 }
1519 
1520 static void
1521 vrele_thread(void *cookie)
1522 {
1523 	vnode_t *vp;
1524 
1525 	for (;;) {
1526 		mutex_enter(&vrele_lock);
1527 		while (TAILQ_EMPTY(&vrele_list)) {
1528 			vrele_gen++;
1529 			cv_broadcast(&vrele_cv);
1530 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
1531 		}
1532 		vp = TAILQ_FIRST(&vrele_list);
1533 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
1534 		vrele_pending--;
1535 		mutex_exit(&vrele_lock);
1536 
1537 		/*
1538 		 * If not the last reference, then ignore the vnode
1539 		 * and look for more work.
1540 		 */
1541 		mutex_enter(&vp->v_interlock);
1542 		KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
1543 		vp->v_iflag &= ~VI_INACTPEND;
1544 		vrelel(vp, 0);
1545 	}
1546 }
1547 
1548 /*
1549  * Page or buffer structure gets a reference.
1550  * Called with v_interlock held.
1551  */
1552 void
1553 vholdl(vnode_t *vp)
1554 {
1555 
1556 	KASSERT(mutex_owned(&vp->v_interlock));
1557 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1558 
1559 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
1560 		mutex_enter(&vnode_free_list_lock);
1561 		KASSERT(vp->v_freelisthd == &vnode_free_list);
1562 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1563 		vp->v_freelisthd = &vnode_hold_list;
1564 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1565 		mutex_exit(&vnode_free_list_lock);
1566 	}
1567 }
1568 
1569 /*
1570  * Page or buffer structure frees a reference.
1571  * Called with v_interlock held.
1572  */
1573 void
1574 holdrelel(vnode_t *vp)
1575 {
1576 
1577 	KASSERT(mutex_owned(&vp->v_interlock));
1578 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1579 
1580 	if (vp->v_holdcnt <= 0) {
1581 		vpanic(vp, "holdrelel: holdcnt vp %p");
1582 	}
1583 
1584 	vp->v_holdcnt--;
1585 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1586 		mutex_enter(&vnode_free_list_lock);
1587 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
1588 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1589 		vp->v_freelisthd = &vnode_free_list;
1590 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1591 		mutex_exit(&vnode_free_list_lock);
1592 	}
1593 }
1594 
1595 /*
1596  * Vnode reference, where a reference is already held by some other
1597  * object (for example, a file structure).
1598  */
1599 void
1600 vref(vnode_t *vp)
1601 {
1602 
1603 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1604 	KASSERT(vp->v_usecount != 0);
1605 
1606 	atomic_inc_uint(&vp->v_usecount);
1607 }
1608 
1609 /*
1610  * Remove any vnodes in the vnode table belonging to mount point mp.
1611  *
1612  * If FORCECLOSE is not specified, there should not be any active ones,
1613  * return error if any are found (nb: this is a user error, not a
1614  * system error). If FORCECLOSE is specified, detach any active vnodes
1615  * that are found.
1616  *
1617  * If WRITECLOSE is set, only flush out regular file vnodes open for
1618  * writing.
1619  *
1620  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1621  */
1622 #ifdef DEBUG
1623 int busyprt = 0;	/* print out busy vnodes */
1624 struct ctldebug debug1 = { "busyprt", &busyprt };
1625 #endif
1626 
1627 static vnode_t *
1628 vflushnext(vnode_t *mvp, int *when)
1629 {
1630 
1631 	if (hardclock_ticks > *when) {
1632 		mutex_exit(&mntvnode_lock);
1633 		yield();
1634 		mutex_enter(&mntvnode_lock);
1635 		*when = hardclock_ticks + hz / 10;
1636 	}
1637 
1638 	return vunmark(mvp);
1639 }
1640 
1641 int
1642 vflush(struct mount *mp, vnode_t *skipvp, int flags)
1643 {
1644 	vnode_t *vp, *mvp;
1645 	int busy = 0, when = 0, gen;
1646 
1647 	/*
1648 	 * First, flush out any vnode references from vrele_list.
1649 	 */
1650 	mutex_enter(&vrele_lock);
1651 	gen = vrele_gen;
1652 	while (vrele_pending && gen == vrele_gen) {
1653 		cv_broadcast(&vrele_cv);
1654 		cv_wait(&vrele_cv, &vrele_lock);
1655 	}
1656 	mutex_exit(&vrele_lock);
1657 
1658 	/* Allocate a marker vnode. */
1659 	if ((mvp = vnalloc(mp)) == NULL)
1660 		return (ENOMEM);
1661 
1662 	/*
1663 	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
1664 	 * and vclean() are called
1665 	 */
1666 	mutex_enter(&mntvnode_lock);
1667 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
1668 	    vp = vflushnext(mvp, &when)) {
1669 		vmark(mvp, vp);
1670 		if (vp->v_mount != mp || vismarker(vp))
1671 			continue;
1672 		/*
1673 		 * Skip over a selected vnode.
1674 		 */
1675 		if (vp == skipvp)
1676 			continue;
1677 		mutex_enter(&vp->v_interlock);
1678 		/*
1679 		 * Ignore clean but still referenced vnodes.
1680 		 */
1681 		if ((vp->v_iflag & VI_CLEAN) != 0) {
1682 			mutex_exit(&vp->v_interlock);
1683 			continue;
1684 		}
1685 		/*
1686 		 * Skip over a vnodes marked VSYSTEM.
1687 		 */
1688 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1689 			mutex_exit(&vp->v_interlock);
1690 			continue;
1691 		}
1692 		/*
1693 		 * If WRITECLOSE is set, only flush out regular file
1694 		 * vnodes open for writing.
1695 		 */
1696 		if ((flags & WRITECLOSE) &&
1697 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1698 			mutex_exit(&vp->v_interlock);
1699 			continue;
1700 		}
1701 		/*
1702 		 * With v_usecount == 0, all we need to do is clear
1703 		 * out the vnode data structures and we are done.
1704 		 */
1705 		if (vp->v_usecount == 0) {
1706 			mutex_exit(&mntvnode_lock);
1707 			vremfree(vp);
1708 			vp->v_usecount = 1;
1709 			vclean(vp, DOCLOSE);
1710 			vrelel(vp, 0);
1711 			mutex_enter(&mntvnode_lock);
1712 			continue;
1713 		}
1714 		/*
1715 		 * If FORCECLOSE is set, forcibly close the vnode.
1716 		 * For block or character devices, revert to an
1717 		 * anonymous device.  For all other files, just
1718 		 * kill them.
1719 		 */
1720 		if (flags & FORCECLOSE) {
1721 			mutex_exit(&mntvnode_lock);
1722 			atomic_inc_uint(&vp->v_usecount);
1723 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1724 				vclean(vp, DOCLOSE);
1725 				vrelel(vp, 0);
1726 			} else {
1727 				vclean(vp, 0);
1728 				vp->v_op = spec_vnodeop_p; /* XXXSMP */
1729 				mutex_exit(&vp->v_interlock);
1730 				/*
1731 				 * The vnode isn't clean, but still resides
1732 				 * on the mount list.  Remove it. XXX This
1733 				 * is a bit dodgy.
1734 				 */
1735 				insmntque(vp, NULL);
1736 				vrele(vp);
1737 			}
1738 			mutex_enter(&mntvnode_lock);
1739 			continue;
1740 		}
1741 #ifdef DEBUG
1742 		if (busyprt)
1743 			vprint("vflush: busy vnode", vp);
1744 #endif
1745 		mutex_exit(&vp->v_interlock);
1746 		busy++;
1747 	}
1748 	mutex_exit(&mntvnode_lock);
1749 	vnfree(mvp);
1750 	if (busy)
1751 		return (EBUSY);
1752 	return (0);
1753 }
1754 
1755 /*
1756  * Disassociate the underlying file system from a vnode.
1757  *
1758  * Must be called with the interlock held, and will return with it held.
1759  */
1760 void
1761 vclean(vnode_t *vp, int flags)
1762 {
1763 	lwp_t *l = curlwp;
1764 	bool recycle, active;
1765 	int error;
1766 
1767 	KASSERT(mutex_owned(&vp->v_interlock));
1768 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1769 	KASSERT(vp->v_usecount != 0);
1770 
1771 	/* If cleaning is already in progress wait until done and return. */
1772 	if (vp->v_iflag & VI_XLOCK) {
1773 		vwait(vp, VI_XLOCK);
1774 		return;
1775 	}
1776 
1777 	/* If already clean, nothing to do. */
1778 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1779 		return;
1780 	}
1781 
1782 	/*
1783 	 * Prevent the vnode from being recycled or brought into use
1784 	 * while we clean it out.
1785 	 */
1786 	vp->v_iflag |= VI_XLOCK;
1787 	if (vp->v_iflag & VI_EXECMAP) {
1788 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1789 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1790 	}
1791 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1792 	active = (vp->v_usecount > 1);
1793 
1794 	/* XXXAD should not lock vnode under layer */
1795 	VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1796 
1797 	/*
1798 	 * Clean out any cached data associated with the vnode.
1799 	 * If purging an active vnode, it must be closed and
1800 	 * deactivated before being reclaimed. Note that the
1801 	 * VOP_INACTIVE will unlock the vnode.
1802 	 */
1803 	if (flags & DOCLOSE) {
1804 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1805 		if (error != 0) {
1806 			/* XXX, fix vn_start_write's grab of mp and use that. */
1807 
1808 			if (wapbl_vphaswapbl(vp))
1809 				WAPBL_DISCARD(wapbl_vptomp(vp));
1810 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1811 		}
1812 		KASSERT(error == 0);
1813 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1814 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1815 			 spec_node_revoke(vp);
1816 		}
1817 	}
1818 	if (active) {
1819 		VOP_INACTIVE(vp, &recycle);
1820 	} else {
1821 		/*
1822 		 * Any other processes trying to obtain this lock must first
1823 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1824 		 */
1825 		VOP_UNLOCK(vp, 0);
1826 	}
1827 
1828 	/* Disassociate the underlying file system from the vnode. */
1829 	if (VOP_RECLAIM(vp)) {
1830 		vpanic(vp, "vclean: cannot reclaim");
1831 	}
1832 
1833 	KASSERT(vp->v_uobj.uo_npages == 0);
1834 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1835 		uvm_ra_freectx(vp->v_ractx);
1836 		vp->v_ractx = NULL;
1837 	}
1838 	cache_purge(vp);
1839 
1840 	/* Done with purge, notify sleepers of the grim news. */
1841 	mutex_enter(&vp->v_interlock);
1842 	vp->v_op = dead_vnodeop_p;
1843 	vp->v_tag = VT_NON;
1844 	vp->v_vnlock = &vp->v_lock;
1845 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1846 	vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1847 	vp->v_vflag &= ~VV_LOCKSWORK;
1848 	if ((flags & DOCLOSE) != 0) {
1849 		vp->v_iflag |= VI_CLEAN;
1850 	}
1851 	cv_broadcast(&vp->v_cv);
1852 
1853 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1854 }
1855 
1856 /*
1857  * Recycle an unused vnode to the front of the free list.
1858  * Release the passed interlock if the vnode will be recycled.
1859  */
1860 int
1861 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1862 {
1863 
1864 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1865 
1866 	mutex_enter(&vp->v_interlock);
1867 	if (vp->v_usecount != 0) {
1868 		mutex_exit(&vp->v_interlock);
1869 		return (0);
1870 	}
1871 	if (inter_lkp)
1872 		mutex_exit(inter_lkp);
1873 	vremfree(vp);
1874 	vp->v_usecount = 1;
1875 	vclean(vp, DOCLOSE);
1876 	vrelel(vp, 0);
1877 	return (1);
1878 }
1879 
1880 /*
1881  * Eliminate all activity associated with a vnode in preparation for
1882  * reuse.  Drops a reference from the vnode.
1883  */
1884 void
1885 vgone(vnode_t *vp)
1886 {
1887 
1888 	mutex_enter(&vp->v_interlock);
1889 	vclean(vp, DOCLOSE);
1890 	vrelel(vp, 0);
1891 }
1892 
1893 /*
1894  * Lookup a vnode by device number.
1895  */
1896 int
1897 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1898 {
1899 	vnode_t *vp;
1900 	int rc = 0;
1901 
1902 	mutex_enter(&device_lock);
1903 	for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1904 		if (dev != vp->v_rdev || type != vp->v_type)
1905 			continue;
1906 		*vpp = vp;
1907 		rc = 1;
1908 		break;
1909 	}
1910 	mutex_exit(&device_lock);
1911 	return (rc);
1912 }
1913 
1914 /*
1915  * Revoke all the vnodes corresponding to the specified minor number
1916  * range (endpoints inclusive) of the specified major.
1917  */
1918 void
1919 vdevgone(int maj, int minl, int minh, enum vtype type)
1920 {
1921 	vnode_t *vp, **vpp;
1922 	dev_t dev;
1923 	int mn;
1924 
1925 	vp = NULL;	/* XXX gcc */
1926 
1927 	mutex_enter(&device_lock);
1928 	for (mn = minl; mn <= minh; mn++) {
1929 		dev = makedev(maj, mn);
1930 		vpp = &specfs_hash[SPECHASH(dev)];
1931 		for (vp = *vpp; vp != NULL;) {
1932 			mutex_enter(&vp->v_interlock);
1933 			if ((vp->v_iflag & VI_CLEAN) != 0 ||
1934 			    dev != vp->v_rdev || type != vp->v_type) {
1935 				mutex_exit(&vp->v_interlock);
1936 				vp = vp->v_specnext;
1937 				continue;
1938 			}
1939 			mutex_exit(&device_lock);
1940 			if (vget(vp, LK_INTERLOCK) == 0) {
1941 				VOP_REVOKE(vp, REVOKEALL);
1942 				vrele(vp);
1943 			}
1944 			mutex_enter(&device_lock);
1945 			vp = *vpp;
1946 		}
1947 	}
1948 	mutex_exit(&device_lock);
1949 }
1950 
1951 /*
1952  * Calculate the total number of references to a special device.
1953  */
1954 int
1955 vcount(vnode_t *vp)
1956 {
1957 	int count;
1958 
1959 	mutex_enter(&device_lock);
1960 	mutex_enter(&vp->v_interlock);
1961 	if (vp->v_specnode == NULL) {
1962 		count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
1963 		mutex_exit(&vp->v_interlock);
1964 		mutex_exit(&device_lock);
1965 		return (count);
1966 	}
1967 	mutex_exit(&vp->v_interlock);
1968 	count = vp->v_specnode->sn_dev->sd_opencnt;
1969 	mutex_exit(&device_lock);
1970 	return (count);
1971 }
1972 
1973 /*
1974  * Eliminate all activity associated with the requested vnode
1975  * and with all vnodes aliased to the requested vnode.
1976  */
1977 void
1978 vrevoke(vnode_t *vp)
1979 {
1980 	vnode_t *vq, **vpp;
1981 	enum vtype type;
1982 	dev_t dev;
1983 
1984 	KASSERT(vp->v_usecount > 0);
1985 
1986 	mutex_enter(&vp->v_interlock);
1987 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1988 		mutex_exit(&vp->v_interlock);
1989 		return;
1990 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1991 		atomic_inc_uint(&vp->v_usecount);
1992 		vclean(vp, DOCLOSE);
1993 		vrelel(vp, 0);
1994 		return;
1995 	} else {
1996 		dev = vp->v_rdev;
1997 		type = vp->v_type;
1998 		mutex_exit(&vp->v_interlock);
1999 	}
2000 
2001 	vpp = &specfs_hash[SPECHASH(dev)];
2002 	mutex_enter(&device_lock);
2003 	for (vq = *vpp; vq != NULL;) {
2004 		/* If clean or being cleaned, then ignore it. */
2005 		mutex_enter(&vq->v_interlock);
2006 		if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
2007 		    vq->v_rdev != dev || vq->v_type != type) {
2008 			mutex_exit(&vq->v_interlock);
2009 			vq = vq->v_specnext;
2010 			continue;
2011 		}
2012 		mutex_exit(&device_lock);
2013 		if (vq->v_usecount == 0) {
2014 			vremfree(vq);
2015 			vq->v_usecount = 1;
2016 		} else {
2017 			atomic_inc_uint(&vq->v_usecount);
2018 		}
2019 		vclean(vq, DOCLOSE);
2020 		vrelel(vq, 0);
2021 		mutex_enter(&device_lock);
2022 		vq = *vpp;
2023 	}
2024 	mutex_exit(&device_lock);
2025 }
2026 
2027 /*
2028  * sysctl helper routine to return list of supported fstypes
2029  */
2030 int
2031 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
2032 {
2033 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
2034 	char *where = oldp;
2035 	struct vfsops *v;
2036 	size_t needed, left, slen;
2037 	int error, first;
2038 
2039 	if (newp != NULL)
2040 		return (EPERM);
2041 	if (namelen != 0)
2042 		return (EINVAL);
2043 
2044 	first = 1;
2045 	error = 0;
2046 	needed = 0;
2047 	left = *oldlenp;
2048 
2049 	sysctl_unlock();
2050 	mutex_enter(&vfs_list_lock);
2051 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2052 		if (where == NULL)
2053 			needed += strlen(v->vfs_name) + 1;
2054 		else {
2055 			memset(bf, 0, sizeof(bf));
2056 			if (first) {
2057 				strncpy(bf, v->vfs_name, sizeof(bf));
2058 				first = 0;
2059 			} else {
2060 				bf[0] = ' ';
2061 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
2062 			}
2063 			bf[sizeof(bf)-1] = '\0';
2064 			slen = strlen(bf);
2065 			if (left < slen + 1)
2066 				break;
2067 			v->vfs_refcount++;
2068 			mutex_exit(&vfs_list_lock);
2069 			/* +1 to copy out the trailing NUL byte */
2070 			error = copyout(bf, where, slen + 1);
2071 			mutex_enter(&vfs_list_lock);
2072 			v->vfs_refcount--;
2073 			if (error)
2074 				break;
2075 			where += slen;
2076 			needed += slen;
2077 			left -= slen;
2078 		}
2079 	}
2080 	mutex_exit(&vfs_list_lock);
2081 	sysctl_relock();
2082 	*oldlenp = needed;
2083 	return (error);
2084 }
2085 
2086 
2087 int kinfo_vdebug = 1;
2088 int kinfo_vgetfailed;
2089 #define KINFO_VNODESLOP	10
2090 /*
2091  * Dump vnode list (via sysctl).
2092  * Copyout address of vnode followed by vnode.
2093  */
2094 /* ARGSUSED */
2095 int
2096 sysctl_kern_vnode(SYSCTLFN_ARGS)
2097 {
2098 	char *where = oldp;
2099 	size_t *sizep = oldlenp;
2100 	struct mount *mp, *nmp;
2101 	vnode_t *vp, *mvp, vbuf;
2102 	char *bp = where, *savebp;
2103 	char *ewhere;
2104 	int error;
2105 
2106 	if (namelen != 0)
2107 		return (EOPNOTSUPP);
2108 	if (newp != NULL)
2109 		return (EPERM);
2110 
2111 #define VPTRSZ	sizeof(vnode_t *)
2112 #define VNODESZ	sizeof(vnode_t)
2113 	if (where == NULL) {
2114 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2115 		return (0);
2116 	}
2117 	ewhere = where + *sizep;
2118 
2119 	sysctl_unlock();
2120 	mutex_enter(&mountlist_lock);
2121 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2122 	     mp = nmp) {
2123 		if (vfs_busy(mp, &nmp)) {
2124 			continue;
2125 		}
2126 		savebp = bp;
2127 		/* Allocate a marker vnode. */
2128 		if ((mvp = vnalloc(mp)) == NULL) {
2129 			sysctl_relock();
2130 			return (ENOMEM);
2131 		}
2132 		mutex_enter(&mntvnode_lock);
2133 		for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
2134 			vmark(mvp, vp);
2135 			/*
2136 			 * Check that the vp is still associated with
2137 			 * this filesystem.  RACE: could have been
2138 			 * recycled onto the same filesystem.
2139 			 */
2140 			if (vp->v_mount != mp || vismarker(vp))
2141 				continue;
2142 			if (bp + VPTRSZ + VNODESZ > ewhere) {
2143 				(void)vunmark(mvp);
2144 				mutex_exit(&mntvnode_lock);
2145 				vnfree(mvp);
2146 				sysctl_relock();
2147 				*sizep = bp - where;
2148 				return (ENOMEM);
2149 			}
2150 			memcpy(&vbuf, vp, VNODESZ);
2151 			mutex_exit(&mntvnode_lock);
2152 			if ((error = copyout(&vp, bp, VPTRSZ)) ||
2153 			   (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
2154 			   	mutex_enter(&mntvnode_lock);
2155 				(void)vunmark(mvp);
2156 				mutex_exit(&mntvnode_lock);
2157 				vnfree(mvp);
2158 				sysctl_relock();
2159 				return (error);
2160 			}
2161 			bp += VPTRSZ + VNODESZ;
2162 			mutex_enter(&mntvnode_lock);
2163 		}
2164 		mutex_exit(&mntvnode_lock);
2165 		vnfree(mvp);
2166 		vfs_unbusy(mp, false, &nmp);
2167 	}
2168 	mutex_exit(&mountlist_lock);
2169 	sysctl_relock();
2170 
2171 	*sizep = bp - where;
2172 	return (0);
2173 }
2174 
2175 /*
2176  * Remove clean vnodes from a mountpoint's vnode list.
2177  */
2178 void
2179 vfs_scrubvnlist(struct mount *mp)
2180 {
2181 	vnode_t *vp, *nvp;
2182 
2183  retry:
2184 	mutex_enter(&mntvnode_lock);
2185 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
2186 		nvp = TAILQ_NEXT(vp, v_mntvnodes);
2187 		mutex_enter(&vp->v_interlock);
2188 		if ((vp->v_iflag & VI_CLEAN) != 0) {
2189 			TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
2190 			vp->v_mount = NULL;
2191 			mutex_exit(&mntvnode_lock);
2192 			mutex_exit(&vp->v_interlock);
2193 			vfs_destroy(mp);
2194 			goto retry;
2195 		}
2196 		mutex_exit(&vp->v_interlock);
2197 	}
2198 	mutex_exit(&mntvnode_lock);
2199 }
2200 
2201 /*
2202  * Check to see if a filesystem is mounted on a block device.
2203  */
2204 int
2205 vfs_mountedon(vnode_t *vp)
2206 {
2207 	vnode_t *vq;
2208 	int error = 0;
2209 
2210 	if (vp->v_type != VBLK)
2211 		return ENOTBLK;
2212 	if (vp->v_specmountpoint != NULL)
2213 		return (EBUSY);
2214 	mutex_enter(&device_lock);
2215 	for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
2216 	    vq = vq->v_specnext) {
2217 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
2218 			continue;
2219 		if (vq->v_specmountpoint != NULL) {
2220 			error = EBUSY;
2221 			break;
2222 		}
2223 	}
2224 	mutex_exit(&device_lock);
2225 	return (error);
2226 }
2227 
2228 /*
2229  * Unmount all file systems.
2230  * We traverse the list in reverse order under the assumption that doing so
2231  * will avoid needing to worry about dependencies.
2232  */
2233 void
2234 vfs_unmountall(struct lwp *l)
2235 {
2236 	struct mount *mp, *nmp;
2237 	int allerror, error;
2238 
2239 	printf("unmounting file systems...");
2240 	for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist);
2241 	     !CIRCLEQ_EMPTY(&mountlist);
2242 	     mp = nmp) {
2243 		nmp = CIRCLEQ_PREV(mp, mnt_list);
2244 #ifdef DEBUG
2245 		printf("\nunmounting %s (%s)...",
2246 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
2247 #endif
2248 		atomic_inc_uint(&mp->mnt_refcnt);
2249 		if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
2250 			printf("unmount of %s failed with error %d\n",
2251 			    mp->mnt_stat.f_mntonname, error);
2252 			allerror = 1;
2253 		}
2254 	}
2255 	printf(" done\n");
2256 	if (allerror)
2257 		printf("WARNING: some file systems would not unmount\n");
2258 }
2259 
2260 /*
2261  * Sync and unmount file systems before shutting down.
2262  */
2263 void
2264 vfs_shutdown(void)
2265 {
2266 	struct lwp *l;
2267 
2268 	/* XXX we're certainly not running in lwp0's context! */
2269 	l = curlwp;
2270 	if (l == NULL)
2271 		l = &lwp0;
2272 
2273 	printf("syncing disks... ");
2274 
2275 	/* remove user processes from run queue */
2276 	suspendsched();
2277 	(void) spl0();
2278 
2279 	/* avoid coming back this way again if we panic. */
2280 	doing_shutdown = 1;
2281 
2282 	sys_sync(l, NULL, NULL);
2283 
2284 	/* Wait for sync to finish. */
2285 	if (buf_syncwait() != 0) {
2286 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
2287 		Debugger();
2288 #endif
2289 		printf("giving up\n");
2290 		return;
2291 	} else
2292 		printf("done\n");
2293 
2294 	/*
2295 	 * If we've panic'd, don't make the situation potentially
2296 	 * worse by unmounting the file systems.
2297 	 */
2298 	if (panicstr != NULL)
2299 		return;
2300 
2301 	/* Release inodes held by texts before update. */
2302 #ifdef notdef
2303 	vnshutdown();
2304 #endif
2305 	/* Unmount file systems. */
2306 	vfs_unmountall(l);
2307 }
2308 
2309 /*
2310  * Mount the root file system.  If the operator didn't specify a
2311  * file system to use, try all possible file systems until one
2312  * succeeds.
2313  */
2314 int
2315 vfs_mountroot(void)
2316 {
2317 	struct vfsops *v;
2318 	int error = ENODEV;
2319 
2320 	if (root_device == NULL)
2321 		panic("vfs_mountroot: root device unknown");
2322 
2323 	switch (device_class(root_device)) {
2324 	case DV_IFNET:
2325 		if (rootdev != NODEV)
2326 			panic("vfs_mountroot: rootdev set for DV_IFNET "
2327 			    "(0x%llx -> %llu,%llu)",
2328 			    (unsigned long long)rootdev,
2329 			    (unsigned long long)major(rootdev),
2330 			    (unsigned long long)minor(rootdev));
2331 		break;
2332 
2333 	case DV_DISK:
2334 		if (rootdev == NODEV)
2335 			panic("vfs_mountroot: rootdev not set for DV_DISK");
2336 	        if (bdevvp(rootdev, &rootvp))
2337 	                panic("vfs_mountroot: can't get vnode for rootdev");
2338 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
2339 		if (error) {
2340 			printf("vfs_mountroot: can't open root device\n");
2341 			return (error);
2342 		}
2343 		break;
2344 
2345 	default:
2346 		printf("%s: inappropriate for root file system\n",
2347 		    device_xname(root_device));
2348 		return (ENODEV);
2349 	}
2350 
2351 	/*
2352 	 * If user specified a root fs type, use it.  Make sure the
2353 	 * specified type exists and has a mount_root()
2354 	 */
2355 	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
2356 		v = vfs_getopsbyname(rootfstype);
2357 		error = EFTYPE;
2358 		if (v != NULL) {
2359 			if (v->vfs_mountroot != NULL) {
2360 				error = (v->vfs_mountroot)();
2361 			}
2362 			v->vfs_refcount--;
2363 		}
2364 		goto done;
2365 	}
2366 
2367 	/*
2368 	 * Try each file system currently configured into the kernel.
2369 	 */
2370 	mutex_enter(&vfs_list_lock);
2371 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2372 		if (v->vfs_mountroot == NULL)
2373 			continue;
2374 #ifdef DEBUG
2375 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
2376 #endif
2377 		v->vfs_refcount++;
2378 		mutex_exit(&vfs_list_lock);
2379 		error = (*v->vfs_mountroot)();
2380 		mutex_enter(&vfs_list_lock);
2381 		v->vfs_refcount--;
2382 		if (!error) {
2383 			aprint_normal("root file system type: %s\n",
2384 			    v->vfs_name);
2385 			break;
2386 		}
2387 	}
2388 	mutex_exit(&vfs_list_lock);
2389 
2390 	if (v == NULL) {
2391 		printf("no file system for %s", device_xname(root_device));
2392 		if (device_class(root_device) == DV_DISK)
2393 			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
2394 		printf("\n");
2395 		error = EFTYPE;
2396 	}
2397 
2398 done:
2399 	if (error && device_class(root_device) == DV_DISK) {
2400 		VOP_CLOSE(rootvp, FREAD, FSCRED);
2401 		vrele(rootvp);
2402 	}
2403 	return (error);
2404 }
2405 
2406 /*
2407  * Get a new unique fsid
2408  */
2409 void
2410 vfs_getnewfsid(struct mount *mp)
2411 {
2412 	static u_short xxxfs_mntid;
2413 	fsid_t tfsid;
2414 	int mtype;
2415 
2416 	mutex_enter(&mntid_lock);
2417 	mtype = makefstype(mp->mnt_op->vfs_name);
2418 	mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
2419 	mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
2420 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2421 	if (xxxfs_mntid == 0)
2422 		++xxxfs_mntid;
2423 	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
2424 	tfsid.__fsid_val[1] = mtype;
2425 	if (!CIRCLEQ_EMPTY(&mountlist)) {
2426 		while (vfs_getvfs(&tfsid)) {
2427 			tfsid.__fsid_val[0]++;
2428 			xxxfs_mntid++;
2429 		}
2430 	}
2431 	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
2432 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
2433 	mutex_exit(&mntid_lock);
2434 }
2435 
2436 /*
2437  * Make a 'unique' number from a mount type name.
2438  */
2439 long
2440 makefstype(const char *type)
2441 {
2442 	long rv;
2443 
2444 	for (rv = 0; *type; type++) {
2445 		rv <<= 2;
2446 		rv ^= *type;
2447 	}
2448 	return rv;
2449 }
2450 
2451 /*
2452  * Set vnode attributes to VNOVAL
2453  */
2454 void
2455 vattr_null(struct vattr *vap)
2456 {
2457 
2458 	vap->va_type = VNON;
2459 
2460 	/*
2461 	 * Assign individually so that it is safe even if size and
2462 	 * sign of each member are varied.
2463 	 */
2464 	vap->va_mode = VNOVAL;
2465 	vap->va_nlink = VNOVAL;
2466 	vap->va_uid = VNOVAL;
2467 	vap->va_gid = VNOVAL;
2468 	vap->va_fsid = VNOVAL;
2469 	vap->va_fileid = VNOVAL;
2470 	vap->va_size = VNOVAL;
2471 	vap->va_blocksize = VNOVAL;
2472 	vap->va_atime.tv_sec =
2473 	    vap->va_mtime.tv_sec =
2474 	    vap->va_ctime.tv_sec =
2475 	    vap->va_birthtime.tv_sec = VNOVAL;
2476 	vap->va_atime.tv_nsec =
2477 	    vap->va_mtime.tv_nsec =
2478 	    vap->va_ctime.tv_nsec =
2479 	    vap->va_birthtime.tv_nsec = VNOVAL;
2480 	vap->va_gen = VNOVAL;
2481 	vap->va_flags = VNOVAL;
2482 	vap->va_rdev = VNOVAL;
2483 	vap->va_bytes = VNOVAL;
2484 	vap->va_vaflags = 0;
2485 }
2486 
2487 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
2488 #define ARRAY_PRINT(idx, arr) \
2489     ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
2490 
2491 const char * const vnode_tags[] = { VNODE_TAGS };
2492 const char * const vnode_types[] = { VNODE_TYPES };
2493 const char vnode_flagbits[] = VNODE_FLAGBITS;
2494 
2495 /*
2496  * Print out a description of a vnode.
2497  */
2498 void
2499 vprint(const char *label, struct vnode *vp)
2500 {
2501 	struct vnlock *vl;
2502 	char bf[96];
2503 	int flag;
2504 
2505 	vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock);
2506 	flag = vp->v_iflag | vp->v_vflag | vp->v_uflag;
2507 	snprintb(bf, sizeof(bf), vnode_flagbits, flag);
2508 
2509 	if (label != NULL)
2510 		printf("%s: ", label);
2511 	printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), "
2512 	    "usecount %d, writecount %d, holdcount %d\n"
2513 	    "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n",
2514 	    vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
2515 	    ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
2516 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt,
2517 	    vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt);
2518 	if (vp->v_data != NULL) {
2519 		printf("\t");
2520 		VOP_PRINT(vp);
2521 	}
2522 }
2523 
2524 #ifdef DEBUG
2525 /*
2526  * List all of the locked vnodes in the system.
2527  * Called when debugging the kernel.
2528  */
2529 void
2530 printlockedvnodes(void)
2531 {
2532 	struct mount *mp, *nmp;
2533 	struct vnode *vp;
2534 
2535 	printf("Locked vnodes\n");
2536 	mutex_enter(&mountlist_lock);
2537 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2538 	     mp = nmp) {
2539 		if (vfs_busy(mp, &nmp)) {
2540 			continue;
2541 		}
2542 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2543 			if (VOP_ISLOCKED(vp))
2544 				vprint(NULL, vp);
2545 		}
2546 		mutex_enter(&mountlist_lock);
2547 		vfs_unbusy(mp, false, &nmp);
2548 	}
2549 	mutex_exit(&mountlist_lock);
2550 }
2551 #endif
2552 
2553 /*
2554  * Do the usual access checking.
2555  * file_mode, uid and gid are from the vnode in question,
2556  * while acc_mode and cred are from the VOP_ACCESS parameter list
2557  */
2558 int
2559 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
2560     mode_t acc_mode, kauth_cred_t cred)
2561 {
2562 	mode_t mask;
2563 	int error, ismember;
2564 
2565 	/*
2566 	 * Super-user always gets read/write access, but execute access depends
2567 	 * on at least one execute bit being set.
2568 	 */
2569 	if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) == 0) {
2570 		if ((acc_mode & VEXEC) && type != VDIR &&
2571 		    (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
2572 			return (EACCES);
2573 		return (0);
2574 	}
2575 
2576 	mask = 0;
2577 
2578 	/* Otherwise, check the owner. */
2579 	if (kauth_cred_geteuid(cred) == uid) {
2580 		if (acc_mode & VEXEC)
2581 			mask |= S_IXUSR;
2582 		if (acc_mode & VREAD)
2583 			mask |= S_IRUSR;
2584 		if (acc_mode & VWRITE)
2585 			mask |= S_IWUSR;
2586 		return ((file_mode & mask) == mask ? 0 : EACCES);
2587 	}
2588 
2589 	/* Otherwise, check the groups. */
2590 	error = kauth_cred_ismember_gid(cred, gid, &ismember);
2591 	if (error)
2592 		return (error);
2593 	if (kauth_cred_getegid(cred) == gid || ismember) {
2594 		if (acc_mode & VEXEC)
2595 			mask |= S_IXGRP;
2596 		if (acc_mode & VREAD)
2597 			mask |= S_IRGRP;
2598 		if (acc_mode & VWRITE)
2599 			mask |= S_IWGRP;
2600 		return ((file_mode & mask) == mask ? 0 : EACCES);
2601 	}
2602 
2603 	/* Otherwise, check everyone else. */
2604 	if (acc_mode & VEXEC)
2605 		mask |= S_IXOTH;
2606 	if (acc_mode & VREAD)
2607 		mask |= S_IROTH;
2608 	if (acc_mode & VWRITE)
2609 		mask |= S_IWOTH;
2610 	return ((file_mode & mask) == mask ? 0 : EACCES);
2611 }
2612 
2613 /*
2614  * Given a file system name, look up the vfsops for that
2615  * file system, or return NULL if file system isn't present
2616  * in the kernel.
2617  */
2618 struct vfsops *
2619 vfs_getopsbyname(const char *name)
2620 {
2621 	struct vfsops *v;
2622 
2623 	mutex_enter(&vfs_list_lock);
2624 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2625 		if (strcmp(v->vfs_name, name) == 0)
2626 			break;
2627 	}
2628 	if (v != NULL)
2629 		v->vfs_refcount++;
2630 	mutex_exit(&vfs_list_lock);
2631 
2632 	return (v);
2633 }
2634 
2635 void
2636 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
2637 {
2638 	const struct statvfs *mbp;
2639 
2640 	if (sbp == (mbp = &mp->mnt_stat))
2641 		return;
2642 
2643 	(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
2644 	sbp->f_fsid = mbp->f_fsid;
2645 	sbp->f_owner = mbp->f_owner;
2646 	sbp->f_flag = mbp->f_flag;
2647 	sbp->f_syncwrites = mbp->f_syncwrites;
2648 	sbp->f_asyncwrites = mbp->f_asyncwrites;
2649 	sbp->f_syncreads = mbp->f_syncreads;
2650 	sbp->f_asyncreads = mbp->f_asyncreads;
2651 	(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
2652 	(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
2653 	    sizeof(sbp->f_fstypename));
2654 	(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
2655 	    sizeof(sbp->f_mntonname));
2656 	(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
2657 	    sizeof(sbp->f_mntfromname));
2658 	sbp->f_namemax = mbp->f_namemax;
2659 }
2660 
2661 int
2662 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
2663     const char *vfsname, struct mount *mp, struct lwp *l)
2664 {
2665 	int error;
2666 	size_t size;
2667 	struct statvfs *sfs = &mp->mnt_stat;
2668 	int (*fun)(const void *, void *, size_t, size_t *);
2669 
2670 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
2671 	    sizeof(mp->mnt_stat.f_fstypename));
2672 
2673 	if (onp) {
2674 		struct cwdinfo *cwdi = l->l_proc->p_cwdi;
2675 		fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
2676 		if (cwdi->cwdi_rdir != NULL) {
2677 			size_t len;
2678 			char *bp;
2679 			char *path = PNBUF_GET();
2680 
2681 			bp = path + MAXPATHLEN;
2682 			*--bp = '\0';
2683 			rw_enter(&cwdi->cwdi_lock, RW_READER);
2684 			error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
2685 			    path, MAXPATHLEN / 2, 0, l);
2686 			rw_exit(&cwdi->cwdi_lock);
2687 			if (error) {
2688 				PNBUF_PUT(path);
2689 				return error;
2690 			}
2691 
2692 			len = strlen(bp);
2693 			if (len > sizeof(sfs->f_mntonname) - 1)
2694 				len = sizeof(sfs->f_mntonname) - 1;
2695 			(void)strncpy(sfs->f_mntonname, bp, len);
2696 			PNBUF_PUT(path);
2697 
2698 			if (len < sizeof(sfs->f_mntonname) - 1) {
2699 				error = (*fun)(onp, &sfs->f_mntonname[len],
2700 				    sizeof(sfs->f_mntonname) - len - 1, &size);
2701 				if (error)
2702 					return error;
2703 				size += len;
2704 			} else {
2705 				size = len;
2706 			}
2707 		} else {
2708 			error = (*fun)(onp, &sfs->f_mntonname,
2709 			    sizeof(sfs->f_mntonname) - 1, &size);
2710 			if (error)
2711 				return error;
2712 		}
2713 		(void)memset(sfs->f_mntonname + size, 0,
2714 		    sizeof(sfs->f_mntonname) - size);
2715 	}
2716 
2717 	if (fromp) {
2718 		fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
2719 		error = (*fun)(fromp, sfs->f_mntfromname,
2720 		    sizeof(sfs->f_mntfromname) - 1, &size);
2721 		if (error)
2722 			return error;
2723 		(void)memset(sfs->f_mntfromname + size, 0,
2724 		    sizeof(sfs->f_mntfromname) - size);
2725 	}
2726 	return 0;
2727 }
2728 
2729 void
2730 vfs_timestamp(struct timespec *ts)
2731 {
2732 
2733 	nanotime(ts);
2734 }
2735 
2736 time_t	rootfstime;			/* recorded root fs time, if known */
2737 void
2738 setrootfstime(time_t t)
2739 {
2740 	rootfstime = t;
2741 }
2742 
2743 /*
2744  * Sham lock manager for vnodes.  This is a temporary measure.
2745  */
2746 int
2747 vlockmgr(struct vnlock *vl, int flags)
2748 {
2749 
2750 	KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
2751 
2752 	switch (flags & LK_TYPE_MASK) {
2753 	case LK_SHARED:
2754 		if (rw_tryenter(&vl->vl_lock, RW_READER)) {
2755 			return 0;
2756 		}
2757 		if ((flags & LK_NOWAIT) != 0) {
2758 			return EBUSY;
2759 		}
2760 		rw_enter(&vl->vl_lock, RW_READER);
2761 		return 0;
2762 
2763 	case LK_EXCLUSIVE:
2764 		if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
2765 			return 0;
2766 		}
2767 		if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
2768 		    rw_write_held(&vl->vl_lock)) {
2769 			vl->vl_recursecnt++;
2770 			return 0;
2771 		}
2772 		if ((flags & LK_NOWAIT) != 0) {
2773 			return EBUSY;
2774 		}
2775 		rw_enter(&vl->vl_lock, RW_WRITER);
2776 		return 0;
2777 
2778 	case LK_RELEASE:
2779 		if (vl->vl_recursecnt != 0) {
2780 			KASSERT(rw_write_held(&vl->vl_lock));
2781 			vl->vl_recursecnt--;
2782 			return 0;
2783 		}
2784 		rw_exit(&vl->vl_lock);
2785 		return 0;
2786 
2787 	default:
2788 		panic("vlockmgr: flags %x", flags);
2789 	}
2790 }
2791 
2792 int
2793 vlockstatus(struct vnlock *vl)
2794 {
2795 
2796 	if (rw_write_held(&vl->vl_lock)) {
2797 		return LK_EXCLUSIVE;
2798 	}
2799 	if (rw_read_held(&vl->vl_lock)) {
2800 		return LK_SHARED;
2801 	}
2802 	return 0;
2803 }
2804 
2805 /*
2806  * mount_specific_key_create --
2807  *	Create a key for subsystem mount-specific data.
2808  */
2809 int
2810 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
2811 {
2812 
2813 	return (specificdata_key_create(mount_specificdata_domain, keyp, dtor));
2814 }
2815 
2816 /*
2817  * mount_specific_key_delete --
2818  *	Delete a key for subsystem mount-specific data.
2819  */
2820 void
2821 mount_specific_key_delete(specificdata_key_t key)
2822 {
2823 
2824 	specificdata_key_delete(mount_specificdata_domain, key);
2825 }
2826 
2827 /*
2828  * mount_initspecific --
2829  *	Initialize a mount's specificdata container.
2830  */
2831 void
2832 mount_initspecific(struct mount *mp)
2833 {
2834 	int error;
2835 
2836 	error = specificdata_init(mount_specificdata_domain,
2837 				  &mp->mnt_specdataref);
2838 	KASSERT(error == 0);
2839 }
2840 
2841 /*
2842  * mount_finispecific --
2843  *	Finalize a mount's specificdata container.
2844  */
2845 void
2846 mount_finispecific(struct mount *mp)
2847 {
2848 
2849 	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
2850 }
2851 
2852 /*
2853  * mount_getspecific --
2854  *	Return mount-specific data corresponding to the specified key.
2855  */
2856 void *
2857 mount_getspecific(struct mount *mp, specificdata_key_t key)
2858 {
2859 
2860 	return (specificdata_getspecific(mount_specificdata_domain,
2861 					 &mp->mnt_specdataref, key));
2862 }
2863 
2864 /*
2865  * mount_setspecific --
2866  *	Set mount-specific data corresponding to the specified key.
2867  */
2868 void
2869 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
2870 {
2871 
2872 	specificdata_setspecific(mount_specificdata_domain,
2873 				 &mp->mnt_specdataref, key, data);
2874 }
2875 
2876 int
2877 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
2878 {
2879 	int error;
2880 
2881 	KERNEL_LOCK(1, NULL);
2882 	error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
2883 	KERNEL_UNLOCK_ONE(NULL);
2884 
2885 	return error;
2886 }
2887 
2888 int
2889 VFS_START(struct mount *mp, int a)
2890 {
2891 	int error;
2892 
2893 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2894 		KERNEL_LOCK(1, NULL);
2895 	}
2896 	error = (*(mp->mnt_op->vfs_start))(mp, a);
2897 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2898 		KERNEL_UNLOCK_ONE(NULL);
2899 	}
2900 
2901 	return error;
2902 }
2903 
2904 int
2905 VFS_UNMOUNT(struct mount *mp, int a)
2906 {
2907 	int error;
2908 
2909 	KERNEL_LOCK(1, NULL);
2910 	error = (*(mp->mnt_op->vfs_unmount))(mp, a);
2911 	KERNEL_UNLOCK_ONE(NULL);
2912 
2913 	return error;
2914 }
2915 
2916 int
2917 VFS_ROOT(struct mount *mp, struct vnode **a)
2918 {
2919 	int error;
2920 
2921 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2922 		KERNEL_LOCK(1, NULL);
2923 	}
2924 	error = (*(mp->mnt_op->vfs_root))(mp, a);
2925 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2926 		KERNEL_UNLOCK_ONE(NULL);
2927 	}
2928 
2929 	return error;
2930 }
2931 
2932 int
2933 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c)
2934 {
2935 	int error;
2936 
2937 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2938 		KERNEL_LOCK(1, NULL);
2939 	}
2940 	error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c);
2941 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2942 		KERNEL_UNLOCK_ONE(NULL);
2943 	}
2944 
2945 	return error;
2946 }
2947 
2948 int
2949 VFS_STATVFS(struct mount *mp, struct statvfs *a)
2950 {
2951 	int error;
2952 
2953 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2954 		KERNEL_LOCK(1, NULL);
2955 	}
2956 	error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
2957 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2958 		KERNEL_UNLOCK_ONE(NULL);
2959 	}
2960 
2961 	return error;
2962 }
2963 
2964 int
2965 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
2966 {
2967 	int error;
2968 
2969 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2970 		KERNEL_LOCK(1, NULL);
2971 	}
2972 	error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
2973 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2974 		KERNEL_UNLOCK_ONE(NULL);
2975 	}
2976 
2977 	return error;
2978 }
2979 
2980 int
2981 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b)
2982 {
2983 	int error;
2984 
2985 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2986 		KERNEL_LOCK(1, NULL);
2987 	}
2988 	error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b);
2989 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
2990 		KERNEL_UNLOCK_ONE(NULL);
2991 	}
2992 
2993 	return error;
2994 }
2995 
2996 int
2997 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
2998 {
2999 	int error;
3000 
3001 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
3002 		KERNEL_LOCK(1, NULL);
3003 	}
3004 	error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
3005 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
3006 		KERNEL_UNLOCK_ONE(NULL);
3007 	}
3008 
3009 	return error;
3010 }
3011 
3012 int
3013 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
3014 {
3015 	int error;
3016 
3017 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3018 		KERNEL_LOCK(1, NULL);
3019 	}
3020 	error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
3021 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3022 		KERNEL_UNLOCK_ONE(NULL);
3023 	}
3024 
3025 	return error;
3026 }
3027 
3028 int
3029 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
3030 {
3031 	int error;
3032 
3033 	KERNEL_LOCK(1, NULL);		/* XXXSMP check ffs */
3034 	error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
3035 	KERNEL_UNLOCK_ONE(NULL);	/* XXX */
3036 
3037 	return error;
3038 }
3039 
3040 int
3041 VFS_SUSPENDCTL(struct mount *mp, int a)
3042 {
3043 	int error;
3044 
3045 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3046 		KERNEL_LOCK(1, NULL);
3047 	}
3048 	error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
3049 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
3050 		KERNEL_UNLOCK_ONE(NULL);
3051 	}
3052 
3053 	return error;
3054 }
3055 
3056 #ifdef DDB
3057 static const char buf_flagbits[] = BUF_FLAGBITS;
3058 
3059 void
3060 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
3061 {
3062 	char bf[1024];
3063 
3064 	(*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
3065 	    PRIx64 " dev 0x%x\n",
3066 	    bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
3067 
3068 	snprintb(bf, sizeof(bf),
3069 	    buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
3070 	(*pr)("  error %d flags 0x%s\n", bp->b_error, bf);
3071 
3072 	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
3073 		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
3074 	(*pr)("  data %p saveaddr %p\n",
3075 		  bp->b_data, bp->b_saveaddr);
3076 	(*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
3077 }
3078 
3079 
3080 void
3081 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
3082 {
3083 	char bf[256];
3084 
3085 	uvm_object_printit(&vp->v_uobj, full, pr);
3086 	snprintb(bf, sizeof(bf),
3087 	    vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
3088 	(*pr)("\nVNODE flags %s\n", bf);
3089 	(*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
3090 	      vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
3091 
3092 	(*pr)("data %p writecount %ld holdcnt %ld\n",
3093 	      vp->v_data, vp->v_writecount, vp->v_holdcnt);
3094 
3095 	(*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
3096 	      ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
3097 	      ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
3098 	      vp->v_mount, vp->v_mountedhere);
3099 
3100 	(*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock);
3101 
3102 	if (full) {
3103 		struct buf *bp;
3104 
3105 		(*pr)("clean bufs:\n");
3106 		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
3107 			(*pr)(" bp %p\n", bp);
3108 			vfs_buf_print(bp, full, pr);
3109 		}
3110 
3111 		(*pr)("dirty bufs:\n");
3112 		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
3113 			(*pr)(" bp %p\n", bp);
3114 			vfs_buf_print(bp, full, pr);
3115 		}
3116 	}
3117 }
3118 
3119 void
3120 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
3121 {
3122 	char sbuf[256];
3123 
3124 	(*pr)("vnodecovered = %p syncer = %p data = %p\n",
3125 			mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data);
3126 
3127 	(*pr)("fs_bshift %d dev_bshift = %d\n",
3128 			mp->mnt_fs_bshift,mp->mnt_dev_bshift);
3129 
3130 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
3131 	(*pr)("flag = %s\n", sbuf);
3132 
3133 	snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
3134 	(*pr)("iflag = %s\n", sbuf);
3135 
3136 	(*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt,
3137 	    &mp->mnt_unmounting, &mp->mnt_updating);
3138 
3139 	(*pr)("statvfs cache:\n");
3140 	(*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
3141 	(*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
3142 	(*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
3143 
3144 	(*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
3145 	(*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
3146 	(*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
3147 	(*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
3148 
3149 	(*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
3150 	(*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
3151 	(*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
3152 	(*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
3153 
3154 	(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
3155 			mp->mnt_stat.f_fsidx.__fsid_val[0],
3156 			mp->mnt_stat.f_fsidx.__fsid_val[1]);
3157 
3158 	(*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
3159 	(*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
3160 
3161 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
3162 
3163 	(*pr)("\tflag = %s\n",sbuf);
3164 	(*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
3165 	(*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
3166 	(*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
3167 	(*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
3168 	(*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
3169 	(*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
3170 	(*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
3171 
3172 	{
3173 		int cnt = 0;
3174 		struct vnode *vp;
3175 		(*pr)("locked vnodes =");
3176 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3177 			if (VOP_ISLOCKED(vp)) {
3178 				if ((++cnt % 6) == 0) {
3179 					(*pr)(" %p,\n\t", vp);
3180 				} else {
3181 					(*pr)(" %p,", vp);
3182 				}
3183 			}
3184 		}
3185 		(*pr)("\n");
3186 	}
3187 
3188 	if (full) {
3189 		int cnt = 0;
3190 		struct vnode *vp;
3191 		(*pr)("all vnodes =");
3192 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3193 			if (!TAILQ_NEXT(vp, v_mntvnodes)) {
3194 				(*pr)(" %p", vp);
3195 			} else if ((++cnt % 6) == 0) {
3196 				(*pr)(" %p,\n\t", vp);
3197 			} else {
3198 				(*pr)(" %p,", vp);
3199 			}
3200 		}
3201 		(*pr)("\n", vp);
3202 	}
3203 }
3204 #endif /* DDB */
3205