xref: /netbsd-src/sys/kern/vfs_subr.c (revision cd22f25e6f6d1cc1f197fe8c5468a80f51d1c4e1)
1 /*	$NetBSD: vfs_subr.c,v 1.340 2008/05/02 17:40:30 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * External virtual filesystem routines.
71  *
72  * This file contains vfs subroutines which are heavily dependant on
73  * the kernel and are not suitable for standalone use.  Examples include
74  * routines involved vnode and mountpoint management.
75  */
76 
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.340 2008/05/02 17:40:30 ad Exp $");
79 
80 #include "opt_ddb.h"
81 #include "opt_compat_netbsd.h"
82 #include "opt_compat_43.h"
83 
84 #include <sys/param.h>
85 #include <sys/systm.h>
86 #include <sys/proc.h>
87 #include <sys/kernel.h>
88 #include <sys/mount.h>
89 #include <sys/fcntl.h>
90 #include <sys/vnode.h>
91 #include <sys/stat.h>
92 #include <sys/namei.h>
93 #include <sys/ucred.h>
94 #include <sys/buf.h>
95 #include <sys/errno.h>
96 #include <sys/malloc.h>
97 #include <sys/syscallargs.h>
98 #include <sys/device.h>
99 #include <sys/filedesc.h>
100 #include <sys/kauth.h>
101 #include <sys/atomic.h>
102 #include <sys/kthread.h>
103 
104 #include <miscfs/specfs/specdev.h>
105 #include <miscfs/syncfs/syncfs.h>
106 
107 #include <uvm/uvm.h>
108 #include <uvm/uvm_readahead.h>
109 #include <uvm/uvm_ddb.h>
110 
111 #include <sys/sysctl.h>
112 
113 extern int dovfsusermount;	/* 1 => permit any user to mount filesystems */
114 extern int vfs_magiclinks;	/* 1 => expand "magic" symlinks */
115 
116 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
117 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
118 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
119 
120 static int vrele_pending;
121 static kmutex_t	vrele_lock;
122 static kcondvar_t vrele_cv;
123 static lwp_t *vrele_lwp;
124 
125 static pool_cache_t vnode_cache;
126 
127 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
128 
129 /*
130  * Local declarations.
131  */
132 
133 static void vrele_thread(void *);
134 static void insmntque(vnode_t *, struct mount *);
135 static int getdevvp(dev_t, vnode_t **, enum vtype);
136 static vnode_t *getcleanvnode(void);;
137 void vpanic(vnode_t *, const char *);
138 
139 #ifdef DIAGNOSTIC
140 void
141 vpanic(vnode_t *vp, const char *msg)
142 {
143 
144 	vprint(NULL, vp);
145 	panic("%s\n", msg);
146 }
147 #else
148 #define	vpanic(vp, msg)	/* nothing */
149 #endif
150 
151 void
152 vn_init1(void)
153 {
154 
155 	vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
156 	    NULL, IPL_NONE, NULL, NULL, NULL);
157 	KASSERT(vnode_cache != NULL);
158 
159 	/* Create deferred release thread. */
160 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
161 	cv_init(&vrele_cv, "vrele");
162 	if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
163 	    NULL, &vrele_lwp, "vrele"))
164 		panic("fork vrele");
165 }
166 
167 int
168 vfs_drainvnodes(long target, struct lwp *l)
169 {
170 
171 	while (numvnodes > target) {
172 		vnode_t *vp;
173 
174 		mutex_enter(&vnode_free_list_lock);
175 		vp = getcleanvnode();
176 		if (vp == NULL)
177 			return EBUSY; /* give up */
178 		ungetnewvnode(vp);
179 	}
180 
181 	return 0;
182 }
183 
184 /*
185  * grab a vnode from freelist and clean it.
186  */
187 vnode_t *
188 getcleanvnode(void)
189 {
190 	vnode_t *vp;
191 	vnodelst_t *listhd;
192 
193 	KASSERT(mutex_owned(&vnode_free_list_lock));
194 
195 retry:
196 	listhd = &vnode_free_list;
197 try_nextlist:
198 	TAILQ_FOREACH(vp, listhd, v_freelist) {
199 		/*
200 		 * It's safe to test v_usecount and v_iflag
201 		 * without holding the interlock here, since
202 		 * these vnodes should never appear on the
203 		 * lists.
204 		 */
205 		if (vp->v_usecount != 0) {
206 			vpanic(vp, "free vnode isn't");
207 		}
208 		if ((vp->v_iflag & VI_CLEAN) != 0) {
209 			vpanic(vp, "clean vnode on freelist");
210 		}
211 		if (vp->v_freelisthd != listhd) {
212 			printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
213 			vpanic(vp, "list head mismatch");
214 		}
215 		if (!mutex_tryenter(&vp->v_interlock))
216 			continue;
217 		/*
218 		 * Our lwp might hold the underlying vnode
219 		 * locked, so don't try to reclaim a VI_LAYER
220 		 * node if it's locked.
221 		 */
222 		if ((vp->v_iflag & VI_XLOCK) == 0 &&
223 		    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
224 			break;
225 		}
226 		mutex_exit(&vp->v_interlock);
227 	}
228 
229 	if (vp == NULL) {
230 		if (listhd == &vnode_free_list) {
231 			listhd = &vnode_hold_list;
232 			goto try_nextlist;
233 		}
234 		mutex_exit(&vnode_free_list_lock);
235 		return NULL;
236 	}
237 
238 	/* Remove it from the freelist. */
239 	TAILQ_REMOVE(listhd, vp, v_freelist);
240 	vp->v_freelisthd = NULL;
241 	mutex_exit(&vnode_free_list_lock);
242 
243 	/*
244 	 * The vnode is still associated with a file system, so we must
245 	 * clean it out before reusing it.  We need to add a reference
246 	 * before doing this.  If the vnode gains another reference while
247 	 * being cleaned out then we lose - retry.
248 	 */
249 	vp->v_usecount++;
250 	vclean(vp, DOCLOSE);
251 	if (vp->v_usecount == 1) {
252 		/* We're about to dirty it. */
253 		vp->v_iflag &= ~VI_CLEAN;
254 		mutex_exit(&vp->v_interlock);
255 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
256 			spec_node_destroy(vp);
257 		}
258 		vp->v_type = VNON;
259 	} else {
260 		/*
261 		 * Don't return to freelist - the holder of the last
262 		 * reference will destroy it.
263 		 */
264 		KASSERT(vp->v_usecount > 1);
265 		vp->v_usecount--;
266 		mutex_exit(&vp->v_interlock);
267 		mutex_enter(&vnode_free_list_lock);
268 		goto retry;
269 	}
270 
271 	if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
272 	    !TAILQ_EMPTY(&vp->v_uobj.memq)) {
273 		vpanic(vp, "cleaned vnode isn't");
274 	}
275 	if (vp->v_numoutput != 0) {
276 		vpanic(vp, "clean vnode has pending I/O's");
277 	}
278 	if ((vp->v_iflag & VI_ONWORKLST) != 0) {
279 		vpanic(vp, "clean vnode on syncer list");
280 	}
281 
282 	return vp;
283 }
284 
285 static inline int
286 vfs_dobusy(struct mount *mp, const krw_t op, struct mount **nextp)
287 {
288 	lwp_t *l;
289 
290 	KASSERT(mp->mnt_refcnt > 0);
291 
292 	atomic_inc_uint(&mp->mnt_refcnt);
293 	if (nextp != NULL) {
294 		mutex_exit(&mountlist_lock);
295 	}
296 	l = curlwp;
297 	if (l->l_mpbusy == mp) {
298 		if (op == RW_WRITER) {
299 			KASSERT(rw_write_held(&mp->mnt_lock));
300 		} else {
301 			KASSERT(rw_lock_held(&mp->mnt_lock));
302 		}
303 		l->l_mprecurse++;
304 	} else {
305 		rw_enter(&mp->mnt_lock, op);
306 		l->l_mpbusy = mp;
307 	}
308 	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
309 		if (nextp != NULL) {
310 			mutex_enter(&mountlist_lock);
311 		}
312 		vfs_unbusy(mp, false, nextp);
313 		return ENOENT;
314 	}
315 
316 	return 0;
317 }
318 
319 /*
320  * Mark a mount point as busy, and gain a new reference to it.  Used to
321  * synchronize access and to delay unmounting.
322  *
323  * => The caller must hold a pre-existing reference to the mount.
324  */
325 int
326 vfs_busy(struct mount *mp, const krw_t op)
327 {
328 	int error;
329 
330 	for (;;) {
331 		error = vfs_dobusy(mp, op, NULL);
332 		if (error != 0) {
333 			return error;
334 		}
335 		if (__predict_true(mp->mnt_unmounter == NULL)) {
336 			return 0;
337 		}
338 		mutex_enter(&mount_lock);
339 		if (mp->mnt_unmounter != NULL) {
340 			vfs_unbusy(mp, false, NULL);
341 			cv_wait(&mount_cv, &mount_lock);
342 		}
343 		mutex_exit(&mount_lock);
344 	}
345 }
346 
347 /*
348  * As vfs_busy(), but return error if the file system is being
349  * unmounted (and do not wait for the unmount).
350  *
351  * => If nextp != NULL, mountlist_lock is understood to be held.  On
352  *    failure a pointer to the next mount will be returned via nextp.
353  *    The caller need not hold a reference to the mount.
354  *
355  * => If nextp == NULL, the caller is expected to hold a reference
356  *    to the mount.
357  */
358 int
359 vfs_trybusy(struct mount *mp, krw_t op, struct mount **nextp)
360 {
361 	lwp_t *l;
362 	int error;
363 
364 	KASSERT(nextp == NULL || mutex_owned(&mountlist_lock));
365 
366 	if (nextp != NULL) {
367 		/*
368 		 * We need to prevent adding a reference to the mount
369 		 * if it is already on the way out: the reference count
370 		 * could be zero, and as a result another thread could
371 		 * be in vfs_destroy() trying to throw away the mount.
372 		 *
373 		 * mnt_iflag is protected by mnt_lock, but this check is
374 		 * safe if mountlist_lock is held.  mountlist_lock will
375 		 * be held by vfs_destroy() before removing the mount
376 		 * from mountlist.
377 		 */
378 		if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
379 			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
380 			return ENOENT;
381 		}
382 	}
383 
384 	error = vfs_dobusy(mp, op, nextp);
385 	l = mp->mnt_unmounter;
386 	if (error == 0 && (l != NULL && l != curlwp)) {
387 		if (nextp != NULL) {
388 			mutex_enter(&mountlist_lock);
389 		}
390 		vfs_unbusy(mp, false, nextp);
391 		error = EBUSY;
392 	}
393 	return error;
394 }
395 
396 /*
397  * Unlock a busy filesystem and drop reference to it.  If 'keepref' is
398  * true, unlock but preserve the reference.
399  *
400  * => If nextp != NULL, mountlist_lock is understood to be held.  On
401  *    failure a pointer to the next mount will be returned via nextp.
402  */
403 void
404 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
405 {
406 	lwp_t *l;
407 
408 	KASSERT(mp->mnt_refcnt > 0);
409 
410 	l = curlwp;
411 	if (l->l_mpbusy != NULL) {
412 		KASSERT(l->l_mpbusy == mp);
413 		KASSERT(rw_lock_held(&mp->mnt_lock));
414 		if (l->l_mprecurse != 0) {
415 			l->l_mprecurse--;
416 		} else {
417 			l->l_mpbusy = NULL;
418 			rw_exit(&mp->mnt_lock);
419 		}
420 	} else {
421 		rw_exit(&mp->mnt_lock);
422 	}
423 	if (nextp != NULL) {
424 		*nextp = CIRCLEQ_NEXT(mp, mnt_list);
425 	}
426 	if (!keepref) {
427 		vfs_destroy(mp, nextp != NULL);
428 	}
429 }
430 
431 /*
432  * Lookup a filesystem type, and if found allocate and initialize
433  * a mount structure for it.
434  *
435  * Devname is usually updated by mount(8) after booting.
436  */
437 int
438 vfs_rootmountalloc(const char *fstypename, const char *devname,
439     struct mount **mpp)
440 {
441 	struct vfsops *vfsp = NULL;
442 	struct mount *mp;
443 
444 	mutex_enter(&vfs_list_lock);
445 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
446 		if (!strncmp(vfsp->vfs_name, fstypename,
447 		    sizeof(mp->mnt_stat.f_fstypename)))
448 			break;
449 	if (vfsp == NULL) {
450 		mutex_exit(&vfs_list_lock);
451 		return (ENODEV);
452 	}
453 	vfsp->vfs_refcount++;
454 	mutex_exit(&vfs_list_lock);
455 
456 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
457 	if (mp == NULL)
458 		return ENOMEM;
459 	mp->mnt_refcnt = 1;
460 	rw_init(&mp->mnt_lock);
461 	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
462 	(void)vfs_busy(mp, RW_WRITER);
463 	TAILQ_INIT(&mp->mnt_vnodelist);
464 	mp->mnt_op = vfsp;
465 	mp->mnt_flag = MNT_RDONLY;
466 	mp->mnt_vnodecovered = NULL;
467 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
468 	    sizeof(mp->mnt_stat.f_fstypename));
469 	mp->mnt_stat.f_mntonname[0] = '/';
470 	mp->mnt_stat.f_mntonname[1] = '\0';
471 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
472 	    '\0';
473 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
474 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
475 	mount_initspecific(mp);
476 	*mpp = mp;
477 	return (0);
478 }
479 
480 /*
481  * Routines having to do with the management of the vnode table.
482  */
483 extern int (**dead_vnodeop_p)(void *);
484 
485 /*
486  * Return the next vnode from the free list.
487  */
488 int
489 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
490 	    vnode_t **vpp)
491 {
492 	struct uvm_object *uobj;
493 	static int toggle;
494 	vnode_t *vp;
495 	int error = 0, tryalloc;
496 
497  try_again:
498 	if (mp != NULL) {
499 		/*
500 		 * Mark filesystem busy while we're creating a
501 		 * vnode.  If unmount is in progress, this will
502 		 * wait; if the unmount succeeds (only if umount
503 		 * -f), this will return an error.  If the
504 		 * unmount fails, we'll keep going afterwards.
505 		 */
506 		error = vfs_busy(mp, RW_READER);
507 		if (error)
508 			return error;
509 	}
510 
511 	/*
512 	 * We must choose whether to allocate a new vnode or recycle an
513 	 * existing one. The criterion for allocating a new one is that
514 	 * the total number of vnodes is less than the number desired or
515 	 * there are no vnodes on either free list. Generally we only
516 	 * want to recycle vnodes that have no buffers associated with
517 	 * them, so we look first on the vnode_free_list. If it is empty,
518 	 * we next consider vnodes with referencing buffers on the
519 	 * vnode_hold_list. The toggle ensures that half the time we
520 	 * will use a buffer from the vnode_hold_list, and half the time
521 	 * we will allocate a new one unless the list has grown to twice
522 	 * the desired size. We are reticent to recycle vnodes from the
523 	 * vnode_hold_list because we will lose the identity of all its
524 	 * referencing buffers.
525 	 */
526 
527 	vp = NULL;
528 
529 	mutex_enter(&vnode_free_list_lock);
530 
531 	toggle ^= 1;
532 	if (numvnodes > 2 * desiredvnodes)
533 		toggle = 0;
534 
535 	tryalloc = numvnodes < desiredvnodes ||
536 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
537 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
538 
539 	if (tryalloc) {
540 		numvnodes++;
541 		mutex_exit(&vnode_free_list_lock);
542 		if ((vp = vnalloc(NULL)) == NULL) {
543 			mutex_enter(&vnode_free_list_lock);
544 			numvnodes--;
545 		} else
546 			vp->v_usecount = 1;
547 	}
548 
549 	if (vp == NULL) {
550 		vp = getcleanvnode();
551 		if (vp == NULL) {
552 			if (mp != NULL) {
553 				vfs_unbusy(mp, false, NULL);
554 			}
555 			if (tryalloc) {
556 				printf("WARNING: unable to allocate new "
557 				    "vnode, retrying...\n");
558 				(void) tsleep(&lbolt, PRIBIO, "newvn", hz);
559 				goto try_again;
560 			}
561 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
562 			*vpp = 0;
563 			return (ENFILE);
564 		}
565 		vp->v_iflag = 0;
566 		vp->v_vflag = 0;
567 		vp->v_uflag = 0;
568 		vp->v_socket = NULL;
569 	}
570 
571 	KASSERT(vp->v_usecount == 1);
572 	KASSERT(vp->v_freelisthd == NULL);
573 	KASSERT(LIST_EMPTY(&vp->v_nclist));
574 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
575 
576 	vp->v_type = VNON;
577 	vp->v_vnlock = &vp->v_lock;
578 	vp->v_tag = tag;
579 	vp->v_op = vops;
580 	insmntque(vp, mp);
581 	*vpp = vp;
582 	vp->v_data = 0;
583 
584 	/*
585 	 * initialize uvm_object within vnode.
586 	 */
587 
588 	uobj = &vp->v_uobj;
589 	KASSERT(uobj->pgops == &uvm_vnodeops);
590 	KASSERT(uobj->uo_npages == 0);
591 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
592 	vp->v_size = vp->v_writesize = VSIZENOTSET;
593 
594 	if (mp != NULL) {
595 		if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
596 			vp->v_vflag |= VV_MPSAFE;
597 		vfs_unbusy(mp, true, NULL);
598 	}
599 
600 	return (0);
601 }
602 
603 /*
604  * This is really just the reverse of getnewvnode(). Needed for
605  * VFS_VGET functions who may need to push back a vnode in case
606  * of a locking race.
607  */
608 void
609 ungetnewvnode(vnode_t *vp)
610 {
611 
612 	KASSERT(vp->v_usecount == 1);
613 	KASSERT(vp->v_data == NULL);
614 	KASSERT(vp->v_freelisthd == NULL);
615 
616 	mutex_enter(&vp->v_interlock);
617 	vp->v_iflag |= VI_CLEAN;
618 	vrelel(vp, 0);
619 }
620 
621 /*
622  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
623  * marker vnode and we are prepared to wait for the allocation.
624  */
625 vnode_t *
626 vnalloc(struct mount *mp)
627 {
628 	vnode_t *vp;
629 
630 	vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
631 	if (vp == NULL) {
632 		return NULL;
633 	}
634 
635 	memset(vp, 0, sizeof(*vp));
636 	UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
637 	cv_init(&vp->v_cv, "vnode");
638 	/*
639 	 * done by memset() above.
640 	 *	LIST_INIT(&vp->v_nclist);
641 	 *	LIST_INIT(&vp->v_dnclist);
642 	 */
643 
644 	if (mp != NULL) {
645 		vp->v_mount = mp;
646 		vp->v_type = VBAD;
647 		vp->v_iflag = VI_MARKER;
648 	} else {
649 		rw_init(&vp->v_lock.vl_lock);
650 	}
651 
652 	return vp;
653 }
654 
655 /*
656  * Free an unused, unreferenced vnode.
657  */
658 void
659 vnfree(vnode_t *vp)
660 {
661 
662 	KASSERT(vp->v_usecount == 0);
663 
664 	if ((vp->v_iflag & VI_MARKER) == 0) {
665 		rw_destroy(&vp->v_lock.vl_lock);
666 		mutex_enter(&vnode_free_list_lock);
667 		numvnodes--;
668 		mutex_exit(&vnode_free_list_lock);
669 	}
670 
671 	UVM_OBJ_DESTROY(&vp->v_uobj);
672 	cv_destroy(&vp->v_cv);
673 	pool_cache_put(vnode_cache, vp);
674 }
675 
676 /*
677  * Remove a vnode from its freelist.
678  */
679 static inline void
680 vremfree(vnode_t *vp)
681 {
682 
683 	KASSERT(mutex_owned(&vp->v_interlock));
684 	KASSERT(vp->v_usecount == 0);
685 
686 	/*
687 	 * Note that the reference count must not change until
688 	 * the vnode is removed.
689 	 */
690 	mutex_enter(&vnode_free_list_lock);
691 	if (vp->v_holdcnt > 0) {
692 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
693 	} else {
694 		KASSERT(vp->v_freelisthd == &vnode_free_list);
695 	}
696 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
697 	vp->v_freelisthd = NULL;
698 	mutex_exit(&vnode_free_list_lock);
699 }
700 
701 /*
702  * Move a vnode from one mount queue to another.
703  */
704 static void
705 insmntque(vnode_t *vp, struct mount *mp)
706 {
707 	struct mount *omp;
708 
709 #ifdef DIAGNOSTIC
710 	if ((mp != NULL) &&
711 	    (mp->mnt_iflag & IMNT_UNMOUNT) &&
712 	    !(mp->mnt_flag & MNT_SOFTDEP) &&
713 	    vp->v_tag != VT_VFS) {
714 		panic("insmntque into dying filesystem");
715 	}
716 #endif
717 
718 	mutex_enter(&mntvnode_lock);
719 	/*
720 	 * Delete from old mount point vnode list, if on one.
721 	 */
722 	if ((omp = vp->v_mount) != NULL)
723 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
724 	/*
725 	 * Insert into list of vnodes for the new mount point, if
726 	 * available.  The caller must take a reference on the mount
727 	 * structure and donate to the vnode.
728 	 */
729 	if ((vp->v_mount = mp) != NULL)
730 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
731 	mutex_exit(&mntvnode_lock);
732 
733 	if (omp != NULL) {
734 		/* Release reference to old mount. */
735 		vfs_destroy(omp, false);
736 	}
737 }
738 
739 /*
740  * Create a vnode for a block device.
741  * Used for root filesystem and swap areas.
742  * Also used for memory file system special devices.
743  */
744 int
745 bdevvp(dev_t dev, vnode_t **vpp)
746 {
747 
748 	return (getdevvp(dev, vpp, VBLK));
749 }
750 
751 /*
752  * Create a vnode for a character device.
753  * Used for kernfs and some console handling.
754  */
755 int
756 cdevvp(dev_t dev, vnode_t **vpp)
757 {
758 
759 	return (getdevvp(dev, vpp, VCHR));
760 }
761 
762 /*
763  * Create a vnode for a device.
764  * Used by bdevvp (block device) for root file system etc.,
765  * and by cdevvp (character device) for console and kernfs.
766  */
767 static int
768 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
769 {
770 	vnode_t *vp;
771 	vnode_t *nvp;
772 	int error;
773 
774 	if (dev == NODEV) {
775 		*vpp = NULL;
776 		return (0);
777 	}
778 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
779 	if (error) {
780 		*vpp = NULL;
781 		return (error);
782 	}
783 	vp = nvp;
784 	vp->v_type = type;
785 	vp->v_vflag |= VV_MPSAFE;
786 	uvm_vnp_setsize(vp, 0);
787 	spec_node_init(vp, dev);
788 	*vpp = vp;
789 	return (0);
790 }
791 
792 /*
793  * Grab a particular vnode from the free list, increment its
794  * reference count and lock it. If the vnode lock bit is set the
795  * vnode is being eliminated in vgone. In that case, we can not
796  * grab the vnode, so the process is awakened when the transition is
797  * completed, and an error returned to indicate that the vnode is no
798  * longer usable (possibly having been changed to a new file system type).
799  */
800 int
801 vget(vnode_t *vp, int flags)
802 {
803 	int error;
804 
805 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
806 
807 	if ((flags & LK_INTERLOCK) == 0)
808 		mutex_enter(&vp->v_interlock);
809 
810 	/*
811 	 * Before adding a reference, we must remove the vnode
812 	 * from its freelist.
813 	 */
814 	if (vp->v_usecount == 0) {
815 		vremfree(vp);
816 	}
817 	if (++vp->v_usecount == 0) {
818 		vpanic(vp, "vget: usecount overflow");
819 	}
820 
821 	/*
822 	 * If the vnode is in the process of being cleaned out for
823 	 * another use, we wait for the cleaning to finish and then
824 	 * return failure.  Cleaning is determined by checking if
825 	 * the VI_XLOCK or VI_FREEING flags are set.
826 	 */
827 	if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
828 		if ((flags & LK_NOWAIT) != 0) {
829 			vrelel(vp, 0);
830 			return EBUSY;
831 		}
832 		vwait(vp, VI_XLOCK | VI_FREEING);
833 		vrelel(vp, 0);
834 		return ENOENT;
835 	}
836 	if (flags & LK_TYPE_MASK) {
837 		error = vn_lock(vp, flags | LK_INTERLOCK);
838 		if (error != 0) {
839 			vrele(vp);
840 		}
841 		return error;
842 	}
843 	mutex_exit(&vp->v_interlock);
844 	return 0;
845 }
846 
847 /*
848  * vput(), just unlock and vrele()
849  */
850 void
851 vput(vnode_t *vp)
852 {
853 
854 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
855 
856 	VOP_UNLOCK(vp, 0);
857 	vrele(vp);
858 }
859 
860 /*
861  * Vnode release.  If reference count drops to zero, call inactive
862  * routine and either return to freelist or free to the pool.
863  */
864 void
865 vrelel(vnode_t *vp, int flags)
866 {
867 	bool recycle, defer;
868 	int error;
869 
870 	KASSERT(mutex_owned(&vp->v_interlock));
871 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
872 	KASSERT(vp->v_freelisthd == NULL);
873 
874 	if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) {
875 		vpanic(vp, "dead but not clean");
876 	}
877 
878 	/*
879 	 * If not the last reference, just drop the reference count
880 	 * and unlock.
881 	 */
882 	if (vp->v_usecount > 1) {
883 		vp->v_usecount--;
884 		vp->v_iflag |= VI_INACTREDO;
885 		mutex_exit(&vp->v_interlock);
886 		return;
887 	}
888 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
889 		vpanic(vp, "vput: bad ref count");
890 	}
891 
892 	/*
893 	 * If not clean, deactivate the vnode, but preserve
894 	 * our reference across the call to VOP_INACTIVE().
895 	 */
896  retry:
897 	if ((vp->v_iflag & VI_CLEAN) == 0) {
898 		recycle = false;
899 		/*
900 		 * XXX This ugly block can be largely eliminated if
901 		 * locking is pushed down into the file systems.
902 		 */
903 		if (curlwp == uvm.pagedaemon_lwp) {
904 			/* The pagedaemon can't wait around; defer. */
905 			defer = true;
906 		} else if (curlwp == vrele_lwp) {
907 			/* We have to try harder. */
908 			vp->v_iflag &= ~VI_INACTREDO;
909 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
910 			    LK_RETRY);
911 			if (error != 0) {
912 				/* XXX */
913 				vpanic(vp, "vrele: unable to lock %p");
914 			}
915 			defer = false;
916 		} else if ((vp->v_iflag & VI_LAYER) != 0) {
917 			/*
918 			 * Acquiring the stack's lock in vclean() even
919 			 * for an honest vput/vrele is dangerous because
920 			 * our caller may hold other vnode locks; defer.
921 			 */
922 			defer = true;
923 		} else {
924 			/* If we can't acquire the lock, then defer. */
925 			vp->v_iflag &= ~VI_INACTREDO;
926 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
927 			    LK_NOWAIT);
928 			if (error != 0) {
929 				defer = true;
930 				mutex_enter(&vp->v_interlock);
931 			} else {
932 				defer = false;
933 			}
934 		}
935 
936 		if (defer) {
937 			/*
938 			 * Defer reclaim to the kthread; it's not safe to
939 			 * clean it here.  We donate it our last reference.
940 			 */
941 			KASSERT(mutex_owned(&vp->v_interlock));
942 			KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
943 			vp->v_iflag |= VI_INACTPEND;
944 			mutex_enter(&vrele_lock);
945 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
946 			if (++vrele_pending > (desiredvnodes >> 8))
947 				cv_signal(&vrele_cv);
948 			mutex_exit(&vrele_lock);
949 			mutex_exit(&vp->v_interlock);
950 			return;
951 		}
952 
953 #ifdef DIAGNOSTIC
954 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
955 		    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
956 			vprint("vrelel: missing VOP_CLOSE()", vp);
957 		}
958 #endif
959 
960 		/*
961 		 * The vnode can gain another reference while being
962 		 * deactivated.  If VOP_INACTIVE() indicates that
963 		 * the described file has been deleted, then recycle
964 		 * the vnode irrespective of additional references.
965 		 * Another thread may be waiting to re-use the on-disk
966 		 * inode.
967 		 *
968 		 * Note that VOP_INACTIVE() will drop the vnode lock.
969 		 */
970 		VOP_INACTIVE(vp, &recycle);
971 		mutex_enter(&vp->v_interlock);
972 		if (!recycle) {
973 			if (vp->v_usecount > 1) {
974 				vp->v_usecount--;
975 				mutex_exit(&vp->v_interlock);
976 				return;
977 			}
978 
979 			/*
980 			 * If we grew another reference while
981 			 * VOP_INACTIVE() was underway, retry.
982 			 */
983 			if ((vp->v_iflag & VI_INACTREDO) != 0) {
984 				goto retry;
985 			}
986 		}
987 
988 		/* Take care of space accounting. */
989 		if (vp->v_iflag & VI_EXECMAP) {
990 			atomic_add_int(&uvmexp.execpages,
991 			    -vp->v_uobj.uo_npages);
992 			atomic_add_int(&uvmexp.filepages,
993 			    vp->v_uobj.uo_npages);
994 		}
995 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
996 		vp->v_vflag &= ~VV_MAPPED;
997 
998 		/*
999 		 * Recycle the vnode if the file is now unused (unlinked),
1000 		 * otherwise just free it.
1001 		 */
1002 		if (recycle) {
1003 			vclean(vp, DOCLOSE);
1004 		}
1005 		KASSERT(vp->v_usecount > 0);
1006 	}
1007 
1008 	if (--vp->v_usecount != 0) {
1009 		/* Gained another reference while being reclaimed. */
1010 		mutex_exit(&vp->v_interlock);
1011 		return;
1012 	}
1013 
1014 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1015 		/*
1016 		 * It's clean so destroy it.  It isn't referenced
1017 		 * anywhere since it has been reclaimed.
1018 		 */
1019 		KASSERT(vp->v_holdcnt == 0);
1020 		KASSERT(vp->v_writecount == 0);
1021 		mutex_exit(&vp->v_interlock);
1022 		insmntque(vp, NULL);
1023 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
1024 			spec_node_destroy(vp);
1025 		}
1026 		vnfree(vp);
1027 	} else {
1028 		/*
1029 		 * Otherwise, put it back onto the freelist.  It
1030 		 * can't be destroyed while still associated with
1031 		 * a file system.
1032 		 */
1033 		mutex_enter(&vnode_free_list_lock);
1034 		if (vp->v_holdcnt > 0) {
1035 			vp->v_freelisthd = &vnode_hold_list;
1036 		} else {
1037 			vp->v_freelisthd = &vnode_free_list;
1038 		}
1039 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1040 		mutex_exit(&vnode_free_list_lock);
1041 		mutex_exit(&vp->v_interlock);
1042 	}
1043 }
1044 
1045 void
1046 vrele(vnode_t *vp)
1047 {
1048 
1049 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1050 
1051 	mutex_enter(&vp->v_interlock);
1052 	vrelel(vp, 0);
1053 }
1054 
1055 static void
1056 vrele_thread(void *cookie)
1057 {
1058 	vnode_t *vp;
1059 
1060 	for (;;) {
1061 		mutex_enter(&vrele_lock);
1062 		while (TAILQ_EMPTY(&vrele_list)) {
1063 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
1064 		}
1065 		vp = TAILQ_FIRST(&vrele_list);
1066 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
1067 		vrele_pending--;
1068 		mutex_exit(&vrele_lock);
1069 
1070 		/*
1071 		 * If not the last reference, then ignore the vnode
1072 		 * and look for more work.
1073 		 */
1074 		mutex_enter(&vp->v_interlock);
1075 		KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
1076 		vp->v_iflag &= ~VI_INACTPEND;
1077 		if (vp->v_usecount > 1) {
1078 			vp->v_usecount--;
1079 			mutex_exit(&vp->v_interlock);
1080 			continue;
1081 		}
1082 		vrelel(vp, 0);
1083 	}
1084 }
1085 
1086 /*
1087  * Page or buffer structure gets a reference.
1088  * Called with v_interlock held.
1089  */
1090 void
1091 vholdl(vnode_t *vp)
1092 {
1093 
1094 	KASSERT(mutex_owned(&vp->v_interlock));
1095 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1096 
1097 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
1098 		mutex_enter(&vnode_free_list_lock);
1099 		KASSERT(vp->v_freelisthd == &vnode_free_list);
1100 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1101 		vp->v_freelisthd = &vnode_hold_list;
1102 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1103 		mutex_exit(&vnode_free_list_lock);
1104 	}
1105 }
1106 
1107 /*
1108  * Page or buffer structure frees a reference.
1109  * Called with v_interlock held.
1110  */
1111 void
1112 holdrelel(vnode_t *vp)
1113 {
1114 
1115 	KASSERT(mutex_owned(&vp->v_interlock));
1116 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1117 
1118 	if (vp->v_holdcnt <= 0) {
1119 		vpanic(vp, "holdrelel: holdcnt vp %p");
1120 	}
1121 
1122 	vp->v_holdcnt--;
1123 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1124 		mutex_enter(&vnode_free_list_lock);
1125 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
1126 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
1127 		vp->v_freelisthd = &vnode_free_list;
1128 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1129 		mutex_exit(&vnode_free_list_lock);
1130 	}
1131 }
1132 
1133 /*
1134  * Vnode reference, where a reference is already held by some other
1135  * object (for example, a file structure).
1136  */
1137 void
1138 vref(vnode_t *vp)
1139 {
1140 
1141 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1142 
1143 	mutex_enter(&vp->v_interlock);
1144 	if (vp->v_usecount <= 0) {
1145 		vpanic(vp, "vref used where vget required");
1146 	}
1147 	if (++vp->v_usecount == 0) {
1148 		vpanic(vp, "vref: usecount overflow");
1149 	}
1150 	mutex_exit(&vp->v_interlock);
1151 }
1152 
1153 /*
1154  * Remove any vnodes in the vnode table belonging to mount point mp.
1155  *
1156  * If FORCECLOSE is not specified, there should not be any active ones,
1157  * return error if any are found (nb: this is a user error, not a
1158  * system error). If FORCECLOSE is specified, detach any active vnodes
1159  * that are found.
1160  *
1161  * If WRITECLOSE is set, only flush out regular file vnodes open for
1162  * writing.
1163  *
1164  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1165  */
1166 #ifdef DEBUG
1167 int busyprt = 0;	/* print out busy vnodes */
1168 struct ctldebug debug1 = { "busyprt", &busyprt };
1169 #endif
1170 
1171 static vnode_t *
1172 vflushnext(vnode_t *mvp, int *when)
1173 {
1174 
1175 	if (hardclock_ticks > *when) {
1176 		mutex_exit(&mntvnode_lock);
1177 		yield();
1178 		mutex_enter(&mntvnode_lock);
1179 		*when = hardclock_ticks + hz / 10;
1180 	}
1181 
1182 	return vunmark(mvp);
1183 }
1184 
1185 int
1186 vflush(struct mount *mp, vnode_t *skipvp, int flags)
1187 {
1188 	vnode_t *vp, *mvp;
1189 	int busy = 0, when = 0;
1190 
1191 	/* Allocate a marker vnode. */
1192 	if ((mvp = vnalloc(mp)) == NULL)
1193 		return (ENOMEM);
1194 
1195 	mutex_enter(&mntvnode_lock);
1196 	/*
1197 	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
1198 	 * and vclean() are called
1199 	 */
1200 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
1201 	    vp = vflushnext(mvp, &when)) {
1202 		vmark(mvp, vp);
1203 		if (vp->v_mount != mp || vismarker(vp))
1204 			continue;
1205 		/*
1206 		 * Skip over a selected vnode.
1207 		 */
1208 		if (vp == skipvp)
1209 			continue;
1210 		mutex_enter(&vp->v_interlock);
1211 		/*
1212 		 * Ignore clean but still referenced vnodes.
1213 		 */
1214 		if ((vp->v_iflag & VI_CLEAN) != 0) {
1215 			mutex_exit(&vp->v_interlock);
1216 			continue;
1217 		}
1218 		/*
1219 		 * Skip over a vnodes marked VSYSTEM.
1220 		 */
1221 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
1222 			mutex_exit(&vp->v_interlock);
1223 			continue;
1224 		}
1225 		/*
1226 		 * If WRITECLOSE is set, only flush out regular file
1227 		 * vnodes open for writing.
1228 		 */
1229 		if ((flags & WRITECLOSE) &&
1230 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1231 			mutex_exit(&vp->v_interlock);
1232 			continue;
1233 		}
1234 		/*
1235 		 * With v_usecount == 0, all we need to do is clear
1236 		 * out the vnode data structures and we are done.
1237 		 */
1238 		if (vp->v_usecount == 0) {
1239 			mutex_exit(&mntvnode_lock);
1240 			vremfree(vp);
1241 			vp->v_usecount++;
1242 			vclean(vp, DOCLOSE);
1243 			vrelel(vp, 0);
1244 			mutex_enter(&mntvnode_lock);
1245 			continue;
1246 		}
1247 		/*
1248 		 * If FORCECLOSE is set, forcibly close the vnode.
1249 		 * For block or character devices, revert to an
1250 		 * anonymous device.  For all other files, just
1251 		 * kill them.
1252 		 */
1253 		if (flags & FORCECLOSE) {
1254 			mutex_exit(&mntvnode_lock);
1255 			vp->v_usecount++;
1256 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1257 				vclean(vp, DOCLOSE);
1258 				vrelel(vp, 0);
1259 			} else {
1260 				vclean(vp, 0);
1261 				vp->v_op = spec_vnodeop_p; /* XXXSMP */
1262 				mutex_exit(&vp->v_interlock);
1263 				/*
1264 				 * The vnode isn't clean, but still resides
1265 				 * on the mount list.  Remove it. XXX This
1266 				 * is a bit dodgy.
1267 				 */
1268 				insmntque(vp, NULL);
1269 				vrele(vp);
1270 			}
1271 			mutex_enter(&mntvnode_lock);
1272 			continue;
1273 		}
1274 #ifdef DEBUG
1275 		if (busyprt)
1276 			vprint("vflush: busy vnode", vp);
1277 #endif
1278 		mutex_exit(&vp->v_interlock);
1279 		busy++;
1280 	}
1281 	mutex_exit(&mntvnode_lock);
1282 	vnfree(mvp);
1283 	if (busy)
1284 		return (EBUSY);
1285 	return (0);
1286 }
1287 
1288 /*
1289  * Disassociate the underlying file system from a vnode.
1290  *
1291  * Must be called with the interlock held, and will return with it held.
1292  */
1293 void
1294 vclean(vnode_t *vp, int flags)
1295 {
1296 	lwp_t *l = curlwp;
1297 	bool recycle, active;
1298 	int error;
1299 
1300 	KASSERT(mutex_owned(&vp->v_interlock));
1301 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1302 	KASSERT(vp->v_usecount != 0);
1303 
1304 	/* If cleaning is already in progress wait until done and return. */
1305 	if (vp->v_iflag & VI_XLOCK) {
1306 		vwait(vp, VI_XLOCK);
1307 		return;
1308 	}
1309 
1310 	/* If already clean, nothing to do. */
1311 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1312 		return;
1313 	}
1314 
1315 	/*
1316 	 * Prevent the vnode from being recycled or brought into use
1317 	 * while we clean it out.
1318 	 */
1319 	vp->v_iflag |= VI_XLOCK;
1320 	if (vp->v_iflag & VI_EXECMAP) {
1321 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1322 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1323 	}
1324 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1325 	active = (vp->v_usecount > 1);
1326 
1327 	/* XXXAD should not lock vnode under layer */
1328 	VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1329 
1330 	/*
1331 	 * Clean out any cached data associated with the vnode.
1332 	 * If purging an active vnode, it must be closed and
1333 	 * deactivated before being reclaimed. Note that the
1334 	 * VOP_INACTIVE will unlock the vnode.
1335 	 */
1336 	if (flags & DOCLOSE) {
1337 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1338 		if (error != 0)
1339 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1340 		KASSERT(error == 0);
1341 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1342 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1343 			 spec_node_revoke(vp);
1344 		}
1345 	}
1346 	if (active) {
1347 		VOP_INACTIVE(vp, &recycle);
1348 	} else {
1349 		/*
1350 		 * Any other processes trying to obtain this lock must first
1351 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1352 		 */
1353 		VOP_UNLOCK(vp, 0);
1354 	}
1355 
1356 	/* Disassociate the underlying file system from the vnode. */
1357 	if (VOP_RECLAIM(vp)) {
1358 		vpanic(vp, "vclean: cannot reclaim");
1359 	}
1360 
1361 	KASSERT(vp->v_uobj.uo_npages == 0);
1362 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1363 		uvm_ra_freectx(vp->v_ractx);
1364 		vp->v_ractx = NULL;
1365 	}
1366 	cache_purge(vp);
1367 
1368 	/* Done with purge, notify sleepers of the grim news. */
1369 	vp->v_op = dead_vnodeop_p;
1370 	vp->v_tag = VT_NON;
1371 	mutex_enter(&vp->v_interlock);
1372 	vp->v_vnlock = &vp->v_lock;
1373 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1374 	vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
1375 	vp->v_vflag &= ~VV_LOCKSWORK;
1376 	if ((flags & DOCLOSE) != 0) {
1377 		vp->v_iflag |= VI_CLEAN;
1378 	}
1379 	cv_broadcast(&vp->v_cv);
1380 
1381 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1382 }
1383 
1384 /*
1385  * Recycle an unused vnode to the front of the free list.
1386  * Release the passed interlock if the vnode will be recycled.
1387  */
1388 int
1389 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1390 {
1391 
1392 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1393 
1394 	mutex_enter(&vp->v_interlock);
1395 	if (vp->v_usecount != 0) {
1396 		mutex_exit(&vp->v_interlock);
1397 		return (0);
1398 	}
1399 	if (inter_lkp)
1400 		mutex_exit(inter_lkp);
1401 	vremfree(vp);
1402 	vp->v_usecount++;
1403 	vclean(vp, DOCLOSE);
1404 	vrelel(vp, 0);
1405 	return (1);
1406 }
1407 
1408 /*
1409  * Eliminate all activity associated with a vnode in preparation for
1410  * reuse.  Drops a reference from the vnode.
1411  */
1412 void
1413 vgone(vnode_t *vp)
1414 {
1415 
1416 	mutex_enter(&vp->v_interlock);
1417 	vclean(vp, DOCLOSE);
1418 	vrelel(vp, 0);
1419 }
1420 
1421 /*
1422  * Lookup a vnode by device number.
1423  */
1424 int
1425 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1426 {
1427 	vnode_t *vp;
1428 	int rc = 0;
1429 
1430 	mutex_enter(&specfs_lock);
1431 	for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1432 		if (dev != vp->v_rdev || type != vp->v_type)
1433 			continue;
1434 		*vpp = vp;
1435 		rc = 1;
1436 		break;
1437 	}
1438 	mutex_exit(&specfs_lock);
1439 	return (rc);
1440 }
1441 
1442 /*
1443  * Revoke all the vnodes corresponding to the specified minor number
1444  * range (endpoints inclusive) of the specified major.
1445  */
1446 void
1447 vdevgone(int maj, int minl, int minh, enum vtype type)
1448 {
1449 	vnode_t *vp, **vpp;
1450 	dev_t dev;
1451 	int mn;
1452 
1453 	vp = NULL;	/* XXX gcc */
1454 
1455 	mutex_enter(&specfs_lock);
1456 	for (mn = minl; mn <= minh; mn++) {
1457 		dev = makedev(maj, mn);
1458 		vpp = &specfs_hash[SPECHASH(dev)];
1459 		for (vp = *vpp; vp != NULL;) {
1460 			mutex_enter(&vp->v_interlock);
1461 			if ((vp->v_iflag & VI_CLEAN) != 0 ||
1462 			    dev != vp->v_rdev || type != vp->v_type) {
1463 				mutex_exit(&vp->v_interlock);
1464 				vp = vp->v_specnext;
1465 				continue;
1466 			}
1467 			mutex_exit(&specfs_lock);
1468 			if (vget(vp, LK_INTERLOCK) == 0) {
1469 				VOP_REVOKE(vp, REVOKEALL);
1470 				vrele(vp);
1471 			}
1472 			mutex_enter(&specfs_lock);
1473 			vp = *vpp;
1474 		}
1475 	}
1476 	mutex_exit(&specfs_lock);
1477 }
1478 
1479 /*
1480  * Calculate the total number of references to a special device.
1481  */
1482 int
1483 vcount(vnode_t *vp)
1484 {
1485 	int count;
1486 
1487 	mutex_enter(&specfs_lock);
1488 	mutex_enter(&vp->v_interlock);
1489 	if (vp->v_specnode == NULL) {
1490 		count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
1491 		mutex_exit(&vp->v_interlock);
1492 		mutex_exit(&specfs_lock);
1493 		return (count);
1494 	}
1495 	mutex_exit(&vp->v_interlock);
1496 	count = vp->v_specnode->sn_dev->sd_opencnt;
1497 	mutex_exit(&specfs_lock);
1498 	return (count);
1499 }
1500 
1501 /*
1502  * Eliminate all activity associated with the requested vnode
1503  * and with all vnodes aliased to the requested vnode.
1504  */
1505 void
1506 vrevoke(vnode_t *vp)
1507 {
1508 	vnode_t *vq, **vpp;
1509 	enum vtype type;
1510 	dev_t dev;
1511 
1512 	KASSERT(vp->v_usecount > 0);
1513 
1514 	mutex_enter(&vp->v_interlock);
1515 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1516 		mutex_exit(&vp->v_interlock);
1517 		return;
1518 	} else {
1519 		dev = vp->v_rdev;
1520 		type = vp->v_type;
1521 		mutex_exit(&vp->v_interlock);
1522 	}
1523 
1524 	vpp = &specfs_hash[SPECHASH(dev)];
1525 	mutex_enter(&specfs_lock);
1526 	for (vq = *vpp; vq != NULL;) {
1527 		/* If clean or being cleaned, then ignore it. */
1528 		mutex_enter(&vq->v_interlock);
1529 		if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
1530 		    vq->v_rdev != dev || vq->v_type != type) {
1531 			mutex_exit(&vq->v_interlock);
1532 			vq = vq->v_specnext;
1533 			continue;
1534 		}
1535 		mutex_exit(&specfs_lock);
1536 		if (vq->v_usecount == 0) {
1537 			vremfree(vq);
1538 		}
1539 		vq->v_usecount++;
1540 		vclean(vq, DOCLOSE);
1541 		vrelel(vq, 0);
1542 		mutex_enter(&specfs_lock);
1543 		vq = *vpp;
1544 	}
1545 	mutex_exit(&specfs_lock);
1546 }
1547 
1548 /*
1549  * sysctl helper routine to return list of supported fstypes
1550  */
1551 static int
1552 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
1553 {
1554 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
1555 	char *where = oldp;
1556 	struct vfsops *v;
1557 	size_t needed, left, slen;
1558 	int error, first;
1559 
1560 	if (newp != NULL)
1561 		return (EPERM);
1562 	if (namelen != 0)
1563 		return (EINVAL);
1564 
1565 	first = 1;
1566 	error = 0;
1567 	needed = 0;
1568 	left = *oldlenp;
1569 
1570 	sysctl_unlock();
1571 	mutex_enter(&vfs_list_lock);
1572 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1573 		if (where == NULL)
1574 			needed += strlen(v->vfs_name) + 1;
1575 		else {
1576 			memset(bf, 0, sizeof(bf));
1577 			if (first) {
1578 				strncpy(bf, v->vfs_name, sizeof(bf));
1579 				first = 0;
1580 			} else {
1581 				bf[0] = ' ';
1582 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1583 			}
1584 			bf[sizeof(bf)-1] = '\0';
1585 			slen = strlen(bf);
1586 			if (left < slen + 1)
1587 				break;
1588 			/* +1 to copy out the trailing NUL byte */
1589 			v->vfs_refcount++;
1590 			mutex_exit(&vfs_list_lock);
1591 			error = copyout(bf, where, slen + 1);
1592 			mutex_enter(&vfs_list_lock);
1593 			v->vfs_refcount--;
1594 			if (error)
1595 				break;
1596 			where += slen;
1597 			needed += slen;
1598 			left -= slen;
1599 		}
1600 	}
1601 	mutex_exit(&vfs_list_lock);
1602 	sysctl_relock();
1603 	*oldlenp = needed;
1604 	return (error);
1605 }
1606 
1607 /*
1608  * Top level filesystem related information gathering.
1609  */
1610 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1611 {
1612 	sysctl_createv(clog, 0, NULL, NULL,
1613 		       CTLFLAG_PERMANENT,
1614 		       CTLTYPE_NODE, "vfs", NULL,
1615 		       NULL, 0, NULL, 0,
1616 		       CTL_VFS, CTL_EOL);
1617 	sysctl_createv(clog, 0, NULL, NULL,
1618 		       CTLFLAG_PERMANENT,
1619 		       CTLTYPE_NODE, "generic",
1620 		       SYSCTL_DESCR("Non-specific vfs related information"),
1621 		       NULL, 0, NULL, 0,
1622 		       CTL_VFS, VFS_GENERIC, CTL_EOL);
1623 	sysctl_createv(clog, 0, NULL, NULL,
1624 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1625 		       CTLTYPE_INT, "usermount",
1626 		       SYSCTL_DESCR("Whether unprivileged users may mount "
1627 				    "filesystems"),
1628 		       NULL, 0, &dovfsusermount, 0,
1629 		       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
1630 	sysctl_createv(clog, 0, NULL, NULL,
1631 		       CTLFLAG_PERMANENT,
1632 		       CTLTYPE_STRING, "fstypes",
1633 		       SYSCTL_DESCR("List of file systems present"),
1634 		       sysctl_vfs_generic_fstypes, 0, NULL, 0,
1635 		       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
1636 	sysctl_createv(clog, 0, NULL, NULL,
1637 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1638 		       CTLTYPE_INT, "magiclinks",
1639 		       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
1640 		       NULL, 0, &vfs_magiclinks, 0,
1641 		       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1642 }
1643 
1644 
1645 int kinfo_vdebug = 1;
1646 int kinfo_vgetfailed;
1647 #define KINFO_VNODESLOP	10
1648 /*
1649  * Dump vnode list (via sysctl).
1650  * Copyout address of vnode followed by vnode.
1651  */
1652 /* ARGSUSED */
1653 int
1654 sysctl_kern_vnode(SYSCTLFN_ARGS)
1655 {
1656 	char *where = oldp;
1657 	size_t *sizep = oldlenp;
1658 	struct mount *mp, *nmp;
1659 	vnode_t *vp, *mvp, vbuf;
1660 	char *bp = where, *savebp;
1661 	char *ewhere;
1662 	int error;
1663 
1664 	if (namelen != 0)
1665 		return (EOPNOTSUPP);
1666 	if (newp != NULL)
1667 		return (EPERM);
1668 
1669 #define VPTRSZ	sizeof(vnode_t *)
1670 #define VNODESZ	sizeof(vnode_t)
1671 	if (where == NULL) {
1672 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1673 		return (0);
1674 	}
1675 	ewhere = where + *sizep;
1676 
1677 	sysctl_unlock();
1678 	mutex_enter(&mountlist_lock);
1679 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1680 	     mp = nmp) {
1681 		if (vfs_trybusy(mp, RW_READER, &nmp)) {
1682 			continue;
1683 		}
1684 		savebp = bp;
1685 		/* Allocate a marker vnode. */
1686 		if ((mvp = vnalloc(mp)) == NULL) {
1687 			sysctl_relock();
1688 			return (ENOMEM);
1689 		}
1690 		mutex_enter(&mntvnode_lock);
1691 		for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
1692 			vmark(mvp, vp);
1693 			/*
1694 			 * Check that the vp is still associated with
1695 			 * this filesystem.  RACE: could have been
1696 			 * recycled onto the same filesystem.
1697 			 */
1698 			if (vp->v_mount != mp || vismarker(vp))
1699 				continue;
1700 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1701 				(void)vunmark(mvp);
1702 				mutex_exit(&mntvnode_lock);
1703 				vnfree(mvp);
1704 				sysctl_relock();
1705 				*sizep = bp - where;
1706 				return (ENOMEM);
1707 			}
1708 			memcpy(&vbuf, vp, VNODESZ);
1709 			mutex_exit(&mntvnode_lock);
1710 			if ((error = copyout(vp, bp, VPTRSZ)) ||
1711 			   (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1712 			   	mutex_enter(&mntvnode_lock);
1713 				(void)vunmark(mvp);
1714 				mutex_exit(&mntvnode_lock);
1715 				vnfree(mvp);
1716 				sysctl_relock();
1717 				return (error);
1718 			}
1719 			bp += VPTRSZ + VNODESZ;
1720 			mutex_enter(&mntvnode_lock);
1721 		}
1722 		mutex_exit(&mntvnode_lock);
1723 		mutex_enter(&mountlist_lock);
1724 		vfs_unbusy(mp, false, &nmp);
1725 		vnfree(mvp);
1726 	}
1727 	mutex_exit(&mountlist_lock);
1728 	sysctl_relock();
1729 
1730 	*sizep = bp - where;
1731 	return (0);
1732 }
1733 
1734 /*
1735  * Remove clean vnodes from a mountpoint's vnode list.
1736  */
1737 void
1738 vfs_scrubvnlist(struct mount *mp)
1739 {
1740 	vnode_t *vp, *nvp;
1741 
1742  retry:
1743 	mutex_enter(&mntvnode_lock);
1744 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1745 		nvp = TAILQ_NEXT(vp, v_mntvnodes);
1746 		mutex_enter(&vp->v_interlock);
1747 		if ((vp->v_iflag & VI_CLEAN) != 0) {
1748 			TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
1749 			vp->v_mount = NULL;
1750 			mutex_exit(&mntvnode_lock);
1751 			mutex_exit(&vp->v_interlock);
1752 			vfs_destroy(mp, false);
1753 			goto retry;
1754 		}
1755 		mutex_exit(&vp->v_interlock);
1756 	}
1757 	mutex_exit(&mntvnode_lock);
1758 }
1759 
1760 /*
1761  * Check to see if a filesystem is mounted on a block device.
1762  */
1763 int
1764 vfs_mountedon(vnode_t *vp)
1765 {
1766 	vnode_t *vq;
1767 	int error = 0;
1768 
1769 	if (vp->v_type != VBLK)
1770 		return ENOTBLK;
1771 	if (vp->v_specmountpoint != NULL)
1772 		return (EBUSY);
1773 	mutex_enter(&specfs_lock);
1774 	for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
1775 	    vq = vq->v_specnext) {
1776 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1777 			continue;
1778 		if (vq->v_specmountpoint != NULL) {
1779 			error = EBUSY;
1780 			break;
1781 		}
1782 	}
1783 	mutex_exit(&specfs_lock);
1784 	return (error);
1785 }
1786 
1787 /*
1788  * Unmount all file systems.
1789  * We traverse the list in reverse order under the assumption that doing so
1790  * will avoid needing to worry about dependencies.
1791  */
1792 void
1793 vfs_unmountall(struct lwp *l)
1794 {
1795 	struct mount *mp, *nmp;
1796 	int allerror, error;
1797 
1798 	printf("unmounting file systems...");
1799 	for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist);
1800 	     !CIRCLEQ_EMPTY(&mountlist);
1801 	     mp = nmp) {
1802 		nmp = CIRCLEQ_PREV(mp, mnt_list);
1803 #ifdef DEBUG
1804 		printf("\nunmounting %s (%s)...",
1805 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1806 #endif
1807 		/*
1808 		 * XXX Freeze syncer.  Must do this before locking the
1809 		 * mount point.  See dounmount() for details.
1810 		 */
1811 		mutex_enter(&syncer_mutex);
1812 		if (vfs_busy(mp, RW_WRITER)) {
1813 			mutex_exit(&syncer_mutex);
1814 			continue;
1815 		}
1816 		if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1817 			printf("unmount of %s failed with error %d\n",
1818 			    mp->mnt_stat.f_mntonname, error);
1819 			allerror = 1;
1820 		}
1821 	}
1822 	printf(" done\n");
1823 	if (allerror)
1824 		printf("WARNING: some file systems would not unmount\n");
1825 }
1826 
1827 /*
1828  * Sync and unmount file systems before shutting down.
1829  */
1830 void
1831 vfs_shutdown(void)
1832 {
1833 	struct lwp *l;
1834 
1835 	/* XXX we're certainly not running in lwp0's context! */
1836 	l = curlwp;
1837 	if (l == NULL)
1838 		l = &lwp0;
1839 
1840 	printf("syncing disks... ");
1841 
1842 	/* remove user processes from run queue */
1843 	suspendsched();
1844 	(void) spl0();
1845 
1846 	/* avoid coming back this way again if we panic. */
1847 	doing_shutdown = 1;
1848 
1849 	sys_sync(l, NULL, NULL);
1850 
1851 	/* Wait for sync to finish. */
1852 	if (buf_syncwait() != 0) {
1853 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1854 		Debugger();
1855 #endif
1856 		printf("giving up\n");
1857 		return;
1858 	} else
1859 		printf("done\n");
1860 
1861 	/*
1862 	 * If we've panic'd, don't make the situation potentially
1863 	 * worse by unmounting the file systems.
1864 	 */
1865 	if (panicstr != NULL)
1866 		return;
1867 
1868 	/* Release inodes held by texts before update. */
1869 #ifdef notdef
1870 	vnshutdown();
1871 #endif
1872 	/* Unmount file systems. */
1873 	vfs_unmountall(l);
1874 }
1875 
1876 /*
1877  * Mount the root file system.  If the operator didn't specify a
1878  * file system to use, try all possible file systems until one
1879  * succeeds.
1880  */
1881 int
1882 vfs_mountroot(void)
1883 {
1884 	struct vfsops *v;
1885 	int error = ENODEV;
1886 
1887 	if (root_device == NULL)
1888 		panic("vfs_mountroot: root device unknown");
1889 
1890 	switch (device_class(root_device)) {
1891 	case DV_IFNET:
1892 		if (rootdev != NODEV)
1893 			panic("vfs_mountroot: rootdev set for DV_IFNET "
1894 			    "(0x%08x -> %d,%d)", rootdev,
1895 			    major(rootdev), minor(rootdev));
1896 		break;
1897 
1898 	case DV_DISK:
1899 		if (rootdev == NODEV)
1900 			panic("vfs_mountroot: rootdev not set for DV_DISK");
1901 	        if (bdevvp(rootdev, &rootvp))
1902 	                panic("vfs_mountroot: can't get vnode for rootdev");
1903 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1904 		if (error) {
1905 			printf("vfs_mountroot: can't open root device\n");
1906 			return (error);
1907 		}
1908 		break;
1909 
1910 	default:
1911 		printf("%s: inappropriate for root file system\n",
1912 		    device_xname(root_device));
1913 		return (ENODEV);
1914 	}
1915 
1916 	/*
1917 	 * If user specified a file system, use it.
1918 	 */
1919 	if (mountroot != NULL) {
1920 		error = (*mountroot)();
1921 		goto done;
1922 	}
1923 
1924 	/*
1925 	 * Try each file system currently configured into the kernel.
1926 	 */
1927 	mutex_enter(&vfs_list_lock);
1928 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1929 		if (v->vfs_mountroot == NULL)
1930 			continue;
1931 #ifdef DEBUG
1932 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1933 #endif
1934 		v->vfs_refcount++;
1935 		mutex_exit(&vfs_list_lock);
1936 		error = (*v->vfs_mountroot)();
1937 		mutex_enter(&vfs_list_lock);
1938 		v->vfs_refcount--;
1939 		if (!error) {
1940 			aprint_normal("root file system type: %s\n",
1941 			    v->vfs_name);
1942 			break;
1943 		}
1944 	}
1945 	mutex_exit(&vfs_list_lock);
1946 
1947 	if (v == NULL) {
1948 		printf("no file system for %s", device_xname(root_device));
1949 		if (device_class(root_device) == DV_DISK)
1950 			printf(" (dev 0x%x)", rootdev);
1951 		printf("\n");
1952 		error = EFTYPE;
1953 	}
1954 
1955 done:
1956 	if (error && device_class(root_device) == DV_DISK) {
1957 		VOP_CLOSE(rootvp, FREAD, FSCRED);
1958 		vrele(rootvp);
1959 	}
1960 	return (error);
1961 }
1962 
1963 /*
1964  * Sham lock manager for vnodes.  This is a temporary measure.
1965  */
1966 int
1967 vlockmgr(struct vnlock *vl, int flags)
1968 {
1969 
1970 	KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
1971 
1972 	switch (flags & LK_TYPE_MASK) {
1973 	case LK_SHARED:
1974 		if (rw_tryenter(&vl->vl_lock, RW_READER)) {
1975 			return 0;
1976 		}
1977 		if ((flags & LK_NOWAIT) != 0) {
1978 			return EBUSY;
1979 		}
1980 		rw_enter(&vl->vl_lock, RW_READER);
1981 		return 0;
1982 
1983 	case LK_EXCLUSIVE:
1984 		if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
1985 			return 0;
1986 		}
1987 		if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
1988 		    rw_write_held(&vl->vl_lock)) {
1989 			vl->vl_recursecnt++;
1990 			return 0;
1991 		}
1992 		if ((flags & LK_NOWAIT) != 0) {
1993 			return EBUSY;
1994 		}
1995 		rw_enter(&vl->vl_lock, RW_WRITER);
1996 		return 0;
1997 
1998 	case LK_RELEASE:
1999 		if (vl->vl_recursecnt != 0) {
2000 			KASSERT(rw_write_held(&vl->vl_lock));
2001 			vl->vl_recursecnt--;
2002 			return 0;
2003 		}
2004 		rw_exit(&vl->vl_lock);
2005 		return 0;
2006 
2007 	default:
2008 		panic("vlockmgr: flags %x", flags);
2009 	}
2010 }
2011 
2012 int
2013 vlockstatus(struct vnlock *vl)
2014 {
2015 
2016 	if (rw_write_held(&vl->vl_lock)) {
2017 		return LK_EXCLUSIVE;
2018 	}
2019 	if (rw_read_held(&vl->vl_lock)) {
2020 		return LK_SHARED;
2021 	}
2022 	return 0;
2023 }
2024