xref: /netbsd-src/sys/kern/vfs_subr.c (revision 8b0f9554ff8762542c4defc4f70e1eb76fb508fa)
1 /*	$NetBSD: vfs_subr.c,v 1.308 2007/12/01 10:36:47 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1989, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
74  */
75 
76 /*
77  * External virtual filesystem routines.
78  *
79  * This file contains vfs subroutines which are heavily dependant on
80  * the kernel and are not suitable for standalone use.  Examples include
81  * routines involved vnode and mountpoint management.
82  */
83 
84 #include <sys/cdefs.h>
85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.308 2007/12/01 10:36:47 yamt Exp $");
86 
87 #include "opt_inet.h"
88 #include "opt_ddb.h"
89 #include "opt_compat_netbsd.h"
90 #include "opt_compat_43.h"
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/proc.h>
95 #include <sys/kernel.h>
96 #include <sys/mount.h>
97 #include <sys/fcntl.h>
98 #include <sys/vnode.h>
99 #include <sys/stat.h>
100 #include <sys/namei.h>
101 #include <sys/ucred.h>
102 #include <sys/buf.h>
103 #include <sys/errno.h>
104 #include <sys/malloc.h>
105 #include <sys/syscallargs.h>
106 #include <sys/device.h>
107 #include <sys/filedesc.h>
108 #include <sys/kauth.h>
109 #include <sys/atomic.h>
110 
111 #include <miscfs/specfs/specdev.h>
112 #include <miscfs/syncfs/syncfs.h>
113 
114 #include <uvm/uvm.h>
115 #include <uvm/uvm_readahead.h>
116 #include <uvm/uvm_ddb.h>
117 
118 #include <sys/sysctl.h>
119 
120 extern int dovfsusermount;	/* 1 => permit any user to mount filesystems */
121 extern int vfs_magiclinks;	/* 1 => expand "magic" symlinks */
122 
123 /* TAILQ_HEAD(freelst, vnode) vnode_free_list =	vnode free list (in vnode.h) */
124 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
125 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
126 
127 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
128 
129 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
130     &pool_allocator_nointr, IPL_NONE);
131 
132 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
133 
134 /*
135  * Local declarations.
136  */
137 
138 static void insmntque(struct vnode *, struct mount *);
139 static int getdevvp(dev_t, struct vnode **, enum vtype);
140 static void vclean(struct vnode *, int, struct lwp *);
141 static struct vnode *getcleanvnode(struct lwp *);
142 
143 int
144 vfs_drainvnodes(long target, struct lwp *l)
145 {
146 
147 	simple_lock(&vnode_free_list_slock);
148 	while (numvnodes > target) {
149 		struct vnode *vp;
150 
151 		vp = getcleanvnode(l);
152 		if (vp == NULL)
153 			return EBUSY; /* give up */
154 		pool_put(&vnode_pool, vp);
155 		simple_lock(&vnode_free_list_slock);
156 		numvnodes--;
157 	}
158 	simple_unlock(&vnode_free_list_slock);
159 
160 	return 0;
161 }
162 
163 /*
164  * grab a vnode from freelist and clean it.
165  */
166 struct vnode *
167 getcleanvnode(struct lwp *l)
168 {
169 	struct vnode *vp;
170 	struct freelst *listhd;
171 
172 	LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
173 
174 	listhd = &vnode_free_list;
175 try_nextlist:
176 	TAILQ_FOREACH(vp, listhd, v_freelist) {
177 		if (!simple_lock_try(&vp->v_interlock))
178 			continue;
179 		/*
180 		 * as our lwp might hold the underlying vnode locked,
181 		 * don't try to reclaim the VLAYER vnode if it's locked.
182 		 */
183 		if ((vp->v_iflag & VI_XLOCK) == 0 &&
184 		    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
185 			break;
186 		}
187 		simple_unlock(&vp->v_interlock);
188 	}
189 
190 	if (vp == NULLVP) {
191 		if (listhd == &vnode_free_list) {
192 			listhd = &vnode_hold_list;
193 			goto try_nextlist;
194 		}
195 		simple_unlock(&vnode_free_list_slock);
196 		return NULLVP;
197 	}
198 
199 	if (vp->v_usecount)
200 		panic("free vnode isn't, vp %p", vp);
201 	TAILQ_REMOVE(listhd, vp, v_freelist);
202 	/* see comment on why 0xdeadb is set at end of vgone (below) */
203 	vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
204 	simple_unlock(&vnode_free_list_slock);
205 
206 	if (vp->v_type != VBAD)
207 		vgonel(vp, l);
208 	else
209 		simple_unlock(&vp->v_interlock);
210 #ifdef DIAGNOSTIC
211 	if (vp->v_data || vp->v_uobj.uo_npages ||
212 	    TAILQ_FIRST(&vp->v_uobj.memq))
213 		panic("cleaned vnode isn't, vp %p", vp);
214 	if (vp->v_numoutput)
215 		panic("clean vnode has pending I/O's, vp %p", vp);
216 #endif
217 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
218 
219 	return vp;
220 }
221 
222 /*
223  * Mark a mount point as busy. Used to synchronize access and to delay
224  * unmounting. Interlock is not released on failure.
225  */
226 int
227 vfs_busy(struct mount *mp, int flags, kmutex_t *interlkp)
228 {
229 	int lkflags;
230 
231 	while (mp->mnt_iflag & IMNT_UNMOUNT) {
232 		int gone, n;
233 
234 		if (flags & LK_NOWAIT)
235 			return (ENOENT);
236 		if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
237 		    && mp->mnt_unmounter == curlwp)
238 			return (EDEADLK);
239 		if (interlkp)
240 			mutex_exit(interlkp);
241 		/*
242 		 * Since all busy locks are shared except the exclusive
243 		 * lock granted when unmounting, the only place that a
244 		 * wakeup needs to be done is at the release of the
245 		 * exclusive lock at the end of dounmount.
246 		 */
247 		simple_lock(&mp->mnt_slock);
248 		mp->mnt_wcnt++;
249 		ltsleep((void *)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock);
250 		n = --mp->mnt_wcnt;
251 		simple_unlock(&mp->mnt_slock);
252 		gone = mp->mnt_iflag & IMNT_GONE;
253 
254 		if (n == 0)
255 			wakeup(&mp->mnt_wcnt);
256 		if (interlkp)
257 			mutex_enter(interlkp);
258 		if (gone)
259 			return (ENOENT);
260 	}
261 	lkflags = LK_SHARED;
262 	if (interlkp) {
263 		/* lkflags |= LK_INTERLOCK; XXX */
264 		mutex_exit(interlkp);	/* XXX */
265 	}
266 	if (lockmgr(&mp->mnt_lock, lkflags, NULL))
267 		panic("vfs_busy: unexpected lock failure");
268 	return (0);
269 }
270 
271 /*
272  * Free a busy filesystem.
273  */
274 void
275 vfs_unbusy(struct mount *mp)
276 {
277 
278 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
279 }
280 
281 /*
282  * Lookup a filesystem type, and if found allocate and initialize
283  * a mount structure for it.
284  *
285  * Devname is usually updated by mount(8) after booting.
286  */
287 int
288 vfs_rootmountalloc(const char *fstypename, const char *devname,
289     struct mount **mpp)
290 {
291 	struct vfsops *vfsp = NULL;
292 	struct mount *mp;
293 
294 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
295 		if (!strncmp(vfsp->vfs_name, fstypename,
296 		    sizeof(mp->mnt_stat.f_fstypename)))
297 			break;
298 
299 	if (vfsp == NULL)
300 		return (ENODEV);
301 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
302 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
303 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
304 	simple_lock_init(&mp->mnt_slock);
305 	(void)vfs_busy(mp, LK_NOWAIT, 0);
306 	TAILQ_INIT(&mp->mnt_vnodelist);
307 	mp->mnt_op = vfsp;
308 	mp->mnt_flag = MNT_RDONLY;
309 	mp->mnt_vnodecovered = NULLVP;
310 	vfsp->vfs_refcount++;
311 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
312 	    sizeof(mp->mnt_stat.f_fstypename));
313 	mp->mnt_stat.f_mntonname[0] = '/';
314 	mp->mnt_stat.f_mntonname[1] = '\0';
315 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
316 	    '\0';
317 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
318 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
319 	mount_initspecific(mp);
320 	*mpp = mp;
321 	return (0);
322 }
323 
324 
325 /*
326  * Routines having to do with the management of the vnode table.
327  */
328 extern int (**dead_vnodeop_p)(void *);
329 
330 /*
331  * Return the next vnode from the free list.
332  */
333 int
334 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
335     struct vnode **vpp)
336 {
337 	struct uvm_object *uobj;
338 	struct lwp *l = curlwp;		/* XXX */
339 	static int toggle;
340 	struct vnode *vp;
341 	int error = 0, tryalloc;
342 
343  try_again:
344 	if (mp) {
345 		/*
346 		 * Mark filesystem busy while we're creating a vnode.
347 		 * If unmount is in progress, this will wait; if the
348 		 * unmount succeeds (only if umount -f), this will
349 		 * return an error.  If the unmount fails, we'll keep
350 		 * going afterwards.
351 		 * (This puts the per-mount vnode list logically under
352 		 * the protection of the vfs_busy lock).
353 		 */
354 		error = vfs_busy(mp, LK_RECURSEFAIL, 0);
355 		if (error && error != EDEADLK)
356 			return error;
357 	}
358 
359 	/*
360 	 * We must choose whether to allocate a new vnode or recycle an
361 	 * existing one. The criterion for allocating a new one is that
362 	 * the total number of vnodes is less than the number desired or
363 	 * there are no vnodes on either free list. Generally we only
364 	 * want to recycle vnodes that have no buffers associated with
365 	 * them, so we look first on the vnode_free_list. If it is empty,
366 	 * we next consider vnodes with referencing buffers on the
367 	 * vnode_hold_list. The toggle ensures that half the time we
368 	 * will use a buffer from the vnode_hold_list, and half the time
369 	 * we will allocate a new one unless the list has grown to twice
370 	 * the desired size. We are reticent to recycle vnodes from the
371 	 * vnode_hold_list because we will lose the identity of all its
372 	 * referencing buffers.
373 	 */
374 
375 	vp = NULL;
376 
377 	simple_lock(&vnode_free_list_slock);
378 
379 	toggle ^= 1;
380 	if (numvnodes > 2 * desiredvnodes)
381 		toggle = 0;
382 
383 	tryalloc = numvnodes < desiredvnodes ||
384 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
385 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
386 
387 	if (tryalloc &&
388 	    (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) {
389 		numvnodes++;
390 		simple_unlock(&vnode_free_list_slock);
391 		memset(vp, 0, sizeof(*vp));
392 		UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1);
393 		/*
394 		 * done by memset() above.
395 		 *	LIST_INIT(&vp->v_nclist);
396 		 *	LIST_INIT(&vp->v_dnclist);
397 		 */
398 	} else {
399 		vp = getcleanvnode(l);
400 		/*
401 		 * Unless this is a bad time of the month, at most
402 		 * the first NCPUS items on the free list are
403 		 * locked, so this is close enough to being empty.
404 		 */
405 		if (vp == NULLVP) {
406 			if (mp && error != EDEADLK)
407 				vfs_unbusy(mp);
408 			if (tryalloc) {
409 				printf("WARNING: unable to allocate new "
410 				    "vnode, retrying...\n");
411 				(void) tsleep(&lbolt, PRIBIO, "newvn", hz);
412 				goto try_again;
413 			}
414 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
415 			*vpp = 0;
416 			return (ENFILE);
417 		}
418 		vp->v_usecount = 1;
419 		vp->v_iflag = 0;
420 		vp->v_vflag = 0;
421 		vp->v_uflag = 0;
422 		vp->v_socket = NULL;
423 	}
424 	vp->v_type = VNON;
425 	vp->v_vnlock = &vp->v_lock;
426 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
427 	KASSERT(LIST_EMPTY(&vp->v_nclist));
428 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
429 	vp->v_tag = tag;
430 	vp->v_op = vops;
431 	insmntque(vp, mp);
432 	*vpp = vp;
433 	vp->v_data = 0;
434 	simple_lock_init(&vp->v_interlock);
435 
436 	/*
437 	 * initialize uvm_object within vnode.
438 	 */
439 
440 	uobj = &vp->v_uobj;
441 	KASSERT(uobj->pgops == &uvm_vnodeops);
442 	KASSERT(uobj->uo_npages == 0);
443 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
444 	vp->v_size = vp->v_writesize = VSIZENOTSET;
445 
446 	if (mp && error != EDEADLK)
447 		vfs_unbusy(mp);
448 	return (0);
449 }
450 
451 /*
452  * This is really just the reverse of getnewvnode(). Needed for
453  * VFS_VGET functions who may need to push back a vnode in case
454  * of a locking race.
455  */
456 void
457 ungetnewvnode(struct vnode *vp)
458 {
459 #ifdef DIAGNOSTIC
460 	if (vp->v_usecount != 1)
461 		panic("ungetnewvnode: busy vnode");
462 #endif
463 	vp->v_usecount--;
464 	insmntque(vp, NULL);
465 	vp->v_type = VBAD;
466 
467 	simple_lock(&vp->v_interlock);
468 	/*
469 	 * Insert at head of LRU list
470 	 */
471 	simple_lock(&vnode_free_list_slock);
472 	if (vp->v_holdcnt > 0)
473 		TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist);
474 	else
475 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
476 	simple_unlock(&vnode_free_list_slock);
477 	simple_unlock(&vp->v_interlock);
478 }
479 
480 /*
481  * Move a vnode from one mount queue to another.
482  */
483 static void
484 insmntque(struct vnode *vp, struct mount *mp)
485 {
486 
487 #ifdef DIAGNOSTIC
488 	if ((mp != NULL) &&
489 	    (mp->mnt_iflag & IMNT_UNMOUNT) &&
490 	    !(mp->mnt_flag & MNT_SOFTDEP) &&
491 	    vp->v_tag != VT_VFS) {
492 		panic("insmntque into dying filesystem");
493 	}
494 #endif
495 
496 	simple_lock(&mntvnode_slock);
497 	/*
498 	 * Delete from old mount point vnode list, if on one.
499 	 */
500 	if (vp->v_mount != NULL)
501 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
502 	/*
503 	 * Insert into list of vnodes for the new mount point, if available.
504 	 */
505 	if ((vp->v_mount = mp) != NULL)
506 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
507 	simple_unlock(&mntvnode_slock);
508 }
509 
510 /*
511  * Create a vnode for a block device.
512  * Used for root filesystem and swap areas.
513  * Also used for memory file system special devices.
514  */
515 int
516 bdevvp(dev_t dev, struct vnode **vpp)
517 {
518 
519 	return (getdevvp(dev, vpp, VBLK));
520 }
521 
522 /*
523  * Create a vnode for a character device.
524  * Used for kernfs and some console handling.
525  */
526 int
527 cdevvp(dev_t dev, struct vnode **vpp)
528 {
529 
530 	return (getdevvp(dev, vpp, VCHR));
531 }
532 
533 /*
534  * Create a vnode for a device.
535  * Used by bdevvp (block device) for root file system etc.,
536  * and by cdevvp (character device) for console and kernfs.
537  */
538 static int
539 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
540 {
541 	struct vnode *vp;
542 	struct vnode *nvp;
543 	int error;
544 
545 	if (dev == NODEV) {
546 		*vpp = NULL;
547 		return (0);
548 	}
549 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
550 	if (error) {
551 		*vpp = NULL;
552 		return (error);
553 	}
554 	vp = nvp;
555 	vp->v_type = type;
556 	uvm_vnp_setsize(vp, 0);
557 	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
558 		vput(vp);
559 		vp = nvp;
560 	}
561 	*vpp = vp;
562 	return (0);
563 }
564 
565 /*
566  * Check to see if the new vnode represents a special device
567  * for which we already have a vnode (either because of
568  * bdevvp() or because of a different vnode representing
569  * the same block device). If such an alias exists, deallocate
570  * the existing contents and return the aliased vnode. The
571  * caller is responsible for filling it with its new contents.
572  */
573 struct vnode *
574 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
575 {
576 	struct lwp *l = curlwp;		/* XXX */
577 	struct vnode *vp;
578 	struct vnode **vpp;
579 
580 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
581 		return (NULLVP);
582 
583 	vpp = &speclisth[SPECHASH(nvp_rdev)];
584 loop:
585 	simple_lock(&spechash_slock);
586 	for (vp = *vpp; vp; vp = vp->v_specnext) {
587 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
588 			continue;
589 		/*
590 		 * Alias, but not in use, so flush it out.
591 		 */
592 		simple_lock(&vp->v_interlock);
593 		simple_unlock(&spechash_slock);
594 		if (vp->v_usecount == 0) {
595 			vgonel(vp, l);
596 			goto loop;
597 		}
598 		/*
599 		 * What we're interested to know here is if someone else has
600 		 * removed this vnode from the device hash list while we were
601 		 * waiting.  This can only happen if vclean() did it, and
602 		 * this requires the vnode to be locked.
603 		 */
604 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK))
605 			goto loop;
606 		if (vp->v_specinfo == NULL) {
607 			vput(vp);
608 			goto loop;
609 		}
610 		simple_lock(&spechash_slock);
611 		break;
612 	}
613 	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
614 		MALLOC(nvp->v_specinfo, struct specinfo *,
615 			sizeof(struct specinfo), M_VNODE, M_NOWAIT);
616 		/* XXX Erg. */
617 		if (nvp->v_specinfo == NULL) {
618 			simple_unlock(&spechash_slock);
619 			uvm_wait("checkalias");
620 			goto loop;
621 		}
622 
623 		nvp->v_rdev = nvp_rdev;
624 		nvp->v_hashchain = vpp;
625 		nvp->v_specnext = *vpp;
626 		nvp->v_specmountpoint = NULL;
627 		simple_unlock(&spechash_slock);
628 		nvp->v_speclockf = NULL;
629 
630 		*vpp = nvp;
631 		if (vp != NULLVP) {
632 			nvp->v_iflag |= VI_ALIASED;
633 			vp->v_iflag |= VI_ALIASED;
634 			vput(vp);
635 		}
636 		return (NULLVP);
637 	}
638 	simple_unlock(&spechash_slock);
639 	VOP_UNLOCK(vp, 0);
640 	simple_lock(&vp->v_interlock);
641 	vclean(vp, 0, l);
642 	vp->v_op = nvp->v_op;
643 	vp->v_tag = nvp->v_tag;
644 	vp->v_vnlock = &vp->v_lock;
645 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
646 	nvp->v_type = VNON;
647 	insmntque(vp, mp);
648 	return (vp);
649 }
650 
651 /*
652  * Grab a particular vnode from the free list, increment its
653  * reference count and lock it. If the vnode lock bit is set the
654  * vnode is being eliminated in vgone. In that case, we can not
655  * grab the vnode, so the process is awakened when the transition is
656  * completed, and an error returned to indicate that the vnode is no
657  * longer usable (possibly having been changed to a new file system type).
658  */
659 int
660 vget(struct vnode *vp, int flags)
661 {
662 	int error;
663 
664 	/*
665 	 * If the vnode is in the process of being cleaned out for
666 	 * another use, we wait for the cleaning to finish and then
667 	 * return failure. Cleaning is determined by checking that
668 	 * the VI_XLOCK flag is set.
669 	 */
670 
671 	if ((flags & LK_INTERLOCK) == 0)
672 		simple_lock(&vp->v_interlock);
673 	if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
674 		if (flags & LK_NOWAIT) {
675 			simple_unlock(&vp->v_interlock);
676 			return EBUSY;
677 		}
678 		vp->v_iflag |= VI_XWANT;
679 		ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock);
680 		return (ENOENT);
681 	}
682 	if (vp->v_usecount == 0) {
683 		simple_lock(&vnode_free_list_slock);
684 		if (vp->v_holdcnt > 0)
685 			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
686 		else
687 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
688 		simple_unlock(&vnode_free_list_slock);
689 	}
690 	vp->v_usecount++;
691 #ifdef DIAGNOSTIC
692 	if (vp->v_usecount == 0) {
693 		vprint("vget", vp);
694 		panic("vget: usecount overflow, vp %p", vp);
695 	}
696 #endif
697 	if (flags & LK_TYPE_MASK) {
698 		if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
699 			vrele(vp);
700 		}
701 		return (error);
702 	}
703 	simple_unlock(&vp->v_interlock);
704 	return (0);
705 }
706 
707 /*
708  * vput(), just unlock and vrele()
709  */
710 void
711 vput(struct vnode *vp)
712 {
713 
714 #ifdef DIAGNOSTIC
715 	if (vp == NULL)
716 		panic("vput: null vp");
717 #endif
718 	simple_lock(&vp->v_interlock);
719 	vp->v_usecount--;
720 	if (vp->v_usecount > 0) {
721 		simple_unlock(&vp->v_interlock);
722 		VOP_UNLOCK(vp, 0);
723 		return;
724 	}
725 #ifdef DIAGNOSTIC
726 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
727 		vprint("vput: bad ref count", vp);
728 		panic("vput: ref cnt");
729 	}
730 #endif
731 	/*
732 	 * Insert at tail of LRU list.
733 	 */
734 	simple_lock(&vnode_free_list_slock);
735 	if (vp->v_holdcnt > 0)
736 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
737 	else
738 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
739 	simple_unlock(&vnode_free_list_slock);
740 	if (vp->v_iflag & VI_EXECMAP) {
741 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
742 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
743 	}
744 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
745 	vp->v_vflag &= ~VV_MAPPED;
746 	simple_unlock(&vp->v_interlock);
747 	VOP_INACTIVE(vp);
748 }
749 
750 /*
751  * Vnode release.
752  * If count drops to zero, call inactive routine and return to freelist.
753  */
754 static void
755 do_vrele(struct vnode *vp, int doinactive, int onhead)
756 {
757 
758 #ifdef DIAGNOSTIC
759 	if (vp == NULL)
760 		panic("vrele: null vp");
761 #endif
762 	simple_lock(&vp->v_interlock);
763 	vp->v_usecount--;
764 	if (vp->v_usecount > 0) {
765 		simple_unlock(&vp->v_interlock);
766 		return;
767 	}
768 #ifdef DIAGNOSTIC
769 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
770 		vprint("vrele: bad ref count", vp);
771 		panic("vrele: ref cnt vp %p", vp);
772 	}
773 #endif
774 	/*
775 	 * Insert at tail of LRU list.
776 	 */
777 	simple_lock(&vnode_free_list_slock);
778 	if (vp->v_holdcnt > 0) {
779 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
780 	} else {
781 		if (onhead)
782 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
783 		else
784 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
785 	}
786 	simple_unlock(&vnode_free_list_slock);
787 	if (vp->v_iflag & VI_EXECMAP) {
788 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
789 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
790 	}
791 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
792 	vp->v_vflag &= ~VV_MAPPED;
793 
794 	if (doinactive) {
795 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
796 			VOP_INACTIVE(vp);
797 	} else {
798 		simple_unlock(&vp->v_interlock);
799 	}
800 }
801 
802 void
803 vrele(struct vnode *vp)
804 {
805 
806 	do_vrele(vp, 1, 0);
807 }
808 
809 void
810 vrele2(struct vnode *vp, int onhead)
811 {
812 
813 	do_vrele(vp, 0, onhead);
814 }
815 
816 /*
817  * Page or buffer structure gets a reference.
818  * Called with v_interlock held.
819  */
820 void
821 vholdl(struct vnode *vp)
822 {
823 
824 	/*
825 	 * If it is on the freelist and the hold count is currently
826 	 * zero, move it to the hold list. The test of the back
827 	 * pointer and the use reference count of zero is because
828 	 * it will be removed from a free list by getnewvnode,
829 	 * but will not have its reference count incremented until
830 	 * after calling vgone. If the reference count were
831 	 * incremented first, vgone would (incorrectly) try to
832 	 * close the previous instance of the underlying object.
833 	 * So, the back pointer is explicitly set to `0xdeadb' in
834 	 * getnewvnode after removing it from a freelist to ensure
835 	 * that we do not try to move it here.
836 	 */
837 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
838 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
839 		simple_lock(&vnode_free_list_slock);
840 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
841 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
842 		simple_unlock(&vnode_free_list_slock);
843 	}
844 	vp->v_holdcnt++;
845 }
846 
847 /*
848  * Page or buffer structure frees a reference.
849  * Called with v_interlock held.
850  */
851 void
852 holdrelel(struct vnode *vp)
853 {
854 
855 	if (vp->v_holdcnt <= 0)
856 		panic("holdrelel: holdcnt vp %p", vp);
857 	vp->v_holdcnt--;
858 
859 	/*
860 	 * If it is on the holdlist and the hold count drops to
861 	 * zero, move it to the free list. The test of the back
862 	 * pointer and the use reference count of zero is because
863 	 * it will be removed from a free list by getnewvnode,
864 	 * but will not have its reference count incremented until
865 	 * after calling vgone. If the reference count were
866 	 * incremented first, vgone would (incorrectly) try to
867 	 * close the previous instance of the underlying object.
868 	 * So, the back pointer is explicitly set to `0xdeadb' in
869 	 * getnewvnode after removing it from a freelist to ensure
870 	 * that we do not try to move it here.
871 	 */
872 
873 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
874 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
875 		simple_lock(&vnode_free_list_slock);
876 		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
877 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
878 		simple_unlock(&vnode_free_list_slock);
879 	}
880 }
881 
882 /*
883  * Vnode reference.
884  */
885 void
886 vref(struct vnode *vp)
887 {
888 
889 	simple_lock(&vp->v_interlock);
890 	if (vp->v_usecount <= 0)
891 		panic("vref used where vget required, vp %p", vp);
892 	vp->v_usecount++;
893 #ifdef DIAGNOSTIC
894 	if (vp->v_usecount == 0) {
895 		vprint("vref", vp);
896 		panic("vref: usecount overflow, vp %p", vp);
897 	}
898 #endif
899 	simple_unlock(&vp->v_interlock);
900 }
901 
902 /*
903  * Remove any vnodes in the vnode table belonging to mount point mp.
904  *
905  * If FORCECLOSE is not specified, there should not be any active ones,
906  * return error if any are found (nb: this is a user error, not a
907  * system error). If FORCECLOSE is specified, detach any active vnodes
908  * that are found.
909  *
910  * If WRITECLOSE is set, only flush out regular file vnodes open for
911  * writing.
912  *
913  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
914  */
915 #ifdef DEBUG
916 int busyprt = 0;	/* print out busy vnodes */
917 struct ctldebug debug1 = { "busyprt", &busyprt };
918 #endif
919 
920 int
921 vflush(struct mount *mp, struct vnode *skipvp, int flags)
922 {
923 	struct lwp *l = curlwp;		/* XXX */
924 	struct vnode *vp, *nvp;
925 	int busy = 0;
926 
927 	simple_lock(&mntvnode_slock);
928 loop:
929 	/*
930 	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
931 	 * and vclean() are called
932 	 */
933 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
934 		if (vp->v_mount != mp)
935 			goto loop;
936 		nvp = TAILQ_NEXT(vp, v_mntvnodes);
937 		/*
938 		 * Skip over a selected vnode.
939 		 */
940 		if (vp == skipvp)
941 			continue;
942 		simple_lock(&vp->v_interlock);
943 		/*
944 		 * Skip over a vnodes marked VV_SYSTEM.
945 		 */
946 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
947 			simple_unlock(&vp->v_interlock);
948 			continue;
949 		}
950 		/*
951 		 * If WRITECLOSE is set, only flush out regular file
952 		 * vnodes open for writing.
953 		 */
954 		if ((flags & WRITECLOSE) &&
955 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
956 			simple_unlock(&vp->v_interlock);
957 			continue;
958 		}
959 		/*
960 		 * With v_usecount == 0, all we need to do is clear
961 		 * out the vnode data structures and we are done.
962 		 */
963 		if (vp->v_usecount == 0) {
964 			simple_unlock(&mntvnode_slock);
965 			vgonel(vp, l);
966 			simple_lock(&mntvnode_slock);
967 			continue;
968 		}
969 		/*
970 		 * If FORCECLOSE is set, forcibly close the vnode.
971 		 * For block or character devices, revert to an
972 		 * anonymous device. For all other files, just kill them.
973 		 */
974 		if (flags & FORCECLOSE) {
975 			simple_unlock(&mntvnode_slock);
976 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
977 				vgonel(vp, l);
978 			} else {
979 				vclean(vp, 0, l);
980 				vp->v_op = spec_vnodeop_p;
981 				insmntque(vp, (struct mount *)0);
982 			}
983 			simple_lock(&mntvnode_slock);
984 			continue;
985 		}
986 #ifdef DEBUG
987 		if (busyprt)
988 			vprint("vflush: busy vnode", vp);
989 #endif
990 		simple_unlock(&vp->v_interlock);
991 		busy++;
992 	}
993 	simple_unlock(&mntvnode_slock);
994 	if (busy)
995 		return (EBUSY);
996 	return (0);
997 }
998 
999 /*
1000  * Disassociate the underlying file system from a vnode.
1001  */
1002 static void
1003 vclean(struct vnode *vp, int flags, struct lwp *l)
1004 {
1005 	int active;
1006 
1007 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1008 
1009 	/*
1010 	 * Check to see if the vnode is in use.
1011 	 * If so we have to reference it before we clean it out
1012 	 * so that its count cannot fall to zero and generate a
1013 	 * race against ourselves to recycle it.
1014 	 */
1015 
1016 	if ((active = vp->v_usecount) != 0) {
1017 		vp->v_usecount++;
1018 #ifdef DIAGNOSTIC
1019 		if (vp->v_usecount == 0) {
1020 			vprint("vclean", vp);
1021 			panic("vclean: usecount overflow");
1022 		}
1023 #endif
1024 	}
1025 
1026 	/*
1027 	 * Prevent the vnode from being recycled or
1028 	 * brought into use while we clean it out.
1029 	 */
1030 	if (vp->v_iflag & VI_XLOCK)
1031 		panic("vclean: deadlock, vp %p", vp);
1032 	vp->v_iflag |= VI_XLOCK;
1033 	if (vp->v_iflag & VI_EXECMAP) {
1034 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1035 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1036 	}
1037 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1038 
1039 	/*
1040 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1041 	 * have the object locked while it cleans it out.  For
1042 	 * active vnodes, it ensures that no other activity can
1043 	 * occur while the underlying object is being cleaned out.
1044 	 *
1045 	 * We drain the lock to make sure we are the last one trying to
1046 	 * get it and immediately resurrect the lock.  Future accesses
1047 	 * for locking this _vnode_ will be protected by VI_XLOCK.  However,
1048 	 * upper layers might be using the _lock_ in case the file system
1049 	 * exported it and might access it while the vnode lingers in
1050 	 * deadfs.
1051 	 */
1052 	VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK);
1053 
1054 	/*
1055 	 * Clean out any cached data associated with the vnode.
1056 	 * If special device, remove it from special device alias list.
1057 	 * if it is on one.
1058 	 */
1059 	if (flags & DOCLOSE) {
1060 		int error;
1061 		struct vnode *vq, *vx;
1062 
1063 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1064 		if (error)
1065 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1066 		KASSERT(error == 0);
1067 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1068 
1069 		if (active)
1070 			VOP_CLOSE(vp, FNONBLOCK, NOCRED);
1071 
1072 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1073 		    vp->v_specinfo != 0) {
1074 			simple_lock(&spechash_slock);
1075 			if (vp->v_hashchain != NULL) {
1076 				if (*vp->v_hashchain == vp) {
1077 					*vp->v_hashchain = vp->v_specnext;
1078 				} else {
1079 					for (vq = *vp->v_hashchain; vq;
1080 					     vq = vq->v_specnext) {
1081 						if (vq->v_specnext != vp)
1082 							continue;
1083 						vq->v_specnext = vp->v_specnext;
1084 						break;
1085 					}
1086 					if (vq == NULL)
1087 						panic("missing bdev");
1088 				}
1089 				if (vp->v_iflag & VI_ALIASED) {
1090 					vx = NULL;
1091 						for (vq = *vp->v_hashchain; vq;
1092 						     vq = vq->v_specnext) {
1093 						if (vq->v_rdev != vp->v_rdev ||
1094 						    vq->v_type != vp->v_type)
1095 							continue;
1096 						if (vx)
1097 							break;
1098 						vx = vq;
1099 					}
1100 					if (vx == NULL)
1101 						panic("missing alias");
1102 					if (vq == NULL)
1103 						vx->v_iflag &= ~VI_ALIASED;
1104 					vp->v_iflag &= ~VI_ALIASED;
1105 				}
1106 			}
1107 			simple_unlock(&spechash_slock);
1108 			FREE(vp->v_specinfo, M_VNODE);
1109 			vp->v_specinfo = NULL;
1110 		}
1111 	}
1112 
1113 	/*
1114 	 * If purging an active vnode, it must be closed and
1115 	 * deactivated before being reclaimed. Note that the
1116 	 * VOP_INACTIVE will unlock the vnode.
1117 	 */
1118 	if (active) {
1119 		VOP_INACTIVE(vp);
1120 	} else {
1121 		/*
1122 		 * Any other processes trying to obtain this lock must first
1123 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1124 		 */
1125 		VOP_UNLOCK(vp, 0);
1126 	}
1127 	/*
1128 	 * Reclaim the vnode.
1129 	 */
1130 	if (VOP_RECLAIM(vp))
1131 		panic("vclean: cannot reclaim, vp %p", vp);
1132 	if (active) {
1133 		/*
1134 		 * Inline copy of vrele() since VOP_INACTIVE
1135 		 * has already been called.
1136 		 */
1137 		simple_lock(&vp->v_interlock);
1138 		if (--vp->v_usecount <= 0) {
1139 #ifdef DIAGNOSTIC
1140 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1141 				vprint("vclean: bad ref count", vp);
1142 				panic("vclean: ref cnt");
1143 			}
1144 #endif
1145 			/*
1146 			 * Insert at tail of LRU list.
1147 			 */
1148 
1149 			simple_unlock(&vp->v_interlock);
1150 			simple_lock(&vnode_free_list_slock);
1151 #ifdef DIAGNOSTIC
1152 			if (vp->v_holdcnt > 0)
1153 				panic("vclean: not clean, vp %p", vp);
1154 #endif
1155 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1156 			simple_unlock(&vnode_free_list_slock);
1157 		} else
1158 			simple_unlock(&vp->v_interlock);
1159 	}
1160 
1161 	KASSERT(vp->v_uobj.uo_npages == 0);
1162 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1163 		uvm_ra_freectx(vp->v_ractx);
1164 		vp->v_ractx = NULL;
1165 	}
1166 	cache_purge(vp);
1167 
1168 	/*
1169 	 * Done with purge, notify sleepers of the grim news.
1170 	 */
1171 	vp->v_op = dead_vnodeop_p;
1172 	vp->v_tag = VT_NON;
1173 	vp->v_vnlock = NULL;
1174 	simple_lock(&vp->v_interlock);
1175 	VN_KNOTE(vp, NOTE_REVOKE);	/* FreeBSD has this in vn_pollgone() */
1176 	vp->v_iflag &= ~VI_XLOCK;
1177 	vp->v_vflag &= ~VV_LOCKSWORK;
1178 	if (vp->v_iflag & VI_XWANT) {
1179 		vp->v_iflag &= ~VI_XWANT;
1180 		simple_unlock(&vp->v_interlock);
1181 		wakeup((void *)vp);
1182 	} else
1183 		simple_unlock(&vp->v_interlock);
1184 }
1185 
1186 /*
1187  * Recycle an unused vnode to the front of the free list.
1188  * Release the passed interlock if the vnode will be recycled.
1189  */
1190 int
1191 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct lwp *l)
1192 {
1193 
1194 	simple_lock(&vp->v_interlock);
1195 	if (vp->v_usecount == 0) {
1196 		if (inter_lkp)
1197 			simple_unlock(inter_lkp);
1198 		vgonel(vp, l);
1199 		return (1);
1200 	}
1201 	simple_unlock(&vp->v_interlock);
1202 	return (0);
1203 }
1204 
1205 /*
1206  * Eliminate all activity associated with a vnode
1207  * in preparation for reuse.
1208  */
1209 void
1210 vgone(struct vnode *vp)
1211 {
1212 	struct lwp *l = curlwp;		/* XXX */
1213 
1214 	simple_lock(&vp->v_interlock);
1215 	vgonel(vp, l);
1216 }
1217 
1218 /*
1219  * vgone, with the vp interlock held.
1220  */
1221 void
1222 vgonel(struct vnode *vp, struct lwp *l)
1223 {
1224 
1225 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1226 
1227 	/*
1228 	 * If a vgone (or vclean) is already in progress,
1229 	 * wait until it is done and return.
1230 	 */
1231 
1232 	if (vp->v_iflag & VI_XLOCK) {
1233 		vp->v_iflag |= VI_XWANT;
1234 		ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock);
1235 		return;
1236 	}
1237 
1238 	/*
1239 	 * Clean out the filesystem specific data.
1240 	 */
1241 
1242 	vclean(vp, DOCLOSE, l);
1243 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1244 
1245 	/*
1246 	 * Delete from old mount point vnode list, if on one.
1247 	 */
1248 
1249 	if (vp->v_mount != NULL)
1250 		insmntque(vp, (struct mount *)0);
1251 
1252 	/*
1253 	 * The test of the back pointer and the reference count of
1254 	 * zero is because it will be removed from the free list by
1255 	 * getcleanvnode, but will not have its reference count
1256 	 * incremented until after calling vgone. If the reference
1257 	 * count were incremented first, vgone would (incorrectly)
1258 	 * try to close the previous instance of the underlying object.
1259 	 * So, the back pointer is explicitly set to `0xdeadb' in
1260 	 * getnewvnode after removing it from the freelist to ensure
1261 	 * that we do not try to move it here.
1262 	 */
1263 
1264 	vp->v_type = VBAD;
1265 	if (vp->v_usecount == 0) {
1266 		bool dofree;
1267 
1268 		simple_lock(&vnode_free_list_slock);
1269 		if (vp->v_holdcnt > 0)
1270 			panic("vgonel: not clean, vp %p", vp);
1271 		/*
1272 		 * if it isn't on the freelist, we're called by getcleanvnode
1273 		 * and vnode is being re-used.  otherwise, we'll free it.
1274 		 */
1275 		dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
1276 		if (dofree) {
1277 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1278 			numvnodes--;
1279 		}
1280 		simple_unlock(&vnode_free_list_slock);
1281 		if (dofree)
1282 			pool_put(&vnode_pool, vp);
1283 	}
1284 }
1285 
1286 /*
1287  * Lookup a vnode by device number.
1288  */
1289 int
1290 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
1291 {
1292 	struct vnode *vp;
1293 	int rc = 0;
1294 
1295 	simple_lock(&spechash_slock);
1296 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1297 		if (dev != vp->v_rdev || type != vp->v_type)
1298 			continue;
1299 		*vpp = vp;
1300 		rc = 1;
1301 		break;
1302 	}
1303 	simple_unlock(&spechash_slock);
1304 	return (rc);
1305 }
1306 
1307 /*
1308  * Revoke all the vnodes corresponding to the specified minor number
1309  * range (endpoints inclusive) of the specified major.
1310  */
1311 void
1312 vdevgone(int maj, int minl, int minh, enum vtype type)
1313 {
1314 	struct vnode *vp;
1315 	int mn;
1316 
1317 	vp = NULL;	/* XXX gcc */
1318 
1319 	for (mn = minl; mn <= minh; mn++)
1320 		if (vfinddev(makedev(maj, mn), type, &vp))
1321 			VOP_REVOKE(vp, REVOKEALL);
1322 }
1323 
1324 /*
1325  * Calculate the total number of references to a special device.
1326  */
1327 int
1328 vcount(struct vnode *vp)
1329 {
1330 	struct vnode *vq, *vnext;
1331 	int count;
1332 
1333 loop:
1334 	if ((vp->v_iflag & VI_ALIASED) == 0)
1335 		return (vp->v_usecount);
1336 	simple_lock(&spechash_slock);
1337 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1338 		vnext = vq->v_specnext;
1339 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1340 			continue;
1341 		/*
1342 		 * Alias, but not in use, so flush it out.
1343 		 */
1344 		if (vq->v_usecount == 0 && vq != vp &&
1345 		    (vq->v_iflag & VI_XLOCK) == 0) {
1346 			simple_unlock(&spechash_slock);
1347 			vgone(vq);
1348 			goto loop;
1349 		}
1350 		count += vq->v_usecount;
1351 	}
1352 	simple_unlock(&spechash_slock);
1353 	return (count);
1354 }
1355 
1356 
1357 /*
1358  * sysctl helper routine to return list of supported fstypes
1359  */
1360 static int
1361 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
1362 {
1363 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
1364 	char *where = oldp;
1365 	struct vfsops *v;
1366 	size_t needed, left, slen;
1367 	int error, first;
1368 
1369 	if (newp != NULL)
1370 		return (EPERM);
1371 	if (namelen != 0)
1372 		return (EINVAL);
1373 
1374 	first = 1;
1375 	error = 0;
1376 	needed = 0;
1377 	left = *oldlenp;
1378 
1379 	mutex_enter(&vfs_list_lock);
1380 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1381 		if (where == NULL)
1382 			needed += strlen(v->vfs_name) + 1;
1383 		else {
1384 			memset(bf, 0, sizeof(bf));
1385 			if (first) {
1386 				strncpy(bf, v->vfs_name, sizeof(bf));
1387 				first = 0;
1388 			} else {
1389 				bf[0] = ' ';
1390 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1391 			}
1392 			bf[sizeof(bf)-1] = '\0';
1393 			slen = strlen(bf);
1394 			if (left < slen + 1)
1395 				break;
1396 			/* +1 to copy out the trailing NUL byte */
1397 			v->vfs_refcount++;
1398 			mutex_exit(&vfs_list_lock);
1399 			error = copyout(bf, where, slen + 1);
1400 			mutex_enter(&vfs_list_lock);
1401 			v->vfs_refcount--;
1402 			if (error)
1403 				break;
1404 			where += slen;
1405 			needed += slen;
1406 			left -= slen;
1407 		}
1408 	}
1409 	mutex_exit(&vfs_list_lock);
1410 	*oldlenp = needed;
1411 	return (error);
1412 }
1413 
1414 /*
1415  * Top level filesystem related information gathering.
1416  */
1417 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1418 {
1419 	sysctl_createv(clog, 0, NULL, NULL,
1420 		       CTLFLAG_PERMANENT,
1421 		       CTLTYPE_NODE, "vfs", NULL,
1422 		       NULL, 0, NULL, 0,
1423 		       CTL_VFS, CTL_EOL);
1424 	sysctl_createv(clog, 0, NULL, NULL,
1425 		       CTLFLAG_PERMANENT,
1426 		       CTLTYPE_NODE, "generic",
1427 		       SYSCTL_DESCR("Non-specific vfs related information"),
1428 		       NULL, 0, NULL, 0,
1429 		       CTL_VFS, VFS_GENERIC, CTL_EOL);
1430 	sysctl_createv(clog, 0, NULL, NULL,
1431 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1432 		       CTLTYPE_INT, "usermount",
1433 		       SYSCTL_DESCR("Whether unprivileged users may mount "
1434 				    "filesystems"),
1435 		       NULL, 0, &dovfsusermount, 0,
1436 		       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
1437 	sysctl_createv(clog, 0, NULL, NULL,
1438 		       CTLFLAG_PERMANENT,
1439 		       CTLTYPE_STRING, "fstypes",
1440 		       SYSCTL_DESCR("List of file systems present"),
1441 		       sysctl_vfs_generic_fstypes, 0, NULL, 0,
1442 		       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
1443 	sysctl_createv(clog, 0, NULL, NULL,
1444 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1445 		       CTLTYPE_INT, "magiclinks",
1446 		       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
1447 		       NULL, 0, &vfs_magiclinks, 0,
1448 		       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1449 }
1450 
1451 
1452 int kinfo_vdebug = 1;
1453 int kinfo_vgetfailed;
1454 #define KINFO_VNODESLOP	10
1455 /*
1456  * Dump vnode list (via sysctl).
1457  * Copyout address of vnode followed by vnode.
1458  */
1459 /* ARGSUSED */
1460 int
1461 sysctl_kern_vnode(SYSCTLFN_ARGS)
1462 {
1463 	char *where = oldp;
1464 	size_t *sizep = oldlenp;
1465 	struct mount *mp, *nmp;
1466 	struct vnode *vp;
1467 	char *bp = where, *savebp;
1468 	char *ewhere;
1469 	int error;
1470 
1471 	if (namelen != 0)
1472 		return (EOPNOTSUPP);
1473 	if (newp != NULL)
1474 		return (EPERM);
1475 
1476 #define VPTRSZ	sizeof(struct vnode *)
1477 #define VNODESZ	sizeof(struct vnode)
1478 	if (where == NULL) {
1479 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1480 		return (0);
1481 	}
1482 	ewhere = where + *sizep;
1483 
1484 	mutex_enter(&mountlist_lock);
1485 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1486 	     mp = nmp) {
1487 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_lock)) {
1488 			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1489 			continue;
1490 		}
1491 		savebp = bp;
1492 again:
1493 		simple_lock(&mntvnode_slock);
1494 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1495 			/*
1496 			 * Check that the vp is still associated with
1497 			 * this filesystem.  RACE: could have been
1498 			 * recycled onto the same filesystem.
1499 			 */
1500 			if (vp->v_mount != mp) {
1501 				simple_unlock(&mntvnode_slock);
1502 				if (kinfo_vdebug)
1503 					printf("kinfo: vp changed\n");
1504 				bp = savebp;
1505 				goto again;
1506 			}
1507 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1508 				simple_unlock(&mntvnode_slock);
1509 				*sizep = bp - where;
1510 				return (ENOMEM);
1511 			}
1512 			simple_unlock(&mntvnode_slock);
1513 			if ((error = copyout((void *)&vp, bp, VPTRSZ)) ||
1514 			   (error = copyout((void *)vp, bp + VPTRSZ, VNODESZ)))
1515 				return (error);
1516 			bp += VPTRSZ + VNODESZ;
1517 			simple_lock(&mntvnode_slock);
1518 		}
1519 		simple_unlock(&mntvnode_slock);
1520 		mutex_enter(&mountlist_lock);
1521 		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1522 		vfs_unbusy(mp);
1523 	}
1524 	mutex_exit(&mountlist_lock);
1525 
1526 	*sizep = bp - where;
1527 	return (0);
1528 }
1529 
1530 /*
1531  * Check to see if a filesystem is mounted on a block device.
1532  */
1533 int
1534 vfs_mountedon(struct vnode *vp)
1535 {
1536 	struct vnode *vq;
1537 	int error = 0;
1538 
1539 	if (vp->v_type != VBLK)
1540 		return ENOTBLK;
1541 	if (vp->v_specmountpoint != NULL)
1542 		return (EBUSY);
1543 	if (vp->v_iflag & VI_ALIASED) {
1544 		simple_lock(&spechash_slock);
1545 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1546 			if (vq->v_rdev != vp->v_rdev ||
1547 			    vq->v_type != vp->v_type)
1548 				continue;
1549 			if (vq->v_specmountpoint != NULL) {
1550 				error = EBUSY;
1551 				break;
1552 			}
1553 		}
1554 		simple_unlock(&spechash_slock);
1555 	}
1556 	return (error);
1557 }
1558 
1559 /*
1560  * Unmount all file systems.
1561  * We traverse the list in reverse order under the assumption that doing so
1562  * will avoid needing to worry about dependencies.
1563  */
1564 void
1565 vfs_unmountall(struct lwp *l)
1566 {
1567 	struct mount *mp, *nmp;
1568 	int allerror, error;
1569 
1570 	printf("unmounting file systems...");
1571 	for (allerror = 0,
1572 	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1573 		nmp = mp->mnt_list.cqe_prev;
1574 #ifdef DEBUG
1575 		printf("\nunmounting %s (%s)...",
1576 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1577 #endif
1578 		/*
1579 		 * XXX Freeze syncer.  Must do this before locking the
1580 		 * mount point.  See dounmount() for details.
1581 		 */
1582 		mutex_enter(&syncer_mutex);
1583 		if (vfs_busy(mp, 0, 0)) {
1584 			mutex_exit(&syncer_mutex);
1585 			continue;
1586 		}
1587 		if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1588 			printf("unmount of %s failed with error %d\n",
1589 			    mp->mnt_stat.f_mntonname, error);
1590 			allerror = 1;
1591 		}
1592 	}
1593 	printf(" done\n");
1594 	if (allerror)
1595 		printf("WARNING: some file systems would not unmount\n");
1596 }
1597 
1598 extern struct simplelock bqueue_slock; /* XXX */
1599 
1600 /*
1601  * Sync and unmount file systems before shutting down.
1602  */
1603 void
1604 vfs_shutdown(void)
1605 {
1606 	struct lwp *l;
1607 
1608 	/* XXX we're certainly not running in lwp0's context! */
1609 	l = curlwp;
1610 	if (l == NULL)
1611 		l = &lwp0;
1612 
1613 	printf("syncing disks... ");
1614 
1615 	/* remove user processes from run queue */
1616 	suspendsched();
1617 	(void) spl0();
1618 
1619 	/* avoid coming back this way again if we panic. */
1620 	doing_shutdown = 1;
1621 
1622 	sys_sync(l, NULL, NULL);
1623 
1624 	/* Wait for sync to finish. */
1625 	if (buf_syncwait() != 0) {
1626 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1627 		Debugger();
1628 #endif
1629 		printf("giving up\n");
1630 		return;
1631 	} else
1632 		printf("done\n");
1633 
1634 	/*
1635 	 * If we've panic'd, don't make the situation potentially
1636 	 * worse by unmounting the file systems.
1637 	 */
1638 	if (panicstr != NULL)
1639 		return;
1640 
1641 	/* Release inodes held by texts before update. */
1642 #ifdef notdef
1643 	vnshutdown();
1644 #endif
1645 	/* Unmount file systems. */
1646 	vfs_unmountall(l);
1647 }
1648 
1649 /*
1650  * Mount the root file system.  If the operator didn't specify a
1651  * file system to use, try all possible file systems until one
1652  * succeeds.
1653  */
1654 int
1655 vfs_mountroot(void)
1656 {
1657 	struct vfsops *v;
1658 	int error = ENODEV;
1659 
1660 	if (root_device == NULL)
1661 		panic("vfs_mountroot: root device unknown");
1662 
1663 	switch (device_class(root_device)) {
1664 	case DV_IFNET:
1665 		if (rootdev != NODEV)
1666 			panic("vfs_mountroot: rootdev set for DV_IFNET "
1667 			    "(0x%08x -> %d,%d)", rootdev,
1668 			    major(rootdev), minor(rootdev));
1669 		break;
1670 
1671 	case DV_DISK:
1672 		if (rootdev == NODEV)
1673 			panic("vfs_mountroot: rootdev not set for DV_DISK");
1674 	        if (bdevvp(rootdev, &rootvp))
1675 	                panic("vfs_mountroot: can't get vnode for rootdev");
1676 		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1677 		if (error) {
1678 			printf("vfs_mountroot: can't open root device\n");
1679 			return (error);
1680 		}
1681 		break;
1682 
1683 	default:
1684 		printf("%s: inappropriate for root file system\n",
1685 		    root_device->dv_xname);
1686 		return (ENODEV);
1687 	}
1688 
1689 	/*
1690 	 * If user specified a file system, use it.
1691 	 */
1692 	if (mountroot != NULL) {
1693 		error = (*mountroot)();
1694 		goto done;
1695 	}
1696 
1697 	/*
1698 	 * Try each file system currently configured into the kernel.
1699 	 */
1700 	mutex_enter(&vfs_list_lock);
1701 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1702 		if (v->vfs_mountroot == NULL)
1703 			continue;
1704 #ifdef DEBUG
1705 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1706 #endif
1707 		v->vfs_refcount++;
1708 		mutex_exit(&vfs_list_lock);
1709 		error = (*v->vfs_mountroot)();
1710 		mutex_enter(&vfs_list_lock);
1711 		v->vfs_refcount--;
1712 		if (!error) {
1713 			aprint_normal("root file system type: %s\n",
1714 			    v->vfs_name);
1715 			break;
1716 		}
1717 	}
1718 	mutex_exit(&vfs_list_lock);
1719 
1720 	if (v == NULL) {
1721 		printf("no file system for %s", root_device->dv_xname);
1722 		if (device_class(root_device) == DV_DISK)
1723 			printf(" (dev 0x%x)", rootdev);
1724 		printf("\n");
1725 		error = EFTYPE;
1726 	}
1727 
1728 done:
1729 	if (error && device_class(root_device) == DV_DISK) {
1730 		VOP_CLOSE(rootvp, FREAD, FSCRED);
1731 		vrele(rootvp);
1732 	}
1733 	return (error);
1734 }
1735