xref: /netbsd-src/sys/kern/vfs_subr.c (revision 7fa608457b817eca6e0977b37f758ae064f3c99c)
1 /*	$NetBSD: vfs_subr.c,v 1.305 2007/11/04 17:31:16 pooka Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1989, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
74  */
75 
76 /*
77  * External virtual filesystem routines.
78  *
79  * This file contains vfs subroutines which are heavily dependant on
80  * the kernel and are not suitable for standalone use.  Examples include
81  * routines involved vnode and mountpoint management.
82  */
83 
84 #include <sys/cdefs.h>
85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.305 2007/11/04 17:31:16 pooka Exp $");
86 
87 #include "opt_inet.h"
88 #include "opt_ddb.h"
89 #include "opt_compat_netbsd.h"
90 #include "opt_compat_43.h"
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/proc.h>
95 #include <sys/kernel.h>
96 #include <sys/mount.h>
97 #include <sys/fcntl.h>
98 #include <sys/vnode.h>
99 #include <sys/stat.h>
100 #include <sys/namei.h>
101 #include <sys/ucred.h>
102 #include <sys/buf.h>
103 #include <sys/errno.h>
104 #include <sys/malloc.h>
105 #include <sys/syscallargs.h>
106 #include <sys/device.h>
107 #include <sys/filedesc.h>
108 #include <sys/kauth.h>
109 
110 #include <miscfs/specfs/specdev.h>
111 #include <miscfs/syncfs/syncfs.h>
112 
113 #include <uvm/uvm.h>
114 #include <uvm/uvm_readahead.h>
115 #include <uvm/uvm_ddb.h>
116 
117 #include <sys/sysctl.h>
118 
119 extern int dovfsusermount;	/* 1 => permit any user to mount filesystems */
120 extern int vfs_magiclinks;	/* 1 => expand "magic" symlinks */
121 
122 /* TAILQ_HEAD(freelst, vnode) vnode_free_list =	vnode free list (in vnode.h) */
123 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
124 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
125 
126 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
127 
128 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
129     &pool_allocator_nointr, IPL_NONE);
130 
131 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
132 
133 /*
134  * Local declarations.
135  */
136 
137 static void insmntque(struct vnode *, struct mount *);
138 static int getdevvp(dev_t, struct vnode **, enum vtype);
139 static void vclean(struct vnode *, int, struct lwp *);
140 static struct vnode *getcleanvnode(struct lwp *);
141 
142 int
143 vfs_drainvnodes(long target, struct lwp *l)
144 {
145 
146 	simple_lock(&vnode_free_list_slock);
147 	while (numvnodes > target) {
148 		struct vnode *vp;
149 
150 		vp = getcleanvnode(l);
151 		if (vp == NULL)
152 			return EBUSY; /* give up */
153 		pool_put(&vnode_pool, vp);
154 		simple_lock(&vnode_free_list_slock);
155 		numvnodes--;
156 	}
157 	simple_unlock(&vnode_free_list_slock);
158 
159 	return 0;
160 }
161 
162 /*
163  * grab a vnode from freelist and clean it.
164  */
165 struct vnode *
166 getcleanvnode(struct lwp *l)
167 {
168 	struct vnode *vp;
169 	struct freelst *listhd;
170 
171 	LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
172 
173 	listhd = &vnode_free_list;
174 try_nextlist:
175 	TAILQ_FOREACH(vp, listhd, v_freelist) {
176 		if (!simple_lock_try(&vp->v_interlock))
177 			continue;
178 		/*
179 		 * as our lwp might hold the underlying vnode locked,
180 		 * don't try to reclaim the VLAYER vnode if it's locked.
181 		 */
182 		if ((vp->v_iflag & VI_XLOCK) == 0 &&
183 		    ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
184 			break;
185 		}
186 		simple_unlock(&vp->v_interlock);
187 	}
188 
189 	if (vp == NULLVP) {
190 		if (listhd == &vnode_free_list) {
191 			listhd = &vnode_hold_list;
192 			goto try_nextlist;
193 		}
194 		simple_unlock(&vnode_free_list_slock);
195 		return NULLVP;
196 	}
197 
198 	if (vp->v_usecount)
199 		panic("free vnode isn't, vp %p", vp);
200 	TAILQ_REMOVE(listhd, vp, v_freelist);
201 	/* see comment on why 0xdeadb is set at end of vgone (below) */
202 	vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
203 	simple_unlock(&vnode_free_list_slock);
204 
205 	if (vp->v_type != VBAD)
206 		vgonel(vp, l);
207 	else
208 		simple_unlock(&vp->v_interlock);
209 #ifdef DIAGNOSTIC
210 	if (vp->v_data || vp->v_uobj.uo_npages ||
211 	    TAILQ_FIRST(&vp->v_uobj.memq))
212 		panic("cleaned vnode isn't, vp %p", vp);
213 	if (vp->v_numoutput)
214 		panic("clean vnode has pending I/O's, vp %p", vp);
215 #endif
216 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
217 
218 	return vp;
219 }
220 
221 /*
222  * Mark a mount point as busy. Used to synchronize access and to delay
223  * unmounting. Interlock is not released on failure.
224  */
225 int
226 vfs_busy(struct mount *mp, int flags, kmutex_t *interlkp)
227 {
228 	int lkflags;
229 
230 	while (mp->mnt_iflag & IMNT_UNMOUNT) {
231 		int gone, n;
232 
233 		if (flags & LK_NOWAIT)
234 			return (ENOENT);
235 		if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
236 		    && mp->mnt_unmounter == curlwp)
237 			return (EDEADLK);
238 		if (interlkp)
239 			mutex_exit(interlkp);
240 		/*
241 		 * Since all busy locks are shared except the exclusive
242 		 * lock granted when unmounting, the only place that a
243 		 * wakeup needs to be done is at the release of the
244 		 * exclusive lock at the end of dounmount.
245 		 */
246 		simple_lock(&mp->mnt_slock);
247 		mp->mnt_wcnt++;
248 		ltsleep((void *)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock);
249 		n = --mp->mnt_wcnt;
250 		simple_unlock(&mp->mnt_slock);
251 		gone = mp->mnt_iflag & IMNT_GONE;
252 
253 		if (n == 0)
254 			wakeup(&mp->mnt_wcnt);
255 		if (interlkp)
256 			mutex_enter(interlkp);
257 		if (gone)
258 			return (ENOENT);
259 	}
260 	lkflags = LK_SHARED;
261 	if (interlkp) {
262 		/* lkflags |= LK_INTERLOCK; XXX */
263 		mutex_exit(interlkp);	/* XXX */
264 	}
265 	if (lockmgr(&mp->mnt_lock, lkflags, NULL))
266 		panic("vfs_busy: unexpected lock failure");
267 	return (0);
268 }
269 
270 /*
271  * Free a busy filesystem.
272  */
273 void
274 vfs_unbusy(struct mount *mp)
275 {
276 
277 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
278 }
279 
280 /*
281  * Lookup a filesystem type, and if found allocate and initialize
282  * a mount structure for it.
283  *
284  * Devname is usually updated by mount(8) after booting.
285  */
286 int
287 vfs_rootmountalloc(const char *fstypename, const char *devname,
288     struct mount **mpp)
289 {
290 	struct vfsops *vfsp = NULL;
291 	struct mount *mp;
292 
293 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
294 		if (!strncmp(vfsp->vfs_name, fstypename,
295 		    sizeof(mp->mnt_stat.f_fstypename)))
296 			break;
297 
298 	if (vfsp == NULL)
299 		return (ENODEV);
300 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
301 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
302 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
303 	simple_lock_init(&mp->mnt_slock);
304 	(void)vfs_busy(mp, LK_NOWAIT, 0);
305 	TAILQ_INIT(&mp->mnt_vnodelist);
306 	mp->mnt_op = vfsp;
307 	mp->mnt_flag = MNT_RDONLY;
308 	mp->mnt_vnodecovered = NULLVP;
309 	vfsp->vfs_refcount++;
310 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
311 	    sizeof(mp->mnt_stat.f_fstypename));
312 	mp->mnt_stat.f_mntonname[0] = '/';
313 	mp->mnt_stat.f_mntonname[1] = '\0';
314 	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
315 	    '\0';
316 	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
317 	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
318 	mount_initspecific(mp);
319 	*mpp = mp;
320 	return (0);
321 }
322 
323 
324 /*
325  * Routines having to do with the management of the vnode table.
326  */
327 extern int (**dead_vnodeop_p)(void *);
328 
329 /*
330  * Return the next vnode from the free list.
331  */
332 int
333 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
334     struct vnode **vpp)
335 {
336 	extern struct uvm_pagerops uvm_vnodeops;
337 	struct uvm_object *uobj;
338 	struct lwp *l = curlwp;		/* XXX */
339 	static int toggle;
340 	struct vnode *vp;
341 	int error = 0, tryalloc;
342 
343  try_again:
344 	if (mp) {
345 		/*
346 		 * Mark filesystem busy while we're creating a vnode.
347 		 * If unmount is in progress, this will wait; if the
348 		 * unmount succeeds (only if umount -f), this will
349 		 * return an error.  If the unmount fails, we'll keep
350 		 * going afterwards.
351 		 * (This puts the per-mount vnode list logically under
352 		 * the protection of the vfs_busy lock).
353 		 */
354 		error = vfs_busy(mp, LK_RECURSEFAIL, 0);
355 		if (error && error != EDEADLK)
356 			return error;
357 	}
358 
359 	/*
360 	 * We must choose whether to allocate a new vnode or recycle an
361 	 * existing one. The criterion for allocating a new one is that
362 	 * the total number of vnodes is less than the number desired or
363 	 * there are no vnodes on either free list. Generally we only
364 	 * want to recycle vnodes that have no buffers associated with
365 	 * them, so we look first on the vnode_free_list. If it is empty,
366 	 * we next consider vnodes with referencing buffers on the
367 	 * vnode_hold_list. The toggle ensures that half the time we
368 	 * will use a buffer from the vnode_hold_list, and half the time
369 	 * we will allocate a new one unless the list has grown to twice
370 	 * the desired size. We are reticent to recycle vnodes from the
371 	 * vnode_hold_list because we will lose the identity of all its
372 	 * referencing buffers.
373 	 */
374 
375 	vp = NULL;
376 
377 	simple_lock(&vnode_free_list_slock);
378 
379 	toggle ^= 1;
380 	if (numvnodes > 2 * desiredvnodes)
381 		toggle = 0;
382 
383 	tryalloc = numvnodes < desiredvnodes ||
384 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
385 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
386 
387 	if (tryalloc &&
388 	    (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) {
389 		numvnodes++;
390 		simple_unlock(&vnode_free_list_slock);
391 		memset(vp, 0, sizeof(*vp));
392 		UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1);
393 		/*
394 		 * done by memset() above.
395 		 *	LIST_INIT(&vp->v_nclist);
396 		 *	LIST_INIT(&vp->v_dnclist);
397 		 */
398 	} else {
399 		vp = getcleanvnode(l);
400 		/*
401 		 * Unless this is a bad time of the month, at most
402 		 * the first NCPUS items on the free list are
403 		 * locked, so this is close enough to being empty.
404 		 */
405 		if (vp == NULLVP) {
406 			if (mp && error != EDEADLK)
407 				vfs_unbusy(mp);
408 			if (tryalloc) {
409 				printf("WARNING: unable to allocate new "
410 				    "vnode, retrying...\n");
411 				(void) tsleep(&lbolt, PRIBIO, "newvn", hz);
412 				goto try_again;
413 			}
414 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
415 			*vpp = 0;
416 			return (ENFILE);
417 		}
418 		vp->v_usecount = 1;
419 		vp->v_iflag = 0;
420 		vp->v_vflag = 0;
421 		vp->v_uflag = 0;
422 		vp->v_socket = NULL;
423 	}
424 	vp->v_type = VNON;
425 	vp->v_vnlock = &vp->v_lock;
426 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
427 	KASSERT(LIST_EMPTY(&vp->v_nclist));
428 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
429 	vp->v_tag = tag;
430 	vp->v_op = vops;
431 	insmntque(vp, mp);
432 	*vpp = vp;
433 	vp->v_data = 0;
434 	simple_lock_init(&vp->v_interlock);
435 
436 	/*
437 	 * initialize uvm_object within vnode.
438 	 */
439 
440 	uobj = &vp->v_uobj;
441 	KASSERT(uobj->pgops == &uvm_vnodeops);
442 	KASSERT(uobj->uo_npages == 0);
443 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
444 	vp->v_size = vp->v_writesize = VSIZENOTSET;
445 
446 	if (mp && error != EDEADLK)
447 		vfs_unbusy(mp);
448 	return (0);
449 }
450 
451 /*
452  * This is really just the reverse of getnewvnode(). Needed for
453  * VFS_VGET functions who may need to push back a vnode in case
454  * of a locking race.
455  */
456 void
457 ungetnewvnode(struct vnode *vp)
458 {
459 #ifdef DIAGNOSTIC
460 	if (vp->v_usecount != 1)
461 		panic("ungetnewvnode: busy vnode");
462 #endif
463 	vp->v_usecount--;
464 	insmntque(vp, NULL);
465 	vp->v_type = VBAD;
466 
467 	simple_lock(&vp->v_interlock);
468 	/*
469 	 * Insert at head of LRU list
470 	 */
471 	simple_lock(&vnode_free_list_slock);
472 	if (vp->v_holdcnt > 0)
473 		TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist);
474 	else
475 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
476 	simple_unlock(&vnode_free_list_slock);
477 	simple_unlock(&vp->v_interlock);
478 }
479 
480 /*
481  * Move a vnode from one mount queue to another.
482  */
483 static void
484 insmntque(struct vnode *vp, struct mount *mp)
485 {
486 
487 #ifdef DIAGNOSTIC
488 	if ((mp != NULL) &&
489 	    (mp->mnt_iflag & IMNT_UNMOUNT) &&
490 	    !(mp->mnt_flag & MNT_SOFTDEP) &&
491 	    vp->v_tag != VT_VFS) {
492 		panic("insmntque into dying filesystem");
493 	}
494 #endif
495 
496 	simple_lock(&mntvnode_slock);
497 	/*
498 	 * Delete from old mount point vnode list, if on one.
499 	 */
500 	if (vp->v_mount != NULL)
501 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
502 	/*
503 	 * Insert into list of vnodes for the new mount point, if available.
504 	 */
505 	if ((vp->v_mount = mp) != NULL)
506 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
507 	simple_unlock(&mntvnode_slock);
508 }
509 
510 /*
511  * Create a vnode for a block device.
512  * Used for root filesystem and swap areas.
513  * Also used for memory file system special devices.
514  */
515 int
516 bdevvp(dev_t dev, struct vnode **vpp)
517 {
518 
519 	return (getdevvp(dev, vpp, VBLK));
520 }
521 
522 /*
523  * Create a vnode for a character device.
524  * Used for kernfs and some console handling.
525  */
526 int
527 cdevvp(dev_t dev, struct vnode **vpp)
528 {
529 
530 	return (getdevvp(dev, vpp, VCHR));
531 }
532 
533 /*
534  * Create a vnode for a device.
535  * Used by bdevvp (block device) for root file system etc.,
536  * and by cdevvp (character device) for console and kernfs.
537  */
538 static int
539 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
540 {
541 	struct vnode *vp;
542 	struct vnode *nvp;
543 	int error;
544 
545 	if (dev == NODEV) {
546 		*vpp = NULL;
547 		return (0);
548 	}
549 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
550 	if (error) {
551 		*vpp = NULL;
552 		return (error);
553 	}
554 	vp = nvp;
555 	vp->v_type = type;
556 	uvm_vnp_setsize(vp, 0);
557 	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
558 		vput(vp);
559 		vp = nvp;
560 	}
561 	*vpp = vp;
562 	return (0);
563 }
564 
565 /*
566  * Check to see if the new vnode represents a special device
567  * for which we already have a vnode (either because of
568  * bdevvp() or because of a different vnode representing
569  * the same block device). If such an alias exists, deallocate
570  * the existing contents and return the aliased vnode. The
571  * caller is responsible for filling it with its new contents.
572  */
573 struct vnode *
574 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
575 {
576 	struct lwp *l = curlwp;		/* XXX */
577 	struct vnode *vp;
578 	struct vnode **vpp;
579 
580 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
581 		return (NULLVP);
582 
583 	vpp = &speclisth[SPECHASH(nvp_rdev)];
584 loop:
585 	simple_lock(&spechash_slock);
586 	for (vp = *vpp; vp; vp = vp->v_specnext) {
587 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
588 			continue;
589 		/*
590 		 * Alias, but not in use, so flush it out.
591 		 */
592 		simple_lock(&vp->v_interlock);
593 		simple_unlock(&spechash_slock);
594 		if (vp->v_usecount == 0) {
595 			vgonel(vp, l);
596 			goto loop;
597 		}
598 		/*
599 		 * What we're interested to know here is if someone else has
600 		 * removed this vnode from the device hash list while we were
601 		 * waiting.  This can only happen if vclean() did it, and
602 		 * this requires the vnode to be locked.
603 		 */
604 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK))
605 			goto loop;
606 		if (vp->v_specinfo == NULL) {
607 			vput(vp);
608 			goto loop;
609 		}
610 		simple_lock(&spechash_slock);
611 		break;
612 	}
613 	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
614 		MALLOC(nvp->v_specinfo, struct specinfo *,
615 			sizeof(struct specinfo), M_VNODE, M_NOWAIT);
616 		/* XXX Erg. */
617 		if (nvp->v_specinfo == NULL) {
618 			simple_unlock(&spechash_slock);
619 			uvm_wait("checkalias");
620 			goto loop;
621 		}
622 
623 		nvp->v_rdev = nvp_rdev;
624 		nvp->v_hashchain = vpp;
625 		nvp->v_specnext = *vpp;
626 		nvp->v_specmountpoint = NULL;
627 		simple_unlock(&spechash_slock);
628 		nvp->v_speclockf = NULL;
629 
630 		*vpp = nvp;
631 		if (vp != NULLVP) {
632 			nvp->v_iflag |= VI_ALIASED;
633 			vp->v_iflag |= VI_ALIASED;
634 			vput(vp);
635 		}
636 		return (NULLVP);
637 	}
638 	simple_unlock(&spechash_slock);
639 	VOP_UNLOCK(vp, 0);
640 	simple_lock(&vp->v_interlock);
641 	vclean(vp, 0, l);
642 	vp->v_op = nvp->v_op;
643 	vp->v_tag = nvp->v_tag;
644 	vp->v_vnlock = &vp->v_lock;
645 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
646 	nvp->v_type = VNON;
647 	insmntque(vp, mp);
648 	return (vp);
649 }
650 
651 /*
652  * Grab a particular vnode from the free list, increment its
653  * reference count and lock it. If the vnode lock bit is set the
654  * vnode is being eliminated in vgone. In that case, we can not
655  * grab the vnode, so the process is awakened when the transition is
656  * completed, and an error returned to indicate that the vnode is no
657  * longer usable (possibly having been changed to a new file system type).
658  */
659 int
660 vget(struct vnode *vp, int flags)
661 {
662 	int error;
663 
664 	/*
665 	 * If the vnode is in the process of being cleaned out for
666 	 * another use, we wait for the cleaning to finish and then
667 	 * return failure. Cleaning is determined by checking that
668 	 * the VI_XLOCK flag is set.
669 	 */
670 
671 	if ((flags & LK_INTERLOCK) == 0)
672 		simple_lock(&vp->v_interlock);
673 	if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
674 		if (flags & LK_NOWAIT) {
675 			simple_unlock(&vp->v_interlock);
676 			return EBUSY;
677 		}
678 		vp->v_iflag |= VI_XWANT;
679 		ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock);
680 		return (ENOENT);
681 	}
682 	if (vp->v_usecount == 0) {
683 		simple_lock(&vnode_free_list_slock);
684 		if (vp->v_holdcnt > 0)
685 			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
686 		else
687 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
688 		simple_unlock(&vnode_free_list_slock);
689 	}
690 	vp->v_usecount++;
691 #ifdef DIAGNOSTIC
692 	if (vp->v_usecount == 0) {
693 		vprint("vget", vp);
694 		panic("vget: usecount overflow, vp %p", vp);
695 	}
696 #endif
697 	if (flags & LK_TYPE_MASK) {
698 		if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
699 			vrele(vp);
700 		}
701 		return (error);
702 	}
703 	simple_unlock(&vp->v_interlock);
704 	return (0);
705 }
706 
707 /*
708  * vput(), just unlock and vrele()
709  */
710 void
711 vput(struct vnode *vp)
712 {
713 	struct lwp *l = curlwp;		/* XXX */
714 
715 #ifdef DIAGNOSTIC
716 	if (vp == NULL)
717 		panic("vput: null vp");
718 #endif
719 	simple_lock(&vp->v_interlock);
720 	vp->v_usecount--;
721 	if (vp->v_usecount > 0) {
722 		simple_unlock(&vp->v_interlock);
723 		VOP_UNLOCK(vp, 0);
724 		return;
725 	}
726 #ifdef DIAGNOSTIC
727 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
728 		vprint("vput: bad ref count", vp);
729 		panic("vput: ref cnt");
730 	}
731 #endif
732 	/*
733 	 * Insert at tail of LRU list.
734 	 */
735 	simple_lock(&vnode_free_list_slock);
736 	if (vp->v_holdcnt > 0)
737 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
738 	else
739 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
740 	simple_unlock(&vnode_free_list_slock);
741 	if (vp->v_iflag & VI_EXECMAP) {
742 		uvmexp.execpages -= vp->v_uobj.uo_npages;
743 		uvmexp.filepages += vp->v_uobj.uo_npages;
744 	}
745 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
746 	vp->v_vflag &= ~VV_MAPPED;
747 	simple_unlock(&vp->v_interlock);
748 	VOP_INACTIVE(vp, l);
749 }
750 
751 /*
752  * Vnode release.
753  * If count drops to zero, call inactive routine and return to freelist.
754  */
755 static void
756 do_vrele(struct vnode *vp, int doinactive, int onhead)
757 {
758 	struct lwp *l = curlwp;		/* XXX */
759 
760 #ifdef DIAGNOSTIC
761 	if (vp == NULL)
762 		panic("vrele: null vp");
763 #endif
764 	simple_lock(&vp->v_interlock);
765 	vp->v_usecount--;
766 	if (vp->v_usecount > 0) {
767 		simple_unlock(&vp->v_interlock);
768 		return;
769 	}
770 #ifdef DIAGNOSTIC
771 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
772 		vprint("vrele: bad ref count", vp);
773 		panic("vrele: ref cnt vp %p", vp);
774 	}
775 #endif
776 	/*
777 	 * Insert at tail of LRU list.
778 	 */
779 	simple_lock(&vnode_free_list_slock);
780 	if (vp->v_holdcnt > 0) {
781 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
782 	} else {
783 		if (onhead)
784 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
785 		else
786 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
787 	}
788 	simple_unlock(&vnode_free_list_slock);
789 	if (vp->v_iflag & VI_EXECMAP) {
790 		uvmexp.execpages -= vp->v_uobj.uo_npages;
791 		uvmexp.filepages += vp->v_uobj.uo_npages;
792 	}
793 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
794 	vp->v_vflag &= ~VV_MAPPED;
795 
796 	if (doinactive) {
797 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
798 			VOP_INACTIVE(vp, l);
799 	} else {
800 		simple_unlock(&vp->v_interlock);
801 	}
802 }
803 
804 void
805 vrele(struct vnode *vp)
806 {
807 
808 	do_vrele(vp, 1, 0);
809 }
810 
811 void
812 vrele2(struct vnode *vp, int onhead)
813 {
814 
815 	do_vrele(vp, 0, onhead);
816 }
817 
818 /*
819  * Page or buffer structure gets a reference.
820  * Called with v_interlock held.
821  */
822 void
823 vholdl(struct vnode *vp)
824 {
825 
826 	/*
827 	 * If it is on the freelist and the hold count is currently
828 	 * zero, move it to the hold list. The test of the back
829 	 * pointer and the use reference count of zero is because
830 	 * it will be removed from a free list by getnewvnode,
831 	 * but will not have its reference count incremented until
832 	 * after calling vgone. If the reference count were
833 	 * incremented first, vgone would (incorrectly) try to
834 	 * close the previous instance of the underlying object.
835 	 * So, the back pointer is explicitly set to `0xdeadb' in
836 	 * getnewvnode after removing it from a freelist to ensure
837 	 * that we do not try to move it here.
838 	 */
839 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
840 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
841 		simple_lock(&vnode_free_list_slock);
842 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
843 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
844 		simple_unlock(&vnode_free_list_slock);
845 	}
846 	vp->v_holdcnt++;
847 }
848 
849 /*
850  * Page or buffer structure frees a reference.
851  * Called with v_interlock held.
852  */
853 void
854 holdrelel(struct vnode *vp)
855 {
856 
857 	if (vp->v_holdcnt <= 0)
858 		panic("holdrelel: holdcnt vp %p", vp);
859 	vp->v_holdcnt--;
860 
861 	/*
862 	 * If it is on the holdlist and the hold count drops to
863 	 * zero, move it to the free list. The test of the back
864 	 * pointer and the use reference count of zero is because
865 	 * it will be removed from a free list by getnewvnode,
866 	 * but will not have its reference count incremented until
867 	 * after calling vgone. If the reference count were
868 	 * incremented first, vgone would (incorrectly) try to
869 	 * close the previous instance of the underlying object.
870 	 * So, the back pointer is explicitly set to `0xdeadb' in
871 	 * getnewvnode after removing it from a freelist to ensure
872 	 * that we do not try to move it here.
873 	 */
874 
875 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
876 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
877 		simple_lock(&vnode_free_list_slock);
878 		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
879 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
880 		simple_unlock(&vnode_free_list_slock);
881 	}
882 }
883 
884 /*
885  * Vnode reference.
886  */
887 void
888 vref(struct vnode *vp)
889 {
890 
891 	simple_lock(&vp->v_interlock);
892 	if (vp->v_usecount <= 0)
893 		panic("vref used where vget required, vp %p", vp);
894 	vp->v_usecount++;
895 #ifdef DIAGNOSTIC
896 	if (vp->v_usecount == 0) {
897 		vprint("vref", vp);
898 		panic("vref: usecount overflow, vp %p", vp);
899 	}
900 #endif
901 	simple_unlock(&vp->v_interlock);
902 }
903 
904 /*
905  * Remove any vnodes in the vnode table belonging to mount point mp.
906  *
907  * If FORCECLOSE is not specified, there should not be any active ones,
908  * return error if any are found (nb: this is a user error, not a
909  * system error). If FORCECLOSE is specified, detach any active vnodes
910  * that are found.
911  *
912  * If WRITECLOSE is set, only flush out regular file vnodes open for
913  * writing.
914  *
915  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
916  */
917 #ifdef DEBUG
918 int busyprt = 0;	/* print out busy vnodes */
919 struct ctldebug debug1 = { "busyprt", &busyprt };
920 #endif
921 
922 int
923 vflush(struct mount *mp, struct vnode *skipvp, int flags)
924 {
925 	struct lwp *l = curlwp;		/* XXX */
926 	struct vnode *vp, *nvp;
927 	int busy = 0;
928 
929 	simple_lock(&mntvnode_slock);
930 loop:
931 	/*
932 	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
933 	 * and vclean() are called
934 	 */
935 	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
936 		if (vp->v_mount != mp)
937 			goto loop;
938 		nvp = TAILQ_NEXT(vp, v_mntvnodes);
939 		/*
940 		 * Skip over a selected vnode.
941 		 */
942 		if (vp == skipvp)
943 			continue;
944 		simple_lock(&vp->v_interlock);
945 		/*
946 		 * Skip over a vnodes marked VV_SYSTEM.
947 		 */
948 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
949 			simple_unlock(&vp->v_interlock);
950 			continue;
951 		}
952 		/*
953 		 * If WRITECLOSE is set, only flush out regular file
954 		 * vnodes open for writing.
955 		 */
956 		if ((flags & WRITECLOSE) &&
957 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
958 			simple_unlock(&vp->v_interlock);
959 			continue;
960 		}
961 		/*
962 		 * With v_usecount == 0, all we need to do is clear
963 		 * out the vnode data structures and we are done.
964 		 */
965 		if (vp->v_usecount == 0) {
966 			simple_unlock(&mntvnode_slock);
967 			vgonel(vp, l);
968 			simple_lock(&mntvnode_slock);
969 			continue;
970 		}
971 		/*
972 		 * If FORCECLOSE is set, forcibly close the vnode.
973 		 * For block or character devices, revert to an
974 		 * anonymous device. For all other files, just kill them.
975 		 */
976 		if (flags & FORCECLOSE) {
977 			simple_unlock(&mntvnode_slock);
978 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
979 				vgonel(vp, l);
980 			} else {
981 				vclean(vp, 0, l);
982 				vp->v_op = spec_vnodeop_p;
983 				insmntque(vp, (struct mount *)0);
984 			}
985 			simple_lock(&mntvnode_slock);
986 			continue;
987 		}
988 #ifdef DEBUG
989 		if (busyprt)
990 			vprint("vflush: busy vnode", vp);
991 #endif
992 		simple_unlock(&vp->v_interlock);
993 		busy++;
994 	}
995 	simple_unlock(&mntvnode_slock);
996 	if (busy)
997 		return (EBUSY);
998 	return (0);
999 }
1000 
1001 /*
1002  * Disassociate the underlying file system from a vnode.
1003  */
1004 static void
1005 vclean(struct vnode *vp, int flags, struct lwp *l)
1006 {
1007 	int active;
1008 
1009 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1010 
1011 	/*
1012 	 * Check to see if the vnode is in use.
1013 	 * If so we have to reference it before we clean it out
1014 	 * so that its count cannot fall to zero and generate a
1015 	 * race against ourselves to recycle it.
1016 	 */
1017 
1018 	if ((active = vp->v_usecount) != 0) {
1019 		vp->v_usecount++;
1020 #ifdef DIAGNOSTIC
1021 		if (vp->v_usecount == 0) {
1022 			vprint("vclean", vp);
1023 			panic("vclean: usecount overflow");
1024 		}
1025 #endif
1026 	}
1027 
1028 	/*
1029 	 * Prevent the vnode from being recycled or
1030 	 * brought into use while we clean it out.
1031 	 */
1032 	if (vp->v_iflag & VI_XLOCK)
1033 		panic("vclean: deadlock, vp %p", vp);
1034 	vp->v_iflag |= VI_XLOCK;
1035 	if (vp->v_iflag & VI_EXECMAP) {
1036 		uvmexp.execpages -= vp->v_uobj.uo_npages;
1037 		uvmexp.filepages += vp->v_uobj.uo_npages;
1038 	}
1039 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1040 
1041 	/*
1042 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1043 	 * have the object locked while it cleans it out.  For
1044 	 * active vnodes, it ensures that no other activity can
1045 	 * occur while the underlying object is being cleaned out.
1046 	 *
1047 	 * We drain the lock to make sure we are the last one trying to
1048 	 * get it and immediately resurrect the lock.  Future accesses
1049 	 * for locking this _vnode_ will be protected by VI_XLOCK.  However,
1050 	 * upper layers might be using the _lock_ in case the file system
1051 	 * exported it and might access it while the vnode lingers in
1052 	 * deadfs.
1053 	 */
1054 	VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK);
1055 
1056 	/*
1057 	 * Clean out any cached data associated with the vnode.
1058 	 * If special device, remove it from special device alias list.
1059 	 * if it is on one.
1060 	 */
1061 	if (flags & DOCLOSE) {
1062 		int error;
1063 		struct vnode *vq, *vx;
1064 
1065 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1066 		if (error)
1067 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1068 		KASSERT(error == 0);
1069 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1070 
1071 		if (active)
1072 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
1073 
1074 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1075 		    vp->v_specinfo != 0) {
1076 			simple_lock(&spechash_slock);
1077 			if (vp->v_hashchain != NULL) {
1078 				if (*vp->v_hashchain == vp) {
1079 					*vp->v_hashchain = vp->v_specnext;
1080 				} else {
1081 					for (vq = *vp->v_hashchain; vq;
1082 					     vq = vq->v_specnext) {
1083 						if (vq->v_specnext != vp)
1084 							continue;
1085 						vq->v_specnext = vp->v_specnext;
1086 						break;
1087 					}
1088 					if (vq == NULL)
1089 						panic("missing bdev");
1090 				}
1091 				if (vp->v_iflag & VI_ALIASED) {
1092 					vx = NULL;
1093 						for (vq = *vp->v_hashchain; vq;
1094 						     vq = vq->v_specnext) {
1095 						if (vq->v_rdev != vp->v_rdev ||
1096 						    vq->v_type != vp->v_type)
1097 							continue;
1098 						if (vx)
1099 							break;
1100 						vx = vq;
1101 					}
1102 					if (vx == NULL)
1103 						panic("missing alias");
1104 					if (vq == NULL)
1105 						vx->v_iflag &= ~VI_ALIASED;
1106 					vp->v_iflag &= ~VI_ALIASED;
1107 				}
1108 			}
1109 			simple_unlock(&spechash_slock);
1110 			FREE(vp->v_specinfo, M_VNODE);
1111 			vp->v_specinfo = NULL;
1112 		}
1113 	}
1114 
1115 	/*
1116 	 * If purging an active vnode, it must be closed and
1117 	 * deactivated before being reclaimed. Note that the
1118 	 * VOP_INACTIVE will unlock the vnode.
1119 	 */
1120 	if (active) {
1121 		VOP_INACTIVE(vp, l);
1122 	} else {
1123 		/*
1124 		 * Any other processes trying to obtain this lock must first
1125 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1126 		 */
1127 		VOP_UNLOCK(vp, 0);
1128 	}
1129 	/*
1130 	 * Reclaim the vnode.
1131 	 */
1132 	if (VOP_RECLAIM(vp, l))
1133 		panic("vclean: cannot reclaim, vp %p", vp);
1134 	if (active) {
1135 		/*
1136 		 * Inline copy of vrele() since VOP_INACTIVE
1137 		 * has already been called.
1138 		 */
1139 		simple_lock(&vp->v_interlock);
1140 		if (--vp->v_usecount <= 0) {
1141 #ifdef DIAGNOSTIC
1142 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1143 				vprint("vclean: bad ref count", vp);
1144 				panic("vclean: ref cnt");
1145 			}
1146 #endif
1147 			/*
1148 			 * Insert at tail of LRU list.
1149 			 */
1150 
1151 			simple_unlock(&vp->v_interlock);
1152 			simple_lock(&vnode_free_list_slock);
1153 #ifdef DIAGNOSTIC
1154 			if (vp->v_holdcnt > 0)
1155 				panic("vclean: not clean, vp %p", vp);
1156 #endif
1157 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1158 			simple_unlock(&vnode_free_list_slock);
1159 		} else
1160 			simple_unlock(&vp->v_interlock);
1161 	}
1162 
1163 	KASSERT(vp->v_uobj.uo_npages == 0);
1164 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1165 		uvm_ra_freectx(vp->v_ractx);
1166 		vp->v_ractx = NULL;
1167 	}
1168 	cache_purge(vp);
1169 
1170 	/*
1171 	 * Done with purge, notify sleepers of the grim news.
1172 	 */
1173 	vp->v_op = dead_vnodeop_p;
1174 	vp->v_tag = VT_NON;
1175 	vp->v_vnlock = NULL;
1176 	simple_lock(&vp->v_interlock);
1177 	VN_KNOTE(vp, NOTE_REVOKE);	/* FreeBSD has this in vn_pollgone() */
1178 	vp->v_iflag &= ~VI_XLOCK;
1179 	vp->v_vflag &= ~VV_LOCKSWORK;
1180 	if (vp->v_iflag & VI_XWANT) {
1181 		vp->v_iflag &= ~VI_XWANT;
1182 		simple_unlock(&vp->v_interlock);
1183 		wakeup((void *)vp);
1184 	} else
1185 		simple_unlock(&vp->v_interlock);
1186 }
1187 
1188 /*
1189  * Recycle an unused vnode to the front of the free list.
1190  * Release the passed interlock if the vnode will be recycled.
1191  */
1192 int
1193 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct lwp *l)
1194 {
1195 
1196 	simple_lock(&vp->v_interlock);
1197 	if (vp->v_usecount == 0) {
1198 		if (inter_lkp)
1199 			simple_unlock(inter_lkp);
1200 		vgonel(vp, l);
1201 		return (1);
1202 	}
1203 	simple_unlock(&vp->v_interlock);
1204 	return (0);
1205 }
1206 
1207 /*
1208  * Eliminate all activity associated with a vnode
1209  * in preparation for reuse.
1210  */
1211 void
1212 vgone(struct vnode *vp)
1213 {
1214 	struct lwp *l = curlwp;		/* XXX */
1215 
1216 	simple_lock(&vp->v_interlock);
1217 	vgonel(vp, l);
1218 }
1219 
1220 /*
1221  * vgone, with the vp interlock held.
1222  */
1223 void
1224 vgonel(struct vnode *vp, struct lwp *l)
1225 {
1226 
1227 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1228 
1229 	/*
1230 	 * If a vgone (or vclean) is already in progress,
1231 	 * wait until it is done and return.
1232 	 */
1233 
1234 	if (vp->v_iflag & VI_XLOCK) {
1235 		vp->v_iflag |= VI_XWANT;
1236 		ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock);
1237 		return;
1238 	}
1239 
1240 	/*
1241 	 * Clean out the filesystem specific data.
1242 	 */
1243 
1244 	vclean(vp, DOCLOSE, l);
1245 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1246 
1247 	/*
1248 	 * Delete from old mount point vnode list, if on one.
1249 	 */
1250 
1251 	if (vp->v_mount != NULL)
1252 		insmntque(vp, (struct mount *)0);
1253 
1254 	/*
1255 	 * The test of the back pointer and the reference count of
1256 	 * zero is because it will be removed from the free list by
1257 	 * getcleanvnode, but will not have its reference count
1258 	 * incremented until after calling vgone. If the reference
1259 	 * count were incremented first, vgone would (incorrectly)
1260 	 * try to close the previous instance of the underlying object.
1261 	 * So, the back pointer is explicitly set to `0xdeadb' in
1262 	 * getnewvnode after removing it from the freelist to ensure
1263 	 * that we do not try to move it here.
1264 	 */
1265 
1266 	vp->v_type = VBAD;
1267 	if (vp->v_usecount == 0) {
1268 		bool dofree;
1269 
1270 		simple_lock(&vnode_free_list_slock);
1271 		if (vp->v_holdcnt > 0)
1272 			panic("vgonel: not clean, vp %p", vp);
1273 		/*
1274 		 * if it isn't on the freelist, we're called by getcleanvnode
1275 		 * and vnode is being re-used.  otherwise, we'll free it.
1276 		 */
1277 		dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
1278 		if (dofree) {
1279 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1280 			numvnodes--;
1281 		}
1282 		simple_unlock(&vnode_free_list_slock);
1283 		if (dofree)
1284 			pool_put(&vnode_pool, vp);
1285 	}
1286 }
1287 
1288 /*
1289  * Lookup a vnode by device number.
1290  */
1291 int
1292 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
1293 {
1294 	struct vnode *vp;
1295 	int rc = 0;
1296 
1297 	simple_lock(&spechash_slock);
1298 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1299 		if (dev != vp->v_rdev || type != vp->v_type)
1300 			continue;
1301 		*vpp = vp;
1302 		rc = 1;
1303 		break;
1304 	}
1305 	simple_unlock(&spechash_slock);
1306 	return (rc);
1307 }
1308 
1309 /*
1310  * Revoke all the vnodes corresponding to the specified minor number
1311  * range (endpoints inclusive) of the specified major.
1312  */
1313 void
1314 vdevgone(int maj, int minl, int minh, enum vtype type)
1315 {
1316 	struct vnode *vp;
1317 	int mn;
1318 
1319 	vp = NULL;	/* XXX gcc */
1320 
1321 	for (mn = minl; mn <= minh; mn++)
1322 		if (vfinddev(makedev(maj, mn), type, &vp))
1323 			VOP_REVOKE(vp, REVOKEALL);
1324 }
1325 
1326 /*
1327  * Calculate the total number of references to a special device.
1328  */
1329 int
1330 vcount(struct vnode *vp)
1331 {
1332 	struct vnode *vq, *vnext;
1333 	int count;
1334 
1335 loop:
1336 	if ((vp->v_iflag & VI_ALIASED) == 0)
1337 		return (vp->v_usecount);
1338 	simple_lock(&spechash_slock);
1339 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1340 		vnext = vq->v_specnext;
1341 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1342 			continue;
1343 		/*
1344 		 * Alias, but not in use, so flush it out.
1345 		 */
1346 		if (vq->v_usecount == 0 && vq != vp &&
1347 		    (vq->v_iflag & VI_XLOCK) == 0) {
1348 			simple_unlock(&spechash_slock);
1349 			vgone(vq);
1350 			goto loop;
1351 		}
1352 		count += vq->v_usecount;
1353 	}
1354 	simple_unlock(&spechash_slock);
1355 	return (count);
1356 }
1357 
1358 
1359 /*
1360  * sysctl helper routine to return list of supported fstypes
1361  */
1362 static int
1363 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
1364 {
1365 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
1366 	char *where = oldp;
1367 	struct vfsops *v;
1368 	size_t needed, left, slen;
1369 	int error, first;
1370 
1371 	if (newp != NULL)
1372 		return (EPERM);
1373 	if (namelen != 0)
1374 		return (EINVAL);
1375 
1376 	first = 1;
1377 	error = 0;
1378 	needed = 0;
1379 	left = *oldlenp;
1380 
1381 	mutex_enter(&vfs_list_lock);
1382 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1383 		if (where == NULL)
1384 			needed += strlen(v->vfs_name) + 1;
1385 		else {
1386 			memset(bf, 0, sizeof(bf));
1387 			if (first) {
1388 				strncpy(bf, v->vfs_name, sizeof(bf));
1389 				first = 0;
1390 			} else {
1391 				bf[0] = ' ';
1392 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1393 			}
1394 			bf[sizeof(bf)-1] = '\0';
1395 			slen = strlen(bf);
1396 			if (left < slen + 1)
1397 				break;
1398 			/* +1 to copy out the trailing NUL byte */
1399 			v->vfs_refcount++;
1400 			mutex_exit(&vfs_list_lock);
1401 			error = copyout(bf, where, slen + 1);
1402 			mutex_enter(&vfs_list_lock);
1403 			v->vfs_refcount--;
1404 			if (error)
1405 				break;
1406 			where += slen;
1407 			needed += slen;
1408 			left -= slen;
1409 		}
1410 	}
1411 	mutex_exit(&vfs_list_lock);
1412 	*oldlenp = needed;
1413 	return (error);
1414 }
1415 
1416 /*
1417  * Top level filesystem related information gathering.
1418  */
1419 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1420 {
1421 	sysctl_createv(clog, 0, NULL, NULL,
1422 		       CTLFLAG_PERMANENT,
1423 		       CTLTYPE_NODE, "vfs", NULL,
1424 		       NULL, 0, NULL, 0,
1425 		       CTL_VFS, CTL_EOL);
1426 	sysctl_createv(clog, 0, NULL, NULL,
1427 		       CTLFLAG_PERMANENT,
1428 		       CTLTYPE_NODE, "generic",
1429 		       SYSCTL_DESCR("Non-specific vfs related information"),
1430 		       NULL, 0, NULL, 0,
1431 		       CTL_VFS, VFS_GENERIC, CTL_EOL);
1432 	sysctl_createv(clog, 0, NULL, NULL,
1433 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1434 		       CTLTYPE_INT, "usermount",
1435 		       SYSCTL_DESCR("Whether unprivileged users may mount "
1436 				    "filesystems"),
1437 		       NULL, 0, &dovfsusermount, 0,
1438 		       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
1439 	sysctl_createv(clog, 0, NULL, NULL,
1440 		       CTLFLAG_PERMANENT,
1441 		       CTLTYPE_STRING, "fstypes",
1442 		       SYSCTL_DESCR("List of file systems present"),
1443 		       sysctl_vfs_generic_fstypes, 0, NULL, 0,
1444 		       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
1445 	sysctl_createv(clog, 0, NULL, NULL,
1446 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1447 		       CTLTYPE_INT, "magiclinks",
1448 		       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
1449 		       NULL, 0, &vfs_magiclinks, 0,
1450 		       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1451 }
1452 
1453 
1454 int kinfo_vdebug = 1;
1455 int kinfo_vgetfailed;
1456 #define KINFO_VNODESLOP	10
1457 /*
1458  * Dump vnode list (via sysctl).
1459  * Copyout address of vnode followed by vnode.
1460  */
1461 /* ARGSUSED */
1462 int
1463 sysctl_kern_vnode(SYSCTLFN_ARGS)
1464 {
1465 	char *where = oldp;
1466 	size_t *sizep = oldlenp;
1467 	struct mount *mp, *nmp;
1468 	struct vnode *vp;
1469 	char *bp = where, *savebp;
1470 	char *ewhere;
1471 	int error;
1472 
1473 	if (namelen != 0)
1474 		return (EOPNOTSUPP);
1475 	if (newp != NULL)
1476 		return (EPERM);
1477 
1478 #define VPTRSZ	sizeof(struct vnode *)
1479 #define VNODESZ	sizeof(struct vnode)
1480 	if (where == NULL) {
1481 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1482 		return (0);
1483 	}
1484 	ewhere = where + *sizep;
1485 
1486 	mutex_enter(&mountlist_lock);
1487 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1488 	     mp = nmp) {
1489 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_lock)) {
1490 			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1491 			continue;
1492 		}
1493 		savebp = bp;
1494 again:
1495 		simple_lock(&mntvnode_slock);
1496 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1497 			/*
1498 			 * Check that the vp is still associated with
1499 			 * this filesystem.  RACE: could have been
1500 			 * recycled onto the same filesystem.
1501 			 */
1502 			if (vp->v_mount != mp) {
1503 				simple_unlock(&mntvnode_slock);
1504 				if (kinfo_vdebug)
1505 					printf("kinfo: vp changed\n");
1506 				bp = savebp;
1507 				goto again;
1508 			}
1509 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1510 				simple_unlock(&mntvnode_slock);
1511 				*sizep = bp - where;
1512 				return (ENOMEM);
1513 			}
1514 			simple_unlock(&mntvnode_slock);
1515 			if ((error = copyout((void *)&vp, bp, VPTRSZ)) ||
1516 			   (error = copyout((void *)vp, bp + VPTRSZ, VNODESZ)))
1517 				return (error);
1518 			bp += VPTRSZ + VNODESZ;
1519 			simple_lock(&mntvnode_slock);
1520 		}
1521 		simple_unlock(&mntvnode_slock);
1522 		mutex_enter(&mountlist_lock);
1523 		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1524 		vfs_unbusy(mp);
1525 	}
1526 	mutex_exit(&mountlist_lock);
1527 
1528 	*sizep = bp - where;
1529 	return (0);
1530 }
1531 
1532 /*
1533  * Check to see if a filesystem is mounted on a block device.
1534  */
1535 int
1536 vfs_mountedon(struct vnode *vp)
1537 {
1538 	struct vnode *vq;
1539 	int error = 0;
1540 
1541 	if (vp->v_type != VBLK)
1542 		return ENOTBLK;
1543 	if (vp->v_specmountpoint != NULL)
1544 		return (EBUSY);
1545 	if (vp->v_iflag & VI_ALIASED) {
1546 		simple_lock(&spechash_slock);
1547 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1548 			if (vq->v_rdev != vp->v_rdev ||
1549 			    vq->v_type != vp->v_type)
1550 				continue;
1551 			if (vq->v_specmountpoint != NULL) {
1552 				error = EBUSY;
1553 				break;
1554 			}
1555 		}
1556 		simple_unlock(&spechash_slock);
1557 	}
1558 	return (error);
1559 }
1560 
1561 /*
1562  * Unmount all file systems.
1563  * We traverse the list in reverse order under the assumption that doing so
1564  * will avoid needing to worry about dependencies.
1565  */
1566 void
1567 vfs_unmountall(struct lwp *l)
1568 {
1569 	struct mount *mp, *nmp;
1570 	int allerror, error;
1571 
1572 	printf("unmounting file systems...");
1573 	for (allerror = 0,
1574 	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1575 		nmp = mp->mnt_list.cqe_prev;
1576 #ifdef DEBUG
1577 		printf("\nunmounting %s (%s)...",
1578 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1579 #endif
1580 		/*
1581 		 * XXX Freeze syncer.  Must do this before locking the
1582 		 * mount point.  See dounmount() for details.
1583 		 */
1584 		mutex_enter(&syncer_mutex);
1585 		if (vfs_busy(mp, 0, 0)) {
1586 			mutex_exit(&syncer_mutex);
1587 			continue;
1588 		}
1589 		if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1590 			printf("unmount of %s failed with error %d\n",
1591 			    mp->mnt_stat.f_mntonname, error);
1592 			allerror = 1;
1593 		}
1594 	}
1595 	printf(" done\n");
1596 	if (allerror)
1597 		printf("WARNING: some file systems would not unmount\n");
1598 }
1599 
1600 extern struct simplelock bqueue_slock; /* XXX */
1601 
1602 /*
1603  * Sync and unmount file systems before shutting down.
1604  */
1605 void
1606 vfs_shutdown(void)
1607 {
1608 	struct lwp *l;
1609 
1610 	/* XXX we're certainly not running in lwp0's context! */
1611 	l = curlwp;
1612 	if (l == NULL)
1613 		l = &lwp0;
1614 
1615 	printf("syncing disks... ");
1616 
1617 	/* remove user processes from run queue */
1618 	suspendsched();
1619 	(void) spl0();
1620 
1621 	/* avoid coming back this way again if we panic. */
1622 	doing_shutdown = 1;
1623 
1624 	sys_sync(l, NULL, NULL);
1625 
1626 	/* Wait for sync to finish. */
1627 	if (buf_syncwait() != 0) {
1628 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1629 		Debugger();
1630 #endif
1631 		printf("giving up\n");
1632 		return;
1633 	} else
1634 		printf("done\n");
1635 
1636 	/*
1637 	 * If we've panic'd, don't make the situation potentially
1638 	 * worse by unmounting the file systems.
1639 	 */
1640 	if (panicstr != NULL)
1641 		return;
1642 
1643 	/* Release inodes held by texts before update. */
1644 #ifdef notdef
1645 	vnshutdown();
1646 #endif
1647 	/* Unmount file systems. */
1648 	vfs_unmountall(l);
1649 }
1650 
1651 /*
1652  * Mount the root file system.  If the operator didn't specify a
1653  * file system to use, try all possible file systems until one
1654  * succeeds.
1655  */
1656 int
1657 vfs_mountroot(void)
1658 {
1659 	struct vfsops *v;
1660 	int error = ENODEV;
1661 
1662 	if (root_device == NULL)
1663 		panic("vfs_mountroot: root device unknown");
1664 
1665 	switch (device_class(root_device)) {
1666 	case DV_IFNET:
1667 		if (rootdev != NODEV)
1668 			panic("vfs_mountroot: rootdev set for DV_IFNET "
1669 			    "(0x%08x -> %d,%d)", rootdev,
1670 			    major(rootdev), minor(rootdev));
1671 		break;
1672 
1673 	case DV_DISK:
1674 		if (rootdev == NODEV)
1675 			panic("vfs_mountroot: rootdev not set for DV_DISK");
1676 	        if (bdevvp(rootdev, &rootvp))
1677 	                panic("vfs_mountroot: can't get vnode for rootdev");
1678 		error = VOP_OPEN(rootvp, FREAD, FSCRED, curlwp);
1679 		if (error) {
1680 			printf("vfs_mountroot: can't open root device\n");
1681 			return (error);
1682 		}
1683 		break;
1684 
1685 	default:
1686 		printf("%s: inappropriate for root file system\n",
1687 		    root_device->dv_xname);
1688 		return (ENODEV);
1689 	}
1690 
1691 	/*
1692 	 * If user specified a file system, use it.
1693 	 */
1694 	if (mountroot != NULL) {
1695 		error = (*mountroot)();
1696 		goto done;
1697 	}
1698 
1699 	/*
1700 	 * Try each file system currently configured into the kernel.
1701 	 */
1702 	mutex_enter(&vfs_list_lock);
1703 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1704 		if (v->vfs_mountroot == NULL)
1705 			continue;
1706 #ifdef DEBUG
1707 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1708 #endif
1709 		v->vfs_refcount++;
1710 		mutex_exit(&vfs_list_lock);
1711 		error = (*v->vfs_mountroot)();
1712 		mutex_enter(&vfs_list_lock);
1713 		v->vfs_refcount--;
1714 		if (!error) {
1715 			aprint_normal("root file system type: %s\n",
1716 			    v->vfs_name);
1717 			break;
1718 		}
1719 	}
1720 	mutex_exit(&vfs_list_lock);
1721 
1722 	if (v == NULL) {
1723 		printf("no file system for %s", root_device->dv_xname);
1724 		if (device_class(root_device) == DV_DISK)
1725 			printf(" (dev 0x%x)", rootdev);
1726 		printf("\n");
1727 		error = EFTYPE;
1728 	}
1729 
1730 done:
1731 	if (error && device_class(root_device) == DV_DISK) {
1732 		VOP_CLOSE(rootvp, FREAD, FSCRED, curlwp);
1733 		vrele(rootvp);
1734 	}
1735 	return (error);
1736 }
1737