xref: /netbsd-src/sys/kern/vfs_subr.c (revision 001c68bd94f75ce9270b69227c4199fbf34ee396)
1 /*	$NetBSD: vfs_subr.c,v 1.201 2003/06/29 22:31:33 fvdl Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1989, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *	This product includes software developed by the University of
60  *	California, Berkeley and its contributors.
61  * 4. Neither the name of the University nor the names of its contributors
62  *    may be used to endorse or promote products derived from this software
63  *    without specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
66  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
69  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
70  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
71  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
72  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
73  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
74  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75  * SUCH DAMAGE.
76  *
77  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
78  */
79 
80 /*
81  * External virtual filesystem routines
82  */
83 
84 #include <sys/cdefs.h>
85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.201 2003/06/29 22:31:33 fvdl Exp $");
86 
87 #include "opt_inet.h"
88 #include "opt_ddb.h"
89 #include "opt_compat_netbsd.h"
90 #include "opt_compat_43.h"
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/proc.h>
95 #include <sys/kernel.h>
96 #include <sys/mount.h>
97 #include <sys/time.h>
98 #include <sys/event.h>
99 #include <sys/fcntl.h>
100 #include <sys/vnode.h>
101 #include <sys/stat.h>
102 #include <sys/namei.h>
103 #include <sys/ucred.h>
104 #include <sys/buf.h>
105 #include <sys/errno.h>
106 #include <sys/malloc.h>
107 #include <sys/domain.h>
108 #include <sys/mbuf.h>
109 #include <sys/sa.h>
110 #include <sys/syscallargs.h>
111 #include <sys/device.h>
112 #include <sys/dirent.h>
113 #include <sys/filedesc.h>
114 
115 #include <miscfs/specfs/specdev.h>
116 #include <miscfs/genfs/genfs.h>
117 #include <miscfs/syncfs/syncfs.h>
118 
119 #include <netinet/in.h>
120 
121 #include <uvm/uvm.h>
122 #include <uvm/uvm_ddb.h>
123 
124 #include <netinet/in.h>
125 
126 #include <sys/sysctl.h>
127 
128 const enum vtype iftovt_tab[16] = {
129 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
130 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
131 };
132 const int	vttoif_tab[9] = {
133 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
134 	S_IFSOCK, S_IFIFO, S_IFMT,
135 };
136 
137 int doforce = 1;		/* 1 => permit forcible unmounting */
138 int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
139 
140 extern int dovfsusermount;	/* 1 => permit any user to mount filesystems */
141 
142 /*
143  * Insq/Remq for the vnode usage lists.
144  */
145 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
146 #define	bufremvn(bp) {							\
147 	LIST_REMOVE(bp, b_vnbufs);					\
148 	(bp)->b_vnbufs.le_next = NOLIST;				\
149 }
150 /* TAILQ_HEAD(freelst, vnode) vnode_free_list =	vnode free list (in vnode.h) */
151 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
152 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
153 
154 struct mntlist mountlist =			/* mounted filesystem list */
155     CIRCLEQ_HEAD_INITIALIZER(mountlist);
156 struct vfs_list_head vfs_list =			/* vfs list */
157     LIST_HEAD_INITIALIZER(vfs_list);
158 
159 struct nfs_public nfs_pub;			/* publicly exported FS */
160 
161 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER;
162 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER;
163 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER;
164 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
165 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER;
166 
167 /* XXX - gross; single global lock to protect v_numoutput */
168 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER;
169 
170 /*
171  * These define the root filesystem and device.
172  */
173 struct mount *rootfs;
174 struct vnode *rootvnode;
175 struct device *root_device;			/* root device */
176 
177 struct pool vnode_pool;				/* memory pool for vnodes */
178 
179 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
180 
181 /*
182  * Local declarations.
183  */
184 void insmntque __P((struct vnode *, struct mount *));
185 int getdevvp __P((dev_t, struct vnode **, enum vtype));
186 void vgoneall __P((struct vnode *));
187 
188 void vclean(struct vnode *, int, struct proc *);
189 
190 static int vfs_hang_addrlist __P((struct mount *, struct netexport *,
191 				  struct export_args *));
192 static int vfs_free_netcred __P((struct radix_node *, void *));
193 static void vfs_free_addrlist __P((struct netexport *));
194 
195 #ifdef DEBUG
196 void printlockedvnodes __P((void));
197 #endif
198 
199 /*
200  * Initialize the vnode management data structures.
201  */
202 void
203 vntblinit()
204 {
205 
206 	pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
207 	    &pool_allocator_nointr);
208 
209 	/*
210 	 * Initialize the filesystem syncer.
211 	 */
212 	vn_initialize_syncerd();
213 }
214 
215 /*
216  * Mark a mount point as busy. Used to synchronize access and to delay
217  * unmounting. Interlock is not released on failure.
218  */
219 int
220 vfs_busy(mp, flags, interlkp)
221 	struct mount *mp;
222 	int flags;
223 	struct simplelock *interlkp;
224 {
225 	int lkflags;
226 
227 	while (mp->mnt_flag & MNT_UNMOUNT) {
228 		int gone;
229 
230 		if (flags & LK_NOWAIT)
231 			return (ENOENT);
232 		if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
233 		    && mp->mnt_unmounter == curproc)
234 			return (EDEADLK);
235 		if (interlkp)
236 			simple_unlock(interlkp);
237 		/*
238 		 * Since all busy locks are shared except the exclusive
239 		 * lock granted when unmounting, the only place that a
240 		 * wakeup needs to be done is at the release of the
241 		 * exclusive lock at the end of dounmount.
242 		 *
243 		 * XXX MP: add spinlock protecting mnt_wcnt here once you
244 		 * can atomically unlock-and-sleep.
245 		 */
246 		mp->mnt_wcnt++;
247 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
248 		mp->mnt_wcnt--;
249 		gone = mp->mnt_flag & MNT_GONE;
250 
251 		if (mp->mnt_wcnt == 0)
252 			wakeup(&mp->mnt_wcnt);
253 		if (interlkp)
254 			simple_lock(interlkp);
255 		if (gone)
256 			return (ENOENT);
257 	}
258 	lkflags = LK_SHARED;
259 	if (interlkp)
260 		lkflags |= LK_INTERLOCK;
261 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp))
262 		panic("vfs_busy: unexpected lock failure");
263 	return (0);
264 }
265 
266 /*
267  * Free a busy filesystem.
268  */
269 void
270 vfs_unbusy(mp)
271 	struct mount *mp;
272 {
273 
274 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
275 }
276 
277 /*
278  * Lookup a filesystem type, and if found allocate and initialize
279  * a mount structure for it.
280  *
281  * Devname is usually updated by mount(8) after booting.
282  */
283 int
284 vfs_rootmountalloc(fstypename, devname, mpp)
285 	char *fstypename;
286 	char *devname;
287 	struct mount **mpp;
288 {
289 	struct vfsops *vfsp = NULL;
290 	struct mount *mp;
291 
292 	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
293 		if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN))
294 			break;
295 
296 	if (vfsp == NULL)
297 		return (ENODEV);
298 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
299 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
300 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
301 	(void)vfs_busy(mp, LK_NOWAIT, 0);
302 	LIST_INIT(&mp->mnt_vnodelist);
303 	mp->mnt_op = vfsp;
304 	mp->mnt_flag = MNT_RDONLY;
305 	mp->mnt_vnodecovered = NULLVP;
306 	vfsp->vfs_refcount++;
307 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN);
308 	mp->mnt_stat.f_mntonname[0] = '/';
309 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
310 	*mpp = mp;
311 	return (0);
312 }
313 
314 /*
315  * Lookup a mount point by filesystem identifier.
316  */
317 struct mount *
318 vfs_getvfs(fsid)
319 	fsid_t *fsid;
320 {
321 	struct mount *mp;
322 
323 	simple_lock(&mountlist_slock);
324 	CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
325 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
326 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
327 			simple_unlock(&mountlist_slock);
328 			return (mp);
329 		}
330 	}
331 	simple_unlock(&mountlist_slock);
332 	return ((struct mount *)0);
333 }
334 
335 /*
336  * Get a new unique fsid
337  */
338 void
339 vfs_getnewfsid(mp)
340 	struct mount *mp;
341 {
342 	static u_short xxxfs_mntid;
343 	fsid_t tfsid;
344 	int mtype;
345 
346 	simple_lock(&mntid_slock);
347 	mtype = makefstype(mp->mnt_op->vfs_name);
348 	mp->mnt_stat.f_fsid.val[0] = makedev(mtype, 0);
349 	mp->mnt_stat.f_fsid.val[1] = mtype;
350 	if (xxxfs_mntid == 0)
351 		++xxxfs_mntid;
352 	tfsid.val[0] = makedev(mtype & 0xff, xxxfs_mntid);
353 	tfsid.val[1] = mtype;
354 	if (!CIRCLEQ_EMPTY(&mountlist)) {
355 		while (vfs_getvfs(&tfsid)) {
356 			tfsid.val[0]++;
357 			xxxfs_mntid++;
358 		}
359 	}
360 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
361 	simple_unlock(&mntid_slock);
362 }
363 
364 /*
365  * Make a 'unique' number from a mount type name.
366  */
367 long
368 makefstype(type)
369 	const char *type;
370 {
371 	long rv;
372 
373 	for (rv = 0; *type; type++) {
374 		rv <<= 2;
375 		rv ^= *type;
376 	}
377 	return rv;
378 }
379 
380 
381 /*
382  * Set vnode attributes to VNOVAL
383  */
384 void
385 vattr_null(vap)
386 	struct vattr *vap;
387 {
388 
389 	vap->va_type = VNON;
390 
391 	/*
392 	 * Assign individually so that it is safe even if size and
393 	 * sign of each member are varied.
394 	 */
395 	vap->va_mode = VNOVAL;
396 	vap->va_nlink = VNOVAL;
397 	vap->va_uid = VNOVAL;
398 	vap->va_gid = VNOVAL;
399 	vap->va_fsid = VNOVAL;
400 	vap->va_fileid = VNOVAL;
401 	vap->va_size = VNOVAL;
402 	vap->va_blocksize = VNOVAL;
403 	vap->va_atime.tv_sec =
404 	    vap->va_mtime.tv_sec =
405 	    vap->va_ctime.tv_sec =
406 	    vap->va_birthtime.tv_sec = VNOVAL;
407 	vap->va_atime.tv_nsec =
408 	    vap->va_mtime.tv_nsec =
409 	    vap->va_ctime.tv_nsec =
410 	    vap->va_birthtime.tv_nsec = VNOVAL;
411 	vap->va_gen = VNOVAL;
412 	vap->va_flags = VNOVAL;
413 	vap->va_rdev = VNOVAL;
414 	vap->va_bytes = VNOVAL;
415 	vap->va_vaflags = 0;
416 }
417 
418 /*
419  * Routines having to do with the management of the vnode table.
420  */
421 extern int (**dead_vnodeop_p) __P((void *));
422 long numvnodes;
423 
424 /*
425  * Return the next vnode from the free list.
426  */
427 int
428 getnewvnode(tag, mp, vops, vpp)
429 	enum vtagtype tag;
430 	struct mount *mp;
431 	int (**vops) __P((void *));
432 	struct vnode **vpp;
433 {
434 	extern struct uvm_pagerops uvm_vnodeops;
435 	struct uvm_object *uobj;
436 	struct proc *p = curproc;	/* XXX */
437 	struct freelst *listhd;
438 	static int toggle;
439 	struct vnode *vp;
440 	int error = 0, tryalloc;
441 
442  try_again:
443 	if (mp) {
444 		/*
445 		 * Mark filesystem busy while we're creating a vnode.
446 		 * If unmount is in progress, this will wait; if the
447 		 * unmount succeeds (only if umount -f), this will
448 		 * return an error.  If the unmount fails, we'll keep
449 		 * going afterwards.
450 		 * (This puts the per-mount vnode list logically under
451 		 * the protection of the vfs_busy lock).
452 		 */
453 		error = vfs_busy(mp, LK_RECURSEFAIL, 0);
454 		if (error && error != EDEADLK)
455 			return error;
456 	}
457 
458 	/*
459 	 * We must choose whether to allocate a new vnode or recycle an
460 	 * existing one. The criterion for allocating a new one is that
461 	 * the total number of vnodes is less than the number desired or
462 	 * there are no vnodes on either free list. Generally we only
463 	 * want to recycle vnodes that have no buffers associated with
464 	 * them, so we look first on the vnode_free_list. If it is empty,
465 	 * we next consider vnodes with referencing buffers on the
466 	 * vnode_hold_list. The toggle ensures that half the time we
467 	 * will use a buffer from the vnode_hold_list, and half the time
468 	 * we will allocate a new one unless the list has grown to twice
469 	 * the desired size. We are reticent to recycle vnodes from the
470 	 * vnode_hold_list because we will lose the identity of all its
471 	 * referencing buffers.
472 	 */
473 
474 	vp = NULL;
475 
476 	simple_lock(&vnode_free_list_slock);
477 
478 	toggle ^= 1;
479 	if (numvnodes > 2 * desiredvnodes)
480 		toggle = 0;
481 
482 	tryalloc = numvnodes < desiredvnodes ||
483 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
484 	     (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
485 
486 	if (tryalloc &&
487 	    (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) {
488 		simple_unlock(&vnode_free_list_slock);
489 		memset(vp, 0, sizeof(*vp));
490 		simple_lock_init(&vp->v_interlock);
491 		uobj = &vp->v_uobj;
492 		uobj->pgops = &uvm_vnodeops;
493 		uobj->uo_npages = 0;
494 		TAILQ_INIT(&uobj->memq);
495 		numvnodes++;
496 	} else {
497 		if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL)
498 			vp = TAILQ_FIRST(listhd = &vnode_hold_list);
499 		for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) {
500 			if (simple_lock_try(&vp->v_interlock)) {
501 				if ((vp->v_flag & VLAYER) == 0) {
502 					break;
503 				}
504 				if (VOP_ISLOCKED(vp) == 0)
505 					break;
506 				else
507 					simple_unlock(&vp->v_interlock);
508 			}
509 		}
510 		/*
511 		 * Unless this is a bad time of the month, at most
512 		 * the first NCPUS items on the free list are
513 		 * locked, so this is close enough to being empty.
514 		 */
515 		if (vp == NULLVP) {
516 			simple_unlock(&vnode_free_list_slock);
517 			if (mp && error != EDEADLK)
518 				vfs_unbusy(mp);
519 			if (tryalloc) {
520 				printf("WARNING: unable to allocate new "
521 				    "vnode, retrying...\n");
522 				(void) tsleep(&lbolt, PRIBIO, "newvn", hz);
523 				goto try_again;
524 			}
525 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
526 			*vpp = 0;
527 			return (ENFILE);
528 		}
529 		if (vp->v_usecount)
530 			panic("free vnode isn't, vp %p", vp);
531 		TAILQ_REMOVE(listhd, vp, v_freelist);
532 		/* see comment on why 0xdeadb is set at end of vgone (below) */
533 		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
534 		simple_unlock(&vnode_free_list_slock);
535 		vp->v_lease = NULL;
536 
537 		if (vp->v_type != VBAD)
538 			vgonel(vp, p);
539 		else
540 			simple_unlock(&vp->v_interlock);
541 #ifdef DIAGNOSTIC
542 		if (vp->v_data || vp->v_uobj.uo_npages ||
543 		    TAILQ_FIRST(&vp->v_uobj.memq))
544 			panic("cleaned vnode isn't, vp %p", vp);
545 		if (vp->v_numoutput)
546 			panic("clean vnode has pending I/O's, vp %p", vp);
547 #endif
548 		KASSERT((vp->v_flag & VONWORKLST) == 0);
549 		vp->v_flag = 0;
550 		vp->v_socket = NULL;
551 #ifdef VERIFIED_EXEC
552 		vp->fp_status = FINGERPRINT_INVALID;
553 #endif
554 	}
555 	vp->v_type = VNON;
556 	vp->v_vnlock = &vp->v_lock;
557 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
558 	cache_purge(vp);
559 	vp->v_tag = tag;
560 	vp->v_op = vops;
561 	insmntque(vp, mp);
562 	*vpp = vp;
563 	vp->v_usecount = 1;
564 	vp->v_data = 0;
565 	simple_lock_init(&vp->v_uobj.vmobjlock);
566 
567 	/*
568 	 * initialize uvm_object within vnode.
569 	 */
570 
571 	uobj = &vp->v_uobj;
572 	KASSERT(uobj->pgops == &uvm_vnodeops);
573 	KASSERT(uobj->uo_npages == 0);
574 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
575 	vp->v_size = VSIZENOTSET;
576 
577 	if (mp && error != EDEADLK)
578 		vfs_unbusy(mp);
579 	return (0);
580 }
581 
582 /*
583  * This is really just the reverse of getnewvnode(). Needed for
584  * VFS_VGET functions who may need to push back a vnode in case
585  * of a locking race.
586  */
587 void
588 ungetnewvnode(vp)
589 	struct vnode *vp;
590 {
591 #ifdef DIAGNOSTIC
592 	if (vp->v_usecount != 1)
593 		panic("ungetnewvnode: busy vnode");
594 #endif
595 	vp->v_usecount--;
596 	insmntque(vp, NULL);
597 	vp->v_type = VBAD;
598 
599 	simple_lock(&vp->v_interlock);
600 	/*
601 	 * Insert at head of LRU list
602 	 */
603 	simple_lock(&vnode_free_list_slock);
604 	if (vp->v_holdcnt > 0)
605 		TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist);
606 	else
607 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
608 	simple_unlock(&vnode_free_list_slock);
609 	simple_unlock(&vp->v_interlock);
610 }
611 
612 /*
613  * Move a vnode from one mount queue to another.
614  */
615 void
616 insmntque(vp, mp)
617 	struct vnode *vp;
618 	struct mount *mp;
619 {
620 
621 #ifdef DIAGNOSTIC
622 	if ((mp != NULL) &&
623 	    (mp->mnt_flag & MNT_UNMOUNT) &&
624 	    !(mp->mnt_flag & MNT_SOFTDEP) &&
625 	    vp->v_tag != VT_VFS) {
626 		panic("insmntque into dying filesystem");
627 	}
628 #endif
629 
630 	simple_lock(&mntvnode_slock);
631 	/*
632 	 * Delete from old mount point vnode list, if on one.
633 	 */
634 	if (vp->v_mount != NULL)
635 		LIST_REMOVE(vp, v_mntvnodes);
636 	/*
637 	 * Insert into list of vnodes for the new mount point, if available.
638 	 */
639 	if ((vp->v_mount = mp) != NULL)
640 		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
641 	simple_unlock(&mntvnode_slock);
642 }
643 
644 /*
645  * Update outstanding I/O count and do wakeup if requested.
646  */
647 void
648 vwakeup(bp)
649 	struct buf *bp;
650 {
651 	struct vnode *vp;
652 
653 	if ((vp = bp->b_vp) != NULL) {
654 		/* XXX global lock hack
655 		 * can't use v_interlock here since this is called
656 		 * in interrupt context from biodone().
657 		 */
658 		simple_lock(&global_v_numoutput_slock);
659 		if (--vp->v_numoutput < 0)
660 			panic("vwakeup: neg numoutput, vp %p", vp);
661 		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
662 			vp->v_flag &= ~VBWAIT;
663 			wakeup((caddr_t)&vp->v_numoutput);
664 		}
665 		simple_unlock(&global_v_numoutput_slock);
666 	}
667 }
668 
669 /*
670  * Flush out and invalidate all buffers associated with a vnode.
671  * Called with the underlying vnode locked, which should prevent new dirty
672  * buffers from being queued.
673  */
674 int
675 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
676 	struct vnode *vp;
677 	int flags;
678 	struct ucred *cred;
679 	struct proc *p;
680 	int slpflag, slptimeo;
681 {
682 	struct buf *bp, *nbp;
683 	int s, error;
684 	int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
685 		(flags & V_SAVE ? PGO_CLEANIT : 0);
686 
687 	/* XXXUBC this doesn't look at flags or slp* */
688 	simple_lock(&vp->v_interlock);
689 	error = VOP_PUTPAGES(vp, 0, 0, flushflags);
690 	if (error) {
691 		return error;
692 	}
693 
694 	if (flags & V_SAVE) {
695 		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p);
696 		if (error)
697 		        return (error);
698 #ifdef DIAGNOSTIC
699 		s = splbio();
700 		if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd))
701 		        panic("vinvalbuf: dirty bufs, vp %p", vp);
702 		splx(s);
703 #endif
704 	}
705 
706 	s = splbio();
707 
708 restart:
709 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
710 		nbp = LIST_NEXT(bp, b_vnbufs);
711 		simple_lock(&bp->b_interlock);
712 		if (bp->b_flags & B_BUSY) {
713 			bp->b_flags |= B_WANTED;
714 			error = ltsleep((caddr_t)bp,
715 				    slpflag | (PRIBIO + 1) | PNORELOCK,
716 				    "vinvalbuf", slptimeo, &bp->b_interlock);
717 			if (error) {
718 				splx(s);
719 				return (error);
720 			}
721 			goto restart;
722 		}
723 		bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
724 		simple_unlock(&bp->b_interlock);
725 		brelse(bp);
726 	}
727 
728 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
729 		nbp = LIST_NEXT(bp, b_vnbufs);
730 		simple_lock(&bp->b_interlock);
731 		if (bp->b_flags & B_BUSY) {
732 			bp->b_flags |= B_WANTED;
733 			error = ltsleep((caddr_t)bp,
734 				    slpflag | (PRIBIO + 1) | PNORELOCK,
735 				    "vinvalbuf", slptimeo, &bp->b_interlock);
736 			if (error) {
737 				splx(s);
738 				return (error);
739 			}
740 			goto restart;
741 		}
742 		/*
743 		 * XXX Since there are no node locks for NFS, I believe
744 		 * there is a slight chance that a delayed write will
745 		 * occur while sleeping just above, so check for it.
746 		 */
747 		if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
748 #ifdef DEBUG
749 			printf("buffer still DELWRI\n");
750 #endif
751 			bp->b_flags |= B_BUSY | B_VFLUSH;
752 			simple_unlock(&bp->b_interlock);
753 			VOP_BWRITE(bp);
754 			goto restart;
755 		}
756 		bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
757 		simple_unlock(&bp->b_interlock);
758 		brelse(bp);
759 	}
760 
761 #ifdef DIAGNOSTIC
762 	if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
763 		panic("vinvalbuf: flush failed, vp %p", vp);
764 #endif
765 
766 	splx(s);
767 
768 	return (0);
769 }
770 
771 /*
772  * Destroy any in core blocks past the truncation length.
773  * Called with the underlying vnode locked, which should prevent new dirty
774  * buffers from being queued.
775  */
776 int
777 vtruncbuf(vp, lbn, slpflag, slptimeo)
778 	struct vnode *vp;
779 	daddr_t lbn;
780 	int slpflag, slptimeo;
781 {
782 	struct buf *bp, *nbp;
783 	int s, error;
784 	voff_t off;
785 
786 	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
787 	simple_lock(&vp->v_interlock);
788 	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
789 	if (error) {
790 		return error;
791 	}
792 
793 	s = splbio();
794 
795 restart:
796 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
797 		nbp = LIST_NEXT(bp, b_vnbufs);
798 		if (bp->b_lblkno < lbn)
799 			continue;
800 		simple_lock(&bp->b_interlock);
801 		if (bp->b_flags & B_BUSY) {
802 			bp->b_flags |= B_WANTED;
803 			error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
804 			    "vtruncbuf", slptimeo, &bp->b_interlock);
805 			if (error) {
806 				splx(s);
807 				return (error);
808 			}
809 			goto restart;
810 		}
811 		bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
812 		simple_unlock(&bp->b_interlock);
813 		brelse(bp);
814 	}
815 
816 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
817 		nbp = LIST_NEXT(bp, b_vnbufs);
818 		if (bp->b_lblkno < lbn)
819 			continue;
820 		simple_lock(&bp->b_interlock);
821 		if (bp->b_flags & B_BUSY) {
822 			bp->b_flags |= B_WANTED;
823 			error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
824 			    "vtruncbuf", slptimeo, &bp->b_interlock);
825 			if (error) {
826 				splx(s);
827 				return (error);
828 			}
829 			goto restart;
830 		}
831 		bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
832 		simple_unlock(&bp->b_interlock);
833 		brelse(bp);
834 	}
835 
836 	splx(s);
837 
838 	return (0);
839 }
840 
841 void
842 vflushbuf(vp, sync)
843 	struct vnode *vp;
844 	int sync;
845 {
846 	struct buf *bp, *nbp;
847 	int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
848 	int s;
849 
850 	simple_lock(&vp->v_interlock);
851 	(void) VOP_PUTPAGES(vp, 0, 0, flags);
852 
853 loop:
854 	s = splbio();
855 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
856 		nbp = LIST_NEXT(bp, b_vnbufs);
857 		simple_lock(&bp->b_interlock);
858 		if ((bp->b_flags & B_BUSY)) {
859 			simple_unlock(&bp->b_interlock);
860 			continue;
861 		}
862 		if ((bp->b_flags & B_DELWRI) == 0)
863 			panic("vflushbuf: not dirty, bp %p", bp);
864 		bp->b_flags |= B_BUSY | B_VFLUSH;
865 		simple_unlock(&bp->b_interlock);
866 		splx(s);
867 		/*
868 		 * Wait for I/O associated with indirect blocks to complete,
869 		 * since there is no way to quickly wait for them below.
870 		 */
871 		if (bp->b_vp == vp || sync == 0)
872 			(void) bawrite(bp);
873 		else
874 			(void) bwrite(bp);
875 		goto loop;
876 	}
877 	if (sync == 0) {
878 		splx(s);
879 		return;
880 	}
881 	simple_lock(&global_v_numoutput_slock);
882 	while (vp->v_numoutput) {
883 		vp->v_flag |= VBWAIT;
884 		ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0,
885 			&global_v_numoutput_slock);
886 	}
887 	simple_unlock(&global_v_numoutput_slock);
888 	splx(s);
889 	if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
890 		vprint("vflushbuf: dirty", vp);
891 		goto loop;
892 	}
893 }
894 
895 /*
896  * Associate a buffer with a vnode.
897  */
898 void
899 bgetvp(vp, bp)
900 	struct vnode *vp;
901 	struct buf *bp;
902 {
903 	int s;
904 
905 	if (bp->b_vp)
906 		panic("bgetvp: not free, bp %p", bp);
907 	VHOLD(vp);
908 	s = splbio();
909 	bp->b_vp = vp;
910 	if (vp->v_type == VBLK || vp->v_type == VCHR)
911 		bp->b_dev = vp->v_rdev;
912 	else
913 		bp->b_dev = NODEV;
914 	/*
915 	 * Insert onto list for new vnode.
916 	 */
917 	bufinsvn(bp, &vp->v_cleanblkhd);
918 	splx(s);
919 }
920 
921 /*
922  * Disassociate a buffer from a vnode.
923  */
924 void
925 brelvp(bp)
926 	struct buf *bp;
927 {
928 	struct vnode *vp;
929 	int s;
930 
931 	if (bp->b_vp == NULL)
932 		panic("brelvp: vp NULL, bp %p", bp);
933 
934 	s = splbio();
935 	vp = bp->b_vp;
936 	/*
937 	 * Delete from old vnode list, if on one.
938 	 */
939 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
940 		bufremvn(bp);
941 
942 	if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) &&
943 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
944 		vp->v_flag &= ~VONWORKLST;
945 		LIST_REMOVE(vp, v_synclist);
946 	}
947 
948 	bp->b_vp = NULL;
949 	HOLDRELE(vp);
950 	splx(s);
951 }
952 
953 /*
954  * Reassign a buffer from one vnode to another.
955  * Used to assign file specific control information
956  * (indirect blocks) to the vnode to which they belong.
957  *
958  * This function must be called at splbio().
959  */
960 void
961 reassignbuf(bp, newvp)
962 	struct buf *bp;
963 	struct vnode *newvp;
964 {
965 	struct buflists *listheadp;
966 	int delay;
967 
968 	/*
969 	 * Delete from old vnode list, if on one.
970 	 */
971 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
972 		bufremvn(bp);
973 	/*
974 	 * If dirty, put on list of dirty buffers;
975 	 * otherwise insert onto list of clean buffers.
976 	 */
977 	if ((bp->b_flags & B_DELWRI) == 0) {
978 		listheadp = &newvp->v_cleanblkhd;
979 		if (TAILQ_EMPTY(&newvp->v_uobj.memq) &&
980 		    (newvp->v_flag & VONWORKLST) &&
981 		    LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
982 			newvp->v_flag &= ~VONWORKLST;
983 			LIST_REMOVE(newvp, v_synclist);
984 		}
985 	} else {
986 		listheadp = &newvp->v_dirtyblkhd;
987 		if ((newvp->v_flag & VONWORKLST) == 0) {
988 			switch (newvp->v_type) {
989 			case VDIR:
990 				delay = dirdelay;
991 				break;
992 			case VBLK:
993 				if (newvp->v_specmountpoint != NULL) {
994 					delay = metadelay;
995 					break;
996 				}
997 				/* fall through */
998 			default:
999 				delay = filedelay;
1000 				break;
1001 			}
1002 			if (!newvp->v_mount ||
1003 			    (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1004 				vn_syncer_add_to_worklist(newvp, delay);
1005 		}
1006 	}
1007 	bufinsvn(bp, listheadp);
1008 }
1009 
1010 /*
1011  * Create a vnode for a block device.
1012  * Used for root filesystem and swap areas.
1013  * Also used for memory file system special devices.
1014  */
1015 int
1016 bdevvp(dev, vpp)
1017 	dev_t dev;
1018 	struct vnode **vpp;
1019 {
1020 
1021 	return (getdevvp(dev, vpp, VBLK));
1022 }
1023 
1024 /*
1025  * Create a vnode for a character device.
1026  * Used for kernfs and some console handling.
1027  */
1028 int
1029 cdevvp(dev, vpp)
1030 	dev_t dev;
1031 	struct vnode **vpp;
1032 {
1033 
1034 	return (getdevvp(dev, vpp, VCHR));
1035 }
1036 
1037 /*
1038  * Create a vnode for a device.
1039  * Used by bdevvp (block device) for root file system etc.,
1040  * and by cdevvp (character device) for console and kernfs.
1041  */
1042 int
1043 getdevvp(dev, vpp, type)
1044 	dev_t dev;
1045 	struct vnode **vpp;
1046 	enum vtype type;
1047 {
1048 	struct vnode *vp;
1049 	struct vnode *nvp;
1050 	int error;
1051 
1052 	if (dev == NODEV) {
1053 		*vpp = NULLVP;
1054 		return (0);
1055 	}
1056 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1057 	if (error) {
1058 		*vpp = NULLVP;
1059 		return (error);
1060 	}
1061 	vp = nvp;
1062 	vp->v_type = type;
1063 	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
1064 		vput(vp);
1065 		vp = nvp;
1066 	}
1067 	*vpp = vp;
1068 	return (0);
1069 }
1070 
1071 /*
1072  * Check to see if the new vnode represents a special device
1073  * for which we already have a vnode (either because of
1074  * bdevvp() or because of a different vnode representing
1075  * the same block device). If such an alias exists, deallocate
1076  * the existing contents and return the aliased vnode. The
1077  * caller is responsible for filling it with its new contents.
1078  */
1079 struct vnode *
1080 checkalias(nvp, nvp_rdev, mp)
1081 	struct vnode *nvp;
1082 	dev_t nvp_rdev;
1083 	struct mount *mp;
1084 {
1085 	struct proc *p = curproc;       /* XXX */
1086 	struct vnode *vp;
1087 	struct vnode **vpp;
1088 
1089 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1090 		return (NULLVP);
1091 
1092 	vpp = &speclisth[SPECHASH(nvp_rdev)];
1093 loop:
1094 	simple_lock(&spechash_slock);
1095 	for (vp = *vpp; vp; vp = vp->v_specnext) {
1096 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
1097 			continue;
1098 		/*
1099 		 * Alias, but not in use, so flush it out.
1100 		 */
1101 		simple_lock(&vp->v_interlock);
1102 		if (vp->v_usecount == 0) {
1103 			simple_unlock(&spechash_slock);
1104 			vgonel(vp, p);
1105 			goto loop;
1106 		}
1107 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
1108 			simple_unlock(&spechash_slock);
1109 			goto loop;
1110 		}
1111 		break;
1112 	}
1113 	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
1114 		MALLOC(nvp->v_specinfo, struct specinfo *,
1115 			sizeof(struct specinfo), M_VNODE, M_NOWAIT);
1116 		/* XXX Erg. */
1117 		if (nvp->v_specinfo == NULL) {
1118 			simple_unlock(&spechash_slock);
1119 			uvm_wait("checkalias");
1120 			goto loop;
1121 		}
1122 
1123 		nvp->v_rdev = nvp_rdev;
1124 		nvp->v_hashchain = vpp;
1125 		nvp->v_specnext = *vpp;
1126 		nvp->v_specmountpoint = NULL;
1127 		simple_unlock(&spechash_slock);
1128 		nvp->v_speclockf = NULL;
1129 		*vpp = nvp;
1130 		if (vp != NULLVP) {
1131 			nvp->v_flag |= VALIASED;
1132 			vp->v_flag |= VALIASED;
1133 			vput(vp);
1134 		}
1135 		return (NULLVP);
1136 	}
1137 	simple_unlock(&spechash_slock);
1138 	VOP_UNLOCK(vp, 0);
1139 	simple_lock(&vp->v_interlock);
1140 	vclean(vp, 0, p);
1141 	vp->v_op = nvp->v_op;
1142 	vp->v_tag = nvp->v_tag;
1143 	vp->v_vnlock = &vp->v_lock;
1144 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1145 	nvp->v_type = VNON;
1146 	insmntque(vp, mp);
1147 	return (vp);
1148 }
1149 
1150 /*
1151  * Grab a particular vnode from the free list, increment its
1152  * reference count and lock it. If the vnode lock bit is set the
1153  * vnode is being eliminated in vgone. In that case, we can not
1154  * grab the vnode, so the process is awakened when the transition is
1155  * completed, and an error returned to indicate that the vnode is no
1156  * longer usable (possibly having been changed to a new file system type).
1157  */
1158 int
1159 vget(vp, flags)
1160 	struct vnode *vp;
1161 	int flags;
1162 {
1163 	int error;
1164 
1165 	/*
1166 	 * If the vnode is in the process of being cleaned out for
1167 	 * another use, we wait for the cleaning to finish and then
1168 	 * return failure. Cleaning is determined by checking that
1169 	 * the VXLOCK flag is set.
1170 	 */
1171 
1172 	if ((flags & LK_INTERLOCK) == 0)
1173 		simple_lock(&vp->v_interlock);
1174 	if (vp->v_flag & VXLOCK) {
1175 		if (flags & LK_NOWAIT) {
1176 			simple_unlock(&vp->v_interlock);
1177 			return EBUSY;
1178 		}
1179 		vp->v_flag |= VXWANT;
1180 		ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock);
1181 		return (ENOENT);
1182 	}
1183 	if (vp->v_usecount == 0) {
1184 		simple_lock(&vnode_free_list_slock);
1185 		if (vp->v_holdcnt > 0)
1186 			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1187 		else
1188 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1189 		simple_unlock(&vnode_free_list_slock);
1190 	}
1191 	vp->v_usecount++;
1192 #ifdef DIAGNOSTIC
1193 	if (vp->v_usecount == 0) {
1194 		vprint("vget", vp);
1195 		panic("vget: usecount overflow, vp %p", vp);
1196 	}
1197 #endif
1198 	if (flags & LK_TYPE_MASK) {
1199 		if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
1200 			/*
1201 			 * must expand vrele here because we do not want
1202 			 * to call VOP_INACTIVE if the reference count
1203 			 * drops back to zero since it was never really
1204 			 * active. We must remove it from the free list
1205 			 * before sleeping so that multiple processes do
1206 			 * not try to recycle it.
1207 			 */
1208 			simple_lock(&vp->v_interlock);
1209 			vp->v_usecount--;
1210 			if (vp->v_usecount > 0) {
1211 				simple_unlock(&vp->v_interlock);
1212 				return (error);
1213 			}
1214 			/*
1215 			 * insert at tail of LRU list
1216 			 */
1217 			simple_lock(&vnode_free_list_slock);
1218 			if (vp->v_holdcnt > 0)
1219 				TAILQ_INSERT_TAIL(&vnode_hold_list, vp,
1220 				    v_freelist);
1221 			else
1222 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
1223 				    v_freelist);
1224 			simple_unlock(&vnode_free_list_slock);
1225 			simple_unlock(&vp->v_interlock);
1226 		}
1227 		return (error);
1228 	}
1229 	simple_unlock(&vp->v_interlock);
1230 	return (0);
1231 }
1232 
1233 /*
1234  * vput(), just unlock and vrele()
1235  */
1236 void
1237 vput(vp)
1238 	struct vnode *vp;
1239 {
1240 	struct proc *p = curproc;	/* XXX */
1241 
1242 #ifdef DIAGNOSTIC
1243 	if (vp == NULL)
1244 		panic("vput: null vp");
1245 #endif
1246 	simple_lock(&vp->v_interlock);
1247 	vp->v_usecount--;
1248 	if (vp->v_usecount > 0) {
1249 		simple_unlock(&vp->v_interlock);
1250 		VOP_UNLOCK(vp, 0);
1251 		return;
1252 	}
1253 #ifdef DIAGNOSTIC
1254 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1255 		vprint("vput: bad ref count", vp);
1256 		panic("vput: ref cnt");
1257 	}
1258 #endif
1259 	/*
1260 	 * Insert at tail of LRU list.
1261 	 */
1262 	simple_lock(&vnode_free_list_slock);
1263 	if (vp->v_holdcnt > 0)
1264 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1265 	else
1266 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1267 	simple_unlock(&vnode_free_list_slock);
1268 	if (vp->v_flag & VEXECMAP) {
1269 		uvmexp.execpages -= vp->v_uobj.uo_npages;
1270 		uvmexp.filepages += vp->v_uobj.uo_npages;
1271 	}
1272 	vp->v_flag &= ~(VTEXT|VEXECMAP);
1273 	simple_unlock(&vp->v_interlock);
1274 	VOP_INACTIVE(vp, p);
1275 }
1276 
1277 /*
1278  * Vnode release.
1279  * If count drops to zero, call inactive routine and return to freelist.
1280  */
1281 void
1282 vrele(vp)
1283 	struct vnode *vp;
1284 {
1285 	struct proc *p = curproc;	/* XXX */
1286 
1287 #ifdef DIAGNOSTIC
1288 	if (vp == NULL)
1289 		panic("vrele: null vp");
1290 #endif
1291 	simple_lock(&vp->v_interlock);
1292 	vp->v_usecount--;
1293 	if (vp->v_usecount > 0) {
1294 		simple_unlock(&vp->v_interlock);
1295 		return;
1296 	}
1297 #ifdef DIAGNOSTIC
1298 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1299 		vprint("vrele: bad ref count", vp);
1300 		panic("vrele: ref cnt vp %p", vp);
1301 	}
1302 #endif
1303 	/*
1304 	 * Insert at tail of LRU list.
1305 	 */
1306 	simple_lock(&vnode_free_list_slock);
1307 	if (vp->v_holdcnt > 0)
1308 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1309 	else
1310 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1311 	simple_unlock(&vnode_free_list_slock);
1312 	if (vp->v_flag & VEXECMAP) {
1313 		uvmexp.execpages -= vp->v_uobj.uo_npages;
1314 		uvmexp.filepages += vp->v_uobj.uo_npages;
1315 	}
1316 	vp->v_flag &= ~(VTEXT|VEXECMAP);
1317 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
1318 		VOP_INACTIVE(vp, p);
1319 }
1320 
1321 #ifdef DIAGNOSTIC
1322 /*
1323  * Page or buffer structure gets a reference.
1324  */
1325 void
1326 vhold(vp)
1327 	struct vnode *vp;
1328 {
1329 
1330 	/*
1331 	 * If it is on the freelist and the hold count is currently
1332 	 * zero, move it to the hold list. The test of the back
1333 	 * pointer and the use reference count of zero is because
1334 	 * it will be removed from a free list by getnewvnode,
1335 	 * but will not have its reference count incremented until
1336 	 * after calling vgone. If the reference count were
1337 	 * incremented first, vgone would (incorrectly) try to
1338 	 * close the previous instance of the underlying object.
1339 	 * So, the back pointer is explicitly set to `0xdeadb' in
1340 	 * getnewvnode after removing it from a freelist to ensure
1341 	 * that we do not try to move it here.
1342 	 */
1343   	simple_lock(&vp->v_interlock);
1344 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1345 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1346 		simple_lock(&vnode_free_list_slock);
1347 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1348 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1349 		simple_unlock(&vnode_free_list_slock);
1350 	}
1351 	vp->v_holdcnt++;
1352 	simple_unlock(&vp->v_interlock);
1353 }
1354 
1355 /*
1356  * Page or buffer structure frees a reference.
1357  */
1358 void
1359 holdrele(vp)
1360 	struct vnode *vp;
1361 {
1362 
1363 	simple_lock(&vp->v_interlock);
1364 	if (vp->v_holdcnt <= 0)
1365 		panic("holdrele: holdcnt vp %p", vp);
1366 	vp->v_holdcnt--;
1367 
1368 	/*
1369 	 * If it is on the holdlist and the hold count drops to
1370 	 * zero, move it to the free list. The test of the back
1371 	 * pointer and the use reference count of zero is because
1372 	 * it will be removed from a free list by getnewvnode,
1373 	 * but will not have its reference count incremented until
1374 	 * after calling vgone. If the reference count were
1375 	 * incremented first, vgone would (incorrectly) try to
1376 	 * close the previous instance of the underlying object.
1377 	 * So, the back pointer is explicitly set to `0xdeadb' in
1378 	 * getnewvnode after removing it from a freelist to ensure
1379 	 * that we do not try to move it here.
1380 	 */
1381 
1382 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1383 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1384 		simple_lock(&vnode_free_list_slock);
1385 		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1386 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1387 		simple_unlock(&vnode_free_list_slock);
1388 	}
1389 	simple_unlock(&vp->v_interlock);
1390 }
1391 
1392 /*
1393  * Vnode reference.
1394  */
1395 void
1396 vref(vp)
1397 	struct vnode *vp;
1398 {
1399 
1400 	simple_lock(&vp->v_interlock);
1401 	if (vp->v_usecount <= 0)
1402 		panic("vref used where vget required, vp %p", vp);
1403 	vp->v_usecount++;
1404 #ifdef DIAGNOSTIC
1405 	if (vp->v_usecount == 0) {
1406 		vprint("vref", vp);
1407 		panic("vref: usecount overflow, vp %p", vp);
1408 	}
1409 #endif
1410 	simple_unlock(&vp->v_interlock);
1411 }
1412 #endif /* DIAGNOSTIC */
1413 
1414 /*
1415  * Remove any vnodes in the vnode table belonging to mount point mp.
1416  *
1417  * If FORCECLOSE is not specified, there should not be any active ones,
1418  * return error if any are found (nb: this is a user error, not a
1419  * system error). If FORCECLOSE is specified, detach any active vnodes
1420  * that are found.
1421  *
1422  * If WRITECLOSE is set, only flush out regular file vnodes open for
1423  * writing.
1424  *
1425  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1426  */
1427 #ifdef DEBUG
1428 int busyprt = 0;	/* print out busy vnodes */
1429 struct ctldebug debug1 = { "busyprt", &busyprt };
1430 #endif
1431 
1432 int
1433 vflush(mp, skipvp, flags)
1434 	struct mount *mp;
1435 	struct vnode *skipvp;
1436 	int flags;
1437 {
1438 	struct proc *p = curproc;	/* XXX */
1439 	struct vnode *vp, *nvp;
1440 	int busy = 0;
1441 
1442 	simple_lock(&mntvnode_slock);
1443 loop:
1444 	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1445 		if (vp->v_mount != mp)
1446 			goto loop;
1447 		nvp = LIST_NEXT(vp, v_mntvnodes);
1448 		/*
1449 		 * Skip over a selected vnode.
1450 		 */
1451 		if (vp == skipvp)
1452 			continue;
1453 		simple_lock(&vp->v_interlock);
1454 		/*
1455 		 * Skip over a vnodes marked VSYSTEM.
1456 		 */
1457 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1458 			simple_unlock(&vp->v_interlock);
1459 			continue;
1460 		}
1461 		/*
1462 		 * If WRITECLOSE is set, only flush out regular file
1463 		 * vnodes open for writing.
1464 		 */
1465 		if ((flags & WRITECLOSE) &&
1466 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1467 			simple_unlock(&vp->v_interlock);
1468 			continue;
1469 		}
1470 		/*
1471 		 * With v_usecount == 0, all we need to do is clear
1472 		 * out the vnode data structures and we are done.
1473 		 */
1474 		if (vp->v_usecount == 0) {
1475 			simple_unlock(&mntvnode_slock);
1476 			vgonel(vp, p);
1477 			simple_lock(&mntvnode_slock);
1478 			continue;
1479 		}
1480 		/*
1481 		 * If FORCECLOSE is set, forcibly close the vnode.
1482 		 * For block or character devices, revert to an
1483 		 * anonymous device. For all other files, just kill them.
1484 		 */
1485 		if (flags & FORCECLOSE) {
1486 			simple_unlock(&mntvnode_slock);
1487 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1488 				vgonel(vp, p);
1489 			} else {
1490 				vclean(vp, 0, p);
1491 				vp->v_op = spec_vnodeop_p;
1492 				insmntque(vp, (struct mount *)0);
1493 			}
1494 			simple_lock(&mntvnode_slock);
1495 			continue;
1496 		}
1497 #ifdef DEBUG
1498 		if (busyprt)
1499 			vprint("vflush: busy vnode", vp);
1500 #endif
1501 		simple_unlock(&vp->v_interlock);
1502 		busy++;
1503 	}
1504 	simple_unlock(&mntvnode_slock);
1505 	if (busy)
1506 		return (EBUSY);
1507 	return (0);
1508 }
1509 
1510 /*
1511  * Disassociate the underlying file system from a vnode.
1512  */
1513 void
1514 vclean(vp, flags, p)
1515 	struct vnode *vp;
1516 	int flags;
1517 	struct proc *p;
1518 {
1519 	int active;
1520 
1521 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1522 
1523 	/*
1524 	 * Check to see if the vnode is in use.
1525 	 * If so we have to reference it before we clean it out
1526 	 * so that its count cannot fall to zero and generate a
1527 	 * race against ourselves to recycle it.
1528 	 */
1529 
1530 	if ((active = vp->v_usecount) != 0) {
1531 		vp->v_usecount++;
1532 #ifdef DIAGNOSTIC
1533 		if (vp->v_usecount == 0) {
1534 			vprint("vclean", vp);
1535 			panic("vclean: usecount overflow");
1536 		}
1537 #endif
1538 	}
1539 
1540 	/*
1541 	 * Prevent the vnode from being recycled or
1542 	 * brought into use while we clean it out.
1543 	 */
1544 	if (vp->v_flag & VXLOCK)
1545 		panic("vclean: deadlock, vp %p", vp);
1546 	vp->v_flag |= VXLOCK;
1547 	if (vp->v_flag & VEXECMAP) {
1548 		uvmexp.execpages -= vp->v_uobj.uo_npages;
1549 		uvmexp.filepages += vp->v_uobj.uo_npages;
1550 	}
1551 	vp->v_flag &= ~(VTEXT|VEXECMAP);
1552 
1553 	/*
1554 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1555 	 * have the object locked while it cleans it out. The VOP_LOCK
1556 	 * ensures that the VOP_INACTIVE routine is done with its work.
1557 	 * For active vnodes, it ensures that no other activity can
1558 	 * occur while the underlying object is being cleaned out.
1559 	 */
1560 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);
1561 
1562 	/*
1563 	 * Clean out any cached data associated with the vnode.
1564 	 */
1565 	if (flags & DOCLOSE) {
1566 		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1567 		KASSERT((vp->v_flag & VONWORKLST) == 0);
1568 	}
1569 	LOCK_ASSERT(!simple_lock_held(&vp->v_interlock));
1570 
1571 	/*
1572 	 * If purging an active vnode, it must be closed and
1573 	 * deactivated before being reclaimed. Note that the
1574 	 * VOP_INACTIVE will unlock the vnode.
1575 	 */
1576 	if (active) {
1577 		if (flags & DOCLOSE)
1578 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
1579 		VOP_INACTIVE(vp, p);
1580 	} else {
1581 		/*
1582 		 * Any other processes trying to obtain this lock must first
1583 		 * wait for VXLOCK to clear, then call the new lock operation.
1584 		 */
1585 		VOP_UNLOCK(vp, 0);
1586 	}
1587 	/*
1588 	 * Reclaim the vnode.
1589 	 */
1590 	if (VOP_RECLAIM(vp, p))
1591 		panic("vclean: cannot reclaim, vp %p", vp);
1592 	if (active) {
1593 		/*
1594 		 * Inline copy of vrele() since VOP_INACTIVE
1595 		 * has already been called.
1596 		 */
1597 		simple_lock(&vp->v_interlock);
1598 		if (--vp->v_usecount <= 0) {
1599 #ifdef DIAGNOSTIC
1600 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1601 				vprint("vclean: bad ref count", vp);
1602 				panic("vclean: ref cnt");
1603 			}
1604 #endif
1605 			/*
1606 			 * Insert at tail of LRU list.
1607 			 */
1608 
1609 			simple_unlock(&vp->v_interlock);
1610 			simple_lock(&vnode_free_list_slock);
1611 #ifdef DIAGNOSTIC
1612 			if (vp->v_holdcnt > 0)
1613 				panic("vclean: not clean, vp %p", vp);
1614 #endif
1615 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1616 			simple_unlock(&vnode_free_list_slock);
1617 		} else
1618 			simple_unlock(&vp->v_interlock);
1619 	}
1620 
1621 	KASSERT(vp->v_uobj.uo_npages == 0);
1622 	cache_purge(vp);
1623 
1624 	/*
1625 	 * Done with purge, notify sleepers of the grim news.
1626 	 */
1627 	vp->v_op = dead_vnodeop_p;
1628 	vp->v_tag = VT_NON;
1629 	simple_lock(&vp->v_interlock);
1630 	VN_KNOTE(vp, NOTE_REVOKE);	/* FreeBSD has this in vn_pollgone() */
1631 	vp->v_flag &= ~VXLOCK;
1632 	if (vp->v_flag & VXWANT) {
1633 		vp->v_flag &= ~VXWANT;
1634 		simple_unlock(&vp->v_interlock);
1635 		wakeup((caddr_t)vp);
1636 	} else
1637 		simple_unlock(&vp->v_interlock);
1638 }
1639 
1640 /*
1641  * Recycle an unused vnode to the front of the free list.
1642  * Release the passed interlock if the vnode will be recycled.
1643  */
1644 int
1645 vrecycle(vp, inter_lkp, p)
1646 	struct vnode *vp;
1647 	struct simplelock *inter_lkp;
1648 	struct proc *p;
1649 {
1650 
1651 	simple_lock(&vp->v_interlock);
1652 	if (vp->v_usecount == 0) {
1653 		if (inter_lkp)
1654 			simple_unlock(inter_lkp);
1655 		vgonel(vp, p);
1656 		return (1);
1657 	}
1658 	simple_unlock(&vp->v_interlock);
1659 	return (0);
1660 }
1661 
1662 /*
1663  * Eliminate all activity associated with a vnode
1664  * in preparation for reuse.
1665  */
1666 void
1667 vgone(vp)
1668 	struct vnode *vp;
1669 {
1670 	struct proc *p = curproc;	/* XXX */
1671 
1672 	simple_lock(&vp->v_interlock);
1673 	vgonel(vp, p);
1674 }
1675 
1676 /*
1677  * vgone, with the vp interlock held.
1678  */
1679 void
1680 vgonel(vp, p)
1681 	struct vnode *vp;
1682 	struct proc *p;
1683 {
1684 	struct vnode *vq;
1685 	struct vnode *vx;
1686 
1687 	LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1688 
1689 	/*
1690 	 * If a vgone (or vclean) is already in progress,
1691 	 * wait until it is done and return.
1692 	 */
1693 
1694 	if (vp->v_flag & VXLOCK) {
1695 		vp->v_flag |= VXWANT;
1696 		ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock);
1697 		return;
1698 	}
1699 
1700 	/*
1701 	 * Clean out the filesystem specific data.
1702 	 */
1703 
1704 	vclean(vp, DOCLOSE, p);
1705 	KASSERT((vp->v_flag & VONWORKLST) == 0);
1706 
1707 	/*
1708 	 * Delete from old mount point vnode list, if on one.
1709 	 */
1710 
1711 	if (vp->v_mount != NULL)
1712 		insmntque(vp, (struct mount *)0);
1713 
1714 	/*
1715 	 * If special device, remove it from special device alias list.
1716 	 * if it is on one.
1717 	 */
1718 
1719 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1720 		simple_lock(&spechash_slock);
1721 		if (vp->v_hashchain != NULL) {
1722 			if (*vp->v_hashchain == vp) {
1723 				*vp->v_hashchain = vp->v_specnext;
1724 			} else {
1725 				for (vq = *vp->v_hashchain; vq;
1726 							vq = vq->v_specnext) {
1727 					if (vq->v_specnext != vp)
1728 						continue;
1729 					vq->v_specnext = vp->v_specnext;
1730 					break;
1731 				}
1732 				if (vq == NULL)
1733 					panic("missing bdev");
1734 			}
1735 			if (vp->v_flag & VALIASED) {
1736 				vx = NULL;
1737 				for (vq = *vp->v_hashchain; vq;
1738 							vq = vq->v_specnext) {
1739 					if (vq->v_rdev != vp->v_rdev ||
1740 					    vq->v_type != vp->v_type)
1741 						continue;
1742 					if (vx)
1743 						break;
1744 					vx = vq;
1745 				}
1746 				if (vx == NULL)
1747 					panic("missing alias");
1748 				if (vq == NULL)
1749 					vx->v_flag &= ~VALIASED;
1750 				vp->v_flag &= ~VALIASED;
1751 			}
1752 		}
1753 		simple_unlock(&spechash_slock);
1754 		FREE(vp->v_specinfo, M_VNODE);
1755 		vp->v_specinfo = NULL;
1756 	}
1757 
1758 	/*
1759 	 * If it is on the freelist and not already at the head,
1760 	 * move it to the head of the list. The test of the back
1761 	 * pointer and the reference count of zero is because
1762 	 * it will be removed from the free list by getnewvnode,
1763 	 * but will not have its reference count incremented until
1764 	 * after calling vgone. If the reference count were
1765 	 * incremented first, vgone would (incorrectly) try to
1766 	 * close the previous instance of the underlying object.
1767 	 * So, the back pointer is explicitly set to `0xdeadb' in
1768 	 * getnewvnode after removing it from the freelist to ensure
1769 	 * that we do not try to move it here.
1770 	 */
1771 
1772 	if (vp->v_usecount == 0) {
1773 		simple_lock(&vnode_free_list_slock);
1774 		if (vp->v_holdcnt > 0)
1775 			panic("vgonel: not clean, vp %p", vp);
1776 		if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
1777 		    TAILQ_FIRST(&vnode_free_list) != vp) {
1778 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1779 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1780 		}
1781 		simple_unlock(&vnode_free_list_slock);
1782 	}
1783 	vp->v_type = VBAD;
1784 }
1785 
1786 /*
1787  * Lookup a vnode by device number.
1788  */
1789 int
1790 vfinddev(dev, type, vpp)
1791 	dev_t dev;
1792 	enum vtype type;
1793 	struct vnode **vpp;
1794 {
1795 	struct vnode *vp;
1796 	int rc = 0;
1797 
1798 	simple_lock(&spechash_slock);
1799 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1800 		if (dev != vp->v_rdev || type != vp->v_type)
1801 			continue;
1802 		*vpp = vp;
1803 		rc = 1;
1804 		break;
1805 	}
1806 	simple_unlock(&spechash_slock);
1807 	return (rc);
1808 }
1809 
1810 /*
1811  * Revoke all the vnodes corresponding to the specified minor number
1812  * range (endpoints inclusive) of the specified major.
1813  */
1814 void
1815 vdevgone(maj, minl, minh, type)
1816 	int maj, minl, minh;
1817 	enum vtype type;
1818 {
1819 	struct vnode *vp;
1820 	int mn;
1821 
1822 	for (mn = minl; mn <= minh; mn++)
1823 		if (vfinddev(makedev(maj, mn), type, &vp))
1824 			VOP_REVOKE(vp, REVOKEALL);
1825 }
1826 
1827 /*
1828  * Calculate the total number of references to a special device.
1829  */
1830 int
1831 vcount(vp)
1832 	struct vnode *vp;
1833 {
1834 	struct vnode *vq, *vnext;
1835 	int count;
1836 
1837 loop:
1838 	if ((vp->v_flag & VALIASED) == 0)
1839 		return (vp->v_usecount);
1840 	simple_lock(&spechash_slock);
1841 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1842 		vnext = vq->v_specnext;
1843 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1844 			continue;
1845 		/*
1846 		 * Alias, but not in use, so flush it out.
1847 		 */
1848 		if (vq->v_usecount == 0 && vq != vp &&
1849 		    (vq->v_flag & VXLOCK) == 0) {
1850 			simple_unlock(&spechash_slock);
1851 			vgone(vq);
1852 			goto loop;
1853 		}
1854 		count += vq->v_usecount;
1855 	}
1856 	simple_unlock(&spechash_slock);
1857 	return (count);
1858 }
1859 
1860 /*
1861  * Print out a description of a vnode.
1862  */
1863 const char * const vnode_types[] = {
1864 	"VNON",
1865 	"VREG",
1866 	"VDIR",
1867 	"VBLK",
1868 	"VCHR",
1869 	"VLNK",
1870 	"VSOCK",
1871 	"VFIFO",
1872 	"VBAD"
1873 };
1874 
1875 void
1876 vprint(label, vp)
1877 	char *label;
1878 	struct vnode *vp;
1879 {
1880 	char buf[96];
1881 
1882 	if (label != NULL)
1883 		printf("%s: ", label);
1884 	printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,",
1885 	    vp->v_tag, vnode_types[vp->v_type],
1886 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
1887 	buf[0] = '\0';
1888 	if (vp->v_flag & VROOT)
1889 		strlcat(buf, "|VROOT", sizeof(buf));
1890 	if (vp->v_flag & VTEXT)
1891 		strlcat(buf, "|VTEXT", sizeof(buf));
1892 	if (vp->v_flag & VEXECMAP)
1893 		strlcat(buf, "|VEXECMAP", sizeof(buf));
1894 	if (vp->v_flag & VSYSTEM)
1895 		strlcat(buf, "|VSYSTEM", sizeof(buf));
1896 	if (vp->v_flag & VXLOCK)
1897 		strlcat(buf, "|VXLOCK", sizeof(buf));
1898 	if (vp->v_flag & VXWANT)
1899 		strlcat(buf, "|VXWANT", sizeof(buf));
1900 	if (vp->v_flag & VBWAIT)
1901 		strlcat(buf, "|VBWAIT", sizeof(buf));
1902 	if (vp->v_flag & VALIASED)
1903 		strlcat(buf, "|VALIASED", sizeof(buf));
1904 	if (buf[0] != '\0')
1905 		printf(" flags (%s)", &buf[1]);
1906 	if (vp->v_data == NULL) {
1907 		printf("\n");
1908 	} else {
1909 		printf("\n\t");
1910 		VOP_PRINT(vp);
1911 	}
1912 }
1913 
1914 #ifdef DEBUG
1915 /*
1916  * List all of the locked vnodes in the system.
1917  * Called when debugging the kernel.
1918  */
1919 void
1920 printlockedvnodes()
1921 {
1922 	struct mount *mp, *nmp;
1923 	struct vnode *vp;
1924 
1925 	printf("Locked vnodes\n");
1926 	simple_lock(&mountlist_slock);
1927 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1928 	     mp = nmp) {
1929 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
1930 			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1931 			continue;
1932 		}
1933 		LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1934 			if (VOP_ISLOCKED(vp))
1935 				vprint(NULL, vp);
1936 		}
1937 		simple_lock(&mountlist_slock);
1938 		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1939 		vfs_unbusy(mp);
1940 	}
1941 	simple_unlock(&mountlist_slock);
1942 }
1943 #endif
1944 
1945 /*
1946  * Top level filesystem related information gathering.
1947  */
1948 int
1949 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1950 	int *name;
1951 	u_int namelen;
1952 	void *oldp;
1953 	size_t *oldlenp;
1954 	void *newp;
1955 	size_t newlen;
1956 	struct proc *p;
1957 {
1958 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
1959 	struct vfsconf vfc;
1960 	extern const char * const mountcompatnames[];
1961 	extern int nmountcompatnames;
1962 #endif
1963 	struct vfsops *vfsp;
1964 
1965 	/* all sysctl names at this level are at least name and field */
1966 	if (namelen < 2)
1967 		return (ENOTDIR);		/* overloaded */
1968 
1969 	/* Not generic: goes to file system. */
1970 	if (name[0] != VFS_GENERIC) {
1971 		static const struct ctlname vfsnames[VFS_MAXID+1]=CTL_VFS_NAMES;
1972 		const char *vfsname;
1973 
1974 		if (name[0] < 0 || name[0] > VFS_MAXID
1975 		    || (vfsname = vfsnames[name[0]].ctl_name) == NULL)
1976 			return (EOPNOTSUPP);
1977 
1978 		vfsp = vfs_getopsbyname(vfsname);
1979 		if (vfsp == NULL || vfsp->vfs_sysctl == NULL)
1980 			return (EOPNOTSUPP);
1981 		return ((*vfsp->vfs_sysctl)(&name[1], namelen - 1,
1982 		    oldp, oldlenp, newp, newlen, p));
1983 	}
1984 
1985 	/* The rest are generic vfs sysctls. */
1986 	switch (name[1]) {
1987 	case VFS_USERMOUNT:
1988 		return sysctl_int(oldp, oldlenp, newp, newlen, &dovfsusermount);
1989 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
1990 	case VFS_MAXTYPENUM:
1991 		/*
1992 		 * Provided for 4.4BSD-Lite2 compatibility.
1993 		 */
1994 		return (sysctl_rdint(oldp, oldlenp, newp, nmountcompatnames));
1995 	case VFS_CONF:
1996 		/*
1997 		 * Special: a node, next is a file system name.
1998 		 * Provided for 4.4BSD-Lite2 compatibility.
1999 		 */
2000 		if (namelen < 3)
2001 			return (ENOTDIR);	/* overloaded */
2002 		if (name[2] >= nmountcompatnames || name[2] < 0 ||
2003 		    mountcompatnames[name[2]] == NULL)
2004 			return (EOPNOTSUPP);
2005 		vfsp = vfs_getopsbyname(mountcompatnames[name[2]]);
2006 		if (vfsp == NULL)
2007 			return (EOPNOTSUPP);
2008 		vfc.vfc_vfsops = vfsp;
2009 		strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN);
2010 		vfc.vfc_typenum = name[2];
2011 		vfc.vfc_refcount = vfsp->vfs_refcount;
2012 		vfc.vfc_flags = 0;
2013 		vfc.vfc_mountroot = vfsp->vfs_mountroot;
2014 		vfc.vfc_next = NULL;
2015 		return (sysctl_rdstruct(oldp, oldlenp, newp, &vfc,
2016 		    sizeof(struct vfsconf)));
2017 #endif
2018 	default:
2019 		break;
2020 	}
2021 	return (EOPNOTSUPP);
2022 }
2023 
2024 int kinfo_vdebug = 1;
2025 int kinfo_vgetfailed;
2026 #define KINFO_VNODESLOP	10
2027 /*
2028  * Dump vnode list (via sysctl).
2029  * Copyout address of vnode followed by vnode.
2030  */
2031 /* ARGSUSED */
2032 int
2033 sysctl_vnode(where, sizep, p)
2034 	char *where;
2035 	size_t *sizep;
2036 	struct proc *p;
2037 {
2038 	struct mount *mp, *nmp;
2039 	struct vnode *nvp, *vp;
2040 	char *bp = where, *savebp;
2041 	char *ewhere;
2042 	int error;
2043 
2044 #define VPTRSZ	sizeof(struct vnode *)
2045 #define VNODESZ	sizeof(struct vnode)
2046 	if (where == NULL) {
2047 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2048 		return (0);
2049 	}
2050 	ewhere = where + *sizep;
2051 
2052 	simple_lock(&mountlist_slock);
2053 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2054 	     mp = nmp) {
2055 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
2056 			nmp = CIRCLEQ_NEXT(mp, mnt_list);
2057 			continue;
2058 		}
2059 		savebp = bp;
2060 again:
2061 		simple_lock(&mntvnode_slock);
2062 		for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2063 		     vp != NULL;
2064 		     vp = nvp) {
2065 			/*
2066 			 * Check that the vp is still associated with
2067 			 * this filesystem.  RACE: could have been
2068 			 * recycled onto the same filesystem.
2069 			 */
2070 			if (vp->v_mount != mp) {
2071 				simple_unlock(&mntvnode_slock);
2072 				if (kinfo_vdebug)
2073 					printf("kinfo: vp changed\n");
2074 				bp = savebp;
2075 				goto again;
2076 			}
2077 			nvp = LIST_NEXT(vp, v_mntvnodes);
2078 			if (bp + VPTRSZ + VNODESZ > ewhere) {
2079 				simple_unlock(&mntvnode_slock);
2080 				*sizep = bp - where;
2081 				return (ENOMEM);
2082 			}
2083 			simple_unlock(&mntvnode_slock);
2084 			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2085 			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
2086 				return (error);
2087 			bp += VPTRSZ + VNODESZ;
2088 			simple_lock(&mntvnode_slock);
2089 		}
2090 		simple_unlock(&mntvnode_slock);
2091 		simple_lock(&mountlist_slock);
2092 		nmp = CIRCLEQ_NEXT(mp, mnt_list);
2093 		vfs_unbusy(mp);
2094 	}
2095 	simple_unlock(&mountlist_slock);
2096 
2097 	*sizep = bp - where;
2098 	return (0);
2099 }
2100 
2101 /*
2102  * Check to see if a filesystem is mounted on a block device.
2103  */
2104 int
2105 vfs_mountedon(vp)
2106 	struct vnode *vp;
2107 {
2108 	struct vnode *vq;
2109 	int error = 0;
2110 
2111 	if (vp->v_specmountpoint != NULL)
2112 		return (EBUSY);
2113 	if (vp->v_flag & VALIASED) {
2114 		simple_lock(&spechash_slock);
2115 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2116 			if (vq->v_rdev != vp->v_rdev ||
2117 			    vq->v_type != vp->v_type)
2118 				continue;
2119 			if (vq->v_specmountpoint != NULL) {
2120 				error = EBUSY;
2121 				break;
2122 			}
2123 		}
2124 		simple_unlock(&spechash_slock);
2125 	}
2126 	return (error);
2127 }
2128 
2129 static int
2130 sacheck(struct sockaddr *sa)
2131 {
2132 	switch (sa->sa_family) {
2133 #ifdef INET
2134 	case AF_INET: {
2135 		struct sockaddr_in *sin = (struct sockaddr_in *)sa;
2136 		char *p = (char *)sin->sin_zero;
2137 		size_t i;
2138 
2139 		if (sin->sin_len != sizeof(*sin))
2140 			return -1;
2141 		if (sin->sin_port != 0)
2142 			return -1;
2143 		for (i = 0; i < sizeof(sin->sin_zero); i++)
2144 			if (*p++ != '\0')
2145 				return -1;
2146 		return 0;
2147 	}
2148 #endif
2149 #ifdef INET6
2150 	case AF_INET6: {
2151 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
2152 
2153 		if (sin6->sin6_len != sizeof(*sin6))
2154 			return -1;
2155 		if (sin6->sin6_port != 0)
2156 			return -1;
2157 		return 0;
2158 	}
2159 #endif
2160 	default:
2161 		return -1;
2162 	}
2163 }
2164 
2165 /*
2166  * Build hash lists of net addresses and hang them off the mount point.
2167  * Called by ufs_mount() to set up the lists of export addresses.
2168  */
2169 static int
2170 vfs_hang_addrlist(mp, nep, argp)
2171 	struct mount *mp;
2172 	struct netexport *nep;
2173 	struct export_args *argp;
2174 {
2175 	struct netcred *np, *enp;
2176 	struct radix_node_head *rnh;
2177 	int i;
2178 	struct sockaddr *saddr, *smask = 0;
2179 	struct domain *dom;
2180 	int error;
2181 
2182 	if (argp->ex_addrlen == 0) {
2183 		if (mp->mnt_flag & MNT_DEFEXPORTED)
2184 			return (EPERM);
2185 		np = &nep->ne_defexported;
2186 		np->netc_exflags = argp->ex_flags;
2187 		crcvt(&np->netc_anon, &argp->ex_anon);
2188 		np->netc_anon.cr_ref = 1;
2189 		mp->mnt_flag |= MNT_DEFEXPORTED;
2190 		return (0);
2191 	}
2192 
2193 	if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN)
2194 		return (EINVAL);
2195 
2196 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2197 	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
2198 	memset((caddr_t)np, 0, i);
2199 	saddr = (struct sockaddr *)(np + 1);
2200 	error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
2201 	if (error)
2202 		goto out;
2203 	if (saddr->sa_len > argp->ex_addrlen)
2204 		saddr->sa_len = argp->ex_addrlen;
2205 	if (sacheck(saddr) == -1)
2206 		return EINVAL;
2207 	if (argp->ex_masklen) {
2208 		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2209 		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
2210 		if (error)
2211 			goto out;
2212 		if (smask->sa_len > argp->ex_masklen)
2213 			smask->sa_len = argp->ex_masklen;
2214 		if (smask->sa_family != saddr->sa_family)
2215 			return EINVAL;
2216 		if (sacheck(smask) == -1)
2217 			return EINVAL;
2218 	}
2219 	i = saddr->sa_family;
2220 	if ((rnh = nep->ne_rtable[i]) == 0) {
2221 		/*
2222 		 * Seems silly to initialize every AF when most are not
2223 		 * used, do so on demand here
2224 		 */
2225 		for (dom = domains; dom; dom = dom->dom_next)
2226 			if (dom->dom_family == i && dom->dom_rtattach) {
2227 				dom->dom_rtattach((void **)&nep->ne_rtable[i],
2228 					dom->dom_rtoffset);
2229 				break;
2230 			}
2231 		if ((rnh = nep->ne_rtable[i]) == 0) {
2232 			error = ENOBUFS;
2233 			goto out;
2234 		}
2235 	}
2236 
2237 	enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh,
2238 	    np->netc_rnodes);
2239 	if (enp != np) {
2240 		if (enp == NULL) {
2241 			enp = (struct netcred *)(*rnh->rnh_lookup)(saddr,
2242 			    smask, rnh);
2243 			if (enp == NULL) {
2244 				error = EPERM;
2245 				goto out;
2246 			}
2247 		} else
2248 			enp->netc_refcnt++;
2249 
2250 		goto check;
2251 	} else
2252 		enp->netc_refcnt = 1;
2253 
2254 	np->netc_exflags = argp->ex_flags;
2255 	crcvt(&np->netc_anon, &argp->ex_anon);
2256 	np->netc_anon.cr_ref = 1;
2257 	return 0;
2258 check:
2259 	if (enp->netc_exflags != argp->ex_flags ||
2260 	    crcmp(&enp->netc_anon, &argp->ex_anon) != 0)
2261 		error = EPERM;
2262 	else
2263 		error = 0;
2264 out:
2265 	free(np, M_NETADDR);
2266 	return error;
2267 }
2268 
2269 /* ARGSUSED */
2270 static int
2271 vfs_free_netcred(rn, w)
2272 	struct radix_node *rn;
2273 	void *w;
2274 {
2275 	struct radix_node_head *rnh = (struct radix_node_head *)w;
2276 	struct netcred *np = (struct netcred *)(void *)rn;
2277 
2278 	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2279 	if (--(np->netc_refcnt) <= 0)
2280 		free(np, M_NETADDR);
2281 	return (0);
2282 }
2283 
2284 /*
2285  * Free the net address hash lists that are hanging off the mount points.
2286  */
2287 static void
2288 vfs_free_addrlist(nep)
2289 	struct netexport *nep;
2290 {
2291 	int i;
2292 	struct radix_node_head *rnh;
2293 
2294 	for (i = 0; i <= AF_MAX; i++)
2295 		if ((rnh = nep->ne_rtable[i]) != NULL) {
2296 			(*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
2297 			free((caddr_t)rnh, M_RTABLE);
2298 			nep->ne_rtable[i] = 0;
2299 		}
2300 }
2301 
2302 int
2303 vfs_export(mp, nep, argp)
2304 	struct mount *mp;
2305 	struct netexport *nep;
2306 	struct export_args *argp;
2307 {
2308 	int error;
2309 
2310 	if (argp->ex_flags & MNT_DELEXPORT) {
2311 		if (mp->mnt_flag & MNT_EXPUBLIC) {
2312 			vfs_setpublicfs(NULL, NULL, NULL);
2313 			mp->mnt_flag &= ~MNT_EXPUBLIC;
2314 		}
2315 		vfs_free_addrlist(nep);
2316 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2317 	}
2318 	if (argp->ex_flags & MNT_EXPORTED) {
2319 		if (argp->ex_flags & MNT_EXPUBLIC) {
2320 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2321 				return (error);
2322 			mp->mnt_flag |= MNT_EXPUBLIC;
2323 		}
2324 		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
2325 			return (error);
2326 		mp->mnt_flag |= MNT_EXPORTED;
2327 	}
2328 	return (0);
2329 }
2330 
2331 /*
2332  * Set the publicly exported filesystem (WebNFS). Currently, only
2333  * one public filesystem is possible in the spec (RFC 2054 and 2055)
2334  */
2335 int
2336 vfs_setpublicfs(mp, nep, argp)
2337 	struct mount *mp;
2338 	struct netexport *nep;
2339 	struct export_args *argp;
2340 {
2341 	int error;
2342 	struct vnode *rvp;
2343 	char *cp;
2344 
2345 	/*
2346 	 * mp == NULL -> invalidate the current info, the FS is
2347 	 * no longer exported. May be called from either vfs_export
2348 	 * or unmount, so check if it hasn't already been done.
2349 	 */
2350 	if (mp == NULL) {
2351 		if (nfs_pub.np_valid) {
2352 			nfs_pub.np_valid = 0;
2353 			if (nfs_pub.np_index != NULL) {
2354 				FREE(nfs_pub.np_index, M_TEMP);
2355 				nfs_pub.np_index = NULL;
2356 			}
2357 		}
2358 		return (0);
2359 	}
2360 
2361 	/*
2362 	 * Only one allowed at a time.
2363 	 */
2364 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2365 		return (EBUSY);
2366 
2367 	/*
2368 	 * Get real filehandle for root of exported FS.
2369 	 */
2370 	memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle));
2371 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2372 
2373 	if ((error = VFS_ROOT(mp, &rvp)))
2374 		return (error);
2375 
2376 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2377 		return (error);
2378 
2379 	vput(rvp);
2380 
2381 	/*
2382 	 * If an indexfile was specified, pull it in.
2383 	 */
2384 	if (argp->ex_indexfile != NULL) {
2385 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2386 		    M_WAITOK);
2387 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2388 		    MAXNAMLEN, (size_t *)0);
2389 		if (!error) {
2390 			/*
2391 			 * Check for illegal filenames.
2392 			 */
2393 			for (cp = nfs_pub.np_index; *cp; cp++) {
2394 				if (*cp == '/') {
2395 					error = EINVAL;
2396 					break;
2397 				}
2398 			}
2399 		}
2400 		if (error) {
2401 			FREE(nfs_pub.np_index, M_TEMP);
2402 			return (error);
2403 		}
2404 	}
2405 
2406 	nfs_pub.np_mount = mp;
2407 	nfs_pub.np_valid = 1;
2408 	return (0);
2409 }
2410 
2411 struct netcred *
2412 vfs_export_lookup(mp, nep, nam)
2413 	struct mount *mp;
2414 	struct netexport *nep;
2415 	struct mbuf *nam;
2416 {
2417 	struct netcred *np;
2418 	struct radix_node_head *rnh;
2419 	struct sockaddr *saddr;
2420 
2421 	np = NULL;
2422 	if (mp->mnt_flag & MNT_EXPORTED) {
2423 		/*
2424 		 * Lookup in the export list first.
2425 		 */
2426 		if (nam != NULL) {
2427 			saddr = mtod(nam, struct sockaddr *);
2428 			rnh = nep->ne_rtable[saddr->sa_family];
2429 			if (rnh != NULL) {
2430 				np = (struct netcred *)
2431 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2432 							      rnh);
2433 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2434 					np = NULL;
2435 			}
2436 		}
2437 		/*
2438 		 * If no address match, use the default if it exists.
2439 		 */
2440 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2441 			np = &nep->ne_defexported;
2442 	}
2443 	return (np);
2444 }
2445 
2446 /*
2447  * Do the usual access checking.
2448  * file_mode, uid and gid are from the vnode in question,
2449  * while acc_mode and cred are from the VOP_ACCESS parameter list
2450  */
2451 int
2452 vaccess(type, file_mode, uid, gid, acc_mode, cred)
2453 	enum vtype type;
2454 	mode_t file_mode;
2455 	uid_t uid;
2456 	gid_t gid;
2457 	mode_t acc_mode;
2458 	struct ucred *cred;
2459 {
2460 	mode_t mask;
2461 
2462 	/*
2463 	 * Super-user always gets read/write access, but execute access depends
2464 	 * on at least one execute bit being set.
2465 	 */
2466 	if (cred->cr_uid == 0) {
2467 		if ((acc_mode & VEXEC) && type != VDIR &&
2468 		    (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
2469 			return (EACCES);
2470 		return (0);
2471 	}
2472 
2473 	mask = 0;
2474 
2475 	/* Otherwise, check the owner. */
2476 	if (cred->cr_uid == uid) {
2477 		if (acc_mode & VEXEC)
2478 			mask |= S_IXUSR;
2479 		if (acc_mode & VREAD)
2480 			mask |= S_IRUSR;
2481 		if (acc_mode & VWRITE)
2482 			mask |= S_IWUSR;
2483 		return ((file_mode & mask) == mask ? 0 : EACCES);
2484 	}
2485 
2486 	/* Otherwise, check the groups. */
2487 	if (cred->cr_gid == gid || groupmember(gid, cred)) {
2488 		if (acc_mode & VEXEC)
2489 			mask |= S_IXGRP;
2490 		if (acc_mode & VREAD)
2491 			mask |= S_IRGRP;
2492 		if (acc_mode & VWRITE)
2493 			mask |= S_IWGRP;
2494 		return ((file_mode & mask) == mask ? 0 : EACCES);
2495 	}
2496 
2497 	/* Otherwise, check everyone else. */
2498 	if (acc_mode & VEXEC)
2499 		mask |= S_IXOTH;
2500 	if (acc_mode & VREAD)
2501 		mask |= S_IROTH;
2502 	if (acc_mode & VWRITE)
2503 		mask |= S_IWOTH;
2504 	return ((file_mode & mask) == mask ? 0 : EACCES);
2505 }
2506 
2507 /*
2508  * Unmount all file systems.
2509  * We traverse the list in reverse order under the assumption that doing so
2510  * will avoid needing to worry about dependencies.
2511  */
2512 void
2513 vfs_unmountall(p)
2514 	struct proc *p;
2515 {
2516 	struct mount *mp, *nmp;
2517 	int allerror, error;
2518 
2519 	for (allerror = 0,
2520 	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2521 		nmp = mp->mnt_list.cqe_prev;
2522 #ifdef DEBUG
2523 		printf("unmounting %s (%s)...\n",
2524 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
2525 #endif
2526 		/*
2527 		 * XXX Freeze syncer.  Must do this before locking the
2528 		 * mount point.  See dounmount() for details.
2529 		 */
2530 		lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL);
2531 		if (vfs_busy(mp, 0, 0)) {
2532 			lockmgr(&syncer_lock, LK_RELEASE, NULL);
2533 			continue;
2534 		}
2535 		if ((error = dounmount(mp, MNT_FORCE, p)) != 0) {
2536 			printf("unmount of %s failed with error %d\n",
2537 			    mp->mnt_stat.f_mntonname, error);
2538 			allerror = 1;
2539 		}
2540 	}
2541 	if (allerror)
2542 		printf("WARNING: some file systems would not unmount\n");
2543 }
2544 
2545 /*
2546  * Sync and unmount file systems before shutting down.
2547  */
2548 void
2549 vfs_shutdown()
2550 {
2551 	struct buf *bp;
2552 	int iter, nbusy, nbusy_prev = 0, dcount, s;
2553 	struct lwp *l = curlwp;
2554 	struct proc *p;
2555 
2556 	/* XXX we're certainly not running in proc0's context! */
2557 	if (l == NULL || (p = l->l_proc) == NULL)
2558 		p = &proc0;
2559 
2560 	printf("syncing disks... ");
2561 
2562 	/* remove user process from run queue */
2563 	suspendsched();
2564 	(void) spl0();
2565 
2566 	/* avoid coming back this way again if we panic. */
2567 	doing_shutdown = 1;
2568 
2569 	sys_sync(l, NULL, NULL);
2570 
2571 	/* Wait for sync to finish. */
2572 	dcount = 10000;
2573 	for (iter = 0; iter < 20;) {
2574 		nbusy = 0;
2575 		for (bp = &buf[nbuf]; --bp >= buf; ) {
2576 			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
2577 				nbusy++;
2578 			/*
2579 			 * With soft updates, some buffers that are
2580 			 * written will be remarked as dirty until other
2581 			 * buffers are written.
2582 			 */
2583 			if (bp->b_vp && bp->b_vp->v_mount
2584 			    && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
2585 			    && (bp->b_flags & B_DELWRI)) {
2586 				s = splbio();
2587 				bremfree(bp);
2588 				bp->b_flags |= B_BUSY;
2589 				splx(s);
2590 				nbusy++;
2591 				bawrite(bp);
2592 				if (dcount-- <= 0) {
2593 					printf("softdep ");
2594 					goto fail;
2595 				}
2596 			}
2597 		}
2598 		if (nbusy == 0)
2599 			break;
2600 		if (nbusy_prev == 0)
2601 			nbusy_prev = nbusy;
2602 		printf("%d ", nbusy);
2603 		tsleep(&nbusy, PRIBIO, "bflush",
2604 		    (iter == 0) ? 1 : hz / 25 * iter);
2605 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
2606 			iter++;
2607 		else
2608 			nbusy_prev = nbusy;
2609 	}
2610 	if (nbusy) {
2611 fail:
2612 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
2613 		printf("giving up\nPrinting vnodes for busy buffers\n");
2614 		for (bp = &buf[nbuf]; --bp >= buf; )
2615 			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
2616 				vprint(NULL, bp->b_vp);
2617 
2618 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
2619 		Debugger();
2620 #endif
2621 
2622 #else  /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
2623 		printf("giving up\n");
2624 #endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
2625 		return;
2626 	} else
2627 		printf("done\n");
2628 
2629 	/*
2630 	 * If we've panic'd, don't make the situation potentially
2631 	 * worse by unmounting the file systems.
2632 	 */
2633 	if (panicstr != NULL)
2634 		return;
2635 
2636 	/* Release inodes held by texts before update. */
2637 #ifdef notdef
2638 	vnshutdown();
2639 #endif
2640 	/* Unmount file systems. */
2641 	vfs_unmountall(p);
2642 }
2643 
2644 /*
2645  * Mount the root file system.  If the operator didn't specify a
2646  * file system to use, try all possible file systems until one
2647  * succeeds.
2648  */
2649 int
2650 vfs_mountroot()
2651 {
2652 	struct vfsops *v;
2653 
2654 	if (root_device == NULL)
2655 		panic("vfs_mountroot: root device unknown");
2656 
2657 	switch (root_device->dv_class) {
2658 	case DV_IFNET:
2659 		if (rootdev != NODEV)
2660 			panic("vfs_mountroot: rootdev set for DV_IFNET "
2661 			    "(0x%08x -> %d,%d)", rootdev,
2662 			    major(rootdev), minor(rootdev));
2663 		break;
2664 
2665 	case DV_DISK:
2666 		if (rootdev == NODEV)
2667 			panic("vfs_mountroot: rootdev not set for DV_DISK");
2668 		break;
2669 
2670 	default:
2671 		printf("%s: inappropriate for root file system\n",
2672 		    root_device->dv_xname);
2673 		return (ENODEV);
2674 	}
2675 
2676 	/*
2677 	 * If user specified a file system, use it.
2678 	 */
2679 	if (mountroot != NULL)
2680 		return ((*mountroot)());
2681 
2682 	/*
2683 	 * Try each file system currently configured into the kernel.
2684 	 */
2685 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2686 		if (v->vfs_mountroot == NULL)
2687 			continue;
2688 #ifdef DEBUG
2689 		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
2690 #endif
2691 		if ((*v->vfs_mountroot)() == 0) {
2692 			aprint_normal("root file system type: %s\n",
2693 			    v->vfs_name);
2694 			break;
2695 		}
2696 	}
2697 
2698 	if (v == NULL) {
2699 		printf("no file system for %s", root_device->dv_xname);
2700 		if (root_device->dv_class == DV_DISK)
2701 			printf(" (dev 0x%x)", rootdev);
2702 		printf("\n");
2703 		return (EFTYPE);
2704 	}
2705 	return (0);
2706 }
2707 
2708 /*
2709  * Given a file system name, look up the vfsops for that
2710  * file system, or return NULL if file system isn't present
2711  * in the kernel.
2712  */
2713 struct vfsops *
2714 vfs_getopsbyname(name)
2715 	const char *name;
2716 {
2717 	struct vfsops *v;
2718 
2719 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2720 		if (strcmp(v->vfs_name, name) == 0)
2721 			break;
2722 	}
2723 
2724 	return (v);
2725 }
2726 
2727 /*
2728  * Establish a file system and initialize it.
2729  */
2730 int
2731 vfs_attach(vfs)
2732 	struct vfsops *vfs;
2733 {
2734 	struct vfsops *v;
2735 	int error = 0;
2736 
2737 
2738 	/*
2739 	 * Make sure this file system doesn't already exist.
2740 	 */
2741 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2742 		if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
2743 			error = EEXIST;
2744 			goto out;
2745 		}
2746 	}
2747 
2748 	/*
2749 	 * Initialize the vnode operations for this file system.
2750 	 */
2751 	vfs_opv_init(vfs->vfs_opv_descs);
2752 
2753 	/*
2754 	 * Now initialize the file system itself.
2755 	 */
2756 	(*vfs->vfs_init)();
2757 
2758 	/*
2759 	 * ...and link it into the kernel's list.
2760 	 */
2761 	LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
2762 
2763 	/*
2764 	 * Sanity: make sure the reference count is 0.
2765 	 */
2766 	vfs->vfs_refcount = 0;
2767 
2768  out:
2769 	return (error);
2770 }
2771 
2772 /*
2773  * Remove a file system from the kernel.
2774  */
2775 int
2776 vfs_detach(vfs)
2777 	struct vfsops *vfs;
2778 {
2779 	struct vfsops *v;
2780 
2781 	/*
2782 	 * Make sure no one is using the filesystem.
2783 	 */
2784 	if (vfs->vfs_refcount != 0)
2785 		return (EBUSY);
2786 
2787 	/*
2788 	 * ...and remove it from the kernel's list.
2789 	 */
2790 	LIST_FOREACH(v, &vfs_list, vfs_list) {
2791 		if (v == vfs) {
2792 			LIST_REMOVE(v, vfs_list);
2793 			break;
2794 		}
2795 	}
2796 
2797 	if (v == NULL)
2798 		return (ESRCH);
2799 
2800 	/*
2801 	 * Now run the file system-specific cleanups.
2802 	 */
2803 	(*vfs->vfs_done)();
2804 
2805 	/*
2806 	 * Free the vnode operations vector.
2807 	 */
2808 	vfs_opv_free(vfs->vfs_opv_descs);
2809 	return (0);
2810 }
2811 
2812 void
2813 vfs_reinit(void)
2814 {
2815 	struct vfsops *vfs;
2816 
2817 	LIST_FOREACH(vfs, &vfs_list, vfs_list) {
2818 		if (vfs->vfs_reinit) {
2819 			(*vfs->vfs_reinit)();
2820 		}
2821 	}
2822 }
2823 
2824 void
2825 copy_statfs_info(struct statfs *sbp, const struct mount *mp)
2826 {
2827 	const struct statfs *mbp;
2828 
2829 	if (sbp == (mbp = &mp->mnt_stat))
2830 		return;
2831 
2832 	sbp->f_oflags = mbp->f_oflags;
2833 	sbp->f_type = mbp->f_type;
2834 	(void)memcpy(&sbp->f_fsid, &mbp->f_fsid, sizeof(sbp->f_fsid));
2835 	sbp->f_owner = mbp->f_owner;
2836 	sbp->f_flags = mbp->f_flags;
2837 	sbp->f_syncwrites = mbp->f_syncwrites;
2838 	sbp->f_asyncwrites = mbp->f_asyncwrites;
2839 	sbp->f_spare[0] = mbp->f_spare[0];
2840 	(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
2841 	    sizeof(sbp->f_fstypename));
2842 	(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
2843 	    sizeof(sbp->f_mntonname));
2844 	(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
2845 	    sizeof(sbp->f_mntfromname));
2846 }
2847 
2848 int
2849 set_statfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
2850     struct mount *mp, struct proc *p)
2851 {
2852 	int error;
2853 	size_t size;
2854 	struct statfs *sfs = &mp->mnt_stat;
2855 	int (*fun)(const void *, void *, size_t, size_t *);
2856 
2857 	(void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name,
2858 	    sizeof(mp->mnt_stat.f_fstypename));
2859 
2860 	if (onp) {
2861 		struct cwdinfo *cwdi = p->p_cwdi;
2862 		fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
2863 		if (cwdi->cwdi_rdir != NULL) {
2864 			size_t len;
2865 			char *bp;
2866 			char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
2867 
2868 			if (!path)
2869 				return ENOMEM;
2870 
2871 			bp = path + MAXPATHLEN;
2872 			*--bp = '\0';
2873 			error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
2874 			    path, MAXPATHLEN / 2, 0, p);
2875 			if (error) {
2876 				free(path, M_TEMP);
2877 				return error;
2878 			}
2879 
2880 			len = strlen(bp);
2881 			if (len > sizeof(sfs->f_mntonname) - 1)
2882 				len = sizeof(sfs->f_mntonname) - 1;
2883 			(void)strncpy(sfs->f_mntonname, bp, len);
2884 			free(path, M_TEMP);
2885 
2886 			if (len < sizeof(sfs->f_mntonname) - 1) {
2887 				error = (*fun)(onp, &sfs->f_mntonname[len],
2888 				    sizeof(sfs->f_mntonname) - len - 1, &size);
2889 				if (error)
2890 					return error;
2891 				size += len;
2892 			} else {
2893 				size = len;
2894 			}
2895 		} else {
2896 			error = (*fun)(onp, &sfs->f_mntonname,
2897 			    sizeof(sfs->f_mntonname) - 1, &size);
2898 			if (error)
2899 				return error;
2900 		}
2901 		(void)memset(sfs->f_mntonname + size, 0,
2902 		    sizeof(sfs->f_mntonname) - size);
2903 	}
2904 
2905 	if (fromp) {
2906 		fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
2907 		error = (*fun)(fromp, sfs->f_mntfromname,
2908 		    sizeof(sfs->f_mntfromname) - 1, &size);
2909 		if (error)
2910 			return error;
2911 		(void)memset(sfs->f_mntfromname + size, 0,
2912 		    sizeof(sfs->f_mntfromname) - size);
2913 	}
2914 	return 0;
2915 }
2916 
2917 #ifdef DDB
2918 const char buf_flagbits[] =
2919 	"\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
2920 	"\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE"
2921 	"\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
2922 	"\32XXX\33VFLUSH";
2923 
2924 void
2925 vfs_buf_print(bp, full, pr)
2926 	struct buf *bp;
2927 	int full;
2928 	void (*pr) __P((const char *, ...));
2929 {
2930 	char buf[1024];
2931 
2932 	(*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n",
2933 		  bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev);
2934 
2935 	bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf));
2936 	(*pr)("  error %d flags 0x%s\n", bp->b_error, buf);
2937 
2938 	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
2939 		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
2940 	(*pr)("  data %p saveaddr %p dep %p\n",
2941 		  bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep));
2942 	(*pr)("  iodone %p\n", bp->b_iodone);
2943 }
2944 
2945 
2946 const char vnode_flagbits[] =
2947 	"\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP"
2948 	"\11XLOCK\12XWANT\13BWAIT\14ALIASED"
2949 	"\15DIROP\16LAYER\17ONWORKLIST\20DIRTY";
2950 
2951 const char * const vnode_tags[] = {
2952 	"VT_NON",
2953 	"VT_UFS",
2954 	"VT_NFS",
2955 	"VT_MFS",
2956 	"VT_MSDOSFS",
2957 	"VT_LFS",
2958 	"VT_LOFS",
2959 	"VT_FDESC",
2960 	"VT_PORTAL",
2961 	"VT_NULL",
2962 	"VT_UMAP",
2963 	"VT_KERNFS",
2964 	"VT_PROCFS",
2965 	"VT_AFS",
2966 	"VT_ISOFS",
2967 	"VT_UNION",
2968 	"VT_ADOSFS",
2969 	"VT_EXT2FS",
2970 	"VT_CODA",
2971 	"VT_FILECORE",
2972 	"VT_NTFS",
2973 	"VT_VFS",
2974 	"VT_OVERLAY",
2975 	"VT_SMBFS"
2976 };
2977 
2978 void
2979 vfs_vnode_print(vp, full, pr)
2980 	struct vnode *vp;
2981 	int full;
2982 	void (*pr) __P((const char *, ...));
2983 {
2984 	char buf[256];
2985 	const char *vtype, *vtag;
2986 
2987 	uvm_object_printit(&vp->v_uobj, full, pr);
2988 	bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf));
2989 	(*pr)("\nVNODE flags %s\n", buf);
2990 	(*pr)("mp %p numoutput %d size 0x%llx\n",
2991 	      vp->v_mount, vp->v_numoutput, vp->v_size);
2992 
2993 	(*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n",
2994 	      vp->v_data, vp->v_usecount, vp->v_writecount,
2995 	      vp->v_holdcnt, vp->v_numoutput);
2996 
2997 	vtype = (vp->v_type >= 0 &&
2998 		 vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ?
2999 		vnode_types[vp->v_type] : "UNKNOWN";
3000 	vtag = (vp->v_tag >= 0 &&
3001 		vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ?
3002 		vnode_tags[vp->v_tag] : "UNKNOWN";
3003 
3004 	(*pr)("type %s(%d) tag %s(%d) id 0x%lx mount %p typedata %p\n",
3005 	      vtype, vp->v_type, vtag, vp->v_tag,
3006 	      vp->v_id, vp->v_mount, vp->v_mountedhere);
3007 
3008 	if (full) {
3009 		struct buf *bp;
3010 
3011 		(*pr)("clean bufs:\n");
3012 		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
3013 			(*pr)(" bp %p\n", bp);
3014 			vfs_buf_print(bp, full, pr);
3015 		}
3016 
3017 		(*pr)("dirty bufs:\n");
3018 		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
3019 			(*pr)(" bp %p\n", bp);
3020 			vfs_buf_print(bp, full, pr);
3021 		}
3022 	}
3023 }
3024 #endif
3025