xref: /netbsd-src/sys/kern/vfs_subr.c (revision 95d875fb90b1458e4f1de6950286ddcd6644bc61)
1 /*	$NetBSD: vfs_subr.c,v 1.116 1999/12/15 07:10:32 perseant Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1989, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *	This product includes software developed by the University of
60  *	California, Berkeley and its contributors.
61  * 4. Neither the name of the University nor the names of its contributors
62  *    may be used to endorse or promote products derived from this software
63  *    without specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
66  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
69  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
70  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
71  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
72  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
73  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
74  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75  * SUCH DAMAGE.
76  *
77  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
78  */
79 
80 /*
81  * External virtual filesystem routines
82  */
83 
84 #include "opt_compat_netbsd.h"
85 #include "opt_compat_43.h"
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/proc.h>
90 #include <sys/mount.h>
91 #include <sys/time.h>
92 #include <sys/fcntl.h>
93 #include <sys/vnode.h>
94 #include <sys/stat.h>
95 #include <sys/namei.h>
96 #include <sys/ucred.h>
97 #include <sys/buf.h>
98 #include <sys/errno.h>
99 #include <sys/malloc.h>
100 #include <sys/domain.h>
101 #include <sys/mbuf.h>
102 #include <sys/syscallargs.h>
103 #include <sys/device.h>
104 #include <sys/dirent.h>
105 
106 #include <vm/vm.h>
107 #include <sys/sysctl.h>
108 
109 #include <miscfs/specfs/specdev.h>
110 #include <miscfs/genfs/genfs.h>
111 #include <miscfs/syncfs/syncfs.h>
112 
113 #include <uvm/uvm_extern.h>
114 
115 enum vtype iftovt_tab[16] = {
116 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
117 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
118 };
119 int	vttoif_tab[9] = {
120 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
121 	S_IFSOCK, S_IFIFO, S_IFMT,
122 };
123 
124 int doforce = 1;		/* 1 => permit forcible unmounting */
125 int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
126 
127 /*
128  * Insq/Remq for the vnode usage lists.
129  */
130 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
131 #define	bufremvn(bp) {							\
132 	LIST_REMOVE(bp, b_vnbufs);					\
133 	(bp)->b_vnbufs.le_next = NOLIST;				\
134 }
135 /* TAILQ_HEAD(freelst, vnode) vnode_free_list =	vnode free list (in vnode.h) */
136 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
137 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
138 
139 struct mntlist mountlist =			/* mounted filesystem list */
140     CIRCLEQ_HEAD_INITIALIZER(mountlist);
141 struct vfs_list_head vfs_list =			/* vfs list */
142 	 LIST_HEAD_INITIALIZER(vfs_list);
143 
144 struct nfs_public nfs_pub;			/* publicly exported FS */
145 
146 struct simplelock mountlist_slock;
147 static struct simplelock mntid_slock;
148 struct simplelock mntvnode_slock;
149 struct simplelock vnode_free_list_slock;
150 struct simplelock spechash_slock;
151 
152 /*
153  * These define the root filesystem and device.
154  */
155 struct mount *rootfs;
156 struct vnode *rootvnode;
157 struct device *root_device;			/* root device */
158 
159 struct pool vnode_pool;				/* memory pool for vnodes */
160 
161 /*
162  * Local declarations.
163  */
164 void insmntque __P((struct vnode *, struct mount *));
165 int getdevvp __P((dev_t, struct vnode **, enum vtype));
166 void vgoneall __P((struct vnode *));
167 
168 static int vfs_hang_addrlist __P((struct mount *, struct netexport *,
169 				  struct export_args *));
170 static int vfs_free_netcred __P((struct radix_node *, void *));
171 static void vfs_free_addrlist __P((struct netexport *));
172 
173 #ifdef DEBUG
174 void printlockedvnodes __P((void));
175 #endif
176 
177 /*
178  * Initialize the vnode management data structures.
179  */
180 void
181 vntblinit()
182 {
183 
184 	simple_lock_init(&mntvnode_slock);
185 	simple_lock_init(&mntid_slock);
186 	simple_lock_init(&spechash_slock);
187 	simple_lock_init(&vnode_free_list_slock);
188 
189 	pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
190 	    0, pool_page_alloc_nointr, pool_page_free_nointr, M_VNODE);
191 
192 	/*
193 	 * Initialize the filesystem syncer.
194 	 */
195 	vn_initialize_syncerd();
196 }
197 
198 /*
199  * Mark a mount point as busy. Used to synchronize access and to delay
200  * unmounting. Interlock is not released on failure.
201  */
202 int
203 vfs_busy(mp, flags, interlkp)
204 	struct mount *mp;
205 	int flags;
206 	struct simplelock *interlkp;
207 {
208 	int lkflags;
209 
210 	while (mp->mnt_flag & MNT_UNMOUNT) {
211 		int gone;
212 
213 		if (flags & LK_NOWAIT)
214 			return (ENOENT);
215 		if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
216 		    && mp->mnt_unmounter == curproc)
217 			return (EDEADLK);
218 		if (interlkp)
219 			simple_unlock(interlkp);
220 		/*
221 		 * Since all busy locks are shared except the exclusive
222 		 * lock granted when unmounting, the only place that a
223 		 * wakeup needs to be done is at the release of the
224 		 * exclusive lock at the end of dounmount.
225 		 *
226 		 * XXX MP: add spinlock protecting mnt_wcnt here once you
227 		 * can atomically unlock-and-sleep.
228 		 */
229 		mp->mnt_wcnt++;
230 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
231 		mp->mnt_wcnt--;
232 		gone = mp->mnt_flag & MNT_GONE;
233 
234 		if (mp->mnt_wcnt == 0)
235 			wakeup(&mp->mnt_wcnt);
236 		if (interlkp)
237 			simple_lock(interlkp);
238 		if (gone)
239 			return (ENOENT);
240 	}
241 	lkflags = LK_SHARED;
242 	if (interlkp)
243 		lkflags |= LK_INTERLOCK;
244 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp))
245 		panic("vfs_busy: unexpected lock failure");
246 	return (0);
247 }
248 
249 /*
250  * Free a busy filesystem.
251  */
252 void
253 vfs_unbusy(mp)
254 	struct mount *mp;
255 {
256 
257 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
258 }
259 
260 /*
261  * Lookup a filesystem type, and if found allocate and initialize
262  * a mount structure for it.
263  *
264  * Devname is usually updated by mount(8) after booting.
265  */
266 int
267 vfs_rootmountalloc(fstypename, devname, mpp)
268 	char *fstypename;
269 	char *devname;
270 	struct mount **mpp;
271 {
272 	struct vfsops *vfsp = NULL;
273 	struct mount *mp;
274 
275 	for (vfsp = LIST_FIRST(&vfs_list); vfsp != NULL;
276 	     vfsp = LIST_NEXT(vfsp, vfs_list))
277 		if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN))
278 			break;
279 
280 	if (vfsp == NULL)
281 		return (ENODEV);
282 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
283 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
284 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
285 	(void)vfs_busy(mp, LK_NOWAIT, 0);
286 	LIST_INIT(&mp->mnt_vnodelist);
287 	mp->mnt_op = vfsp;
288 	mp->mnt_flag = MNT_RDONLY;
289 	mp->mnt_vnodecovered = NULLVP;
290 	vfsp->vfs_refcount++;
291 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN);
292 	mp->mnt_stat.f_mntonname[0] = '/';
293 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
294 	*mpp = mp;
295 	return (0);
296 }
297 
298 /*
299  * Lookup a mount point by filesystem identifier.
300  */
301 struct mount *
302 vfs_getvfs(fsid)
303 	fsid_t *fsid;
304 {
305 	register struct mount *mp;
306 
307 	simple_lock(&mountlist_slock);
308 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
309 	     mp = mp->mnt_list.cqe_next) {
310 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
311 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
312 			simple_unlock(&mountlist_slock);
313 			return (mp);
314 		}
315 	}
316 	simple_unlock(&mountlist_slock);
317 	return ((struct mount *)0);
318 }
319 
320 /*
321  * Get a new unique fsid
322  */
323 void
324 vfs_getnewfsid(mp, fstypename)
325 	struct mount *mp;
326 	char *fstypename;
327 {
328 	static u_short xxxfs_mntid;
329 	fsid_t tfsid;
330 	int mtype;
331 
332 	simple_lock(&mntid_slock);
333 	mtype = makefstype(fstypename);
334 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
335 	mp->mnt_stat.f_fsid.val[1] = mtype;
336 	if (xxxfs_mntid == 0)
337 		++xxxfs_mntid;
338 	tfsid.val[0] = makedev((nblkdev + mtype) & 0xff, xxxfs_mntid);
339 	tfsid.val[1] = mtype;
340 	if (mountlist.cqh_first != (void *)&mountlist) {
341 		while (vfs_getvfs(&tfsid)) {
342 			tfsid.val[0]++;
343 			xxxfs_mntid++;
344 		}
345 	}
346 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
347 	simple_unlock(&mntid_slock);
348 }
349 
350 /*
351  * Make a 'unique' number from a mount type name.
352  */
353 long
354 makefstype(type)
355 	char *type;
356 {
357 	long rv;
358 
359 	for (rv = 0; *type; type++) {
360 		rv <<= 2;
361 		rv ^= *type;
362 	}
363 	return rv;
364 }
365 
366 
367 /*
368  * Set vnode attributes to VNOVAL
369  */
370 void
371 vattr_null(vap)
372 	register struct vattr *vap;
373 {
374 
375 	vap->va_type = VNON;
376 
377 	/*
378 	 * Assign individually so that it is safe even if size and
379 	 * sign of each member are varied.
380 	 */
381 	vap->va_mode = VNOVAL;
382 	vap->va_nlink = VNOVAL;
383 	vap->va_uid = VNOVAL;
384 	vap->va_gid = VNOVAL;
385 	vap->va_fsid = VNOVAL;
386 	vap->va_fileid = VNOVAL;
387 	vap->va_size = VNOVAL;
388 	vap->va_blocksize = VNOVAL;
389 	vap->va_atime.tv_sec =
390 	    vap->va_mtime.tv_sec =
391 	    vap->va_ctime.tv_sec = VNOVAL;
392 	vap->va_atime.tv_nsec =
393 	    vap->va_mtime.tv_nsec =
394 	    vap->va_ctime.tv_nsec = VNOVAL;
395 	vap->va_gen = VNOVAL;
396 	vap->va_flags = VNOVAL;
397 	vap->va_rdev = VNOVAL;
398 	vap->va_bytes = VNOVAL;
399 	vap->va_vaflags = 0;
400 }
401 
402 /*
403  * Routines having to do with the management of the vnode table.
404  */
405 extern int (**dead_vnodeop_p) __P((void *));
406 long numvnodes;
407 
408 /*
409  * Return the next vnode from the free list.
410  */
411 int
412 getnewvnode(tag, mp, vops, vpp)
413 	enum vtagtype tag;
414 	struct mount *mp;
415 	int (**vops) __P((void *));
416 	struct vnode **vpp;
417 {
418 	struct proc *p = curproc;	/* XXX */
419 	struct freelst *listhd;
420 	static int toggle;
421 	struct vnode *vp;
422 	int error = 0;
423 #ifdef DIAGNOSTIC
424 	int s;
425 #endif
426 	if (mp) {
427 		/*
428 		 * Mark filesystem busy while we're creating a vnode.
429 		 * If unmount is in progress, this will wait; if the
430 		 * unmount succeeds (only if umount -f), this will
431 		 * return an error.  If the unmount fails, we'll keep
432 		 * going afterwards.
433 		 * (This puts the per-mount vnode list logically under
434 		 * the protection of the vfs_busy lock).
435 		 */
436 		error = vfs_busy(mp, LK_RECURSEFAIL, 0);
437 		if (error && error != EDEADLK)
438 			return error;
439 	}
440 
441 	/*
442 	 * We must choose whether to allocate a new vnode or recycle an
443 	 * existing one. The criterion for allocating a new one is that
444 	 * the total number of vnodes is less than the number desired or
445 	 * there are no vnodes on either free list. Generally we only
446 	 * want to recycle vnodes that have no buffers associated with
447 	 * them, so we look first on the vnode_free_list. If it is empty,
448 	 * we next consider vnodes with referencing buffers on the
449 	 * vnode_hold_list. The toggle ensures that half the time we
450 	 * will use a buffer from the vnode_hold_list, and half the time
451 	 * we will allocate a new one unless the list has grown to twice
452 	 * the desired size. We are reticent to recycle vnodes from the
453 	 * vnode_hold_list because we will lose the identity of all its
454 	 * referencing buffers.
455 	 */
456 	toggle ^= 1;
457 	if (numvnodes > 2 * desiredvnodes)
458 		toggle = 0;
459 
460 	simple_lock(&vnode_free_list_slock);
461 	if (numvnodes < desiredvnodes ||
462 	    (TAILQ_FIRST(listhd = &vnode_free_list) == NULL &&
463 	    (TAILQ_FIRST(listhd = &vnode_hold_list) == NULL || toggle))) {
464 		simple_unlock(&vnode_free_list_slock);
465 		vp = pool_get(&vnode_pool, PR_WAITOK);
466 		memset((char *)vp, 0, sizeof(*vp));
467 		simple_lock_init(&vp->v_interlock);
468 		numvnodes++;
469 	} else {
470 		for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
471 		    vp = TAILQ_NEXT(vp, v_freelist)) {
472 			if (simple_lock_try(&vp->v_interlock)) {
473 				if ((vp->v_flag & VLAYER) == 0) {
474 					break;
475 				}
476 				if (VOP_ISLOCKED(vp) == 0)
477 					break;
478 				else
479 					simple_unlock(&vp->v_interlock);
480 			}
481 		}
482 		/*
483 		 * Unless this is a bad time of the month, at most
484 		 * the first NCPUS items on the free list are
485 		 * locked, so this is close enough to being empty.
486 		 */
487 		if (vp == NULLVP) {
488 			simple_unlock(&vnode_free_list_slock);
489 			if (mp && error != EDEADLK)
490 				vfs_unbusy(mp);
491 			tablefull("vnode");
492 			*vpp = 0;
493 			return (ENFILE);
494 		}
495 		if (vp->v_usecount)
496 			panic("free vnode isn't");
497 		TAILQ_REMOVE(listhd, vp, v_freelist);
498 		/* see comment on why 0xdeadb is set at end of vgone (below) */
499 		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
500 		simple_unlock(&vnode_free_list_slock);
501 		vp->v_lease = NULL;
502 		if (vp->v_type != VBAD)
503 			vgonel(vp, p);
504 		else
505 			simple_unlock(&vp->v_interlock);
506 #ifdef DIAGNOSTIC
507 		if (vp->v_data)
508 			panic("cleaned vnode isn't");
509 		s = splbio();
510 		if (vp->v_numoutput)
511 			panic("Clean vnode has pending I/O's");
512 		splx(s);
513 #endif
514 		vp->v_flag = 0;
515 		vp->v_lastr = 0;
516 		vp->v_ralen = 0;
517 		vp->v_maxra = 0;
518 		vp->v_lastw = 0;
519 		vp->v_lasta = 0;
520 		vp->v_cstart = 0;
521 		vp->v_clen = 0;
522 		vp->v_socket = 0;
523 	}
524 	vp->v_type = VNON;
525 	vp->v_vnlock = &vp->v_lock;
526 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
527 	cache_purge(vp);
528 	vp->v_tag = tag;
529 	vp->v_op = vops;
530 	insmntque(vp, mp);
531 	*vpp = vp;
532 	vp->v_usecount = 1;
533 	vp->v_data = 0;
534 	simple_lock_init(&vp->v_uvm.u_obj.vmobjlock);
535 	if (mp && error != EDEADLK)
536 		vfs_unbusy(mp);
537 	return (0);
538 }
539 
540 /*
541  * Move a vnode from one mount queue to another.
542  */
543 void
544 insmntque(vp, mp)
545 	register struct vnode *vp;
546 	register struct mount *mp;
547 {
548 
549 #ifdef DIAGNOSTIC
550 	if ((mp != NULL) &&
551 	    (mp->mnt_flag & MNT_UNMOUNT) &&
552 	    !(mp->mnt_flag & MNT_SOFTDEP) &&
553 	    vp->v_tag != VT_VFS) {
554 		panic("insmntque into dying filesystem");
555 	}
556 #endif
557 
558 	simple_lock(&mntvnode_slock);
559 	/*
560 	 * Delete from old mount point vnode list, if on one.
561 	 */
562 	if (vp->v_mount != NULL)
563 		LIST_REMOVE(vp, v_mntvnodes);
564 	/*
565 	 * Insert into list of vnodes for the new mount point, if available.
566 	 */
567 	if ((vp->v_mount = mp) != NULL)
568 		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
569 	simple_unlock(&mntvnode_slock);
570 }
571 
572 /*
573  * Update outstanding I/O count and do wakeup if requested.
574  */
575 void
576 vwakeup(bp)
577 	register struct buf *bp;
578 {
579 	register struct vnode *vp;
580 
581 	bp->b_flags &= ~B_WRITEINPROG;
582 	if ((vp = bp->b_vp) != NULL) {
583 		if (--vp->v_numoutput < 0)
584 			panic("vwakeup: neg numoutput");
585 		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
586 			vp->v_flag &= ~VBWAIT;
587 			wakeup((caddr_t)&vp->v_numoutput);
588 		}
589 	}
590 }
591 
592 /*
593  * Flush out and invalidate all buffers associated with a vnode.
594  * Called with the underlying object locked.
595  */
596 int
597 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
598 	register struct vnode *vp;
599 	int flags;
600 	struct ucred *cred;
601 	struct proc *p;
602 	int slpflag, slptimeo;
603 {
604 	register struct buf *bp;
605 	struct buf *nbp, *blist;
606 	int s, error;
607 
608 	if (flags & V_SAVE) {
609 		s = splbio();
610 		while (vp->v_numoutput) {
611 			vp->v_flag |= VBWAIT;
612 			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
613 			    "vbwait", 0);
614 		}
615 		if (vp->v_dirtyblkhd.lh_first != NULL) {
616 			splx(s);
617 			if ((error = VOP_FSYNC(vp, cred, FSYNC_WAIT, p)) != 0)
618 				return (error);
619 			s = splbio();
620 			if (vp->v_numoutput > 0 ||
621 			    vp->v_dirtyblkhd.lh_first != NULL)
622 				panic("vinvalbuf: dirty bufs");
623 		}
624 		splx(s);
625 	}
626 
627 	s = splbio();
628 
629 	for (;;) {
630 		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
631 			while (blist && blist->b_lblkno < 0)
632 				blist = blist->b_vnbufs.le_next;
633 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
634 		    (flags & V_SAVEMETA)) {
635 			while (blist && blist->b_lblkno < 0)
636 				blist = blist->b_vnbufs.le_next;
637 		}
638 		if (!blist)
639 			break;
640 
641 		for (bp = blist; bp; bp = nbp) {
642 			nbp = bp->b_vnbufs.le_next;
643 			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
644 				continue;
645 			if (bp->b_flags & B_BUSY) {
646 				bp->b_flags |= B_WANTED;
647 				error = tsleep((caddr_t)bp,
648 					slpflag | (PRIBIO + 1), "vinvalbuf",
649 					slptimeo);
650 				if (error) {
651 					splx(s);
652 					return (error);
653 				}
654 				break;
655 			}
656 			bp->b_flags |= B_BUSY | B_VFLUSH;
657 			/*
658 			 * XXX Since there are no node locks for NFS, I believe
659 			 * there is a slight chance that a delayed write will
660 			 * occur while sleeping just above, so check for it.
661 			 */
662 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
663 				VOP_BWRITE(bp);
664 #ifdef DEBUG
665 				printf("buffer still DELWRI\n");
666 #endif
667 				/* VOP_FSYNC(vp, cred, FSYNC_WAIT, p); */
668 				continue;
669 			}
670 			bp->b_flags |= B_INVAL;
671 			brelse(bp);
672 		}
673 	}
674 
675 	if (!(flags & V_SAVEMETA) &&
676 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
677 		panic("vinvalbuf: flush failed");
678 
679 	splx(s);
680 
681 	return (0);
682 }
683 
684 void
685 vflushbuf(vp, sync)
686 	register struct vnode *vp;
687 	int sync;
688 {
689 	register struct buf *bp, *nbp;
690 	int s;
691 
692 loop:
693 	s = splbio();
694 	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
695 		nbp = bp->b_vnbufs.le_next;
696 		if ((bp->b_flags & B_BUSY))
697 			continue;
698 		if ((bp->b_flags & B_DELWRI) == 0)
699 			panic("vflushbuf: not dirty");
700 		bp->b_flags |= B_BUSY | B_VFLUSH;
701 		splx(s);
702 		/*
703 		 * Wait for I/O associated with indirect blocks to complete,
704 		 * since there is no way to quickly wait for them below.
705 		 */
706 		if (bp->b_vp == vp || sync == 0)
707 			(void) bawrite(bp);
708 		else
709 			(void) bwrite(bp);
710 		goto loop;
711 	}
712 	if (sync == 0) {
713 		splx(s);
714 		return;
715 	}
716 	while (vp->v_numoutput) {
717 		vp->v_flag |= VBWAIT;
718 		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0);
719 	}
720 	splx(s);
721 	if (vp->v_dirtyblkhd.lh_first != NULL) {
722 		vprint("vflushbuf: dirty", vp);
723 		goto loop;
724 	}
725 }
726 
727 /*
728  * Associate a buffer with a vnode.
729  */
730 void
731 bgetvp(vp, bp)
732 	register struct vnode *vp;
733 	register struct buf *bp;
734 {
735 	int s;
736 
737 	if (bp->b_vp)
738 		panic("bgetvp: not free");
739 	VHOLD(vp);
740 	s = splbio();
741 	bp->b_vp = vp;
742 	if (vp->v_type == VBLK || vp->v_type == VCHR)
743 		bp->b_dev = vp->v_rdev;
744 	else
745 		bp->b_dev = NODEV;
746 	/*
747 	 * Insert onto list for new vnode.
748 	 */
749 	bufinsvn(bp, &vp->v_cleanblkhd);
750 	splx(s);
751 }
752 
753 /*
754  * Disassociate a buffer from a vnode.
755  */
756 void
757 brelvp(bp)
758 	register struct buf *bp;
759 {
760 	struct vnode *vp;
761 	int s;
762 
763 	if (bp->b_vp == (struct vnode *) 0)
764 		panic("brelvp: NULL");
765 
766 	s = splbio();
767 	vp = bp->b_vp;
768 	/*
769 	 * Delete from old vnode list, if on one.
770 	 */
771 	if (bp->b_vnbufs.le_next != NOLIST)
772 		bufremvn(bp);
773 	if ((vp->v_flag & VONWORKLST) && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
774 		vp->v_flag &= ~VONWORKLST;
775 		LIST_REMOVE(vp, v_synclist);
776 	}
777 	bp->b_vp = (struct vnode *) 0;
778 	HOLDRELE(vp);
779 	splx(s);
780 }
781 
782 /*
783  * Reassign a buffer from one vnode to another.
784  * Used to assign file specific control information
785  * (indirect blocks) to the vnode to which they belong.
786  *
787  * This function must be called at splbio().
788  */
789 void
790 reassignbuf(bp, newvp)
791 	struct buf *bp;
792 	struct vnode *newvp;
793 {
794 	struct buflists *listheadp;
795 	int delay;
796 
797 	if (newvp == NULL) {
798 		printf("reassignbuf: NULL");
799 		return;
800 	}
801 
802 	/*
803 	 * Delete from old vnode list, if on one.
804 	 */
805 	if (bp->b_vnbufs.le_next != NOLIST)
806 		bufremvn(bp);
807 	/*
808 	 * If dirty, put on list of dirty buffers;
809 	 * otherwise insert onto list of clean buffers.
810 	 */
811 	if ((bp->b_flags & B_DELWRI) == 0) {
812 		listheadp = &newvp->v_cleanblkhd;
813 		if ((newvp->v_flag & VONWORKLST) &&
814 		    LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
815 			newvp->v_flag &= ~VONWORKLST;
816 			LIST_REMOVE(newvp, v_synclist);
817 		}
818 	} else {
819 		listheadp = &newvp->v_dirtyblkhd;
820 		if ((newvp->v_flag & VONWORKLST) == 0) {
821 			switch (newvp->v_type) {
822 			case VDIR:
823 				delay = dirdelay;
824 				break;
825 			case VBLK:
826 				if (newvp->v_specmountpoint != NULL) {
827 					delay = metadelay;
828 					break;
829 				}
830 				/* fall through */
831 			default:
832 				delay = filedelay;;
833 			}
834 			vn_syncer_add_to_worklist(newvp, delay);
835 		}
836 	}
837 	bufinsvn(bp, listheadp);
838 }
839 
840 /*
841  * Create a vnode for a block device.
842  * Used for root filesystem and swap areas.
843  * Also used for memory file system special devices.
844  */
845 int
846 bdevvp(dev, vpp)
847 	dev_t dev;
848 	struct vnode **vpp;
849 {
850 
851 	return (getdevvp(dev, vpp, VBLK));
852 }
853 
854 /*
855  * Create a vnode for a character device.
856  * Used for kernfs and some console handling.
857  */
858 int
859 cdevvp(dev, vpp)
860 	dev_t dev;
861 	struct vnode **vpp;
862 {
863 
864 	return (getdevvp(dev, vpp, VCHR));
865 }
866 
867 /*
868  * Create a vnode for a device.
869  * Used by bdevvp (block device) for root file system etc.,
870  * and by cdevvp (character device) for console and kernfs.
871  */
872 int
873 getdevvp(dev, vpp, type)
874 	dev_t dev;
875 	struct vnode **vpp;
876 	enum vtype type;
877 {
878 	register struct vnode *vp;
879 	struct vnode *nvp;
880 	int error;
881 
882 	if (dev == NODEV) {
883 		*vpp = NULLVP;
884 		return (0);
885 	}
886 	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
887 	if (error) {
888 		*vpp = NULLVP;
889 		return (error);
890 	}
891 	vp = nvp;
892 	vp->v_type = type;
893 	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
894 		vput(vp);
895 		vp = nvp;
896 	}
897 	*vpp = vp;
898 	return (0);
899 }
900 
901 /*
902  * Check to see if the new vnode represents a special device
903  * for which we already have a vnode (either because of
904  * bdevvp() or because of a different vnode representing
905  * the same block device). If such an alias exists, deallocate
906  * the existing contents and return the aliased vnode. The
907  * caller is responsible for filling it with its new contents.
908  */
909 struct vnode *
910 checkalias(nvp, nvp_rdev, mp)
911 	register struct vnode *nvp;
912 	dev_t nvp_rdev;
913 	struct mount *mp;
914 {
915 	struct proc *p = curproc;       /* XXX */
916 	register struct vnode *vp;
917 	struct vnode **vpp;
918 
919 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
920 		return (NULLVP);
921 
922 	vpp = &speclisth[SPECHASH(nvp_rdev)];
923 loop:
924 	simple_lock(&spechash_slock);
925 	for (vp = *vpp; vp; vp = vp->v_specnext) {
926 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
927 			continue;
928 		/*
929 		 * Alias, but not in use, so flush it out.
930 		 */
931 		simple_lock(&vp->v_interlock);
932 		if (vp->v_usecount == 0) {
933 			simple_unlock(&spechash_slock);
934 			vgonel(vp, p);
935 			goto loop;
936 		}
937 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
938 			simple_unlock(&spechash_slock);
939 			goto loop;
940 		}
941 		break;
942 	}
943 	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
944 		MALLOC(nvp->v_specinfo, struct specinfo *,
945 			sizeof(struct specinfo), M_VNODE, M_WAITOK);
946 		nvp->v_rdev = nvp_rdev;
947 		nvp->v_hashchain = vpp;
948 		nvp->v_specnext = *vpp;
949 		nvp->v_specmountpoint = NULL;
950 		simple_unlock(&spechash_slock);
951 		nvp->v_speclockf = NULL;
952 		*vpp = nvp;
953 		if (vp != NULLVP) {
954 			nvp->v_flag |= VALIASED;
955 			vp->v_flag |= VALIASED;
956 			vput(vp);
957 		}
958 		return (NULLVP);
959 	}
960 	simple_unlock(&spechash_slock);
961 	VOP_UNLOCK(vp, 0);
962 	simple_lock(&vp->v_interlock);
963 	vclean(vp, 0, p);
964 	vp->v_op = nvp->v_op;
965 	vp->v_tag = nvp->v_tag;
966 	vp->v_vnlock = &vp->v_lock;
967 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
968 	nvp->v_type = VNON;
969 	insmntque(vp, mp);
970 	return (vp);
971 }
972 
973 /*
974  * Grab a particular vnode from the free list, increment its
975  * reference count and lock it. If the vnode lock bit is set the
976  * vnode is being eliminated in vgone. In that case, we can not
977  * grab the vnode, so the process is awakened when the transition is
978  * completed, and an error returned to indicate that the vnode is no
979  * longer usable (possibly having been changed to a new file system type).
980  */
981 int
982 vget(vp, flags)
983 	struct vnode *vp;
984 	int flags;
985 {
986 	int error;
987 
988 	/*
989 	 * If the vnode is in the process of being cleaned out for
990 	 * another use, we wait for the cleaning to finish and then
991 	 * return failure. Cleaning is determined by checking that
992 	 * the VXLOCK flag is set.
993 	 */
994 	if ((flags & LK_INTERLOCK) == 0)
995 		simple_lock(&vp->v_interlock);
996 	if (vp->v_flag & VXLOCK) {
997 		vp->v_flag |= VXWANT;
998 		simple_unlock(&vp->v_interlock);
999 		tsleep((caddr_t)vp, PINOD, "vget", 0);
1000 		return (ENOENT);
1001 	}
1002 	if (vp->v_usecount == 0) {
1003 		simple_lock(&vnode_free_list_slock);
1004 		if (vp->v_holdcnt > 0)
1005 			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1006 		else
1007 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1008 		simple_unlock(&vnode_free_list_slock);
1009 	}
1010 	vp->v_usecount++;
1011 #ifdef DIAGNOSTIC
1012 	if (vp->v_usecount == 0) {
1013 		vprint("vget", vp);
1014 		panic("vget: usecount overflow");
1015 	}
1016 #endif
1017 	if (flags & LK_TYPE_MASK) {
1018 		if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
1019 			/*
1020 			 * must expand vrele here because we do not want
1021 			 * to call VOP_INACTIVE if the reference count
1022 			 * drops back to zero since it was never really
1023 			 * active. We must remove it from the free list
1024 			 * before sleeping so that multiple processes do
1025 			 * not try to recycle it.
1026 			 */
1027 			simple_lock(&vp->v_interlock);
1028 			vp->v_usecount--;
1029 			if (vp->v_usecount > 0) {
1030 				simple_unlock(&vp->v_interlock);
1031 				return (error);
1032 			}
1033 			/*
1034 			 * insert at tail of LRU list
1035 			 */
1036 			simple_lock(&vnode_free_list_slock);
1037 			if (vp->v_holdcnt > 0)
1038 				TAILQ_INSERT_TAIL(&vnode_hold_list, vp,
1039 				    v_freelist);
1040 			else
1041 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
1042 				    v_freelist);
1043 			simple_unlock(&vnode_free_list_slock);
1044 			simple_unlock(&vp->v_interlock);
1045 		}
1046 		return (error);
1047 	}
1048 	simple_unlock(&vp->v_interlock);
1049 	return (0);
1050 }
1051 
1052 /*
1053  * vput(), just unlock and vrele()
1054  */
1055 void
1056 vput(vp)
1057 	struct vnode *vp;
1058 {
1059 	struct proc *p = curproc;	/* XXX */
1060 
1061 #ifdef DIAGNOSTIC
1062 	if (vp == NULL)
1063 		panic("vput: null vp");
1064 #endif
1065 	simple_lock(&vp->v_interlock);
1066 	vp->v_usecount--;
1067 	if (vp->v_usecount > 0) {
1068 		simple_unlock(&vp->v_interlock);
1069 		VOP_UNLOCK(vp, 0);
1070 		return;
1071 	}
1072 #ifdef DIAGNOSTIC
1073 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1074 		vprint("vput: bad ref count", vp);
1075 		panic("vput: ref cnt");
1076 	}
1077 #endif
1078 	/*
1079 	 * Insert at tail of LRU list.
1080 	 */
1081 	simple_lock(&vnode_free_list_slock);
1082 	if (vp->v_holdcnt > 0)
1083 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1084 	else
1085 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1086 	simple_unlock(&vnode_free_list_slock);
1087 	simple_unlock(&vp->v_interlock);
1088 	VOP_INACTIVE(vp, p);
1089 }
1090 
1091 /*
1092  * Vnode release.
1093  * If count drops to zero, call inactive routine and return to freelist.
1094  */
1095 void
1096 vrele(vp)
1097 	struct vnode *vp;
1098 {
1099 	struct proc *p = curproc;	/* XXX */
1100 
1101 #ifdef DIAGNOSTIC
1102 	if (vp == NULL)
1103 		panic("vrele: null vp");
1104 #endif
1105 	simple_lock(&vp->v_interlock);
1106 	vp->v_usecount--;
1107 	if (vp->v_usecount > 0) {
1108 		simple_unlock(&vp->v_interlock);
1109 		return;
1110 	}
1111 #ifdef DIAGNOSTIC
1112 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1113 		vprint("vrele: bad ref count", vp);
1114 		panic("vrele: ref cnt");
1115 	}
1116 #endif
1117 	/*
1118 	 * Insert at tail of LRU list.
1119 	 */
1120 	simple_lock(&vnode_free_list_slock);
1121 	if (vp->v_holdcnt > 0)
1122 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1123 	else
1124 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1125 	simple_unlock(&vnode_free_list_slock);
1126 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
1127 		VOP_INACTIVE(vp, p);
1128 }
1129 
1130 #ifdef DIAGNOSTIC
1131 /*
1132  * Page or buffer structure gets a reference.
1133  */
1134 void
1135 vhold(vp)
1136 	register struct vnode *vp;
1137 {
1138 
1139 	/*
1140 	 * If it is on the freelist and the hold count is currently
1141 	 * zero, move it to the hold list. The test of the back
1142 	 * pointer and the use reference count of zero is because
1143 	 * it will be removed from a free list by getnewvnode,
1144 	 * but will not have its reference count incremented until
1145 	 * after calling vgone. If the reference count were
1146 	 * incremented first, vgone would (incorrectly) try to
1147 	 * close the previous instance of the underlying object.
1148 	 * So, the back pointer is explicitly set to `0xdeadb' in
1149 	 * getnewvnode after removing it from a freelist to ensure
1150 	 * that we do not try to move it here.
1151 	 */
1152   	simple_lock(&vp->v_interlock);
1153 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1154 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1155 		simple_lock(&vnode_free_list_slock);
1156 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1157 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1158 		simple_unlock(&vnode_free_list_slock);
1159 	}
1160 	vp->v_holdcnt++;
1161 	simple_unlock(&vp->v_interlock);
1162 }
1163 
1164 /*
1165  * Page or buffer structure frees a reference.
1166  */
1167 void
1168 holdrele(vp)
1169 	register struct vnode *vp;
1170 {
1171 
1172 	simple_lock(&vp->v_interlock);
1173 	if (vp->v_holdcnt <= 0)
1174 		panic("holdrele: holdcnt");
1175 	vp->v_holdcnt--;
1176 	/*
1177 	 * If it is on the holdlist and the hold count drops to
1178 	 * zero, move it to the free list. The test of the back
1179 	 * pointer and the use reference count of zero is because
1180 	 * it will be removed from a free list by getnewvnode,
1181 	 * but will not have its reference count incremented until
1182 	 * after calling vgone. If the reference count were
1183 	 * incremented first, vgone would (incorrectly) try to
1184 	 * close the previous instance of the underlying object.
1185 	 * So, the back pointer is explicitly set to `0xdeadb' in
1186 	 * getnewvnode after removing it from a freelist to ensure
1187 	 * that we do not try to move it here.
1188 	 */
1189 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1190 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1191 		simple_lock(&vnode_free_list_slock);
1192 		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1193 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1194 		simple_unlock(&vnode_free_list_slock);
1195 	}
1196 	simple_unlock(&vp->v_interlock);
1197 }
1198 
1199 /*
1200  * Vnode reference.
1201  */
1202 void
1203 vref(vp)
1204 	struct vnode *vp;
1205 {
1206 
1207 	simple_lock(&vp->v_interlock);
1208 	if (vp->v_usecount <= 0)
1209 		panic("vref used where vget required");
1210 	vp->v_usecount++;
1211 #ifdef DIAGNOSTIC
1212 	if (vp->v_usecount == 0) {
1213 		vprint("vref", vp);
1214 		panic("vref: usecount overflow");
1215 	}
1216 #endif
1217 	simple_unlock(&vp->v_interlock);
1218 }
1219 #endif /* DIAGNOSTIC */
1220 
1221 /*
1222  * Remove any vnodes in the vnode table belonging to mount point mp.
1223  *
1224  * If MNT_NOFORCE is specified, there should not be any active ones,
1225  * return error if any are found (nb: this is a user error, not a
1226  * system error). If MNT_FORCE is specified, detach any active vnodes
1227  * that are found.
1228  */
1229 #ifdef DEBUG
1230 int busyprt = 0;	/* print out busy vnodes */
1231 struct ctldebug debug1 = { "busyprt", &busyprt };
1232 #endif
1233 
1234 int
1235 vflush(mp, skipvp, flags)
1236 	struct mount *mp;
1237 	struct vnode *skipvp;
1238 	int flags;
1239 {
1240 	struct proc *p = curproc;	/* XXX */
1241 	register struct vnode *vp, *nvp;
1242 	int busy = 0;
1243 
1244 	simple_lock(&mntvnode_slock);
1245 loop:
1246 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1247 		if (vp->v_mount != mp)
1248 			goto loop;
1249 		nvp = vp->v_mntvnodes.le_next;
1250 		/*
1251 		 * Skip over a selected vnode.
1252 		 */
1253 		if (vp == skipvp)
1254 			continue;
1255 		simple_lock(&vp->v_interlock);
1256 		/*
1257 		 * Skip over a vnodes marked VSYSTEM.
1258 		 */
1259 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1260 			simple_unlock(&vp->v_interlock);
1261 			continue;
1262 		}
1263 		/*
1264 		 * If WRITECLOSE is set, only flush out regular file
1265 		 * vnodes open for writing.
1266 		 */
1267 		if ((flags & WRITECLOSE) &&
1268 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1269 			simple_unlock(&vp->v_interlock);
1270 			continue;
1271 		}
1272 		/*
1273 		 * With v_usecount == 0, all we need to do is clear
1274 		 * out the vnode data structures and we are done.
1275 		 */
1276 		if (vp->v_usecount == 0) {
1277 			simple_unlock(&mntvnode_slock);
1278 			vgonel(vp, p);
1279 			simple_lock(&mntvnode_slock);
1280 			continue;
1281 		}
1282 		/*
1283 		 * If FORCECLOSE is set, forcibly close the vnode.
1284 		 * For block or character devices, revert to an
1285 		 * anonymous device. For all other files, just kill them.
1286 		 */
1287 		if (flags & FORCECLOSE) {
1288 			simple_unlock(&mntvnode_slock);
1289 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1290 				vgonel(vp, p);
1291 			} else {
1292 				vclean(vp, 0, p);
1293 				vp->v_op = spec_vnodeop_p;
1294 				insmntque(vp, (struct mount *)0);
1295 			}
1296 			simple_lock(&mntvnode_slock);
1297 			continue;
1298 		}
1299 #ifdef DEBUG
1300 		if (busyprt)
1301 			vprint("vflush: busy vnode", vp);
1302 #endif
1303 		simple_unlock(&vp->v_interlock);
1304 		busy++;
1305 	}
1306 	simple_unlock(&mntvnode_slock);
1307 	if (busy)
1308 		return (EBUSY);
1309 	return (0);
1310 }
1311 
1312 /*
1313  * Disassociate the underlying file system from a vnode.
1314  */
1315 void
1316 vclean(vp, flags, p)
1317 	register struct vnode *vp;
1318 	int flags;
1319 	struct proc *p;
1320 {
1321 	int active;
1322 
1323 	/*
1324 	 * Check to see if the vnode is in use.
1325 	 * If so we have to reference it before we clean it out
1326 	 * so that its count cannot fall to zero and generate a
1327 	 * race against ourselves to recycle it.
1328 	 */
1329 	if ((active = vp->v_usecount) != 0) {
1330 		/* We have the vnode interlock. */
1331 		vp->v_usecount++;
1332 #ifdef DIAGNOSTIC
1333 		if (vp->v_usecount == 0) {
1334 			vprint("vclean", vp);
1335 			panic("vclean: usecount overflow");
1336 		}
1337 #endif
1338 	}
1339 
1340 	/*
1341 	 * Prevent the vnode from being recycled or
1342 	 * brought into use while we clean it out.
1343 	 */
1344 	if (vp->v_flag & VXLOCK)
1345 		panic("vclean: deadlock");
1346 	vp->v_flag |= VXLOCK;
1347 	/*
1348 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1349 	 * have the object locked while it cleans it out. The VOP_LOCK
1350 	 * ensures that the VOP_INACTIVE routine is done with its work.
1351 	 * For active vnodes, it ensures that no other activity can
1352 	 * occur while the underlying object is being cleaned out.
1353 	 */
1354 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);
1355 
1356 	/*
1357 	 * clean out any VM data associated with the vnode.
1358 	 */
1359 	uvm_vnp_terminate(vp);
1360 	/*
1361 	 * Clean out any buffers associated with the vnode.
1362 	 */
1363 	if (flags & DOCLOSE)
1364 		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1365 
1366 	/*
1367 	 * If purging an active vnode, it must be closed and
1368 	 * deactivated before being reclaimed. Note that the
1369 	 * VOP_INACTIVE will unlock the vnode.
1370 	 */
1371 	if (active) {
1372 		if (flags & DOCLOSE)
1373 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
1374 		VOP_INACTIVE(vp, p);
1375 	} else {
1376 		/*
1377 		 * Any other processes trying to obtain this lock must first
1378 		 * wait for VXLOCK to clear, then call the new lock operation.
1379 		 */
1380 		VOP_UNLOCK(vp, 0);
1381 	}
1382 	/*
1383 	 * Reclaim the vnode.
1384 	 */
1385 	if (VOP_RECLAIM(vp, p))
1386 		panic("vclean: cannot reclaim");
1387 
1388 	if (active) {
1389 		/*
1390 		 * Inline copy of vrele() since VOP_INACTIVE
1391 		 * has already been called.
1392 		 */
1393 		simple_lock(&vp->v_interlock);
1394 		if (--vp->v_usecount <= 0) {
1395 #ifdef DIAGNOSTIC
1396 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1397 				vprint("vclean: bad ref count", vp);
1398 				panic("vclean: ref cnt");
1399 			}
1400 #endif
1401 			/*
1402 			 * Insert at tail of LRU list.
1403 			 */
1404 			simple_unlock(&vp->v_interlock);
1405 			simple_lock(&vnode_free_list_slock);
1406 #ifdef DIAGNOSTIC
1407 			if (vp->v_vnlock) {
1408 				if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1409 					vprint("vclean: lock not drained", vp);
1410 			}
1411 			if (vp->v_holdcnt > 0)
1412 				panic("vclean: not clean");
1413 #endif
1414 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1415 			simple_unlock(&vnode_free_list_slock);
1416 		} else
1417 			simple_unlock(&vp->v_interlock);
1418 	}
1419 
1420 	cache_purge(vp);
1421 
1422 	/*
1423 	 * Done with purge, notify sleepers of the grim news.
1424 	 */
1425 	vp->v_op = dead_vnodeop_p;
1426 	vp->v_tag = VT_NON;
1427 	vp->v_flag &= ~VXLOCK;
1428 	if (vp->v_flag & VXWANT) {
1429 		vp->v_flag &= ~VXWANT;
1430 		wakeup((caddr_t)vp);
1431 	}
1432 }
1433 
1434 /*
1435  * Recycle an unused vnode to the front of the free list.
1436  * Release the passed interlock if the vnode will be recycled.
1437  */
1438 int
1439 vrecycle(vp, inter_lkp, p)
1440 	struct vnode *vp;
1441 	struct simplelock *inter_lkp;
1442 	struct proc *p;
1443 {
1444 
1445 	simple_lock(&vp->v_interlock);
1446 	if (vp->v_usecount == 0) {
1447 		if (inter_lkp)
1448 			simple_unlock(inter_lkp);
1449 		vgonel(vp, p);
1450 		return (1);
1451 	}
1452 	simple_unlock(&vp->v_interlock);
1453 	return (0);
1454 }
1455 
1456 /*
1457  * Eliminate all activity associated with a vnode
1458  * in preparation for reuse.
1459  */
1460 void
1461 vgone(vp)
1462 	struct vnode *vp;
1463 {
1464 	struct proc *p = curproc;	/* XXX */
1465 
1466 	simple_lock(&vp->v_interlock);
1467 	vgonel(vp, p);
1468 }
1469 
1470 /*
1471  * vgone, with the vp interlock held.
1472  */
1473 void
1474 vgonel(vp, p)
1475 	register struct vnode *vp;
1476 	struct proc *p;
1477 {
1478 	struct vnode *vq;
1479 	struct vnode *vx;
1480 
1481 	/*
1482 	 * If a vgone (or vclean) is already in progress,
1483 	 * wait until it is done and return.
1484 	 */
1485 	if (vp->v_flag & VXLOCK) {
1486 		vp->v_flag |= VXWANT;
1487 		simple_unlock(&vp->v_interlock);
1488 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1489 		return;
1490 	}
1491 	/*
1492 	 * Clean out the filesystem specific data.
1493 	 */
1494 	vclean(vp, DOCLOSE, p);
1495 	/*
1496 	 * Delete from old mount point vnode list, if on one.
1497 	 */
1498 	if (vp->v_mount != NULL)
1499 		insmntque(vp, (struct mount *)0);
1500 	/*
1501 	 * If special device, remove it from special device alias list.
1502 	 * if it is on one.
1503 	 */
1504 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1505 		simple_lock(&spechash_slock);
1506 		if (vp->v_hashchain != NULL) {
1507 			if (*vp->v_hashchain == vp) {
1508 				*vp->v_hashchain = vp->v_specnext;
1509 			} else {
1510 				for (vq = *vp->v_hashchain; vq;
1511 							vq = vq->v_specnext) {
1512 					if (vq->v_specnext != vp)
1513 						continue;
1514 					vq->v_specnext = vp->v_specnext;
1515 					break;
1516 				}
1517 				if (vq == NULL)
1518 					panic("missing bdev");
1519 			}
1520 			if (vp->v_flag & VALIASED) {
1521 				vx = NULL;
1522 				for (vq = *vp->v_hashchain; vq;
1523 							vq = vq->v_specnext) {
1524 					if (vq->v_rdev != vp->v_rdev ||
1525 					    vq->v_type != vp->v_type)
1526 						continue;
1527 					if (vx)
1528 						break;
1529 					vx = vq;
1530 				}
1531 				if (vx == NULL)
1532 					panic("missing alias");
1533 				if (vq == NULL)
1534 					vx->v_flag &= ~VALIASED;
1535 				vp->v_flag &= ~VALIASED;
1536 			}
1537 		}
1538 		simple_unlock(&spechash_slock);
1539 		FREE(vp->v_specinfo, M_VNODE);
1540 		vp->v_specinfo = NULL;
1541 	}
1542 	/*
1543 	 * If it is on the freelist and not already at the head,
1544 	 * move it to the head of the list. The test of the back
1545 	 * pointer and the reference count of zero is because
1546 	 * it will be removed from the free list by getnewvnode,
1547 	 * but will not have its reference count incremented until
1548 	 * after calling vgone. If the reference count were
1549 	 * incremented first, vgone would (incorrectly) try to
1550 	 * close the previous instance of the underlying object.
1551 	 * So, the back pointer is explicitly set to `0xdeadb' in
1552 	 * getnewvnode after removing it from the freelist to ensure
1553 	 * that we do not try to move it here.
1554 	 */
1555 	if (vp->v_usecount == 0) {
1556 		simple_lock(&vnode_free_list_slock);
1557 		if (vp->v_holdcnt > 0)
1558 			panic("vgonel: not clean");
1559 		if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
1560 		    TAILQ_FIRST(&vnode_free_list) != vp) {
1561 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1562 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1563 		}
1564 		simple_unlock(&vnode_free_list_slock);
1565 	}
1566 	vp->v_type = VBAD;
1567 }
1568 
1569 /*
1570  * Lookup a vnode by device number.
1571  */
1572 int
1573 vfinddev(dev, type, vpp)
1574 	dev_t dev;
1575 	enum vtype type;
1576 	struct vnode **vpp;
1577 {
1578 	struct vnode *vp;
1579 	int rc = 0;
1580 
1581 	simple_lock(&spechash_slock);
1582 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1583 		if (dev != vp->v_rdev || type != vp->v_type)
1584 			continue;
1585 		*vpp = vp;
1586 		rc = 1;
1587 		break;
1588 	}
1589 	simple_unlock(&spechash_slock);
1590 	return (rc);
1591 }
1592 
1593 /*
1594  * Revoke all the vnodes corresponding to the specified minor number
1595  * range (endpoints inclusive) of the specified major.
1596  */
1597 void
1598 vdevgone(maj, minl, minh, type)
1599 	int maj, minl, minh;
1600 	enum vtype type;
1601 {
1602 	struct vnode *vp;
1603 	int mn;
1604 
1605 	for (mn = minl; mn <= minh; mn++)
1606 		if (vfinddev(makedev(maj, mn), type, &vp))
1607 			VOP_REVOKE(vp, REVOKEALL);
1608 }
1609 
1610 /*
1611  * Calculate the total number of references to a special device.
1612  */
1613 int
1614 vcount(vp)
1615 	register struct vnode *vp;
1616 {
1617 	register struct vnode *vq, *vnext;
1618 	int count;
1619 
1620 loop:
1621 	if ((vp->v_flag & VALIASED) == 0)
1622 		return (vp->v_usecount);
1623 	simple_lock(&spechash_slock);
1624 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1625 		vnext = vq->v_specnext;
1626 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1627 			continue;
1628 		/*
1629 		 * Alias, but not in use, so flush it out.
1630 		 */
1631 		if (vq->v_usecount == 0 && vq != vp) {
1632 			simple_unlock(&spechash_slock);
1633 			vgone(vq);
1634 			goto loop;
1635 		}
1636 		count += vq->v_usecount;
1637 	}
1638 	simple_unlock(&spechash_slock);
1639 	return (count);
1640 }
1641 
1642 /*
1643  * Print out a description of a vnode.
1644  */
1645 static char *typename[] =
1646    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1647 
1648 void
1649 vprint(label, vp)
1650 	char *label;
1651 	register struct vnode *vp;
1652 {
1653 	char buf[64];
1654 
1655 	if (label != NULL)
1656 		printf("%s: ", label);
1657 	printf("tag %d type %s, usecount %ld, writecount %ld, refcount %ld,",
1658 	    vp->v_tag, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1659 	    vp->v_holdcnt);
1660 	buf[0] = '\0';
1661 	if (vp->v_flag & VROOT)
1662 		strcat(buf, "|VROOT");
1663 	if (vp->v_flag & VTEXT)
1664 		strcat(buf, "|VTEXT");
1665 	if (vp->v_flag & VSYSTEM)
1666 		strcat(buf, "|VSYSTEM");
1667 	if (vp->v_flag & VXLOCK)
1668 		strcat(buf, "|VXLOCK");
1669 	if (vp->v_flag & VXWANT)
1670 		strcat(buf, "|VXWANT");
1671 	if (vp->v_flag & VBWAIT)
1672 		strcat(buf, "|VBWAIT");
1673 	if (vp->v_flag & VALIASED)
1674 		strcat(buf, "|VALIASED");
1675 	if (buf[0] != '\0')
1676 		printf(" flags (%s)", &buf[1]);
1677 	if (vp->v_data == NULL) {
1678 		printf("\n");
1679 	} else {
1680 		printf("\n\t");
1681 		VOP_PRINT(vp);
1682 	}
1683 }
1684 
1685 #ifdef DEBUG
1686 /*
1687  * List all of the locked vnodes in the system.
1688  * Called when debugging the kernel.
1689  */
1690 void
1691 printlockedvnodes()
1692 {
1693 	struct mount *mp, *nmp;
1694 	struct vnode *vp;
1695 
1696 	printf("Locked vnodes\n");
1697 	simple_lock(&mountlist_slock);
1698 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1699 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
1700 			nmp = mp->mnt_list.cqe_next;
1701 			continue;
1702 		}
1703 		for (vp = mp->mnt_vnodelist.lh_first;
1704 		     vp != NULL;
1705 		     vp = vp->v_mntvnodes.le_next) {
1706 			if (VOP_ISLOCKED(vp))
1707 				vprint((char *)0, vp);
1708 		}
1709 		simple_lock(&mountlist_slock);
1710 		nmp = mp->mnt_list.cqe_next;
1711 		vfs_unbusy(mp);
1712 	}
1713 	simple_unlock(&mountlist_slock);
1714 }
1715 #endif
1716 
1717 extern const char *mountcompatnames[];
1718 extern const int nmountcompatnames;
1719 
1720 /*
1721  * Top level filesystem related information gathering.
1722  */
1723 int
1724 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1725 	int *name;
1726 	u_int namelen;
1727 	void *oldp;
1728 	size_t *oldlenp;
1729 	void *newp;
1730 	size_t newlen;
1731 	struct proc *p;
1732 {
1733 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
1734 	struct vfsconf vfc;
1735 #endif
1736 	struct vfsops *vfsp;
1737 
1738 	/* all sysctl names at this level are at least name and field */
1739 	if (namelen < 2)
1740 		return (ENOTDIR);		/* overloaded */
1741 
1742 	/* Not generic: goes to file system. */
1743 	if (name[0] != VFS_GENERIC) {
1744 		if (name[0] >= nmountcompatnames || name[0] < 0 ||
1745 		    mountcompatnames[name[0]] == NULL)
1746 			return (EOPNOTSUPP);
1747 		vfsp = vfs_getopsbyname(mountcompatnames[name[0]]);
1748 		if (vfsp == NULL || vfsp->vfs_sysctl == NULL)
1749 			return (EOPNOTSUPP);
1750 		return ((*vfsp->vfs_sysctl)(&name[1], namelen - 1,
1751 		    oldp, oldlenp, newp, newlen, p));
1752 	}
1753 
1754 	/* The rest are generic vfs sysctls. */
1755 	switch (name[1]) {
1756 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
1757 	case VFS_MAXTYPENUM:
1758 		/*
1759 		 * Provided for 4.4BSD-Lite2 compatibility.
1760 		 */
1761 		return (sysctl_rdint(oldp, oldlenp, newp, nmountcompatnames));
1762 	case VFS_CONF:
1763 		/*
1764 		 * Special: a node, next is a file system name.
1765 		 * Provided for 4.4BSD-Lite2 compatibility.
1766 		 */
1767 		if (namelen < 3)
1768 			return (ENOTDIR);	/* overloaded */
1769 		if (name[2] >= nmountcompatnames || name[2] < 0 ||
1770 		    mountcompatnames[name[2]] == NULL)
1771 			return (EOPNOTSUPP);
1772 		vfsp = vfs_getopsbyname(mountcompatnames[name[2]]);
1773 		if (vfsp == NULL)
1774 			return (EOPNOTSUPP);
1775 		vfc.vfc_vfsops = vfsp;
1776 		strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN);
1777 		vfc.vfc_typenum = name[2];
1778 		vfc.vfc_refcount = vfsp->vfs_refcount;
1779 		vfc.vfc_flags = 0;
1780 		vfc.vfc_mountroot = vfsp->vfs_mountroot;
1781 		vfc.vfc_next = NULL;
1782 		return (sysctl_rdstruct(oldp, oldlenp, newp, &vfc,
1783 		    sizeof(struct vfsconf)));
1784 #endif
1785 	default:
1786 		break;
1787 	}
1788 	return (EOPNOTSUPP);
1789 }
1790 
1791 int kinfo_vdebug = 1;
1792 int kinfo_vgetfailed;
1793 #define KINFO_VNODESLOP	10
1794 /*
1795  * Dump vnode list (via sysctl).
1796  * Copyout address of vnode followed by vnode.
1797  */
1798 /* ARGSUSED */
1799 int
1800 sysctl_vnode(where, sizep, p)
1801 	char *where;
1802 	size_t *sizep;
1803 	struct proc *p;
1804 {
1805 	struct mount *mp, *nmp;
1806 	struct vnode *nvp, *vp;
1807 	char *bp = where, *savebp;
1808 	char *ewhere;
1809 	int error;
1810 
1811 #define VPTRSZ	sizeof(struct vnode *)
1812 #define VNODESZ	sizeof(struct vnode)
1813 	if (where == NULL) {
1814 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1815 		return (0);
1816 	}
1817 	ewhere = where + *sizep;
1818 
1819 	simple_lock(&mountlist_slock);
1820 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1821 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
1822 			nmp = mp->mnt_list.cqe_next;
1823 			continue;
1824 		}
1825 		savebp = bp;
1826 again:
1827 		simple_lock(&mntvnode_slock);
1828 		for (vp = mp->mnt_vnodelist.lh_first;
1829 		     vp != NULL;
1830 		     vp = nvp) {
1831 			/*
1832 			 * Check that the vp is still associated with
1833 			 * this filesystem.  RACE: could have been
1834 			 * recycled onto the same filesystem.
1835 			 */
1836 			if (vp->v_mount != mp) {
1837 				simple_unlock(&mntvnode_slock);
1838 				if (kinfo_vdebug)
1839 					printf("kinfo: vp changed\n");
1840 				bp = savebp;
1841 				goto again;
1842 			}
1843 			nvp = vp->v_mntvnodes.le_next;
1844 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1845 				simple_unlock(&mntvnode_slock);
1846 				*sizep = bp - where;
1847 				return (ENOMEM);
1848 			}
1849 			simple_unlock(&mntvnode_slock);
1850 			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1851 			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1852 				return (error);
1853 			bp += VPTRSZ + VNODESZ;
1854 			simple_lock(&mntvnode_slock);
1855 		}
1856 		simple_unlock(&mntvnode_slock);
1857 		simple_lock(&mountlist_slock);
1858 		nmp = mp->mnt_list.cqe_next;
1859 		vfs_unbusy(mp);
1860 	}
1861 	simple_unlock(&mountlist_slock);
1862 
1863 	*sizep = bp - where;
1864 	return (0);
1865 }
1866 
1867 /*
1868  * Check to see if a filesystem is mounted on a block device.
1869  */
1870 int
1871 vfs_mountedon(vp)
1872 	struct vnode *vp;
1873 {
1874 	struct vnode *vq;
1875 	int error = 0;
1876 
1877 	if (vp->v_specmountpoint != NULL)
1878 		return (EBUSY);
1879 	if (vp->v_flag & VALIASED) {
1880 		simple_lock(&spechash_slock);
1881 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1882 			if (vq->v_rdev != vp->v_rdev ||
1883 			    vq->v_type != vp->v_type)
1884 				continue;
1885 			if (vq->v_specmountpoint != NULL) {
1886 				error = EBUSY;
1887 				break;
1888 			}
1889 		}
1890 		simple_unlock(&spechash_slock);
1891 	}
1892 	return (error);
1893 }
1894 
1895 /*
1896  * Build hash lists of net addresses and hang them off the mount point.
1897  * Called by ufs_mount() to set up the lists of export addresses.
1898  */
1899 static int
1900 vfs_hang_addrlist(mp, nep, argp)
1901 	struct mount *mp;
1902 	struct netexport *nep;
1903 	struct export_args *argp;
1904 {
1905 	register struct netcred *np, *enp;
1906 	register struct radix_node_head *rnh;
1907 	register int i;
1908 	struct radix_node *rn;
1909 	struct sockaddr *saddr, *smask = 0;
1910 	struct domain *dom;
1911 	int error;
1912 
1913 	if (argp->ex_addrlen == 0) {
1914 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1915 			return (EPERM);
1916 		np = &nep->ne_defexported;
1917 		np->netc_exflags = argp->ex_flags;
1918 		np->netc_anon = argp->ex_anon;
1919 		np->netc_anon.cr_ref = 1;
1920 		mp->mnt_flag |= MNT_DEFEXPORTED;
1921 		return (0);
1922 	}
1923 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1924 	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
1925 	memset((caddr_t)np, 0, i);
1926 	saddr = (struct sockaddr *)(np + 1);
1927 	error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
1928 	if (error)
1929 		goto out;
1930 	if (saddr->sa_len > argp->ex_addrlen)
1931 		saddr->sa_len = argp->ex_addrlen;
1932 	if (argp->ex_masklen) {
1933 		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1934 		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
1935 		if (error)
1936 			goto out;
1937 		if (smask->sa_len > argp->ex_masklen)
1938 			smask->sa_len = argp->ex_masklen;
1939 	}
1940 	i = saddr->sa_family;
1941 	if ((rnh = nep->ne_rtable[i]) == 0) {
1942 		/*
1943 		 * Seems silly to initialize every AF when most are not
1944 		 * used, do so on demand here
1945 		 */
1946 		for (dom = domains; dom; dom = dom->dom_next)
1947 			if (dom->dom_family == i && dom->dom_rtattach) {
1948 				dom->dom_rtattach((void **)&nep->ne_rtable[i],
1949 					dom->dom_rtoffset);
1950 				break;
1951 			}
1952 		if ((rnh = nep->ne_rtable[i]) == 0) {
1953 			error = ENOBUFS;
1954 			goto out;
1955 		}
1956 	}
1957 	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
1958 		np->netc_rnodes);
1959 	if (rn == 0 || np != (struct netcred *)rn) { /* already exists */
1960 		if (rn == 0) {
1961 			enp = (struct netcred *)(*rnh->rnh_lookup)(saddr,
1962 				smask, rnh);
1963 			if (enp == 0) {
1964 				error = EPERM;
1965 				goto out;
1966 			}
1967 		} else
1968 			enp = (struct netcred *)rn;
1969 
1970 		if (enp->netc_exflags != argp->ex_flags ||
1971 		    enp->netc_anon.cr_uid != argp->ex_anon.cr_uid ||
1972 		    enp->netc_anon.cr_gid != argp->ex_anon.cr_gid ||
1973 		    enp->netc_anon.cr_ngroups != argp->ex_anon.cr_ngroups ||
1974 		    memcmp(&enp->netc_anon.cr_groups, &argp->ex_anon.cr_groups,
1975 			enp->netc_anon.cr_ngroups))
1976 				error = EPERM;
1977 		else
1978 			error = 0;
1979 		goto out;
1980 	}
1981 	np->netc_exflags = argp->ex_flags;
1982 	np->netc_anon = argp->ex_anon;
1983 	np->netc_anon.cr_ref = 1;
1984 	return (0);
1985 out:
1986 	free(np, M_NETADDR);
1987 	return (error);
1988 }
1989 
1990 /* ARGSUSED */
1991 static int
1992 vfs_free_netcred(rn, w)
1993 	struct radix_node *rn;
1994 	void *w;
1995 {
1996 	register struct radix_node_head *rnh = (struct radix_node_head *)w;
1997 
1998 	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
1999 	free((caddr_t)rn, M_NETADDR);
2000 	return (0);
2001 }
2002 
2003 /*
2004  * Free the net address hash lists that are hanging off the mount points.
2005  */
2006 static void
2007 vfs_free_addrlist(nep)
2008 	struct netexport *nep;
2009 {
2010 	register int i;
2011 	register struct radix_node_head *rnh;
2012 
2013 	for (i = 0; i <= AF_MAX; i++)
2014 		if ((rnh = nep->ne_rtable[i]) != NULL) {
2015 			(*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
2016 			free((caddr_t)rnh, M_RTABLE);
2017 			nep->ne_rtable[i] = 0;
2018 		}
2019 }
2020 
2021 int
2022 vfs_export(mp, nep, argp)
2023 	struct mount *mp;
2024 	struct netexport *nep;
2025 	struct export_args *argp;
2026 {
2027 	int error;
2028 
2029 	if (argp->ex_flags & MNT_DELEXPORT) {
2030 		if (mp->mnt_flag & MNT_EXPUBLIC) {
2031 			vfs_setpublicfs(NULL, NULL, NULL);
2032 			mp->mnt_flag &= ~MNT_EXPUBLIC;
2033 		}
2034 		vfs_free_addrlist(nep);
2035 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2036 	}
2037 	if (argp->ex_flags & MNT_EXPORTED) {
2038 		if (argp->ex_flags & MNT_EXPUBLIC) {
2039 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2040 				return (error);
2041 			mp->mnt_flag |= MNT_EXPUBLIC;
2042 		}
2043 		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
2044 			return (error);
2045 		mp->mnt_flag |= MNT_EXPORTED;
2046 	}
2047 	return (0);
2048 }
2049 
2050 /*
2051  * Set the publicly exported filesystem (WebNFS). Currently, only
2052  * one public filesystem is possible in the spec (RFC 2054 and 2055)
2053  */
2054 int
2055 vfs_setpublicfs(mp, nep, argp)
2056 	struct mount *mp;
2057 	struct netexport *nep;
2058 	struct export_args *argp;
2059 {
2060 	int error;
2061 	struct vnode *rvp;
2062 	char *cp;
2063 
2064 	/*
2065 	 * mp == NULL -> invalidate the current info, the FS is
2066 	 * no longer exported. May be called from either vfs_export
2067 	 * or unmount, so check if it hasn't already been done.
2068 	 */
2069 	if (mp == NULL) {
2070 		if (nfs_pub.np_valid) {
2071 			nfs_pub.np_valid = 0;
2072 			if (nfs_pub.np_index != NULL) {
2073 				FREE(nfs_pub.np_index, M_TEMP);
2074 				nfs_pub.np_index = NULL;
2075 			}
2076 		}
2077 		return (0);
2078 	}
2079 
2080 	/*
2081 	 * Only one allowed at a time.
2082 	 */
2083 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2084 		return (EBUSY);
2085 
2086 	/*
2087 	 * Get real filehandle for root of exported FS.
2088 	 */
2089 	memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle));
2090 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2091 
2092 	if ((error = VFS_ROOT(mp, &rvp)))
2093 		return (error);
2094 
2095 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2096 		return (error);
2097 
2098 	vput(rvp);
2099 
2100 	/*
2101 	 * If an indexfile was specified, pull it in.
2102 	 */
2103 	if (argp->ex_indexfile != NULL) {
2104 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2105 		    M_WAITOK);
2106 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2107 		    MAXNAMLEN, (size_t *)0);
2108 		if (!error) {
2109 			/*
2110 			 * Check for illegal filenames.
2111 			 */
2112 			for (cp = nfs_pub.np_index; *cp; cp++) {
2113 				if (*cp == '/') {
2114 					error = EINVAL;
2115 					break;
2116 				}
2117 			}
2118 		}
2119 		if (error) {
2120 			FREE(nfs_pub.np_index, M_TEMP);
2121 			return (error);
2122 		}
2123 	}
2124 
2125 	nfs_pub.np_mount = mp;
2126 	nfs_pub.np_valid = 1;
2127 	return (0);
2128 }
2129 
2130 struct netcred *
2131 vfs_export_lookup(mp, nep, nam)
2132 	register struct mount *mp;
2133 	struct netexport *nep;
2134 	struct mbuf *nam;
2135 {
2136 	register struct netcred *np;
2137 	register struct radix_node_head *rnh;
2138 	struct sockaddr *saddr;
2139 
2140 	np = NULL;
2141 	if (mp->mnt_flag & MNT_EXPORTED) {
2142 		/*
2143 		 * Lookup in the export list first.
2144 		 */
2145 		if (nam != NULL) {
2146 			saddr = mtod(nam, struct sockaddr *);
2147 			rnh = nep->ne_rtable[saddr->sa_family];
2148 			if (rnh != NULL) {
2149 				np = (struct netcred *)
2150 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2151 							      rnh);
2152 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2153 					np = NULL;
2154 			}
2155 		}
2156 		/*
2157 		 * If no address match, use the default if it exists.
2158 		 */
2159 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2160 			np = &nep->ne_defexported;
2161 	}
2162 	return (np);
2163 }
2164 
2165 /*
2166  * Do the usual access checking.
2167  * file_mode, uid and gid are from the vnode in question,
2168  * while acc_mode and cred are from the VOP_ACCESS parameter list
2169  */
2170 int
2171 vaccess(type, file_mode, uid, gid, acc_mode, cred)
2172 	enum vtype type;
2173 	mode_t file_mode;
2174 	uid_t uid;
2175 	gid_t gid;
2176 	mode_t acc_mode;
2177 	struct ucred *cred;
2178 {
2179 	mode_t mask;
2180 
2181 	/*
2182 	 * Super-user always gets read/write access, but execute access depends
2183 	 * on at least one execute bit being set.
2184 	 */
2185 	if (cred->cr_uid == 0) {
2186 		if ((acc_mode & VEXEC) && type != VDIR &&
2187 		    (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
2188 			return (EACCES);
2189 		return (0);
2190 	}
2191 
2192 	mask = 0;
2193 
2194 	/* Otherwise, check the owner. */
2195 	if (cred->cr_uid == uid) {
2196 		if (acc_mode & VEXEC)
2197 			mask |= S_IXUSR;
2198 		if (acc_mode & VREAD)
2199 			mask |= S_IRUSR;
2200 		if (acc_mode & VWRITE)
2201 			mask |= S_IWUSR;
2202 		return ((file_mode & mask) == mask ? 0 : EACCES);
2203 	}
2204 
2205 	/* Otherwise, check the groups. */
2206 	if (cred->cr_gid == gid || groupmember(gid, cred)) {
2207 		if (acc_mode & VEXEC)
2208 			mask |= S_IXGRP;
2209 		if (acc_mode & VREAD)
2210 			mask |= S_IRGRP;
2211 		if (acc_mode & VWRITE)
2212 			mask |= S_IWGRP;
2213 		return ((file_mode & mask) == mask ? 0 : EACCES);
2214 	}
2215 
2216 	/* Otherwise, check everyone else. */
2217 	if (acc_mode & VEXEC)
2218 		mask |= S_IXOTH;
2219 	if (acc_mode & VREAD)
2220 		mask |= S_IROTH;
2221 	if (acc_mode & VWRITE)
2222 		mask |= S_IWOTH;
2223 	return ((file_mode & mask) == mask ? 0 : EACCES);
2224 }
2225 
2226 /*
2227  * Unmount all file systems.
2228  * We traverse the list in reverse order under the assumption that doing so
2229  * will avoid needing to worry about dependencies.
2230  */
2231 void
2232 vfs_unmountall()
2233 {
2234 	register struct mount *mp, *nmp;
2235 	int allerror, error;
2236 	struct proc *p = curproc;	/* XXX */
2237 
2238 	/*
2239 	 * Unmounting a file system blocks the requesting process.
2240 	 * However, it's possible for this routine to be called when
2241 	 * curproc is NULL (e.g. panic situation, or via the debugger).
2242 	 * If we get stuck in this situation, just abort, since any
2243 	 * attempts to sleep will fault.
2244 	 */
2245 	if (p == NULL) {
2246 		printf("vfs_unmountall: no context, aborting\n");
2247 		return;
2248 	}
2249 
2250 	for (allerror = 0,
2251 	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2252 		nmp = mp->mnt_list.cqe_prev;
2253 #ifdef DEBUG
2254 		printf("unmounting %s (%s)...\n",
2255 		    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
2256 #endif
2257 		if (vfs_busy(mp, 0, 0))
2258 			continue;
2259 		if ((error = dounmount(mp, MNT_FORCE, p)) != 0) {
2260 			printf("unmount of %s failed with error %d\n",
2261 			    mp->mnt_stat.f_mntonname, error);
2262 			allerror = 1;
2263 		}
2264 	}
2265 	if (allerror)
2266 		printf("WARNING: some file systems would not unmount\n");
2267 }
2268 
2269 /*
2270  * Sync and unmount file systems before shutting down.
2271  */
2272 void
2273 vfs_shutdown()
2274 {
2275 	register struct buf *bp;
2276 	int iter, nbusy, dcount, s;
2277 
2278 	printf("syncing disks... ");
2279 
2280 	/* XXX Should suspend scheduling. */
2281 	(void) spl0();
2282 
2283 	sys_sync(&proc0, (void *)0, (register_t *)0);
2284 
2285 	/* Wait for sync to finish. */
2286 	dcount = 10000;
2287 	for (iter = 0; iter < 20; iter++) {
2288 		nbusy = 0;
2289 		for (bp = &buf[nbuf]; --bp >= buf; ) {
2290 			if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
2291 				nbusy++;
2292 			/*
2293 			 * With soft updates, some buffers that are
2294 			 * written will be remarked as dirty until other
2295 			 * buffers are written.
2296 			 */
2297 			if (bp->b_vp && bp->b_vp->v_mount
2298 			    && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
2299 			    && (bp->b_flags & B_DELWRI)) {
2300 				s = splbio();
2301 				bremfree(bp);
2302 				bp->b_flags |= B_BUSY;
2303 				splx(s);
2304 				nbusy++;
2305 				bawrite(bp);
2306 				if (dcount-- <= 0) {
2307 					printf("softdep ");
2308 					goto fail;
2309 				}
2310 			}
2311 		}
2312 		if (nbusy == 0)
2313 			break;
2314 		printf("%d ", nbusy);
2315 		DELAY(40000 * iter);
2316 	}
2317 	if (nbusy) {
2318 fail:
2319 #ifdef DEBUG
2320 		printf("giving up\nPrinting vnodes for busy buffers\n");
2321 		for (bp = &buf[nbuf]; --bp >= buf; )
2322 			if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
2323 				vprint(NULL, bp->b_vp);
2324 #else
2325 		printf("giving up\n");
2326 #endif
2327 		return;
2328 	} else
2329 		printf("done\n");
2330 
2331 	/*
2332 	 * If we've panic'd, don't make the situation potentially
2333 	 * worse by unmounting the file systems.
2334 	 */
2335 	if (panicstr != NULL)
2336 		return;
2337 
2338 	/* Release inodes held by texts before update. */
2339 #ifdef notdef
2340 	vnshutdown();
2341 #endif
2342 	/* Unmount file systems. */
2343 	vfs_unmountall();
2344 }
2345 
2346 /*
2347  * Mount the root file system.  If the operator didn't specify a
2348  * file system to use, try all possible file systems until one
2349  * succeeds.
2350  */
2351 int
2352 vfs_mountroot()
2353 {
2354 	extern int (*mountroot) __P((void));
2355 	struct vfsops *v;
2356 
2357 	if (root_device == NULL)
2358 		panic("vfs_mountroot: root device unknown");
2359 
2360 	switch (root_device->dv_class) {
2361 	case DV_IFNET:
2362 		if (rootdev != NODEV)
2363 			panic("vfs_mountroot: rootdev set for DV_IFNET");
2364 		break;
2365 
2366 	case DV_DISK:
2367 		if (rootdev == NODEV)
2368 			panic("vfs_mountroot: rootdev not set for DV_DISK");
2369 		break;
2370 
2371 	default:
2372 		printf("%s: inappropriate for root file system\n",
2373 		    root_device->dv_xname);
2374 		return (ENODEV);
2375 	}
2376 
2377 	/*
2378 	 * If user specified a file system, use it.
2379 	 */
2380 	if (mountroot != NULL)
2381 		return ((*mountroot)());
2382 
2383 	/*
2384 	 * Try each file system currently configured into the kernel.
2385 	 */
2386 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2387 		if (v->vfs_mountroot == NULL)
2388 			continue;
2389 #ifdef DEBUG
2390 		printf("mountroot: trying %s...\n", v->vfs_name);
2391 #endif
2392 		if ((*v->vfs_mountroot)() == 0) {
2393 			printf("root file system type: %s\n", v->vfs_name);
2394 			break;
2395 		}
2396 	}
2397 
2398 	if (v == NULL) {
2399 		printf("no file system for %s", root_device->dv_xname);
2400 		if (root_device->dv_class == DV_DISK)
2401 			printf(" (dev 0x%x)", rootdev);
2402 		printf("\n");
2403 		return (EFTYPE);
2404 	}
2405 	return (0);
2406 }
2407 
2408 /*
2409  * Given a file system name, look up the vfsops for that
2410  * file system, or return NULL if file system isn't present
2411  * in the kernel.
2412  */
2413 struct vfsops *
2414 vfs_getopsbyname(name)
2415 	const char *name;
2416 {
2417 	struct vfsops *v;
2418 
2419 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2420 		if (strcmp(v->vfs_name, name) == 0)
2421 			break;
2422 	}
2423 
2424 	return (v);
2425 }
2426 
2427 /*
2428  * Establish a file system and initialize it.
2429  */
2430 int
2431 vfs_attach(vfs)
2432 	struct vfsops *vfs;
2433 {
2434 	struct vfsops *v;
2435 	int error = 0;
2436 
2437 
2438 	/*
2439 	 * Make sure this file system doesn't already exist.
2440 	 */
2441 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2442 		if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
2443 			error = EEXIST;
2444 			goto out;
2445 		}
2446 	}
2447 
2448 	/*
2449 	 * Initialize the vnode operations for this file system.
2450 	 */
2451 	vfs_opv_init(vfs->vfs_opv_descs);
2452 
2453 	/*
2454 	 * Now initialize the file system itself.
2455 	 */
2456 	(*vfs->vfs_init)();
2457 
2458 	/*
2459 	 * ...and link it into the kernel's list.
2460 	 */
2461 	LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
2462 
2463 	/*
2464 	 * Sanity: make sure the reference count is 0.
2465 	 */
2466 	vfs->vfs_refcount = 0;
2467 
2468  out:
2469 	return (error);
2470 }
2471 
2472 /*
2473  * Remove a file system from the kernel.
2474  */
2475 int
2476 vfs_detach(vfs)
2477 	struct vfsops *vfs;
2478 {
2479 	struct vfsops *v;
2480 
2481 	/*
2482 	 * Make sure no one is using the filesystem.
2483 	 */
2484 	if (vfs->vfs_refcount != 0)
2485 		return (EBUSY);
2486 
2487 	/*
2488 	 * ...and remove it from the kernel's list.
2489 	 */
2490 	for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) {
2491 		if (v == vfs) {
2492 			LIST_REMOVE(v, vfs_list);
2493 			break;
2494 		}
2495 	}
2496 
2497 	if (v == NULL)
2498 		return (ESRCH);
2499 
2500 	/*
2501 	 * Free the vnode operations vector.
2502 	 */
2503 	vfs_opv_free(vfs->vfs_opv_descs);
2504 	return (0);
2505 }
2506