xref: /netbsd-src/sys/kern/vfs_subr.c (revision ae1bfcddc410612bc8c58b807e1830becb69a24c)
1 /*
2  * Copyright (c) 1989 The Regents of the University of California.
3  * All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)vfs_subr.c	7.60 (Berkeley) 6/21/91
39  *	$Id: vfs_subr.c,v 1.29 1994/05/17 04:22:04 cgd Exp $
40  */
41 
42 /*
43  * External virtual filesystem routines
44  */
45 
46 #include <sys/param.h>
47 #include <sys/proc.h>
48 #include <sys/mount.h>
49 #include <sys/time.h>
50 #include <sys/vnode.h>
51 #include <miscfs/specfs/specdev.h> /* XXX */
52 #include <sys/namei.h>
53 #include <sys/ucred.h>
54 #include <sys/buf.h>
55 #include <sys/errno.h>
56 #include <sys/malloc.h>
57 #include <sys/systm.h>
58 #include <vm/vm.h>
59 #include <sys/sysctl.h>
60 
61 /*
62  * Flag to allow forcible unmounting.
63  */
64 int doforce = 1;
65 
66 int prtactive;	/* 1 => print out reclaim of active vnodes */
67 
68 void vprint __P((char *label, struct vnode *vp));
69 
70 /*
71  * Insq/Remq for the vnode usage lists.
72  */
73 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
74 #define	bufremvn(bp) {							\
75 	LIST_REMOVE(bp, b_vnbufs);					\
76 	(bp)->b_vnbufs.le_next = NOLIST;				\
77 }
78 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
79 struct mntlist mountlist;			/* mounted filesystem list */
80 
81 /*
82  * Remove a mount point from the list of mounted filesystems.
83  * Unmount of the root is illegal.
84  */
85 void
86 vfs_remove(mp)
87 	register struct mount *mp;
88 {
89 
90 	if (mp == rootfs)
91 		panic("vfs_remove: unmounting root");
92 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
93 	mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0;
94 	vfs_unlock(mp);
95 }
96 
97 /*
98  * Lock a filesystem.
99  * Used to prevent access to it while mounting and unmounting.
100  */
101 vfs_lock(mp)
102 	register struct mount *mp;
103 {
104 
105 	while(mp->mnt_flag & MNT_MLOCK) {
106 		mp->mnt_flag |= MNT_MWAIT;
107 		tsleep((caddr_t)mp, PVFS, "vfslock", 0);
108 	}
109 	mp->mnt_flag |= MNT_MLOCK;
110 	return (0);
111 }
112 
113 /*
114  * Unlock a locked filesystem.
115  * Panic if filesystem is not locked.
116  */
117 void
118 vfs_unlock(mp)
119 	register struct mount *mp;
120 {
121 
122 	if ((mp->mnt_flag & MNT_MLOCK) == 0)
123 		panic("vfs_unlock: not locked");
124 	mp->mnt_flag &= ~MNT_MLOCK;
125 	if (mp->mnt_flag & MNT_MWAIT) {
126 		mp->mnt_flag &= ~MNT_MWAIT;
127 		wakeup((caddr_t)mp);
128 	}
129 }
130 
131 /*
132  * Mark a mount point as busy.
133  * Used to synchronize access and to delay unmounting.
134  */
135 vfs_busy(mp)
136 	register struct mount *mp;
137 {
138 
139 	while(mp->mnt_flag & MNT_MPBUSY) {
140 		mp->mnt_flag |= MNT_MPWANT;
141 		tsleep((caddr_t)&mp->mnt_flag, PVFS, "vfsbusy", 0);
142 	}
143 	if (mp->mnt_flag & MNT_UNMOUNT)
144 		return (1);
145 	mp->mnt_flag |= MNT_MPBUSY;
146 	return (0);
147 }
148 
149 /*
150  * Free a busy filesystem.
151  * Panic if filesystem is not busy.
152  */
153 vfs_unbusy(mp)
154 	register struct mount *mp;
155 {
156 
157 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
158 		panic("vfs_unbusy: not busy");
159 	mp->mnt_flag &= ~MNT_MPBUSY;
160 	if (mp->mnt_flag & MNT_MPWANT) {
161 		mp->mnt_flag &= ~MNT_MPWANT;
162 		wakeup((caddr_t)&mp->mnt_flag);
163 	}
164 }
165 
166 /*
167  * Lookup a mount point by filesystem identifier.
168  */
169 struct mount *
170 getvfs(fsid)
171 	fsid_t *fsid;
172 {
173 	register struct mount *mp;
174 
175 	for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next)
176 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
177 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
178 			return (mp);
179 	return ((struct mount *)0);
180 }
181 
182 /*
183  * Check to see if a filesystem is mounted on a block device.
184  */
185 mountedon(vp)
186 	register struct vnode *vp;
187 {
188 	register struct vnode *vq;
189 
190 	if (vp->v_specflags & SI_MOUNTEDON)
191 		return (EBUSY);
192 	if (vp->v_flag & VALIASED) {
193 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
194 			if (vq->v_rdev != vp->v_rdev ||
195 			    vq->v_type != vp->v_type)
196 				continue;
197 			if (vq->v_specflags & SI_MOUNTEDON)
198 				return (EBUSY);
199 		}
200 	}
201 	return (0);
202 }
203 
204 /*
205  * Set vnode attributes to VNOVAL
206  */
207 void
208 vattr_null(vap)
209 	register struct vattr *vap;
210 {
211 
212 	vap->va_type = VNON;
213 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
214 		vap->va_fsid = vap->va_fileid = vap->va_size =
215 		vap->va_blocksize = vap->va_rdev = vap->va_bytes =
216 		vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
217 		vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
218 		vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
219 		vap->va_flags = vap->va_gen = VNOVAL;
220 	vap->va_vaflags = 0;
221 }
222 
223 /*
224  * Routines having to do with the management of the vnode table.
225  */
226 extern struct vnodeops dead_vnodeops, spec_vnodeops;
227 extern void vclean();
228 long numvnodes;
229 struct vattr va_null;
230 
231 /*
232  * Initialize the vnode structures and initialize each file system type.
233  */
234 void
235 vfsinit()
236 {
237 	int i;
238 
239 	/* initialize the vnode management data structures */
240 	TAILQ_INIT(&vnode_free_list);
241 	TAILQ_INIT(&mountlist);
242 	/*
243 	 * Initialize the vnode name cache
244 	 */
245 	nchinit();
246 	/*
247 	 * Initialize each file system type.
248 	 */
249 	vattr_null(&va_null);
250 	for (i = 0; i < nvfssw; i++) {
251 		if (vfssw[i] == NULL)
252 			continue;
253 		(*(vfssw[i]->vfs_init))();
254 	}
255 }
256 
257 /*
258  * Get a new unique fsid
259  */
260 void
261 getnewfsid(mp, mtype)
262 	struct mount *mp;
263 	int mtype;
264 {
265 	static u_short xxxfs_mntid;
266 
267 	fsid_t tfsid;
268 
269 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + 11, 0);	/* XXX */
270 	mp->mnt_stat.f_fsid.val[1] = mtype;
271 	if (xxxfs_mntid == 0)
272 		++xxxfs_mntid;
273 	tfsid.val[0] = makedev(nblkdev+mtype, xxxfs_mntid);
274 	tfsid.val[1] = mtype;
275 	if (rootfs) {
276 		while (getvfs(&tfsid)) {
277 			tfsid.val[0]++;
278 			xxxfs_mntid++;
279 		}
280 	}
281 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
282 }
283 
284 /*
285  * make a 'unique' number from a mount type name
286  */
287 long
288 makefstype(type)
289 	char *type;
290 {
291 	long rv;
292 
293 	for (rv = 0; *type; type++) {
294 		rv <<= 2;
295 		rv ^= *type;
296 	}
297 	return rv;
298 }
299 /*
300  * Return the next vnode from the free list.
301  */
302 getnewvnode(tag, mp, vops, vpp)
303 	enum vtagtype tag;
304 	struct mount *mp;
305 	struct vnodeops *vops;
306 	struct vnode **vpp;
307 {
308 	register struct vnode *vp, *vq;
309 
310 	if ((vnode_free_list.tqh_first == NULL &&
311 	     numvnodes < 2 * desiredvnodes) ||
312 	    numvnodes < desiredvnodes) {
313 		vp = (struct vnode *)malloc((u_long)sizeof *vp,
314 		    M_VNODE, M_WAITOK);
315 		bzero((char *)vp, sizeof *vp);
316 		numvnodes++;
317 	} else {
318 		if ((vp = vnode_free_list.tqh_first) == NULL) {
319 			tablefull("vnode");
320 			*vpp = 0;
321 			return (ENFILE);
322 		}
323 		if (vp->v_usecount)
324 			panic("free vnode isn't");
325 
326 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
327 		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
328 		if (vp->v_type != VBAD)
329 			vgone(vp);
330 		vp->v_flag = 0;
331 		vp->v_lastr = 0;
332 		vp->v_socket = 0;
333 	}
334 	vp->v_type = VNON;
335 	cache_purge(vp);
336 	vp->v_tag = tag;
337 	vp->v_op = vops;
338 	insmntque(vp, mp);
339 	vp->v_usecount = 1;
340 	*vpp = vp;
341 	return (0);
342 }
343 
344 /*
345  * Move a vnode from one mount queue to another.
346  */
347 insmntque(vp, mp)
348 	register struct vnode *vp;
349 	register struct mount *mp;
350 {
351 	register struct vnode *vq;
352 
353 	/*
354 	 * Delete from old mount point vnode list, if on one.
355 	 */
356 	if (vp->v_mount != NULL)
357 		LIST_REMOVE(vp, v_mntvnodes);
358 	/*
359 	 * Insert into list of vnodes for the new mount point, if available.
360 	 */
361 	if ((vp->v_mount = mp) == NULL)
362 		return;
363 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
364 }
365 
366 /*
367  * Make sure all write-behind blocks associated
368  * with mount point are flushed out (from sync).
369  */
370 mntflushbuf(mountp, flags)
371 	struct mount *mountp;
372 	int flags;
373 {
374 	register struct vnode *vp;
375 
376 	if ((mountp->mnt_flag & MNT_MPBUSY) == 0)
377 		panic("mntflushbuf: not busy");
378 loop:
379 	for (vp = mountp->mnt_vnodelist.lh_first; vp;
380 	    vp = vp->v_mntvnodes.le_next) {
381 		if (VOP_ISLOCKED(vp))
382 			continue;
383 		if (vget(vp, 1))
384 			goto loop;
385 		vflushbuf(vp, flags);
386 		vput(vp);
387 		if (vp->v_mount != mountp)
388 			goto loop;
389 	}
390 }
391 
392 /*
393  * Flush all dirty buffers associated with a vnode.
394  */
395 vflushbuf(vp, flags)
396 	register struct vnode *vp;
397 	int flags;
398 {
399 	register struct buf *bp;
400 	struct buf *nbp;
401 	int s;
402 
403 loop:
404 	s = splbio();
405 	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
406 		nbp = bp->b_vnbufs.le_next;
407 		if ((bp->b_flags & B_BUSY))
408 			continue;
409 		if ((bp->b_flags & B_DELWRI) == 0)
410 			panic("vflushbuf: not dirty");
411 		bremfree(bp);
412 		bp->b_flags |= B_BUSY;
413 		splx(s);
414 		/*
415 		 * Wait for I/O associated with indirect blocks to complete,
416 		 * since there is no way to quickly wait for them below.
417 		 * NB: This is really specific to ufs, but is done here
418 		 * as it is easier and quicker.
419 		 */
420 		if (bp->b_vp == vp || (flags & B_SYNC) == 0)
421 			(void) bawrite(bp);
422 		else
423 			(void) bwrite(bp);
424 		goto loop;
425 	}
426 	splx(s);
427 	if ((flags & B_SYNC) == 0)
428 		return;
429 	s = splbio();
430 	while (vp->v_numoutput) {
431 		vp->v_flag |= VBWAIT;
432 		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflshbuf", 0);
433 	}
434 	splx(s);
435 	if (vp->v_dirtyblkhd.lh_first != NULL) {
436 		vprint("vflushbuf: dirty", vp);
437 		goto loop;
438 	}
439 }
440 
441 /*
442  * Update outstanding I/O count and do wakeup if requested.
443  */
444 vwakeup(bp)
445 	register struct buf *bp;
446 {
447 	register struct vnode *vp;
448 
449 	bp->b_dirtyoff = bp->b_dirtyend = 0;
450 	if (vp = bp->b_vp) {
451 		vp->v_numoutput--;
452 		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
453 			if (vp->v_numoutput < 0)
454 				panic("vwakeup: neg numoutput");
455 			vp->v_flag &= ~VBWAIT;
456 			wakeup((caddr_t)&vp->v_numoutput);
457 		}
458 	}
459 }
460 
461 /*
462  * Invalidate in core blocks belonging to closed or umounted filesystem
463  *
464  * Go through the list of vnodes associated with the file system;
465  * for each vnode invalidate any buffers that it holds. Normally
466  * this routine is preceeded by a bflush call, so that on a quiescent
467  * filesystem there will be no dirty buffers when we are done. Binval
468  * returns the count of dirty buffers when it is finished.
469  */
470 mntinvalbuf(mountp)
471 	struct mount *mountp;
472 {
473 	register struct vnode *vp;
474 	int dirty = 0;
475 
476 	if ((mountp->mnt_flag & MNT_MPBUSY) == 0)
477 		panic("mntinvalbuf: not busy");
478 loop:
479 	for (vp = mountp->mnt_vnodelist.lh_first; vp;
480 	    vp = vp->v_mntvnodes.le_next) {
481 		if (vget(vp, 1))
482 			goto loop;
483 		dirty += vinvalbuf(vp, 1);
484 		vput(vp);
485 		if (vp->v_mount != mountp)
486 			goto loop;
487 	}
488 	return (dirty);
489 }
490 
491 /*
492  * Flush out and invalidate all buffers associated with a vnode.
493  * Called with the underlying object locked.
494  */
495 vinvalbuf(vp, save)
496 	register struct vnode *vp;
497 	int save;
498 {
499 	register struct buf *bp;
500 	struct buf *nbp, *blist;
501 	int s, dirty = 0;
502 
503 	for (;;) {
504 		if (blist = vp->v_dirtyblkhd.lh_first)
505 			/* void */;
506 		else if (blist = vp->v_cleanblkhd.lh_first)
507 			/* void */;
508 		else
509 			break;
510 		for (bp = blist; bp; bp = nbp) {
511 			nbp = bp->b_vnbufs.le_next;
512 			s = splbio();
513 			if (bp->b_flags & B_BUSY) {
514 				bp->b_flags |= B_WANTED;
515 				tsleep((caddr_t)bp, PRIBIO + 1, "vinvalbuf", 0);
516 				splx(s);
517 				break;
518 			}
519 			bremfree(bp);
520 			bp->b_flags |= B_BUSY;
521 			splx(s);
522 			if (save && (bp->b_flags & B_DELWRI)) {
523 				dirty++;
524 				(void) bwrite(bp);
525 				break;
526 			}
527 			if (bp->b_vp != vp)
528 				reassignbuf(bp, bp->b_vp);
529 			else
530 				bp->b_flags |= B_INVAL;
531 			brelse(bp);
532 		}
533 	}
534 	if (vp->v_dirtyblkhd.lh_first != NULL ||
535 	    vp->v_cleanblkhd.lh_first != NULL)
536 		panic("vinvalbuf: flush failed");
537 	return (dirty);
538 }
539 
540 /*
541  * Associate a buffer with a vnode.
542  */
543 bgetvp(vp, bp)
544 	register struct vnode *vp;
545 	register struct buf *bp;
546 {
547 	register struct vnode *vq;
548 	register struct buf *bq;
549 
550 	if (bp->b_vp)
551 		panic("bgetvp: not free");
552 	VHOLD(vp);
553 	bp->b_vp = vp;
554 	if (vp->v_type == VBLK || vp->v_type == VCHR)
555 		bp->b_dev = vp->v_rdev;
556 	else
557 		bp->b_dev = NODEV;
558 	/*
559 	 * Insert onto list for new vnode.
560 	 */
561 	bufinsvn(bp, &vp->v_cleanblkhd);
562 }
563 
564 /*
565  * Disassociate a buffer from a vnode.
566  */
567 brelvp(bp)
568 	register struct buf *bp;
569 {
570 	struct buf *bq;
571 	struct vnode *vp;
572 
573 	if (bp->b_vp == (struct vnode *) 0)
574 		panic("brelvp: NULL");
575 	/*
576 	 * Delete from old vnode list, if on one.
577 	 */
578 	if (bp->b_vnbufs.le_next != NOLIST)
579 		bufremvn(bp);
580 	vp = bp->b_vp;
581 	bp->b_vp = (struct vnode *) 0;
582 	HOLDRELE(vp);
583 }
584 
585 /*
586  * Reassign a buffer from one vnode to another.
587  * Used to assign file specific control information
588  * (indirect blocks) to the vnode to which they belong.
589  */
590 reassignbuf(bp, newvp)
591 	register struct buf *bp;
592 	register struct vnode *newvp;
593 {
594 	struct buf *bq;
595 	struct buflists *listheadp;
596 
597 	if (newvp == NULL)
598 		panic("reassignbuf: NULL");
599 	/*
600 	 * Delete from old vnode list, if on one.
601 	 */
602 	if (bp->b_vnbufs.le_next != NOLIST)
603 		bufremvn(bp);
604 	/*
605 	 * If dirty, put on list of dirty buffers;
606 	 * otherwise insert onto list of clean buffers.
607 	 */
608 	if (bp->b_flags & B_DELWRI)
609 		listheadp = &newvp->v_dirtyblkhd;
610 	else
611 		listheadp = &newvp->v_cleanblkhd;
612 	bufinsvn(bp, listheadp);
613 }
614 
615 /*
616  * Create a vnode for a block device.
617  * Used for root filesystem, argdev, and swap areas.
618  * Also used for memory file system special devices.
619  */
620 bdevvp(dev, vpp)
621 	dev_t dev;
622 	struct vnode **vpp;
623 {
624 	return(getdevvp(dev, vpp, VBLK));
625 }
626 
627 /*
628  * Create a vnode for a character device.
629  * Used for kernfs and some console handling.
630  */
631 cdevvp(dev, vpp)
632 	dev_t dev;
633 	struct vnode **vpp;
634 {
635 	return(getdevvp(dev, vpp, VCHR));
636 }
637 
638 /*
639  * Create a vnode for a device.
640  * Used by bdevvp (block device) for root file system etc.,
641  * and by cdevvp (character device) for console and kernfs.
642  */
643 getdevvp(dev, vpp, type)
644 	dev_t dev;
645 	struct vnode **vpp;
646 	enum vtype type;
647 {
648 	register struct vnode *vp;
649 	struct vnode *nvp;
650 	int error;
651 
652 	if (dev == NODEV)
653 		return (0);
654 	error = getnewvnode(VT_NON, (struct mount *)0, &spec_vnodeops, &nvp);
655 	if (error) {
656 		*vpp = NULLVP;
657 		return (error);
658 	}
659 	vp = nvp;
660 	vp->v_type = type;
661 	if (nvp = checkalias(vp, dev, (struct mount *)0)) {
662 		vput(vp);
663 		vp = nvp;
664 	}
665 	*vpp = vp;
666 	return (0);
667 }
668 
669 /*
670  * Check to see if the new vnode represents a special device
671  * for which we already have a vnode (either because of
672  * bdevvp() or because of a different vnode representing
673  * the same block device). If such an alias exists, deallocate
674  * the existing contents and return the aliased vnode. The
675  * caller is responsible for filling it with its new contents.
676  */
677 struct vnode *
678 checkalias(nvp, nvp_rdev, mp)
679 	register struct vnode *nvp;
680 	dev_t nvp_rdev;
681 	struct mount *mp;
682 {
683 	register struct vnode *vp;
684 	struct vnode **vpp;
685 
686 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
687 		return (NULLVP);
688 
689 	vpp = &speclisth[SPECHASH(nvp_rdev)];
690 loop:
691 	for (vp = *vpp; vp; vp = vp->v_specnext) {
692 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
693 			continue;
694 		/*
695 		 * Alias, but not in use, so flush it out.
696 		 */
697 		if (vp->v_usecount == 0) {
698 			vgone(vp);
699 			goto loop;
700 		}
701 		if (vget(vp, 1))
702 			goto loop;
703 		break;
704 	}
705 	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
706 		MALLOC(nvp->v_specinfo, struct specinfo *,
707 			sizeof(struct specinfo), M_VNODE, M_WAITOK);
708 		nvp->v_rdev = nvp_rdev;
709 		nvp->v_hashchain = vpp;
710 		nvp->v_specnext = *vpp;
711 		nvp->v_specflags = 0;
712 		*vpp = nvp;
713 		if (vp != NULL) {
714 			nvp->v_flag |= VALIASED;
715 			vp->v_flag |= VALIASED;
716 			vput(vp);
717 		}
718 		return (NULLVP);
719 	}
720 	VOP_UNLOCK(vp);
721 	vclean(vp, 0);
722 	vp->v_op = nvp->v_op;
723 	vp->v_tag = nvp->v_tag;
724 	nvp->v_type = VNON;
725 	insmntque(vp, mp);
726 	return (vp);
727 }
728 
729 /*
730  * Grab a particular vnode from the free list, increment its
731  * reference count and lock it. The vnode lock bit is set the
732  * vnode is being eliminated in vgone. The process is awakened
733  * when the transition is completed, and an error returned to
734  * indicate that the vnode is no longer usable (possibly having
735  * been changed to a new file system type).
736  */
737 vget(vp, lockflag)
738 	register struct vnode *vp;
739 	int lockflag;
740 {
741 	register struct vnode *vq;
742 
743 	if ((vp->v_flag & VXLOCK) ||
744 	    (vp->v_usecount == 0 &&
745 	     vp->v_freelist.tqe_prev == (struct vnode **)0xdeadb)) {
746 		vp->v_flag |= VXWANT;
747 		tsleep((caddr_t)vp, PINOD, "vget", 0);
748 		return (1);
749 	}
750 	if (vp->v_usecount == 0)
751 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
752 	vp->v_usecount++;
753 	if (lockflag)
754 		VOP_LOCK(vp);
755 	return (0);
756 }
757 
758 /*
759  * Vnode reference, just increment the count
760  */
761 void
762 vref(vp)
763 	struct vnode *vp;
764 {
765 
766 	if (vp->v_usecount <= 0)
767 		panic("vref used where vget required");
768 	vp->v_usecount++;
769 }
770 
771 /*
772  * vput(), just unlock and vrele()
773  */
774 void
775 vput(vp)
776 	register struct vnode *vp;
777 {
778 	VOP_UNLOCK(vp);
779 	vrele(vp);
780 }
781 
782 /*
783  * Vnode release.
784  * If count drops to zero, call inactive routine and return to freelist.
785  */
786 void
787 vrele(vp)
788 	register struct vnode *vp;
789 {
790 	struct proc *p = curproc;		/* XXX */
791 
792 #ifdef DIAGNOSTIC
793 	if (vp == NULL)
794 		panic("vrele: null vp");
795 #endif
796 	vp->v_usecount--;
797 	if (vp->v_usecount > 0)
798 		return;
799 #ifdef DIAGNOSTIC
800 	if (vp->v_usecount != 0 || vp->v_writecount != 0) {
801 		vprint("vrele: bad ref count", vp);
802 		panic("vrele: ref cnt");
803 	}
804 #endif
805 	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
806 	VOP_INACTIVE(vp, p);
807 }
808 
809 /*
810  * Page or buffer structure gets a reference.
811  */
812 vhold(vp)
813 	register struct vnode *vp;
814 {
815 
816 	vp->v_holdcnt++;
817 }
818 
819 /*
820  * Page or buffer structure frees a reference.
821  */
822 holdrele(vp)
823 	register struct vnode *vp;
824 {
825 
826 	if (vp->v_holdcnt <= 0)
827 		panic("holdrele: holdcnt");
828 	vp->v_holdcnt--;
829 }
830 
831 /*
832  * Remove any vnodes in the vnode table belonging to mount point mp.
833  *
834  * If MNT_NOFORCE is specified, there should not be any active ones,
835  * return error if any are found (nb: this is a user error, not a
836  * system error). If MNT_FORCE is specified, detach any active vnodes
837  * that are found.
838  */
839 int busyprt = 0;	/* patch to print out busy vnodes */
840 
841 vflush(mp, skipvp, flags)
842 	struct mount *mp;
843 	struct vnode *skipvp;
844 	int flags;
845 {
846 	register struct vnode *vp, *nvp;
847 	int busy = 0;
848 
849 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
850 		panic("vflush: not busy");
851 loop:
852 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
853 		if (vp->v_mount != mp)
854 			goto loop;
855 		nvp = vp->v_mntvnodes.le_next;
856 		/*
857 		 * Skip over a selected vnode.
858 		 */
859 		if (vp == skipvp)
860 			continue;
861 		/*
862 		 * Skip over a vnodes marked VSYSTEM.
863 		 */
864 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
865 			continue;
866 		/*
867 		 * With v_usecount == 0, all we need to do is clear
868 		 * out the vnode data structures and we are done.
869 		 */
870 		if (vp->v_usecount == 0) {
871 			vgone(vp);
872 			continue;
873 		}
874 		/*
875 		 * For block or character devices, revert to an
876 		 * anonymous device. For all other files, just kill them.
877 		 */
878 		if (flags & FORCECLOSE) {
879 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
880 				vgone(vp);
881 			} else {
882 				vclean(vp, 0);
883 				vp->v_op = &spec_vnodeops;
884 				insmntque(vp, (struct mount *)0);
885 			}
886 			continue;
887 		}
888 		if (busyprt)
889 			vprint("vflush: busy vnode", vp);
890 		busy++;
891 	}
892 	if (busy)
893 		return (EBUSY);
894 	return (0);
895 }
896 
897 /*
898  * Disassociate the underlying file system from a vnode.
899  */
900 void
901 vclean(vp, flags)
902 	register struct vnode *vp;
903 	int flags;
904 {
905 	struct vnodeops *origops;
906 	int active;
907 	struct proc *p = curproc;	/* XXX */
908 
909 	/*
910 	 * Check to see if the vnode is in use.
911 	 * If so we have to reference it before we clean it out
912 	 * so that its count cannot fall to zero and generate a
913 	 * race against ourselves to recycle it.
914 	 */
915 	if (active = vp->v_usecount)
916 		VREF(vp);
917 	/*
918 	 * Prevent the vnode from being recycled or
919 	 * brought into use while we clean it out.
920 	 */
921 	if (vp->v_flag & VXLOCK)
922 		panic("vclean: deadlock");
923 	vp->v_flag |= VXLOCK;
924 	/*
925 	 * Even if the count is zero, the VOP_INACTIVE routine may still
926 	 * have the object locked while it cleans it out. The VOP_LOCK
927 	 * ensures that the VOP_INACTIVE routine is done with its work.
928 	 * For active vnodes, it ensures that no other activity can
929 	 * occur while the buffer list is being cleaned out.
930 	 */
931 	VOP_LOCK(vp);
932 	if (flags & DOCLOSE)
933 		vinvalbuf(vp, 1);
934 	/*
935 	 * Prevent any further operations on the vnode from
936 	 * being passed through to the old file system.
937 	 */
938 	origops = vp->v_op;
939 	vp->v_op = &dead_vnodeops;
940 	vp->v_tag = VT_NON;
941 	/*
942 	 * If purging an active vnode, it must be unlocked, closed,
943 	 * and deactivated before being reclaimed.
944 	 */
945 	(*(origops->vop_unlock))(vp);
946 	if (active) {
947 		if (flags & DOCLOSE)
948 			(*(origops->vop_close))(vp, IO_NDELAY, NOCRED, p);
949 		(*(origops->vop_inactive))(vp, p);
950 	}
951 	/*
952 	 * Reclaim the vnode.
953 	 */
954 	if ((*(origops->vop_reclaim))(vp))
955 		panic("vclean: cannot reclaim");
956 	if (active)
957 		vrele(vp);
958 	/*
959 	 * Done with purge, notify sleepers in vget of the grim news.
960 	 */
961 	vp->v_flag &= ~VXLOCK;
962 	if (vp->v_flag & VXWANT) {
963 		vp->v_flag &= ~VXWANT;
964 		wakeup((caddr_t)vp);
965 	}
966 }
967 
968 /*
969  * Eliminate all activity associated with  the requested vnode
970  * and with all vnodes aliased to the requested vnode.
971  */
972 void
973 vgoneall(vp)
974 	register struct vnode *vp;
975 {
976 	register struct vnode *vq;
977 
978 	if (vp->v_flag & VALIASED) {
979 		/*
980 		 * If a vgone (or vclean) is already in progress,
981 		 * wait until it is done and return.
982 		 */
983 		if (vp->v_flag & VXLOCK) {
984 			vp->v_flag |= VXWANT;
985 			tsleep((caddr_t)vp, PINOD, "vgoneall", 0);
986 			return;
987 		}
988 		/*
989 		 * Ensure that vp will not be vgone'd while we
990 		 * are eliminating its aliases.
991 		 */
992 		vp->v_flag |= VXLOCK;
993 		while (vp->v_flag & VALIASED) {
994 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
995 				if (vq->v_rdev != vp->v_rdev ||
996 				    vq->v_type != vp->v_type || vp == vq)
997 					continue;
998 				vgone(vq);
999 				break;
1000 			}
1001 		}
1002 		/*
1003 		 * Remove the lock so that vgone below will
1004 		 * really eliminate the vnode after which time
1005 		 * vgone will awaken any sleepers.
1006 		 */
1007 		vp->v_flag &= ~VXLOCK;
1008 	}
1009 	vgone(vp);
1010 }
1011 
1012 /*
1013  * Eliminate all activity associated with a vnode
1014  * in preparation for reuse.
1015  */
1016 void
1017 vgone(vp)
1018 	register struct vnode *vp;
1019 {
1020 	register struct vnode *vq;
1021 	struct vnode *vx;
1022 
1023 	/*
1024 	 * If a vgone (or vclean) is already in progress,
1025 	 * wait until it is done and return.
1026 	 */
1027 	if (vp->v_flag & VXLOCK) {
1028 		vp->v_flag |= VXWANT;
1029 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1030 		return;
1031 	}
1032 	/*
1033 	 * Clean out the filesystem specific data.
1034 	 */
1035 	vclean(vp, DOCLOSE);
1036 	/*
1037 	 * Delete from old mount point vnode list, if on one.
1038 	 */
1039 	if (vp->v_mount != NULL) {
1040 		LIST_REMOVE(vp, v_mntvnodes);
1041 		vp->v_mount = NULL;
1042 	}
1043 	/*
1044 	 * If special device, remove it from special device alias list.
1045 	 */
1046 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
1047 		if (*vp->v_hashchain == vp) {
1048 			*vp->v_hashchain = vp->v_specnext;
1049 		} else {
1050 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1051 				if (vq->v_specnext != vp)
1052 					continue;
1053 				vq->v_specnext = vp->v_specnext;
1054 				break;
1055 			}
1056 			if (vq == NULL)
1057 				panic("missing bdev");
1058 		}
1059 		if (vp->v_flag & VALIASED) {
1060 			vx = NULL;
1061 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1062 				if (vq->v_rdev != vp->v_rdev ||
1063 				    vq->v_type != vp->v_type)
1064 					continue;
1065 				if (vx != NULL)
1066 					break;
1067 				vx = vq;
1068 			}
1069 			if (vx == NULL)
1070 				panic("missing alias");
1071 			if (vq == NULL)
1072 				vx->v_flag &= ~VALIASED;
1073 			vp->v_flag &= ~VALIASED;
1074 		}
1075 		FREE(vp->v_specinfo, M_VNODE);
1076 		vp->v_specinfo = NULL;
1077 	}
1078 	/*
1079 	 * If it is on the freelist, move it to the head of the list.
1080 	 */
1081 	if (vp->v_usecount == 0 &&
1082 	    vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
1083 	    vnode_free_list.tqh_first != vp) {
1084 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1085 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1086 	}
1087 	vp->v_type = VBAD;
1088 }
1089 
1090 /*
1091  * Lookup a vnode by device number.
1092  */
1093 vfinddev(dev, type, vpp)
1094 	dev_t dev;
1095 	enum vtype type;
1096 	struct vnode **vpp;
1097 {
1098 	register struct vnode *vp;
1099 
1100 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1101 		if (dev != vp->v_rdev || type != vp->v_type)
1102 			continue;
1103 		*vpp = vp;
1104 		return (0);
1105 	}
1106 	return (1);
1107 }
1108 
1109 /*
1110  * Calculate the total number of references to a special device.
1111  */
1112 vcount(vp)
1113 	register struct vnode *vp;
1114 {
1115 	register struct vnode *vq;
1116 	int count;
1117 
1118 loop:
1119 	if ((vp->v_flag & VALIASED) == 0)
1120 		return (vp->v_usecount);
1121 
1122 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1123 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1124 			continue;
1125 		/*
1126 		 * Alias, but not in use, so flush it out.
1127 		 */
1128 		if (vq->v_usecount == 0 && vq != vp) {
1129 			vgone(vq);
1130 			goto loop;
1131 		}
1132 		count += vq->v_usecount;
1133 	}
1134 	return (count);
1135 }
1136 
1137 /*
1138  * Print out a description of a vnode.
1139  */
1140 static char *typename[] =
1141    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1142 
1143 void
1144 vprint(label, vp)
1145 	char *label;
1146 	register struct vnode *vp;
1147 {
1148 	char buf[64];
1149 
1150 	if (label != NULL)
1151 		printf("%s: ", label);
1152 	printf("type %s, usecount %d, writecount %d, refcount %d,",
1153 		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1154 		vp->v_holdcnt);
1155 	buf[0] = '\0';
1156 	if (vp->v_flag & VROOT)
1157 		strcat(buf, "|VROOT");
1158 	if (vp->v_flag & VTEXT)
1159 		strcat(buf, "|VTEXT");
1160 	if (vp->v_flag & VSYSTEM)
1161 		strcat(buf, "|VSYSTEM");
1162 	if (vp->v_flag & VXLOCK)
1163 		strcat(buf, "|VXLOCK");
1164 	if (vp->v_flag & VXWANT)
1165 		strcat(buf, "|VXWANT");
1166 	if (vp->v_flag & VBWAIT)
1167 		strcat(buf, "|VBWAIT");
1168 	if (vp->v_flag & VALIASED)
1169 		strcat(buf, "|VALIASED");
1170 	if (buf[0] != '\0')
1171 		printf(" flags (%s)", &buf[1]);
1172 	printf("\n\t");
1173 	VOP_PRINT(vp);
1174 }
1175 
1176 #ifdef DEBUG
1177 /*
1178  * List all of the locked vnodes in the system.
1179  * Called when debugging the kernel.
1180  */
1181 printlockedvnodes()
1182 {
1183 	register struct mount *mp;
1184 	register struct vnode *vp;
1185 
1186 	printf("Locked vnodes\n");
1187 	for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) {
1188 		for (vp = mp->mnt_vnodelist.lh_first; vp != NULL;
1189 		    vp = vp->v_mntvnodes.le_next)
1190 			if (VOP_ISLOCKED(vp))
1191 				vprint((char *)0, vp);
1192 	}
1193 }
1194 #endif
1195 
1196 int kinfo_vdebug = 1;
1197 int kinfo_vgetfailed;
1198 #define KINFO_VNODESLOP	10
1199 /*
1200  * Dump vnode list (via sysctl).
1201  * Copyout address of vnode followed by vnode.
1202  */
1203 /* ARGSUSED */
1204 sysctl_vnode(where, sizep)
1205 	char *where;
1206 	size_t *sizep;
1207 {
1208 	register struct mount *mp, *nmp;
1209 	struct vnode *vp;
1210 	register char *bp = where, *savebp;
1211 	char *ewhere;
1212 	int error;
1213 
1214 #define VPTRSZ	sizeof (struct vnode *)
1215 #define VNODESZ	sizeof (struct vnode)
1216 	if (where == NULL) {
1217 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1218 		return (0);
1219 	}
1220 	ewhere = where + *sizep;
1221 
1222 	for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) {
1223 		nmp = mp->mnt_list.tqe_next;
1224 		if (vfs_busy(mp))
1225 			continue;
1226 		savebp = bp;
1227 again:
1228 		for (vp = mp->mnt_vnodelist.lh_first;
1229 		     vp != NULL;
1230 		     vp = vp->v_mntvnodes.le_next) {
1231 			/*
1232 			 * Check that the vp is still associated with
1233 			 * this filesystem.  RACE: could have been
1234 			 * recycled onto the same filesystem.
1235 			 */
1236 			if (vp->v_mount != mp) {
1237 				if (kinfo_vdebug)
1238 					printf("kinfo: vp changed\n");
1239 				bp = savebp;
1240 				goto again;
1241 			}
1242 			if (bp + VPTRSZ + VNODESZ > ewhere) {
1243 				*sizep = bp - where;
1244 				return (ENOMEM);
1245 			}
1246 			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1247 			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1248 				return (error);
1249 			bp += VPTRSZ + VNODESZ;
1250 		}
1251 		vfs_unbusy(mp);
1252 	}
1253 
1254 	*sizep = bp - where;
1255 	return (0);
1256 }
1257