xref: /netbsd-src/sys/kern/vfs_vnode.c (revision 46f5119e40af2e51998f686b2fdcc76b5488f7f3)
1 /*	$NetBSD: vfs_vnode.c,v 1.5 2011/04/04 02:46:57 rmind Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * Note on v_usecount and locking:
71  *
72  * At nearly all points it is known that v_usecount could be zero, the
73  * vnode interlock will be held.
74  *
75  * To change v_usecount away from zero, the interlock must be held.  To
76  * change from a non-zero value to zero, again the interlock must be
77  * held.
78  *
79  * There's a flag bit, VC_XLOCK, embedded in v_usecount.
80  * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock
81  * must be held.
82  * To modify the VC_XLOCK bit, the interlock must be held.
83  * We always keep the usecount (v_usecount & VC_MASK) non-zero while the
84  * VC_XLOCK bit is set.
85  *
86  * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
87  * value to a non-zero value can safely be done using atomic operations,
88  * without the interlock held.
89  * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
90  * value can be done using atomic operations, without the interlock held.
91  */
92 
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.5 2011/04/04 02:46:57 rmind Exp $");
95 
96 #include <sys/param.h>
97 #include <sys/kernel.h>
98 
99 #include <sys/atomic.h>
100 #include <sys/buf.h>
101 #include <sys/conf.h>
102 #include <sys/device.h>
103 #include <sys/kauth.h>
104 #include <sys/kmem.h>
105 #include <sys/kthread.h>
106 #include <sys/module.h>
107 #include <sys/mount.h>
108 #include <sys/namei.h>
109 #include <sys/syscallargs.h>
110 #include <sys/sysctl.h>
111 #include <sys/systm.h>
112 #include <sys/vnode.h>
113 #include <sys/wapbl.h>
114 
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_readahead.h>
117 
118 u_int			numvnodes;
119 
120 static pool_cache_t	vnode_cache;
121 static kmutex_t		vnode_free_list_lock;
122 
123 static vnodelst_t	vnode_free_list;
124 static vnodelst_t	vnode_hold_list;
125 static vnodelst_t	vrele_list;
126 
127 static kmutex_t		vrele_lock;
128 static kcondvar_t	vrele_cv;
129 static lwp_t *		vrele_lwp;
130 static int		vrele_pending;
131 static int		vrele_gen;
132 
133 static vnode_t *	getcleanvnode(void);
134 static void		vrele_thread(void *);
135 static void		vpanic(vnode_t *, const char *);
136 
137 /* Routines having to do with the management of the vnode table. */
138 extern int		(**dead_vnodeop_p)(void *);
139 
140 void
141 vfs_vnode_sysinit(void)
142 {
143 	int error;
144 
145 	vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
146 	    NULL, IPL_NONE, NULL, NULL, NULL);
147 	KASSERT(vnode_cache != NULL);
148 
149 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
150 	TAILQ_INIT(&vnode_free_list);
151 	TAILQ_INIT(&vnode_hold_list);
152 	TAILQ_INIT(&vrele_list);
153 
154 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
155 	cv_init(&vrele_cv, "vrele");
156 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
157 	    NULL, &vrele_lwp, "vrele");
158 	KASSERT(error == 0);
159 }
160 
161 /*
162  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
163  * marker vnode and we are prepared to wait for the allocation.
164  */
165 vnode_t *
166 vnalloc(struct mount *mp)
167 {
168 	vnode_t *vp;
169 
170 	vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
171 	if (vp == NULL) {
172 		return NULL;
173 	}
174 
175 	memset(vp, 0, sizeof(*vp));
176 	UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
177 	cv_init(&vp->v_cv, "vnode");
178 	/*
179 	 * Done by memset() above.
180 	 *	LIST_INIT(&vp->v_nclist);
181 	 *	LIST_INIT(&vp->v_dnclist);
182 	 */
183 
184 	if (mp != NULL) {
185 		vp->v_mount = mp;
186 		vp->v_type = VBAD;
187 		vp->v_iflag = VI_MARKER;
188 	} else {
189 		rw_init(&vp->v_lock);
190 	}
191 
192 	return vp;
193 }
194 
195 /*
196  * Free an unused, unreferenced vnode.
197  */
198 void
199 vnfree(vnode_t *vp)
200 {
201 
202 	KASSERT(vp->v_usecount == 0);
203 
204 	if ((vp->v_iflag & VI_MARKER) == 0) {
205 		rw_destroy(&vp->v_lock);
206 		mutex_enter(&vnode_free_list_lock);
207 		numvnodes--;
208 		mutex_exit(&vnode_free_list_lock);
209 	}
210 
211 	UVM_OBJ_DESTROY(&vp->v_uobj);
212 	cv_destroy(&vp->v_cv);
213 	pool_cache_put(vnode_cache, vp);
214 }
215 
216 /*
217  * getcleanvnode: grab a vnode from freelist and clean it.
218  *
219  * => Releases vnode_free_list_lock.
220  * => Returns referenced vnode on success.
221  */
222 static vnode_t *
223 getcleanvnode(void)
224 {
225 	vnode_t *vp;
226 	vnodelst_t *listhd;
227 
228 	KASSERT(mutex_owned(&vnode_free_list_lock));
229 retry:
230 	listhd = &vnode_free_list;
231 try_nextlist:
232 	TAILQ_FOREACH(vp, listhd, v_freelist) {
233 		/*
234 		 * It's safe to test v_usecount and v_iflag
235 		 * without holding the interlock here, since
236 		 * these vnodes should never appear on the
237 		 * lists.
238 		 */
239 		KASSERT(vp->v_usecount == 0);
240 		KASSERT((vp->v_iflag & VI_CLEAN) == 0);
241 		KASSERT(vp->v_freelisthd == listhd);
242 
243 		if (!mutex_tryenter(&vp->v_interlock))
244 			continue;
245 		if ((vp->v_iflag & VI_XLOCK) == 0)
246 			break;
247 		mutex_exit(&vp->v_interlock);
248 	}
249 
250 	if (vp == NULL) {
251 		if (listhd == &vnode_free_list) {
252 			listhd = &vnode_hold_list;
253 			goto try_nextlist;
254 		}
255 		mutex_exit(&vnode_free_list_lock);
256 		return NULL;
257 	}
258 
259 	/* Remove it from the freelist. */
260 	TAILQ_REMOVE(listhd, vp, v_freelist);
261 	vp->v_freelisthd = NULL;
262 	mutex_exit(&vnode_free_list_lock);
263 
264 	KASSERT(vp->v_usecount == 0);
265 
266 	/*
267 	 * The vnode is still associated with a file system, so we must
268 	 * clean it out before reusing it.  We need to add a reference
269 	 * before doing this.  If the vnode gains another reference while
270 	 * being cleaned out then we lose - retry.
271 	 */
272 	atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
273 	vclean(vp, DOCLOSE);
274 	KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
275 	atomic_add_int(&vp->v_usecount, -VC_XLOCK);
276 	if (vp->v_usecount == 1) {
277 		/* We're about to dirty it. */
278 		vp->v_iflag &= ~VI_CLEAN;
279 		mutex_exit(&vp->v_interlock);
280 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
281 			spec_node_destroy(vp);
282 		}
283 		vp->v_type = VNON;
284 	} else {
285 		/*
286 		 * Don't return to freelist - the holder of the last
287 		 * reference will destroy it.
288 		 */
289 		vrelel(vp, 0); /* releases vp->v_interlock */
290 		mutex_enter(&vnode_free_list_lock);
291 		goto retry;
292 	}
293 
294 	KASSERT(vp->v_data == NULL);
295 	KASSERT(vp->v_uobj.uo_npages == 0);
296 	KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq));
297 	KASSERT(vp->v_numoutput == 0);
298 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
299 
300 	return vp;
301 }
302 
303 /*
304  * getnewvnode: return the next vnode from the free list.
305  *
306  * => Returns referenced vnode, moved into the mount queue.
307  */
308 int
309 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
310 	    vnode_t **vpp)
311 {
312 	struct uvm_object *uobj;
313 	static int toggle;
314 	vnode_t *vp;
315 	int error = 0, tryalloc;
316 
317 try_again:
318 	if (mp != NULL) {
319 		/*
320 		 * Mark filesystem busy while we are creating a vnode.
321 		 * If unmount is in progress, this will fail.
322 		 */
323 		error = vfs_busy(mp, NULL);
324 		if (error)
325 			return error;
326 	}
327 
328 	/*
329 	 * We must choose whether to allocate a new vnode or recycle an
330 	 * existing one. The criterion for allocating a new one is that
331 	 * the total number of vnodes is less than the number desired or
332 	 * there are no vnodes on either free list. Generally we only
333 	 * want to recycle vnodes that have no buffers associated with
334 	 * them, so we look first on the vnode_free_list. If it is empty,
335 	 * we next consider vnodes with referencing buffers on the
336 	 * vnode_hold_list. The toggle ensures that half the time we
337 	 * will use a buffer from the vnode_hold_list, and half the time
338 	 * we will allocate a new one unless the list has grown to twice
339 	 * the desired size. We are reticent to recycle vnodes from the
340 	 * vnode_hold_list because we will lose the identity of all its
341 	 * referencing buffers.
342 	 */
343 
344 	vp = NULL;
345 
346 	mutex_enter(&vnode_free_list_lock);
347 
348 	toggle ^= 1;
349 	if (numvnodes > 2 * desiredvnodes)
350 		toggle = 0;
351 
352 	tryalloc = numvnodes < desiredvnodes ||
353 	    (TAILQ_FIRST(&vnode_free_list) == NULL &&
354 	    (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
355 
356 	if (tryalloc) {
357 		/* Allocate a new vnode. */
358 		numvnodes++;
359 		mutex_exit(&vnode_free_list_lock);
360 		if ((vp = vnalloc(NULL)) == NULL) {
361 			mutex_enter(&vnode_free_list_lock);
362 			numvnodes--;
363 		} else
364 			vp->v_usecount = 1;
365 	}
366 
367 	if (vp == NULL) {
368 		/* Recycle and get vnode clean. */
369 		vp = getcleanvnode();
370 		if (vp == NULL) {
371 			if (mp != NULL) {
372 				vfs_unbusy(mp, false, NULL);
373 			}
374 			if (tryalloc) {
375 				printf("WARNING: unable to allocate new "
376 				    "vnode, retrying...\n");
377 				kpause("newvn", false, hz, NULL);
378 				goto try_again;
379 			}
380 			tablefull("vnode", "increase kern.maxvnodes or NVNODE");
381 			*vpp = 0;
382 			return ENFILE;
383 		}
384 		vp->v_iflag = 0;
385 		vp->v_vflag = 0;
386 		vp->v_uflag = 0;
387 		vp->v_socket = NULL;
388 	}
389 
390 	KASSERT(vp->v_usecount == 1);
391 	KASSERT(vp->v_freelisthd == NULL);
392 	KASSERT(LIST_EMPTY(&vp->v_nclist));
393 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
394 
395 	/* Initialize vnode. */
396 	vp->v_type = VNON;
397 	vp->v_tag = tag;
398 	vp->v_op = vops;
399 	vp->v_data = NULL;
400 
401 	uobj = &vp->v_uobj;
402 	KASSERT(uobj->pgops == &uvm_vnodeops);
403 	KASSERT(uobj->uo_npages == 0);
404 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
405 	vp->v_size = vp->v_writesize = VSIZENOTSET;
406 
407 	/* Finally, move vnode into the mount queue. */
408 	vfs_insmntque(vp, mp);
409 
410 	if (mp != NULL) {
411 		if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
412 			vp->v_vflag |= VV_MPSAFE;
413 		vfs_unbusy(mp, true, NULL);
414 	}
415 
416 	*vpp = vp;
417 	return 0;
418 }
419 
420 /*
421  * This is really just the reverse of getnewvnode(). Needed for
422  * VFS_VGET functions who may need to push back a vnode in case
423  * of a locking race.
424  */
425 void
426 ungetnewvnode(vnode_t *vp)
427 {
428 
429 	KASSERT(vp->v_usecount == 1);
430 	KASSERT(vp->v_data == NULL);
431 	KASSERT(vp->v_freelisthd == NULL);
432 
433 	mutex_enter(&vp->v_interlock);
434 	vp->v_iflag |= VI_CLEAN;
435 	vrelel(vp, 0);
436 }
437 
438 /*
439  * Remove a vnode from its freelist.
440  */
441 void
442 vremfree(vnode_t *vp)
443 {
444 
445 	KASSERT(mutex_owned(&vp->v_interlock));
446 	KASSERT(vp->v_usecount == 0);
447 
448 	/*
449 	 * Note that the reference count must not change until
450 	 * the vnode is removed.
451 	 */
452 	mutex_enter(&vnode_free_list_lock);
453 	if (vp->v_holdcnt > 0) {
454 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
455 	} else {
456 		KASSERT(vp->v_freelisthd == &vnode_free_list);
457 	}
458 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
459 	vp->v_freelisthd = NULL;
460 	mutex_exit(&vnode_free_list_lock);
461 }
462 
463 /*
464  * Try to gain a reference to a vnode, without acquiring its interlock.
465  * The caller must hold a lock that will prevent the vnode from being
466  * recycled or freed.
467  */
468 bool
469 vtryget(vnode_t *vp)
470 {
471 	u_int use, next;
472 
473 	/*
474 	 * If the vnode is being freed, don't make life any harder
475 	 * for vclean() by adding another reference without waiting.
476 	 * This is not strictly necessary, but we'll do it anyway.
477 	 */
478 	if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) {
479 		return false;
480 	}
481 	for (use = vp->v_usecount;; use = next) {
482 		if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) {
483 			/* Need interlock held if first reference. */
484 			return false;
485 		}
486 		next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
487 		if (__predict_true(next == use)) {
488 			return true;
489 		}
490 	}
491 }
492 
493 /*
494  * vget: get a particular vnode from the free list, increment its reference
495  * count and lock it.
496  *
497  * => Should be called with v_interlock held.
498  *
499  * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean().
500  * In that case, we cannot grab the vnode, so the process is awakened when
501  * the transition is completed, and an error returned to indicate that the
502  * vnode is no longer usable (e.g. changed to a new file system type).
503  */
504 int
505 vget(vnode_t *vp, int flags)
506 {
507 	int error = 0;
508 
509 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
510 	KASSERT(mutex_owned(&vp->v_interlock));
511 	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
512 
513 	/*
514 	 * Before adding a reference, we must remove the vnode
515 	 * from its freelist.
516 	 */
517 	if (vp->v_usecount == 0) {
518 		vremfree(vp);
519 		vp->v_usecount = 1;
520 	} else {
521 		atomic_inc_uint(&vp->v_usecount);
522 	}
523 
524 	/*
525 	 * If the vnode is in the process of being cleaned out for
526 	 * another use, we wait for the cleaning to finish and then
527 	 * return failure.  Cleaning is determined by checking if
528 	 * the VI_XLOCK flag is set.
529 	 */
530 	if ((vp->v_iflag & VI_XLOCK) != 0) {
531 		if ((flags & LK_NOWAIT) != 0) {
532 			vrelel(vp, 0);
533 			return EBUSY;
534 		}
535 		vwait(vp, VI_XLOCK);
536 		vrelel(vp, 0);
537 		return ENOENT;
538 	}
539 
540 	/*
541 	 * Ok, we got it in good shape.  Just locking left.
542 	 */
543 	KASSERT((vp->v_iflag & VI_CLEAN) == 0);
544 	mutex_exit(&vp->v_interlock);
545 	if (flags & (LK_EXCLUSIVE | LK_SHARED)) {
546 		error = vn_lock(vp, flags);
547 		if (error != 0) {
548 			vrele(vp);
549 		}
550 	}
551 	return error;
552 }
553 
554 /*
555  * vput: unlock and release the reference.
556  */
557 void
558 vput(vnode_t *vp)
559 {
560 
561 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
562 
563 	VOP_UNLOCK(vp);
564 	vrele(vp);
565 }
566 
567 /*
568  * Try to drop reference on a vnode.  Abort if we are releasing the
569  * last reference.  Note: this _must_ succeed if not the last reference.
570  */
571 static inline bool
572 vtryrele(vnode_t *vp)
573 {
574 	u_int use, next;
575 
576 	for (use = vp->v_usecount;; use = next) {
577 		if (use == 1) {
578 			return false;
579 		}
580 		KASSERT((use & VC_MASK) > 1);
581 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
582 		if (__predict_true(next == use)) {
583 			return true;
584 		}
585 	}
586 }
587 
588 /*
589  * Vnode release.  If reference count drops to zero, call inactive
590  * routine and either return to freelist or free to the pool.
591  */
592 void
593 vrelel(vnode_t *vp, int flags)
594 {
595 	bool recycle, defer;
596 	int error;
597 
598 	KASSERT(mutex_owned(&vp->v_interlock));
599 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
600 	KASSERT(vp->v_freelisthd == NULL);
601 
602 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
603 	    (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
604 		vpanic(vp, "dead but not clean");
605 	}
606 
607 	/*
608 	 * If not the last reference, just drop the reference count
609 	 * and unlock.
610 	 */
611 	if (vtryrele(vp)) {
612 		vp->v_iflag |= VI_INACTREDO;
613 		mutex_exit(&vp->v_interlock);
614 		return;
615 	}
616 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
617 		vpanic(vp, "vrelel: bad ref count");
618 	}
619 
620 	KASSERT((vp->v_iflag & VI_XLOCK) == 0);
621 
622 	/*
623 	 * If not clean, deactivate the vnode, but preserve
624 	 * our reference across the call to VOP_INACTIVE().
625 	 */
626 retry:
627 	if ((vp->v_iflag & VI_CLEAN) == 0) {
628 		recycle = false;
629 		vp->v_iflag |= VI_INACTNOW;
630 
631 		/*
632 		 * XXX This ugly block can be largely eliminated if
633 		 * locking is pushed down into the file systems.
634 		 *
635 		 * Defer vnode release to vrele_thread if caller
636 		 * requests it explicitly.
637 		 */
638 		if ((curlwp == uvm.pagedaemon_lwp) ||
639 		    (flags & VRELEL_ASYNC_RELE) != 0) {
640 			/* The pagedaemon can't wait around; defer. */
641 			defer = true;
642 		} else if (curlwp == vrele_lwp) {
643 			/* We have to try harder. */
644 			vp->v_iflag &= ~VI_INACTREDO;
645 			mutex_exit(&vp->v_interlock);
646 			error = vn_lock(vp, LK_EXCLUSIVE);
647 			if (error != 0) {
648 				/* XXX */
649 				vpanic(vp, "vrele: unable to lock %p");
650 			}
651 			defer = false;
652 		} else if ((vp->v_iflag & VI_LAYER) != 0) {
653 			/*
654 			 * Acquiring the stack's lock in vclean() even
655 			 * for an honest vput/vrele is dangerous because
656 			 * our caller may hold other vnode locks; defer.
657 			 */
658 			defer = true;
659 		} else {
660 			/* If we can't acquire the lock, then defer. */
661 			vp->v_iflag &= ~VI_INACTREDO;
662 			mutex_exit(&vp->v_interlock);
663 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
664 			if (error != 0) {
665 				defer = true;
666 				mutex_enter(&vp->v_interlock);
667 			} else {
668 				defer = false;
669 			}
670 		}
671 
672 		if (defer) {
673 			/*
674 			 * Defer reclaim to the kthread; it's not safe to
675 			 * clean it here.  We donate it our last reference.
676 			 */
677 			KASSERT(mutex_owned(&vp->v_interlock));
678 			KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
679 			vp->v_iflag &= ~VI_INACTNOW;
680 			vp->v_iflag |= VI_INACTPEND;
681 			mutex_enter(&vrele_lock);
682 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
683 			if (++vrele_pending > (desiredvnodes >> 8))
684 				cv_signal(&vrele_cv);
685 			mutex_exit(&vrele_lock);
686 			mutex_exit(&vp->v_interlock);
687 			return;
688 		}
689 
690 #ifdef DIAGNOSTIC
691 		if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
692 		    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
693 			vprint("vrelel: missing VOP_CLOSE()", vp);
694 		}
695 #endif
696 
697 		/*
698 		 * The vnode can gain another reference while being
699 		 * deactivated.  If VOP_INACTIVE() indicates that
700 		 * the described file has been deleted, then recycle
701 		 * the vnode irrespective of additional references.
702 		 * Another thread may be waiting to re-use the on-disk
703 		 * inode.
704 		 *
705 		 * Note that VOP_INACTIVE() will drop the vnode lock.
706 		 */
707 		VOP_INACTIVE(vp, &recycle);
708 		mutex_enter(&vp->v_interlock);
709 		vp->v_iflag &= ~VI_INACTNOW;
710 		if (!recycle) {
711 			if (vtryrele(vp)) {
712 				mutex_exit(&vp->v_interlock);
713 				return;
714 			}
715 
716 			/*
717 			 * If we grew another reference while
718 			 * VOP_INACTIVE() was underway, retry.
719 			 */
720 			if ((vp->v_iflag & VI_INACTREDO) != 0) {
721 				goto retry;
722 			}
723 		}
724 
725 		/* Take care of space accounting. */
726 		if (vp->v_iflag & VI_EXECMAP) {
727 			atomic_add_int(&uvmexp.execpages,
728 			    -vp->v_uobj.uo_npages);
729 			atomic_add_int(&uvmexp.filepages,
730 			    vp->v_uobj.uo_npages);
731 		}
732 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
733 		vp->v_vflag &= ~VV_MAPPED;
734 
735 		/*
736 		 * Recycle the vnode if the file is now unused (unlinked),
737 		 * otherwise just free it.
738 		 */
739 		if (recycle) {
740 			vclean(vp, DOCLOSE);
741 		}
742 		KASSERT(vp->v_usecount > 0);
743 	}
744 
745 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
746 		/* Gained another reference while being reclaimed. */
747 		mutex_exit(&vp->v_interlock);
748 		return;
749 	}
750 
751 	if ((vp->v_iflag & VI_CLEAN) != 0) {
752 		/*
753 		 * It's clean so destroy it.  It isn't referenced
754 		 * anywhere since it has been reclaimed.
755 		 */
756 		KASSERT(vp->v_holdcnt == 0);
757 		KASSERT(vp->v_writecount == 0);
758 		mutex_exit(&vp->v_interlock);
759 		vfs_insmntque(vp, NULL);
760 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
761 			spec_node_destroy(vp);
762 		}
763 		vnfree(vp);
764 	} else {
765 		/*
766 		 * Otherwise, put it back onto the freelist.  It
767 		 * can't be destroyed while still associated with
768 		 * a file system.
769 		 */
770 		mutex_enter(&vnode_free_list_lock);
771 		if (vp->v_holdcnt > 0) {
772 			vp->v_freelisthd = &vnode_hold_list;
773 		} else {
774 			vp->v_freelisthd = &vnode_free_list;
775 		}
776 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
777 		mutex_exit(&vnode_free_list_lock);
778 		mutex_exit(&vp->v_interlock);
779 	}
780 }
781 
782 void
783 vrele(vnode_t *vp)
784 {
785 
786 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
787 
788 	if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
789 		return;
790 	}
791 	mutex_enter(&vp->v_interlock);
792 	vrelel(vp, 0);
793 }
794 
795 /*
796  * Asynchronous vnode release, vnode is released in different context.
797  */
798 void
799 vrele_async(vnode_t *vp)
800 {
801 
802 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
803 
804 	if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
805 		return;
806 	}
807 	mutex_enter(&vp->v_interlock);
808 	vrelel(vp, VRELEL_ASYNC_RELE);
809 }
810 
811 static void
812 vrele_thread(void *cookie)
813 {
814 	vnode_t *vp;
815 
816 	for (;;) {
817 		mutex_enter(&vrele_lock);
818 		while (TAILQ_EMPTY(&vrele_list)) {
819 			vrele_gen++;
820 			cv_broadcast(&vrele_cv);
821 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
822 		}
823 		vp = TAILQ_FIRST(&vrele_list);
824 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
825 		vrele_pending--;
826 		mutex_exit(&vrele_lock);
827 
828 		/*
829 		 * If not the last reference, then ignore the vnode
830 		 * and look for more work.
831 		 */
832 		mutex_enter(&vp->v_interlock);
833 		KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
834 		vp->v_iflag &= ~VI_INACTPEND;
835 		vrelel(vp, 0);
836 	}
837 }
838 
839 void
840 vrele_flush(void)
841 {
842 	int gen;
843 
844 	mutex_enter(&vrele_lock);
845 	gen = vrele_gen;
846 	while (vrele_pending && gen == vrele_gen) {
847 		cv_broadcast(&vrele_cv);
848 		cv_wait(&vrele_cv, &vrele_lock);
849 	}
850 	mutex_exit(&vrele_lock);
851 }
852 
853 /*
854  * Vnode reference, where a reference is already held by some other
855  * object (for example, a file structure).
856  */
857 void
858 vref(vnode_t *vp)
859 {
860 
861 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
862 	KASSERT(vp->v_usecount != 0);
863 
864 	atomic_inc_uint(&vp->v_usecount);
865 }
866 
867 /*
868  * Page or buffer structure gets a reference.
869  * Called with v_interlock held.
870  */
871 void
872 vholdl(vnode_t *vp)
873 {
874 
875 	KASSERT(mutex_owned(&vp->v_interlock));
876 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
877 
878 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
879 		mutex_enter(&vnode_free_list_lock);
880 		KASSERT(vp->v_freelisthd == &vnode_free_list);
881 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
882 		vp->v_freelisthd = &vnode_hold_list;
883 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
884 		mutex_exit(&vnode_free_list_lock);
885 	}
886 }
887 
888 /*
889  * Page or buffer structure frees a reference.
890  * Called with v_interlock held.
891  */
892 void
893 holdrelel(vnode_t *vp)
894 {
895 
896 	KASSERT(mutex_owned(&vp->v_interlock));
897 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
898 
899 	if (vp->v_holdcnt <= 0) {
900 		vpanic(vp, "holdrelel: holdcnt vp %p");
901 	}
902 
903 	vp->v_holdcnt--;
904 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
905 		mutex_enter(&vnode_free_list_lock);
906 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
907 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
908 		vp->v_freelisthd = &vnode_free_list;
909 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
910 		mutex_exit(&vnode_free_list_lock);
911 	}
912 }
913 
914 /*
915  * Disassociate the underlying file system from a vnode.
916  *
917  * Must be called with the interlock held, and will return with it held.
918  */
919 void
920 vclean(vnode_t *vp, int flags)
921 {
922 	lwp_t *l = curlwp;
923 	bool recycle, active;
924 	int error;
925 
926 	KASSERT(mutex_owned(&vp->v_interlock));
927 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
928 	KASSERT(vp->v_usecount != 0);
929 
930 	/* If cleaning is already in progress wait until done and return. */
931 	if (vp->v_iflag & VI_XLOCK) {
932 		vwait(vp, VI_XLOCK);
933 		return;
934 	}
935 
936 	/* If already clean, nothing to do. */
937 	if ((vp->v_iflag & VI_CLEAN) != 0) {
938 		return;
939 	}
940 
941 	/*
942 	 * Prevent the vnode from being recycled or brought into use
943 	 * while we clean it out.
944 	 */
945 	vp->v_iflag |= VI_XLOCK;
946 	if (vp->v_iflag & VI_EXECMAP) {
947 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
948 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
949 	}
950 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
951 	active = (vp->v_usecount & VC_MASK) > 1;
952 
953 	/* XXXAD should not lock vnode under layer */
954 	mutex_exit(&vp->v_interlock);
955 	VOP_LOCK(vp, LK_EXCLUSIVE);
956 
957 	/*
958 	 * Clean out any cached data associated with the vnode.
959 	 * If purging an active vnode, it must be closed and
960 	 * deactivated before being reclaimed. Note that the
961 	 * VOP_INACTIVE will unlock the vnode.
962 	 */
963 	if (flags & DOCLOSE) {
964 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
965 		if (error != 0) {
966 			/* XXX, fix vn_start_write's grab of mp and use that. */
967 
968 			if (wapbl_vphaswapbl(vp))
969 				WAPBL_DISCARD(wapbl_vptomp(vp));
970 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
971 		}
972 		KASSERT(error == 0);
973 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
974 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
975 			 spec_node_revoke(vp);
976 		}
977 	}
978 	if (active) {
979 		VOP_INACTIVE(vp, &recycle);
980 	} else {
981 		/*
982 		 * Any other processes trying to obtain this lock must first
983 		 * wait for VI_XLOCK to clear, then call the new lock operation.
984 		 */
985 		VOP_UNLOCK(vp);
986 	}
987 
988 	/* Disassociate the underlying file system from the vnode. */
989 	if (VOP_RECLAIM(vp)) {
990 		vpanic(vp, "vclean: cannot reclaim");
991 	}
992 
993 	KASSERT(vp->v_uobj.uo_npages == 0);
994 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
995 		uvm_ra_freectx(vp->v_ractx);
996 		vp->v_ractx = NULL;
997 	}
998 	cache_purge(vp);
999 
1000 	/* Done with purge, notify sleepers of the grim news. */
1001 	mutex_enter(&vp->v_interlock);
1002 	vp->v_op = dead_vnodeop_p;
1003 	vp->v_tag = VT_NON;
1004 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1005 	vp->v_iflag &= ~VI_XLOCK;
1006 	vp->v_vflag &= ~VV_LOCKSWORK;
1007 	if ((flags & DOCLOSE) != 0) {
1008 		vp->v_iflag |= VI_CLEAN;
1009 	}
1010 	cv_broadcast(&vp->v_cv);
1011 
1012 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1013 }
1014 
1015 /*
1016  * Recycle an unused vnode to the front of the free list.
1017  * Release the passed interlock if the vnode will be recycled.
1018  */
1019 int
1020 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1021 {
1022 
1023 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1024 
1025 	mutex_enter(&vp->v_interlock);
1026 	if (vp->v_usecount != 0) {
1027 		mutex_exit(&vp->v_interlock);
1028 		return 0;
1029 	}
1030 	if (inter_lkp) {
1031 		mutex_exit(inter_lkp);
1032 	}
1033 	vremfree(vp);
1034 	vp->v_usecount = 1;
1035 	vclean(vp, DOCLOSE);
1036 	vrelel(vp, 0);
1037 	return 1;
1038 }
1039 
1040 /*
1041  * Eliminate all activity associated with the requested vnode
1042  * and with all vnodes aliased to the requested vnode.
1043  */
1044 void
1045 vrevoke(vnode_t *vp)
1046 {
1047 	vnode_t *vq, **vpp;
1048 	enum vtype type;
1049 	dev_t dev;
1050 
1051 	KASSERT(vp->v_usecount > 0);
1052 
1053 	mutex_enter(&vp->v_interlock);
1054 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1055 		mutex_exit(&vp->v_interlock);
1056 		return;
1057 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1058 		atomic_inc_uint(&vp->v_usecount);
1059 		vclean(vp, DOCLOSE);
1060 		vrelel(vp, 0);
1061 		return;
1062 	} else {
1063 		dev = vp->v_rdev;
1064 		type = vp->v_type;
1065 		mutex_exit(&vp->v_interlock);
1066 	}
1067 
1068 	vpp = &specfs_hash[SPECHASH(dev)];
1069 	mutex_enter(&device_lock);
1070 	for (vq = *vpp; vq != NULL;) {
1071 		/* If clean or being cleaned, then ignore it. */
1072 		mutex_enter(&vq->v_interlock);
1073 		if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
1074 		    vq->v_rdev != dev || vq->v_type != type) {
1075 			mutex_exit(&vq->v_interlock);
1076 			vq = vq->v_specnext;
1077 			continue;
1078 		}
1079 		mutex_exit(&device_lock);
1080 		if (vq->v_usecount == 0) {
1081 			vremfree(vq);
1082 			vq->v_usecount = 1;
1083 		} else {
1084 			atomic_inc_uint(&vq->v_usecount);
1085 		}
1086 		vclean(vq, DOCLOSE);
1087 		vrelel(vq, 0);
1088 		mutex_enter(&device_lock);
1089 		vq = *vpp;
1090 	}
1091 	mutex_exit(&device_lock);
1092 }
1093 
1094 /*
1095  * Eliminate all activity associated with a vnode in preparation for
1096  * reuse.  Drops a reference from the vnode.
1097  */
1098 void
1099 vgone(vnode_t *vp)
1100 {
1101 
1102 	mutex_enter(&vp->v_interlock);
1103 	vclean(vp, DOCLOSE);
1104 	vrelel(vp, 0);
1105 }
1106 
1107 /*
1108  * Update outstanding I/O count and do wakeup if requested.
1109  */
1110 void
1111 vwakeup(struct buf *bp)
1112 {
1113 	vnode_t *vp;
1114 
1115 	if ((vp = bp->b_vp) == NULL)
1116 		return;
1117 
1118 	KASSERT(bp->b_objlock == &vp->v_interlock);
1119 	KASSERT(mutex_owned(bp->b_objlock));
1120 
1121 	if (--vp->v_numoutput < 0)
1122 		panic("vwakeup: neg numoutput, vp %p", vp);
1123 	if (vp->v_numoutput == 0)
1124 		cv_broadcast(&vp->v_cv);
1125 }
1126 
1127 /*
1128  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1129  * recycled.
1130  */
1131 void
1132 vwait(vnode_t *vp, int flags)
1133 {
1134 
1135 	KASSERT(mutex_owned(&vp->v_interlock));
1136 	KASSERT(vp->v_usecount != 0);
1137 
1138 	while ((vp->v_iflag & flags) != 0)
1139 		cv_wait(&vp->v_cv, &vp->v_interlock);
1140 }
1141 
1142 int
1143 vfs_drainvnodes(long target)
1144 {
1145 
1146 	while (numvnodes > target) {
1147 		vnode_t *vp;
1148 
1149 		mutex_enter(&vnode_free_list_lock);
1150 		vp = getcleanvnode();
1151 		if (vp == NULL) {
1152 			return EBUSY;
1153 		}
1154 		ungetnewvnode(vp);
1155 	}
1156 	return 0;
1157 }
1158 
1159 void
1160 vpanic(vnode_t *vp, const char *msg)
1161 {
1162 #ifdef DIAGNOSTIC
1163 
1164 	vprint(NULL, vp);
1165 	panic("%s\n", msg);
1166 #endif
1167 }
1168