xref: /netbsd-src/sys/kern/vfs_vnode.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: vfs_vnode.c,v 1.37 2014/07/05 09:33:15 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * The vnode cache subsystem.
71  *
72  * Life-cycle
73  *
74  *	Normally, there are two points where new vnodes are created:
75  *	VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
76  *	starts in one of the following ways:
77  *
78  *	- Allocation, via getnewvnode(9) and/or vnalloc(9).
79  *	- Reclamation of inactive vnode, via vget(9).
80  *
81  *	Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82  *	was another, traditional way.  Currently, only the draining thread
83  *	recycles the vnodes.  This behaviour might be revisited.
84  *
85  *	The life-cycle ends when the last reference is dropped, usually
86  *	in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
87  *	the file system that vnode is inactive.  Via this call, file system
88  *	indicates whether vnode can be recycled (usually, it checks its own
89  *	references, e.g. count of links, whether the file was removed).
90  *
91  *	Depending on indication, vnode can be put into a free list (cache),
92  *	or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93  *	underlying file system from the vnode, and finally destroyed.
94  *
95  * Reference counting
96  *
97  *	Vnode is considered active, if reference count (vnode_t::v_usecount)
98  *	is non-zero.  It is maintained using: vref(9) and vrele(9), as well
99  *	as vput(9), routines.  Common points holding references are e.g.
100  *	file openings, current working directory, mount points, etc.
101  *
102  * Note on v_usecount and its locking
103  *
104  *	At nearly all points it is known that v_usecount could be zero,
105  *	the vnode_t::v_interlock will be held.  To change v_usecount away
106  *	from zero, the interlock must be held.  To change from a non-zero
107  *	value to zero, again the interlock must be held.
108  *
109  *	Changing the usecount from a non-zero value to a non-zero value can
110  *	safely be done using atomic operations, without the interlock held.
111  *
112  *	Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
113  *	mntvnode_lock is still held.
114  *
115  *	See PR 41374.
116  */
117 
118 #include <sys/cdefs.h>
119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.37 2014/07/05 09:33:15 hannken Exp $");
120 
121 #define _VFS_VNODE_PRIVATE
122 
123 #include <sys/param.h>
124 #include <sys/kernel.h>
125 
126 #include <sys/atomic.h>
127 #include <sys/buf.h>
128 #include <sys/conf.h>
129 #include <sys/device.h>
130 #include <sys/hash.h>
131 #include <sys/kauth.h>
132 #include <sys/kmem.h>
133 #include <sys/kthread.h>
134 #include <sys/module.h>
135 #include <sys/mount.h>
136 #include <sys/namei.h>
137 #include <sys/syscallargs.h>
138 #include <sys/sysctl.h>
139 #include <sys/systm.h>
140 #include <sys/vnode.h>
141 #include <sys/wapbl.h>
142 #include <sys/fstrans.h>
143 
144 #include <uvm/uvm.h>
145 #include <uvm/uvm_readahead.h>
146 
147 /* Flags to vrelel. */
148 #define	VRELEL_ASYNC_RELE	0x0001	/* Always defer to vrele thread. */
149 #define	VRELEL_CHANGING_SET	0x0002	/* VI_CHANGING set by caller. */
150 
151 struct vcache_key {
152 	struct mount *vk_mount;
153 	const void *vk_key;
154 	size_t vk_key_len;
155 };
156 struct vcache_node {
157 	SLIST_ENTRY(vcache_node) vn_hash;
158 	struct vnode *vn_vnode;
159 	struct vcache_key vn_key;
160 };
161 
162 u_int			numvnodes		__cacheline_aligned;
163 
164 static pool_cache_t	vnode_cache		__read_mostly;
165 static struct mount	*dead_mount;
166 
167 /*
168  * There are two free lists: one is for vnodes which have no buffer/page
169  * references and one for those which do (i.e. v_holdcnt is non-zero).
170  * Vnode recycling mechanism first attempts to look into the former list.
171  */
172 static kmutex_t		vnode_free_list_lock	__cacheline_aligned;
173 static vnodelst_t	vnode_free_list		__cacheline_aligned;
174 static vnodelst_t	vnode_hold_list		__cacheline_aligned;
175 static kcondvar_t	vdrain_cv		__cacheline_aligned;
176 
177 static vnodelst_t	vrele_list		__cacheline_aligned;
178 static kmutex_t		vrele_lock		__cacheline_aligned;
179 static kcondvar_t	vrele_cv		__cacheline_aligned;
180 static lwp_t *		vrele_lwp		__cacheline_aligned;
181 static int		vrele_pending		__cacheline_aligned;
182 static int		vrele_gen		__cacheline_aligned;
183 
184 static struct {
185 	kmutex_t	lock;
186 	u_long		hashmask;
187 	SLIST_HEAD(hashhead, vcache_node)	*hashtab;
188 	pool_cache_t	pool;
189 }			vcache			__cacheline_aligned;
190 
191 static int		cleanvnode(void);
192 static void		vcache_init(void);
193 static void		vcache_reinit(void);
194 static void		vclean(vnode_t *);
195 static void		vrelel(vnode_t *, int);
196 static void		vdrain_thread(void *);
197 static void		vrele_thread(void *);
198 static void		vnpanic(vnode_t *, const char *, ...)
199     __printflike(2, 3);
200 static void		vwait(vnode_t *, int);
201 
202 /* Routines having to do with the management of the vnode table. */
203 extern int		(**dead_vnodeop_p)(void *);
204 extern struct vfsops	dead_vfsops;
205 
206 void
207 vfs_vnode_sysinit(void)
208 {
209 	int error __diagused;
210 
211 	vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
212 	    NULL, IPL_NONE, NULL, NULL, NULL);
213 	KASSERT(vnode_cache != NULL);
214 
215 	dead_mount = vfs_mountalloc(&dead_vfsops, NULL);
216 	KASSERT(dead_mount != NULL);
217 	dead_mount->mnt_iflag = IMNT_MPSAFE;
218 
219 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
220 	TAILQ_INIT(&vnode_free_list);
221 	TAILQ_INIT(&vnode_hold_list);
222 	TAILQ_INIT(&vrele_list);
223 
224 	vcache_init();
225 
226 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
227 	cv_init(&vdrain_cv, "vdrain");
228 	cv_init(&vrele_cv, "vrele");
229 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
230 	    NULL, NULL, "vdrain");
231 	KASSERT(error == 0);
232 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
233 	    NULL, &vrele_lwp, "vrele");
234 	KASSERT(error == 0);
235 }
236 
237 /*
238  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
239  * marker vnode.
240  */
241 vnode_t *
242 vnalloc(struct mount *mp)
243 {
244 	vnode_t *vp;
245 
246 	vp = pool_cache_get(vnode_cache, PR_WAITOK);
247 	KASSERT(vp != NULL);
248 
249 	memset(vp, 0, sizeof(*vp));
250 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
251 	cv_init(&vp->v_cv, "vnode");
252 	/*
253 	 * Done by memset() above.
254 	 *	LIST_INIT(&vp->v_nclist);
255 	 *	LIST_INIT(&vp->v_dnclist);
256 	 */
257 
258 	if (mp != NULL) {
259 		vp->v_mount = mp;
260 		vp->v_type = VBAD;
261 		vp->v_iflag = VI_MARKER;
262 		return vp;
263 	}
264 
265 	mutex_enter(&vnode_free_list_lock);
266 	numvnodes++;
267 	if (numvnodes > desiredvnodes + desiredvnodes / 10)
268 		cv_signal(&vdrain_cv);
269 	mutex_exit(&vnode_free_list_lock);
270 
271 	rw_init(&vp->v_lock);
272 	vp->v_usecount = 1;
273 	vp->v_type = VNON;
274 	vp->v_size = vp->v_writesize = VSIZENOTSET;
275 
276 	return vp;
277 }
278 
279 /*
280  * Free an unused, unreferenced vnode.
281  */
282 void
283 vnfree(vnode_t *vp)
284 {
285 
286 	KASSERT(vp->v_usecount == 0);
287 
288 	if ((vp->v_iflag & VI_MARKER) == 0) {
289 		rw_destroy(&vp->v_lock);
290 		mutex_enter(&vnode_free_list_lock);
291 		numvnodes--;
292 		mutex_exit(&vnode_free_list_lock);
293 	}
294 
295 	/*
296 	 * Note: the vnode interlock will either be freed, of reference
297 	 * dropped (if VI_LOCKSHARE was in use).
298 	 */
299 	uvm_obj_destroy(&vp->v_uobj, true);
300 	cv_destroy(&vp->v_cv);
301 	pool_cache_put(vnode_cache, vp);
302 }
303 
304 /*
305  * cleanvnode: grab a vnode from freelist, clean and free it.
306  *
307  * => Releases vnode_free_list_lock.
308  */
309 static int
310 cleanvnode(void)
311 {
312 	vnode_t *vp;
313 	vnodelst_t *listhd;
314 	struct mount *mp;
315 
316 	KASSERT(mutex_owned(&vnode_free_list_lock));
317 
318 	listhd = &vnode_free_list;
319 try_nextlist:
320 	TAILQ_FOREACH(vp, listhd, v_freelist) {
321 		/*
322 		 * It's safe to test v_usecount and v_iflag
323 		 * without holding the interlock here, since
324 		 * these vnodes should never appear on the
325 		 * lists.
326 		 */
327 		KASSERT(vp->v_usecount == 0);
328 		KASSERT((vp->v_iflag & VI_CLEAN) == 0);
329 		KASSERT(vp->v_freelisthd == listhd);
330 
331 		if (!mutex_tryenter(vp->v_interlock))
332 			continue;
333 		if ((vp->v_iflag & VI_XLOCK) != 0) {
334 			mutex_exit(vp->v_interlock);
335 			continue;
336 		}
337 		mp = vp->v_mount;
338 		if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
339 			mutex_exit(vp->v_interlock);
340 			continue;
341 		}
342 		break;
343 	}
344 
345 	if (vp == NULL) {
346 		if (listhd == &vnode_free_list) {
347 			listhd = &vnode_hold_list;
348 			goto try_nextlist;
349 		}
350 		mutex_exit(&vnode_free_list_lock);
351 		return EBUSY;
352 	}
353 
354 	/* Remove it from the freelist. */
355 	TAILQ_REMOVE(listhd, vp, v_freelist);
356 	vp->v_freelisthd = NULL;
357 	mutex_exit(&vnode_free_list_lock);
358 
359 	KASSERT(vp->v_usecount == 0);
360 
361 	/*
362 	 * The vnode is still associated with a file system, so we must
363 	 * clean it out before freeing it.  We need to add a reference
364 	 * before doing this.
365 	 */
366 	vp->v_usecount = 1;
367 	KASSERT((vp->v_iflag & VI_CHANGING) == 0);
368 	vp->v_iflag |= VI_CHANGING;
369 	vclean(vp);
370 	vrelel(vp, VRELEL_CHANGING_SET);
371 	fstrans_done(mp);
372 
373 	return 0;
374 }
375 
376 /*
377  * getnewvnode: return a fresh vnode.
378  *
379  * => Returns referenced vnode, moved into the mount queue.
380  * => Shares the interlock specified by 'slock', if it is not NULL.
381  */
382 int
383 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
384     kmutex_t *slock, vnode_t **vpp)
385 {
386 	struct uvm_object *uobj __diagused;
387 	vnode_t *vp;
388 	int error = 0;
389 
390 	if (mp != NULL) {
391 		/*
392 		 * Mark filesystem busy while we are creating a vnode.
393 		 * If unmount is in progress, this will fail.
394 		 */
395 		error = vfs_busy(mp, NULL);
396 		if (error)
397 			return error;
398 	}
399 
400 	vp = NULL;
401 
402 	/* Allocate a new vnode. */
403 	vp = vnalloc(NULL);
404 
405 	KASSERT(vp->v_freelisthd == NULL);
406 	KASSERT(LIST_EMPTY(&vp->v_nclist));
407 	KASSERT(LIST_EMPTY(&vp->v_dnclist));
408 	KASSERT(vp->v_data == NULL);
409 
410 	/* Initialize vnode. */
411 	vp->v_tag = tag;
412 	vp->v_op = vops;
413 
414 	uobj = &vp->v_uobj;
415 	KASSERT(uobj->pgops == &uvm_vnodeops);
416 	KASSERT(uobj->uo_npages == 0);
417 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
418 
419 	/* Share the vnode_t::v_interlock, if requested. */
420 	if (slock) {
421 		/* Set the interlock and mark that it is shared. */
422 		KASSERT(vp->v_mount == NULL);
423 		mutex_obj_hold(slock);
424 		uvm_obj_setlock(&vp->v_uobj, slock);
425 		KASSERT(vp->v_interlock == slock);
426 		vp->v_iflag |= VI_LOCKSHARE;
427 	}
428 
429 	/* Finally, move vnode into the mount queue. */
430 	vfs_insmntque(vp, mp);
431 
432 	if (mp != NULL) {
433 		if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
434 			vp->v_vflag |= VV_MPSAFE;
435 		vfs_unbusy(mp, true, NULL);
436 	}
437 
438 	*vpp = vp;
439 	return 0;
440 }
441 
442 /*
443  * This is really just the reverse of getnewvnode(). Needed for
444  * VFS_VGET functions who may need to push back a vnode in case
445  * of a locking race.
446  */
447 void
448 ungetnewvnode(vnode_t *vp)
449 {
450 
451 	KASSERT(vp->v_usecount == 1);
452 	KASSERT(vp->v_data == NULL);
453 	KASSERT(vp->v_freelisthd == NULL);
454 
455 	mutex_enter(vp->v_interlock);
456 	vp->v_iflag |= VI_CLEAN;
457 	vrelel(vp, 0);
458 }
459 
460 /*
461  * Helper thread to keep the number of vnodes below desiredvnodes.
462  */
463 static void
464 vdrain_thread(void *cookie)
465 {
466 	int error;
467 
468 	mutex_enter(&vnode_free_list_lock);
469 
470 	for (;;) {
471 		cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
472 		while (numvnodes > desiredvnodes) {
473 			error = cleanvnode();
474 			if (error)
475 				kpause("vndsbusy", false, hz, NULL);
476 			mutex_enter(&vnode_free_list_lock);
477 			if (error)
478 				break;
479 		}
480 	}
481 }
482 
483 /*
484  * Remove a vnode from its freelist.
485  */
486 void
487 vremfree(vnode_t *vp)
488 {
489 
490 	KASSERT(mutex_owned(vp->v_interlock));
491 	KASSERT(vp->v_usecount == 0);
492 
493 	/*
494 	 * Note that the reference count must not change until
495 	 * the vnode is removed.
496 	 */
497 	mutex_enter(&vnode_free_list_lock);
498 	if (vp->v_holdcnt > 0) {
499 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
500 	} else {
501 		KASSERT(vp->v_freelisthd == &vnode_free_list);
502 	}
503 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
504 	vp->v_freelisthd = NULL;
505 	mutex_exit(&vnode_free_list_lock);
506 }
507 
508 /*
509  * vget: get a particular vnode from the free list, increment its reference
510  * count and lock it.
511  *
512  * => Should be called with v_interlock held.
513  *
514  * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
515  * In that case, we cannot grab the vnode, so the process is awakened when
516  * the transition is completed, and an error returned to indicate that the
517  * vnode is no longer usable.
518  */
519 int
520 vget(vnode_t *vp, int flags)
521 {
522 	int error = 0;
523 
524 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
525 	KASSERT(mutex_owned(vp->v_interlock));
526 	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
527 
528 	/*
529 	 * Before adding a reference, we must remove the vnode
530 	 * from its freelist.
531 	 */
532 	if (vp->v_usecount == 0) {
533 		vremfree(vp);
534 		vp->v_usecount = 1;
535 	} else {
536 		atomic_inc_uint(&vp->v_usecount);
537 	}
538 
539 	/*
540 	 * If the vnode is in the process of changing state we wait
541 	 * for the change to complete and take care not to return
542 	 * a clean vnode.
543 	 */
544 	if ((vp->v_iflag & VI_CHANGING) != 0) {
545 		if ((flags & LK_NOWAIT) != 0) {
546 			vrelel(vp, 0);
547 			return EBUSY;
548 		}
549 		vwait(vp, VI_CHANGING);
550 		if ((vp->v_iflag & VI_CLEAN) != 0) {
551 			vrelel(vp, 0);
552 			return ENOENT;
553 		}
554 	}
555 
556 	/*
557 	 * Ok, we got it in good shape.  Just locking left.
558 	 */
559 	KASSERT((vp->v_iflag & VI_CLEAN) == 0);
560 	mutex_exit(vp->v_interlock);
561 	if (flags & (LK_EXCLUSIVE | LK_SHARED)) {
562 		error = vn_lock(vp, flags);
563 		if (error != 0) {
564 			vrele(vp);
565 		}
566 	}
567 	return error;
568 }
569 
570 /*
571  * vput: unlock and release the reference.
572  */
573 void
574 vput(vnode_t *vp)
575 {
576 
577 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
578 
579 	VOP_UNLOCK(vp);
580 	vrele(vp);
581 }
582 
583 /*
584  * Try to drop reference on a vnode.  Abort if we are releasing the
585  * last reference.  Note: this _must_ succeed if not the last reference.
586  */
587 static inline bool
588 vtryrele(vnode_t *vp)
589 {
590 	u_int use, next;
591 
592 	for (use = vp->v_usecount;; use = next) {
593 		if (use == 1) {
594 			return false;
595 		}
596 		KASSERT(use > 1);
597 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
598 		if (__predict_true(next == use)) {
599 			return true;
600 		}
601 	}
602 }
603 
604 /*
605  * Vnode release.  If reference count drops to zero, call inactive
606  * routine and either return to freelist or free to the pool.
607  */
608 static void
609 vrelel(vnode_t *vp, int flags)
610 {
611 	bool recycle, defer;
612 	int error;
613 
614 	KASSERT(mutex_owned(vp->v_interlock));
615 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
616 	KASSERT(vp->v_freelisthd == NULL);
617 
618 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
619 	    (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
620 		vnpanic(vp, "dead but not clean");
621 	}
622 
623 	/*
624 	 * If not the last reference, just drop the reference count
625 	 * and unlock.
626 	 */
627 	if (vtryrele(vp)) {
628 		if ((flags & VRELEL_CHANGING_SET) != 0) {
629 			KASSERT((vp->v_iflag & VI_CHANGING) != 0);
630 			vp->v_iflag &= ~VI_CHANGING;
631 			cv_broadcast(&vp->v_cv);
632 		}
633 		mutex_exit(vp->v_interlock);
634 		return;
635 	}
636 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
637 		vnpanic(vp, "%s: bad ref count", __func__);
638 	}
639 
640 	KASSERT((vp->v_iflag & VI_XLOCK) == 0);
641 
642 #ifdef DIAGNOSTIC
643 	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
644 	    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
645 		vprint("vrelel: missing VOP_CLOSE()", vp);
646 	}
647 #endif
648 
649 	/*
650 	 * If not clean, deactivate the vnode, but preserve
651 	 * our reference across the call to VOP_INACTIVE().
652 	 */
653 	if ((vp->v_iflag & VI_CLEAN) == 0) {
654 		recycle = false;
655 
656 		/*
657 		 * XXX This ugly block can be largely eliminated if
658 		 * locking is pushed down into the file systems.
659 		 *
660 		 * Defer vnode release to vrele_thread if caller
661 		 * requests it explicitly or is the pagedaemon.
662 		 */
663 		if ((curlwp == uvm.pagedaemon_lwp) ||
664 		    (flags & VRELEL_ASYNC_RELE) != 0) {
665 			defer = true;
666 		} else if (curlwp == vrele_lwp) {
667 			/*
668 			 * We have to try harder.
669 			 */
670 			mutex_exit(vp->v_interlock);
671 			error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
672 			KASSERT(error == 0);
673 			mutex_enter(vp->v_interlock);
674 			defer = false;
675 		} else {
676 			/* If we can't acquire the lock, then defer. */
677 			mutex_exit(vp->v_interlock);
678 			error = vn_lock(vp,
679 			    LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
680 			defer = (error != 0);
681 			mutex_enter(vp->v_interlock);
682 		}
683 
684 		KASSERT(mutex_owned(vp->v_interlock));
685 		KASSERT(! (curlwp == vrele_lwp && defer));
686 
687 		if (defer) {
688 			/*
689 			 * Defer reclaim to the kthread; it's not safe to
690 			 * clean it here.  We donate it our last reference.
691 			 */
692 			if ((flags & VRELEL_CHANGING_SET) != 0) {
693 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
694 				vp->v_iflag &= ~VI_CHANGING;
695 				cv_broadcast(&vp->v_cv);
696 			}
697 			mutex_enter(&vrele_lock);
698 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
699 			if (++vrele_pending > (desiredvnodes >> 8))
700 				cv_signal(&vrele_cv);
701 			mutex_exit(&vrele_lock);
702 			mutex_exit(vp->v_interlock);
703 			return;
704 		}
705 
706 		/*
707 		 * If the node got another reference while we
708 		 * released the interlock, don't try to inactivate it yet.
709 		 */
710 		if (__predict_false(vtryrele(vp))) {
711 			VOP_UNLOCK(vp);
712 			if ((flags & VRELEL_CHANGING_SET) != 0) {
713 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
714 				vp->v_iflag &= ~VI_CHANGING;
715 				cv_broadcast(&vp->v_cv);
716 			}
717 			mutex_exit(vp->v_interlock);
718 			return;
719 		}
720 
721 		if ((flags & VRELEL_CHANGING_SET) == 0) {
722 			KASSERT((vp->v_iflag & VI_CHANGING) == 0);
723 			vp->v_iflag |= VI_CHANGING;
724 		}
725 		mutex_exit(vp->v_interlock);
726 
727 		/*
728 		 * The vnode can gain another reference while being
729 		 * deactivated.  If VOP_INACTIVE() indicates that
730 		 * the described file has been deleted, then recycle
731 		 * the vnode irrespective of additional references.
732 		 * Another thread may be waiting to re-use the on-disk
733 		 * inode.
734 		 *
735 		 * Note that VOP_INACTIVE() will drop the vnode lock.
736 		 */
737 		VOP_INACTIVE(vp, &recycle);
738 		mutex_enter(vp->v_interlock);
739 		if (!recycle) {
740 			if (vtryrele(vp)) {
741 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
742 				vp->v_iflag &= ~VI_CHANGING;
743 				cv_broadcast(&vp->v_cv);
744 				mutex_exit(vp->v_interlock);
745 				return;
746 			}
747 		}
748 
749 		/* Take care of space accounting. */
750 		if (vp->v_iflag & VI_EXECMAP) {
751 			atomic_add_int(&uvmexp.execpages,
752 			    -vp->v_uobj.uo_npages);
753 			atomic_add_int(&uvmexp.filepages,
754 			    vp->v_uobj.uo_npages);
755 		}
756 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
757 		vp->v_vflag &= ~VV_MAPPED;
758 
759 		/*
760 		 * Recycle the vnode if the file is now unused (unlinked),
761 		 * otherwise just free it.
762 		 */
763 		if (recycle) {
764 			vclean(vp);
765 		}
766 		KASSERT(vp->v_usecount > 0);
767 	} else { /* vnode was already clean */
768 		if ((flags & VRELEL_CHANGING_SET) == 0) {
769 			KASSERT((vp->v_iflag & VI_CHANGING) == 0);
770 			vp->v_iflag |= VI_CHANGING;
771 		}
772 	}
773 
774 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
775 		/* Gained another reference while being reclaimed. */
776 		KASSERT((vp->v_iflag & VI_CHANGING) != 0);
777 		vp->v_iflag &= ~VI_CHANGING;
778 		cv_broadcast(&vp->v_cv);
779 		mutex_exit(vp->v_interlock);
780 		return;
781 	}
782 
783 	if ((vp->v_iflag & VI_CLEAN) != 0) {
784 		/*
785 		 * It's clean so destroy it.  It isn't referenced
786 		 * anywhere since it has been reclaimed.
787 		 */
788 		KASSERT(vp->v_holdcnt == 0);
789 		KASSERT(vp->v_writecount == 0);
790 		mutex_exit(vp->v_interlock);
791 		vfs_insmntque(vp, NULL);
792 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
793 			spec_node_destroy(vp);
794 		}
795 		vnfree(vp);
796 	} else {
797 		/*
798 		 * Otherwise, put it back onto the freelist.  It
799 		 * can't be destroyed while still associated with
800 		 * a file system.
801 		 */
802 		mutex_enter(&vnode_free_list_lock);
803 		if (vp->v_holdcnt > 0) {
804 			vp->v_freelisthd = &vnode_hold_list;
805 		} else {
806 			vp->v_freelisthd = &vnode_free_list;
807 		}
808 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
809 		mutex_exit(&vnode_free_list_lock);
810 		KASSERT((vp->v_iflag & VI_CHANGING) != 0);
811 		vp->v_iflag &= ~VI_CHANGING;
812 		cv_broadcast(&vp->v_cv);
813 		mutex_exit(vp->v_interlock);
814 	}
815 }
816 
817 void
818 vrele(vnode_t *vp)
819 {
820 
821 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
822 
823 	if (vtryrele(vp)) {
824 		return;
825 	}
826 	mutex_enter(vp->v_interlock);
827 	vrelel(vp, 0);
828 }
829 
830 /*
831  * Asynchronous vnode release, vnode is released in different context.
832  */
833 void
834 vrele_async(vnode_t *vp)
835 {
836 
837 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
838 
839 	if (vtryrele(vp)) {
840 		return;
841 	}
842 	mutex_enter(vp->v_interlock);
843 	vrelel(vp, VRELEL_ASYNC_RELE);
844 }
845 
846 static void
847 vrele_thread(void *cookie)
848 {
849 	vnodelst_t skip_list;
850 	vnode_t *vp;
851 	struct mount *mp;
852 
853 	TAILQ_INIT(&skip_list);
854 
855 	mutex_enter(&vrele_lock);
856 	for (;;) {
857 		while (TAILQ_EMPTY(&vrele_list)) {
858 			vrele_gen++;
859 			cv_broadcast(&vrele_cv);
860 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
861 			TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
862 		}
863 		vp = TAILQ_FIRST(&vrele_list);
864 		mp = vp->v_mount;
865 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
866 		if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
867 			TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
868 			continue;
869 		}
870 		vrele_pending--;
871 		mutex_exit(&vrele_lock);
872 
873 		/*
874 		 * If not the last reference, then ignore the vnode
875 		 * and look for more work.
876 		 */
877 		mutex_enter(vp->v_interlock);
878 		vrelel(vp, 0);
879 		fstrans_done(mp);
880 		mutex_enter(&vrele_lock);
881 	}
882 }
883 
884 void
885 vrele_flush(void)
886 {
887 	int gen;
888 
889 	mutex_enter(&vrele_lock);
890 	gen = vrele_gen;
891 	while (vrele_pending && gen == vrele_gen) {
892 		cv_broadcast(&vrele_cv);
893 		cv_wait(&vrele_cv, &vrele_lock);
894 	}
895 	mutex_exit(&vrele_lock);
896 }
897 
898 /*
899  * Vnode reference, where a reference is already held by some other
900  * object (for example, a file structure).
901  */
902 void
903 vref(vnode_t *vp)
904 {
905 
906 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
907 	KASSERT(vp->v_usecount != 0);
908 
909 	atomic_inc_uint(&vp->v_usecount);
910 }
911 
912 /*
913  * Page or buffer structure gets a reference.
914  * Called with v_interlock held.
915  */
916 void
917 vholdl(vnode_t *vp)
918 {
919 
920 	KASSERT(mutex_owned(vp->v_interlock));
921 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
922 
923 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
924 		mutex_enter(&vnode_free_list_lock);
925 		KASSERT(vp->v_freelisthd == &vnode_free_list);
926 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
927 		vp->v_freelisthd = &vnode_hold_list;
928 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
929 		mutex_exit(&vnode_free_list_lock);
930 	}
931 }
932 
933 /*
934  * Page or buffer structure frees a reference.
935  * Called with v_interlock held.
936  */
937 void
938 holdrelel(vnode_t *vp)
939 {
940 
941 	KASSERT(mutex_owned(vp->v_interlock));
942 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
943 
944 	if (vp->v_holdcnt <= 0) {
945 		vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
946 	}
947 
948 	vp->v_holdcnt--;
949 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
950 		mutex_enter(&vnode_free_list_lock);
951 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
952 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
953 		vp->v_freelisthd = &vnode_free_list;
954 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
955 		mutex_exit(&vnode_free_list_lock);
956 	}
957 }
958 
959 /*
960  * Disassociate the underlying file system from a vnode.
961  *
962  * Must be called with the interlock held, and will return with it held.
963  */
964 static void
965 vclean(vnode_t *vp)
966 {
967 	lwp_t *l = curlwp;
968 	bool recycle, active, doclose;
969 	int error;
970 
971 	KASSERT(mutex_owned(vp->v_interlock));
972 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
973 	KASSERT(vp->v_usecount != 0);
974 
975 	/* If already clean, nothing to do. */
976 	if ((vp->v_iflag & VI_CLEAN) != 0) {
977 		return;
978 	}
979 
980 	active = (vp->v_usecount > 1);
981 	doclose = ! (active && vp->v_type == VBLK &&
982 	    spec_node_getmountedfs(vp) != NULL);
983 	mutex_exit(vp->v_interlock);
984 
985 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
986 
987 	/*
988 	 * Prevent the vnode from being recycled or brought into use
989 	 * while we clean it out.
990 	 */
991 	mutex_enter(vp->v_interlock);
992 	KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
993 	vp->v_iflag |= VI_XLOCK;
994 	if (vp->v_iflag & VI_EXECMAP) {
995 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
996 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
997 	}
998 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
999 	mutex_exit(vp->v_interlock);
1000 
1001 	/*
1002 	 * Clean out any cached data associated with the vnode.
1003 	 * If purging an active vnode, it must be closed and
1004 	 * deactivated before being reclaimed. Note that the
1005 	 * VOP_INACTIVE will unlock the vnode.
1006 	 */
1007 	if (doclose) {
1008 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1009 		if (error != 0) {
1010 			if (wapbl_vphaswapbl(vp))
1011 				WAPBL_DISCARD(wapbl_vptomp(vp));
1012 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1013 		}
1014 		KASSERT(error == 0);
1015 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1016 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1017 			 spec_node_revoke(vp);
1018 		}
1019 	}
1020 	if (active) {
1021 		VOP_INACTIVE(vp, &recycle);
1022 	} else {
1023 		/*
1024 		 * Any other processes trying to obtain this lock must first
1025 		 * wait for VI_XLOCK to clear, then call the new lock operation.
1026 		 */
1027 		VOP_UNLOCK(vp);
1028 	}
1029 
1030 	/* Disassociate the underlying file system from the vnode. */
1031 	if (VOP_RECLAIM(vp)) {
1032 		vnpanic(vp, "%s: cannot reclaim", __func__);
1033 	}
1034 
1035 	KASSERT(vp->v_data == NULL);
1036 	KASSERT(vp->v_uobj.uo_npages == 0);
1037 
1038 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1039 		uvm_ra_freectx(vp->v_ractx);
1040 		vp->v_ractx = NULL;
1041 	}
1042 
1043 	/* Purge name cache. */
1044 	cache_purge(vp);
1045 
1046 	/* Move to dead mount. */
1047 	vp->v_vflag &= ~VV_ROOT;
1048 	atomic_inc_uint(&dead_mount->mnt_refcnt);
1049 	vfs_insmntque(vp, dead_mount);
1050 
1051 	/* Done with purge, notify sleepers of the grim news. */
1052 	mutex_enter(vp->v_interlock);
1053 	if (doclose) {
1054 		vp->v_op = dead_vnodeop_p;
1055 		vp->v_vflag |= VV_LOCKSWORK;
1056 		vp->v_iflag |= VI_CLEAN;
1057 	} else {
1058 		vp->v_op = spec_vnodeop_p;
1059 		vp->v_vflag &= ~VV_LOCKSWORK;
1060 	}
1061 	vp->v_tag = VT_NON;
1062 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1063 	vp->v_iflag &= ~VI_XLOCK;
1064 	cv_broadcast(&vp->v_cv);
1065 
1066 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1067 }
1068 
1069 /*
1070  * Recycle an unused vnode if caller holds the last reference.
1071  */
1072 bool
1073 vrecycle(vnode_t *vp)
1074 {
1075 
1076 	mutex_enter(vp->v_interlock);
1077 
1078 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
1079 
1080 	if (vp->v_usecount != 1) {
1081 		mutex_exit(vp->v_interlock);
1082 		return false;
1083 	}
1084 	if ((vp->v_iflag & VI_CHANGING) != 0)
1085 		vwait(vp, VI_CHANGING);
1086 	if (vp->v_usecount != 1) {
1087 		mutex_exit(vp->v_interlock);
1088 		return false;
1089 	} else if ((vp->v_iflag & VI_CLEAN) != 0) {
1090 		mutex_exit(vp->v_interlock);
1091 		return true;
1092 	}
1093 	vp->v_iflag |= VI_CHANGING;
1094 	vclean(vp);
1095 	vrelel(vp, VRELEL_CHANGING_SET);
1096 	return true;
1097 }
1098 
1099 /*
1100  * Eliminate all activity associated with the requested vnode
1101  * and with all vnodes aliased to the requested vnode.
1102  */
1103 void
1104 vrevoke(vnode_t *vp)
1105 {
1106 	vnode_t *vq;
1107 	enum vtype type;
1108 	dev_t dev;
1109 
1110 	KASSERT(vp->v_usecount > 0);
1111 
1112 	mutex_enter(vp->v_interlock);
1113 	if ((vp->v_iflag & VI_CLEAN) != 0) {
1114 		mutex_exit(vp->v_interlock);
1115 		return;
1116 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1117 		atomic_inc_uint(&vp->v_usecount);
1118 		mutex_exit(vp->v_interlock);
1119 		vgone(vp);
1120 		return;
1121 	} else {
1122 		dev = vp->v_rdev;
1123 		type = vp->v_type;
1124 		mutex_exit(vp->v_interlock);
1125 	}
1126 
1127 	while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1128 		vgone(vq);
1129 	}
1130 }
1131 
1132 /*
1133  * Eliminate all activity associated with a vnode in preparation for
1134  * reuse.  Drops a reference from the vnode.
1135  */
1136 void
1137 vgone(vnode_t *vp)
1138 {
1139 
1140 	mutex_enter(vp->v_interlock);
1141 	if ((vp->v_iflag & VI_CHANGING) != 0)
1142 		vwait(vp, VI_CHANGING);
1143 	vp->v_iflag |= VI_CHANGING;
1144 	vclean(vp);
1145 	vrelel(vp, VRELEL_CHANGING_SET);
1146 }
1147 
1148 static inline uint32_t
1149 vcache_hash(const struct vcache_key *key)
1150 {
1151 	uint32_t hash = HASH32_BUF_INIT;
1152 
1153 	hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1154 	hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1155 	return hash;
1156 }
1157 
1158 static void
1159 vcache_init(void)
1160 {
1161 
1162 	vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1163 	    "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1164 	KASSERT(vcache.pool != NULL);
1165 	mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1166 	vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1167 	    &vcache.hashmask);
1168 }
1169 
1170 static void
1171 vcache_reinit(void)
1172 {
1173 	int i;
1174 	uint32_t hash;
1175 	u_long oldmask, newmask;
1176 	struct hashhead *oldtab, *newtab;
1177 	struct vcache_node *node;
1178 
1179 	newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1180 	mutex_enter(&vcache.lock);
1181 	oldtab = vcache.hashtab;
1182 	oldmask = vcache.hashmask;
1183 	vcache.hashtab = newtab;
1184 	vcache.hashmask = newmask;
1185 	for (i = 0; i <= oldmask; i++) {
1186 		while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1187 			SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1188 			hash = vcache_hash(&node->vn_key);
1189 			SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1190 			    node, vn_hash);
1191 		}
1192 	}
1193 	mutex_exit(&vcache.lock);
1194 	hashdone(oldtab, HASH_SLIST, oldmask);
1195 }
1196 
1197 static inline struct vcache_node *
1198 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1199 {
1200 	struct hashhead *hashp;
1201 	struct vcache_node *node;
1202 
1203 	KASSERT(mutex_owned(&vcache.lock));
1204 
1205 	hashp = &vcache.hashtab[hash & vcache.hashmask];
1206 	SLIST_FOREACH(node, hashp, vn_hash) {
1207 		if (key->vk_mount != node->vn_key.vk_mount)
1208 			continue;
1209 		if (key->vk_key_len != node->vn_key.vk_key_len)
1210 			continue;
1211 		if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1212 			continue;
1213 		return node;
1214 	}
1215 	return NULL;
1216 }
1217 
1218 /*
1219  * Get a vnode / fs node pair by key and return it referenced through vpp.
1220  */
1221 int
1222 vcache_get(struct mount *mp, const void *key, size_t key_len,
1223     struct vnode **vpp)
1224 {
1225 	int error;
1226 	uint32_t hash;
1227 	const void *new_key;
1228 	struct vnode *vp;
1229 	struct vcache_key vcache_key;
1230 	struct vcache_node *node, *new_node;
1231 
1232 	new_key = NULL;
1233 	*vpp = NULL;
1234 
1235 	vcache_key.vk_mount = mp;
1236 	vcache_key.vk_key = key;
1237 	vcache_key.vk_key_len = key_len;
1238 	hash = vcache_hash(&vcache_key);
1239 
1240 again:
1241 	mutex_enter(&vcache.lock);
1242 	node = vcache_hash_lookup(&vcache_key, hash);
1243 
1244 	/* If found, take a reference or retry. */
1245 	if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
1246 		vp = node->vn_vnode;
1247 		mutex_enter(vp->v_interlock);
1248 		mutex_exit(&vcache.lock);
1249 		error = vget(vp, 0);
1250 		if (error == ENOENT)
1251 			goto again;
1252 		if (error == 0)
1253 			*vpp = vp;
1254 		KASSERT((error != 0) == (*vpp == NULL));
1255 		return error;
1256 	}
1257 
1258 	/* If another thread loads this node, wait and retry. */
1259 	if (node != NULL) {
1260 		KASSERT(node->vn_vnode == NULL);
1261 		mutex_exit(&vcache.lock);
1262 		kpause("vcache", false, mstohz(20), NULL);
1263 		goto again;
1264 	}
1265 	mutex_exit(&vcache.lock);
1266 
1267 	/* Allocate and initialize a new vcache / vnode pair. */
1268 	error = vfs_busy(mp, NULL);
1269 	if (error)
1270 		return error;
1271 	new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1272 	new_node->vn_vnode = NULL;
1273 	new_node->vn_key = vcache_key;
1274 	vp = vnalloc(NULL);
1275 	mutex_enter(&vcache.lock);
1276 	node = vcache_hash_lookup(&vcache_key, hash);
1277 	if (node == NULL) {
1278 		SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1279 		    new_node, vn_hash);
1280 		node = new_node;
1281 	}
1282 	mutex_exit(&vcache.lock);
1283 
1284 	/* If another thread beat us inserting this node, retry. */
1285 	if (node != new_node) {
1286 		pool_cache_put(vcache.pool, new_node);
1287 		KASSERT(vp->v_usecount == 1);
1288 		vp->v_usecount = 0;
1289 		vnfree(vp);
1290 		vfs_unbusy(mp, false, NULL);
1291 		goto again;
1292 	}
1293 
1294 	/* Load the fs node.  Exclusive as new_node->vn_vnode is NULL. */
1295 	error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1296 	if (error) {
1297 		mutex_enter(&vcache.lock);
1298 		SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1299 		    new_node, vcache_node, vn_hash);
1300 		mutex_exit(&vcache.lock);
1301 		pool_cache_put(vcache.pool, new_node);
1302 		KASSERT(vp->v_usecount == 1);
1303 		vp->v_usecount = 0;
1304 		vnfree(vp);
1305 		vfs_unbusy(mp, false, NULL);
1306 		KASSERT(*vpp == NULL);
1307 		return error;
1308 	}
1309 	KASSERT(new_key != NULL);
1310 	KASSERT(memcmp(key, new_key, key_len) == 0);
1311 	KASSERT(vp->v_op != NULL);
1312 	vfs_insmntque(vp, mp);
1313 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1314 		vp->v_vflag |= VV_MPSAFE;
1315 	vfs_unbusy(mp, true, NULL);
1316 
1317 	/* Finished loading, finalize node. */
1318 	mutex_enter(&vcache.lock);
1319 	new_node->vn_key.vk_key = new_key;
1320 	new_node->vn_vnode = vp;
1321 	mutex_exit(&vcache.lock);
1322 	*vpp = vp;
1323 	return 0;
1324 }
1325 
1326 /*
1327  * Prepare key change: lock old and new cache node.
1328  * Return an error if the new node already exists.
1329  */
1330 int
1331 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1332     const void *old_key, size_t old_key_len,
1333     const void *new_key, size_t new_key_len)
1334 {
1335 	uint32_t old_hash, new_hash;
1336 	struct vcache_key old_vcache_key, new_vcache_key;
1337 	struct vcache_node *node, *new_node;
1338 
1339 	old_vcache_key.vk_mount = mp;
1340 	old_vcache_key.vk_key = old_key;
1341 	old_vcache_key.vk_key_len = old_key_len;
1342 	old_hash = vcache_hash(&old_vcache_key);
1343 
1344 	new_vcache_key.vk_mount = mp;
1345 	new_vcache_key.vk_key = new_key;
1346 	new_vcache_key.vk_key_len = new_key_len;
1347 	new_hash = vcache_hash(&new_vcache_key);
1348 
1349 	new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1350 	new_node->vn_vnode = NULL;
1351 	new_node->vn_key = new_vcache_key;
1352 
1353 	mutex_enter(&vcache.lock);
1354 	node = vcache_hash_lookup(&new_vcache_key, new_hash);
1355 	if (node != NULL) {
1356 		mutex_exit(&vcache.lock);
1357 		pool_cache_put(vcache.pool, new_node);
1358 		return EEXIST;
1359 	}
1360 	SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1361 	    new_node, vn_hash);
1362 	node = vcache_hash_lookup(&old_vcache_key, old_hash);
1363 	KASSERT(node != NULL);
1364 	KASSERT(node->vn_vnode == vp);
1365 	node->vn_vnode = NULL;
1366 	node->vn_key = old_vcache_key;
1367 	mutex_exit(&vcache.lock);
1368 	return 0;
1369 }
1370 
1371 /*
1372  * Key change complete: remove old node and unlock new node.
1373  */
1374 void
1375 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1376     const void *old_key, size_t old_key_len,
1377     const void *new_key, size_t new_key_len)
1378 {
1379 	uint32_t old_hash, new_hash;
1380 	struct vcache_key old_vcache_key, new_vcache_key;
1381 	struct vcache_node *node;
1382 
1383 	old_vcache_key.vk_mount = mp;
1384 	old_vcache_key.vk_key = old_key;
1385 	old_vcache_key.vk_key_len = old_key_len;
1386 	old_hash = vcache_hash(&old_vcache_key);
1387 
1388 	new_vcache_key.vk_mount = mp;
1389 	new_vcache_key.vk_key = new_key;
1390 	new_vcache_key.vk_key_len = new_key_len;
1391 	new_hash = vcache_hash(&new_vcache_key);
1392 
1393 	mutex_enter(&vcache.lock);
1394 	node = vcache_hash_lookup(&new_vcache_key, new_hash);
1395 	KASSERT(node != NULL && node->vn_vnode == NULL);
1396 	KASSERT(node->vn_key.vk_key_len == new_key_len);
1397 	node->vn_vnode = vp;
1398 	node->vn_key = new_vcache_key;
1399 	node = vcache_hash_lookup(&old_vcache_key, old_hash);
1400 	KASSERT(node != NULL);
1401 	KASSERT(node->vn_vnode == NULL);
1402 	SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1403 	    node, vcache_node, vn_hash);
1404 	mutex_exit(&vcache.lock);
1405 	pool_cache_put(vcache.pool, node);
1406 }
1407 
1408 /*
1409  * Remove a vnode / fs node pair from the cache.
1410  */
1411 void
1412 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1413 {
1414 	uint32_t hash;
1415 	struct vcache_key vcache_key;
1416 	struct vcache_node *node;
1417 
1418 	vcache_key.vk_mount = mp;
1419 	vcache_key.vk_key = key;
1420 	vcache_key.vk_key_len = key_len;
1421 	hash = vcache_hash(&vcache_key);
1422 
1423 	mutex_enter(&vcache.lock);
1424 	node = vcache_hash_lookup(&vcache_key, hash);
1425 	KASSERT(node != NULL);
1426 	SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1427 	    node, vcache_node, vn_hash);
1428 	mutex_exit(&vcache.lock);
1429 	pool_cache_put(vcache.pool, node);
1430 }
1431 
1432 /*
1433  * Update outstanding I/O count and do wakeup if requested.
1434  */
1435 void
1436 vwakeup(struct buf *bp)
1437 {
1438 	vnode_t *vp;
1439 
1440 	if ((vp = bp->b_vp) == NULL)
1441 		return;
1442 
1443 	KASSERT(bp->b_objlock == vp->v_interlock);
1444 	KASSERT(mutex_owned(bp->b_objlock));
1445 
1446 	if (--vp->v_numoutput < 0)
1447 		vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1448 	if (vp->v_numoutput == 0)
1449 		cv_broadcast(&vp->v_cv);
1450 }
1451 
1452 /*
1453  * Test a vnode for being or becoming dead.  Returns one of:
1454  * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1455  * ENOENT: vnode is dead.
1456  * 0:      otherwise.
1457  *
1458  * Whenever this function returns a non-zero value all future
1459  * calls will also return a non-zero value.
1460  */
1461 int
1462 vdead_check(struct vnode *vp, int flags)
1463 {
1464 
1465 	KASSERT(mutex_owned(vp->v_interlock));
1466 	if (ISSET(vp->v_iflag, VI_XLOCK)) {
1467 		if (ISSET(flags, VDEAD_NOWAIT))
1468 			return EBUSY;
1469 		vwait(vp, VI_XLOCK);
1470 		KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
1471 	}
1472 	if (ISSET(vp->v_iflag, VI_CLEAN))
1473 		return ENOENT;
1474 	return 0;
1475 }
1476 
1477 /*
1478  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1479  * recycled.
1480  */
1481 static void
1482 vwait(vnode_t *vp, int flags)
1483 {
1484 
1485 	KASSERT(mutex_owned(vp->v_interlock));
1486 	KASSERT(vp->v_usecount != 0);
1487 
1488 	while ((vp->v_iflag & flags) != 0)
1489 		cv_wait(&vp->v_cv, vp->v_interlock);
1490 }
1491 
1492 int
1493 vfs_drainvnodes(long target)
1494 {
1495 	int error;
1496 
1497 	mutex_enter(&vnode_free_list_lock);
1498 
1499 	while (numvnodes > target) {
1500 		error = cleanvnode();
1501 		if (error != 0)
1502 			return error;
1503 		mutex_enter(&vnode_free_list_lock);
1504 	}
1505 
1506 	mutex_exit(&vnode_free_list_lock);
1507 
1508 	vcache_reinit();
1509 
1510 	return 0;
1511 }
1512 
1513 void
1514 vnpanic(vnode_t *vp, const char *fmt, ...)
1515 {
1516 	va_list ap;
1517 
1518 #ifdef DIAGNOSTIC
1519 	vprint(NULL, vp);
1520 #endif
1521 	va_start(ap, fmt);
1522 	vpanic(fmt, ap);
1523 	va_end(ap);
1524 }
1525