xref: /netbsd-src/sys/kern/vfs_vnode.c (revision 835080d5e678e165a8c64b1ec4e616adde88537c)
1 /*	$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * The vnode cache subsystem.
71  *
72  * Life-cycle
73  *
74  *	Normally, there are two points where new vnodes are created:
75  *	VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
76  *	starts in one of the following ways:
77  *
78  *	- Allocation, via vcache_get(9) or vcache_new(9).
79  *	- Reclamation of inactive vnode, via vcache_vget(9).
80  *
81  *	Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82  *	was another, traditional way.  Currently, only the draining thread
83  *	recycles the vnodes.  This behaviour might be revisited.
84  *
85  *	The life-cycle ends when the last reference is dropped, usually
86  *	in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
87  *	the file system that vnode is inactive.  Via this call, file system
88  *	indicates whether vnode can be recycled (usually, it checks its own
89  *	references, e.g. count of links, whether the file was removed).
90  *
91  *	Depending on indication, vnode can be put into a free list (cache),
92  *	or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93  *	disassociate underlying file system from the vnode, and finally
94  *	destroyed.
95  *
96  * Vnode state
97  *
98  *	Vnode is always in one of six states:
99  *	- MARKER	This is a marker vnode to help list traversal.  It
100  *			will never change its state.
101  *	- LOADING	Vnode is associating underlying file system and not
102  *			yet ready to use.
103  *	- LOADED	Vnode has associated underlying file system and is
104  *			ready to use.
105  *	- BLOCKED	Vnode is active but cannot get new references.
106  *	- RECLAIMING	Vnode is disassociating from the underlying file
107  *			system.
108  *	- RECLAIMED	Vnode has disassociated from underlying file system
109  *			and is dead.
110  *
111  *	Valid state changes are:
112  *	LOADING -> LOADED
113  *			Vnode has been initialised in vcache_get() or
114  *			vcache_new() and is ready to use.
115  *	BLOCKED -> RECLAIMING
116  *			Vnode starts disassociation from underlying file
117  *			system in vcache_reclaim().
118  *	RECLAIMING -> RECLAIMED
119  *			Vnode finished disassociation from underlying file
120  *			system in vcache_reclaim().
121  *	LOADED -> BLOCKED
122  *			Either vcache_rekey*() is changing the vnode key or
123  *			vrelel() is about to call VOP_INACTIVE().
124  *	BLOCKED -> LOADED
125  *			The block condition is over.
126  *	LOADING -> RECLAIMED
127  *			Either vcache_get() or vcache_new() failed to
128  *			associate the underlying file system or vcache_rekey*()
129  *			drops a vnode used as placeholder.
130  *
131  *	Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132  *	and it is possible to wait for state change.
133  *
134  *	State is protected with v_interlock with one exception:
135  *	to change from LOADING both v_interlock and vcache_lock must be held
136  *	so it is possible to check "state == LOADING" without holding
137  *	v_interlock.  See vcache_get() for details.
138  *
139  * Reference counting
140  *
141  *	Vnode is considered active, if reference count (vnode_t::v_usecount)
142  *	is non-zero.  It is maintained using: vref(9) and vrele(9), as well
143  *	as vput(9), routines.  Common points holding references are e.g.
144  *	file openings, current working directory, mount points, etc.
145  *
146  *	v_usecount is adjusted with atomic operations, however to change
147  *	from a non-zero value to zero the interlock must also be held.
148  */
149 
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");
152 
153 #ifdef _KERNEL_OPT
154 #include "opt_pax.h"
155 #endif
156 
157 #include <sys/param.h>
158 #include <sys/kernel.h>
159 
160 #include <sys/atomic.h>
161 #include <sys/buf.h>
162 #include <sys/conf.h>
163 #include <sys/device.h>
164 #include <sys/hash.h>
165 #include <sys/kauth.h>
166 #include <sys/kmem.h>
167 #include <sys/module.h>
168 #include <sys/mount.h>
169 #include <sys/namei.h>
170 #include <sys/pax.h>
171 #include <sys/syscallargs.h>
172 #include <sys/sysctl.h>
173 #include <sys/systm.h>
174 #include <sys/threadpool.h>
175 #include <sys/vnode_impl.h>
176 #include <sys/wapbl.h>
177 #include <sys/fstrans.h>
178 
179 #include <miscfs/deadfs/deadfs.h>
180 #include <miscfs/specfs/specdev.h>
181 
182 #include <uvm/uvm.h>
183 #include <uvm/uvm_readahead.h>
184 #include <uvm/uvm_stat.h>
185 
186 /* Flags to vrelel. */
187 #define	VRELEL_ASYNC	0x0001	/* Always defer to vrele thread. */
188 
189 #define	LRU_VRELE	0
190 #define	LRU_FREE	1
191 #define	LRU_HOLD	2
192 #define	LRU_COUNT	3
193 
194 /*
195  * There are three lru lists: one holds vnodes waiting for async release,
196  * one is for vnodes which have no buffer/page references and one for those
197  * which do (i.e.  v_holdcnt is non-zero).  We put the lists into a single,
198  * private cache line as vnodes migrate between them while under the same
199  * lock (vdrain_lock).
200  */
201 
202 typedef struct {
203 	vnode_impl_t *li_marker;
204 } lru_iter_t;
205 
206 u_int			numvnodes		__cacheline_aligned;
207 static vnodelst_t	lru_list[LRU_COUNT]	__cacheline_aligned;
208 static struct threadpool *threadpool;
209 static struct threadpool_job vdrain_job;
210 static struct threadpool_job vrele_job;
211 static kmutex_t		vdrain_lock		__cacheline_aligned;
212 SLIST_HEAD(hashhead, vnode_impl);
213 static kmutex_t		vcache_lock		__cacheline_aligned;
214 static kcondvar_t	vcache_cv;
215 static u_int		vcache_hashsize;
216 static u_long		vcache_hashmask;
217 static struct hashhead	*vcache_hashtab;
218 static pool_cache_t	vcache_pool;
219 static void		lru_requeue(vnode_t *, vnodelst_t *);
220 static vnodelst_t *	lru_which(vnode_t *);
221 static vnode_impl_t *	lru_iter_first(int, lru_iter_t *);
222 static vnode_impl_t *	lru_iter_next(lru_iter_t *);
223 static void		lru_iter_release(lru_iter_t *);
224 static vnode_impl_t *	vcache_alloc(void);
225 static void		vcache_dealloc(vnode_impl_t *);
226 static void		vcache_free(vnode_impl_t *);
227 static void		vcache_init(void);
228 static void		vcache_reinit(void);
229 static void		vcache_reclaim(vnode_t *);
230 static void		vrele_deferred(vnode_impl_t *);
231 static void		vrelel(vnode_t *, int, int);
232 static void		vnpanic(vnode_t *, const char *, ...)
233     __printflike(2, 3);
234 static bool		vdrain_one(u_int);
235 static void		vdrain_task(struct threadpool_job *);
236 static void		vrele_task(struct threadpool_job *);
237 
238 /* Routines having to do with the management of the vnode table. */
239 
240 /*
241  * The high bit of v_usecount is a gate for vcache_tryvget().  It's set
242  * only when the vnode state is LOADED.
243  * The next bit of v_usecount is a flag for vrelel().  It's set
244  * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
245  */
246 #define	VUSECOUNT_MASK	0x3fffffff
247 #define	VUSECOUNT_GATE	0x80000000
248 #define	VUSECOUNT_VGET	0x40000000
249 
250 /*
251  * Return the current usecount of a vnode.
252  */
253 inline int
254 vrefcnt(struct vnode *vp)
255 {
256 
257 	return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
258 }
259 
260 /* Vnode state operations and diagnostics. */
261 
262 #if defined(DIAGNOSTIC)
263 
264 #define VSTATE_VALID(state) \
265 	((state) != VS_ACTIVE && (state) != VS_MARKER)
266 #define VSTATE_GET(vp) \
267 	vstate_assert_get((vp), __func__, __LINE__)
268 #define VSTATE_CHANGE(vp, from, to) \
269 	vstate_assert_change((vp), (from), (to), __func__, __LINE__)
270 #define VSTATE_WAIT_STABLE(vp) \
271 	vstate_assert_wait_stable((vp), __func__, __LINE__)
272 
273 void
274 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
275     bool has_lock)
276 {
277 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
278 	int refcnt = vrefcnt(vp);
279 
280 	if (!has_lock) {
281 		enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);
282 
283 		if (state == VS_ACTIVE && refcnt > 0 &&
284 		    (vstate == VS_LOADED || vstate == VS_BLOCKED))
285 			return;
286 		if (vstate == state)
287 			return;
288 		mutex_enter((vp)->v_interlock);
289 	}
290 
291 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
292 
293 	if ((state == VS_ACTIVE && refcnt > 0 &&
294 	    (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
295 	    vip->vi_state == state) {
296 		if (!has_lock)
297 			mutex_exit((vp)->v_interlock);
298 		return;
299 	}
300 	vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
301 	    vstate_name(vip->vi_state), refcnt,
302 	    vstate_name(state), func, line);
303 }
304 
305 static enum vnode_state
306 vstate_assert_get(vnode_t *vp, const char *func, int line)
307 {
308 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
309 
310 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
311 	if (! VSTATE_VALID(vip->vi_state))
312 		vnpanic(vp, "state is %s at %s:%d",
313 		    vstate_name(vip->vi_state), func, line);
314 
315 	return vip->vi_state;
316 }
317 
318 static void
319 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
320 {
321 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
322 
323 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
324 	if (! VSTATE_VALID(vip->vi_state))
325 		vnpanic(vp, "state is %s at %s:%d",
326 		    vstate_name(vip->vi_state), func, line);
327 
328 	while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
329 		cv_wait(&vp->v_cv, vp->v_interlock);
330 
331 	if (! VSTATE_VALID(vip->vi_state))
332 		vnpanic(vp, "state is %s at %s:%d",
333 		    vstate_name(vip->vi_state), func, line);
334 }
335 
336 static void
337 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
338     const char *func, int line)
339 {
340 	bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
341 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
342 
343 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
344 	if (from == VS_LOADING)
345 		KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
346 
347 	if (! VSTATE_VALID(from))
348 		vnpanic(vp, "from is %s at %s:%d",
349 		    vstate_name(from), func, line);
350 	if (! VSTATE_VALID(to))
351 		vnpanic(vp, "to is %s at %s:%d",
352 		    vstate_name(to), func, line);
353 	if (vip->vi_state != from)
354 		vnpanic(vp, "from is %s, expected %s at %s:%d\n",
355 		    vstate_name(vip->vi_state), vstate_name(from), func, line);
356 	if ((from == VS_LOADED) != gated)
357 		vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
358 		    vstate_name(vip->vi_state), gated, func, line);
359 
360 	/* Open/close the gate for vcache_tryvget(). */
361 	if (to == VS_LOADED) {
362 		membar_release();
363 		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
364 	} else {
365 		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
366 	}
367 
368 	atomic_store_relaxed(&vip->vi_state, to);
369 	if (from == VS_LOADING)
370 		cv_broadcast(&vcache_cv);
371 	if (to == VS_LOADED || to == VS_RECLAIMED)
372 		cv_broadcast(&vp->v_cv);
373 }
374 
375 #else /* defined(DIAGNOSTIC) */
376 
377 #define VSTATE_GET(vp) \
378 	(VNODE_TO_VIMPL((vp))->vi_state)
379 #define VSTATE_CHANGE(vp, from, to) \
380 	vstate_change((vp), (from), (to))
381 #define VSTATE_WAIT_STABLE(vp) \
382 	vstate_wait_stable((vp))
383 void
384 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
385     bool has_lock)
386 {
387 
388 }
389 
390 static void
391 vstate_wait_stable(vnode_t *vp)
392 {
393 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
394 
395 	while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
396 		cv_wait(&vp->v_cv, vp->v_interlock);
397 }
398 
399 static void
400 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
401 {
402 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
403 
404 	/* Open/close the gate for vcache_tryvget(). */
405 	if (to == VS_LOADED) {
406 		membar_release();
407 		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
408 	} else {
409 		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
410 	}
411 
412 	atomic_store_relaxed(&vip->vi_state, to);
413 	if (from == VS_LOADING)
414 		cv_broadcast(&vcache_cv);
415 	if (to == VS_LOADED || to == VS_RECLAIMED)
416 		cv_broadcast(&vp->v_cv);
417 }
418 
419 #endif /* defined(DIAGNOSTIC) */
420 
421 void
422 vfs_vnode_sysinit(void)
423 {
424 	int error __diagused, i;
425 
426 	dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
427 	KASSERT(dead_rootmount != NULL);
428 	dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
429 
430 	mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
431 	for (i = 0; i < LRU_COUNT; i++) {
432 		TAILQ_INIT(&lru_list[i]);
433 	}
434 	vcache_init();
435 
436 	error = threadpool_get(&threadpool, PRI_NONE);
437 	KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
438 	threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
439 	threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
440 }
441 
442 /*
443  * Allocate a new marker vnode.
444  */
445 vnode_t *
446 vnalloc_marker(struct mount *mp)
447 {
448 	vnode_impl_t *vip;
449 	vnode_t *vp;
450 
451 	vip = pool_cache_get(vcache_pool, PR_WAITOK);
452 	memset(vip, 0, sizeof(*vip));
453 	vp = VIMPL_TO_VNODE(vip);
454 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
455 	vp->v_mount = mp;
456 	vp->v_type = VBAD;
457 	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
458 	klist_init(&vip->vi_klist.vk_klist);
459 	vp->v_klist = &vip->vi_klist;
460 	vip->vi_state = VS_MARKER;
461 
462 	return vp;
463 }
464 
465 /*
466  * Free a marker vnode.
467  */
468 void
469 vnfree_marker(vnode_t *vp)
470 {
471 	vnode_impl_t *vip;
472 
473 	vip = VNODE_TO_VIMPL(vp);
474 	KASSERT(vip->vi_state == VS_MARKER);
475 	mutex_obj_free(vp->v_interlock);
476 	uvm_obj_destroy(&vp->v_uobj, true);
477 	klist_fini(&vip->vi_klist.vk_klist);
478 	pool_cache_put(vcache_pool, vip);
479 }
480 
481 /*
482  * Test a vnode for being a marker vnode.
483  */
484 bool
485 vnis_marker(vnode_t *vp)
486 {
487 
488 	return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
489 }
490 
491 /*
492  * Return the lru list this node should be on.
493  */
494 static vnodelst_t *
495 lru_which(vnode_t *vp)
496 {
497 
498 	KASSERT(mutex_owned(vp->v_interlock));
499 
500 	if (vp->v_holdcnt > 0)
501 		return &lru_list[LRU_HOLD];
502 	else
503 		return &lru_list[LRU_FREE];
504 }
505 
506 /*
507  * Put vnode to end of given list.
508  * Both the current and the new list may be NULL, used on vnode alloc/free.
509  * Adjust numvnodes and signal vdrain thread if there is work.
510  */
511 static void
512 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
513 {
514 	vnode_impl_t *vip;
515 	int d;
516 
517 	/*
518 	 * If the vnode is on the correct list, and was put there recently,
519 	 * then leave it be, thus avoiding huge cache and lock contention.
520 	 */
521 	vip = VNODE_TO_VIMPL(vp);
522 	if (listhd == vip->vi_lrulisthd &&
523 	    (getticks() - vip->vi_lrulisttm) < hz) {
524 	    	return;
525 	}
526 
527 	mutex_enter(&vdrain_lock);
528 	d = 0;
529 	if (vip->vi_lrulisthd != NULL)
530 		TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
531 	else
532 		d++;
533 	vip->vi_lrulisthd = listhd;
534 	vip->vi_lrulisttm = getticks();
535 	if (vip->vi_lrulisthd != NULL)
536 		TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
537 	else
538 		d--;
539 	if (d != 0) {
540 		/*
541 		 * Looks strange?  This is not a bug.  Don't store
542 		 * numvnodes unless there is a change - avoid false
543 		 * sharing on MP.
544 		 */
545 		numvnodes += d;
546 	}
547 	if (listhd == &lru_list[LRU_VRELE])
548 		threadpool_schedule_job(threadpool, &vrele_job);
549 	if (d > 0 && numvnodes > desiredvnodes)
550 		threadpool_schedule_job(threadpool, &vdrain_job);
551 	if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16)
552 		kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
553 	mutex_exit(&vdrain_lock);
554 }
555 
556 /*
557  * LRU list iterator.
558  * Caller holds vdrain_lock.
559  */
560 static vnode_impl_t *
561 lru_iter_first(int idx, lru_iter_t *iterp)
562 {
563 	vnode_impl_t *marker;
564 
565 	KASSERT(mutex_owned(&vdrain_lock));
566 
567 	mutex_exit(&vdrain_lock);
568 	marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
569 	mutex_enter(&vdrain_lock);
570 	marker->vi_lrulisthd = &lru_list[idx];
571 	iterp->li_marker = marker;
572 
573 	TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
574 
575 	return lru_iter_next(iterp);
576 }
577 
578 static vnode_impl_t *
579 lru_iter_next(lru_iter_t *iter)
580 {
581 	vnode_impl_t *vip, *marker;
582 	vnodelst_t *listhd;
583 
584 	KASSERT(mutex_owned(&vdrain_lock));
585 
586 	marker = iter->li_marker;
587 	listhd = marker->vi_lrulisthd;
588 
589 	while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
590 		TAILQ_REMOVE(listhd, marker, vi_lrulist);
591 		TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
592 		if (!vnis_marker(VIMPL_TO_VNODE(vip)))
593 			break;
594 	}
595 
596 	return vip;
597 }
598 
599 static void
600 lru_iter_release(lru_iter_t *iter)
601 {
602 	vnode_impl_t *marker;
603 
604 	KASSERT(mutex_owned(&vdrain_lock));
605 
606 	marker = iter->li_marker;
607 	TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
608 
609 	mutex_exit(&vdrain_lock);
610 	vnfree_marker(VIMPL_TO_VNODE(marker));
611 	mutex_enter(&vdrain_lock);
612 }
613 
614 /*
615  * Release deferred vrele vnodes for this mount.
616  * Called with file system suspended.
617  */
618 void
619 vrele_flush(struct mount *mp)
620 {
621 	lru_iter_t iter;
622 	vnode_impl_t *vip;
623 
624 	KASSERT(fstrans_is_owner(mp));
625 
626 	mutex_enter(&vdrain_lock);
627 	for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
628 	    vip = lru_iter_next(&iter)) {
629 		if (VIMPL_TO_VNODE(vip)->v_mount != mp)
630 			continue;
631 		vrele_deferred(vip);
632 	}
633 	lru_iter_release(&iter);
634 	mutex_exit(&vdrain_lock);
635 }
636 
637 /*
638  * One pass through the LRU lists to keep the number of allocated
639  * vnodes below target.  Returns true if target met.
640  */
641 static bool
642 vdrain_one(u_int target)
643 {
644 	int ix, lists[] = { LRU_FREE, LRU_HOLD };
645 	lru_iter_t iter;
646 	vnode_impl_t *vip;
647 	vnode_t *vp;
648 	struct mount *mp;
649 
650 	KASSERT(mutex_owned(&vdrain_lock));
651 
652 	for (ix = 0; ix < __arraycount(lists); ix++) {
653 		for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
654 		    vip = lru_iter_next(&iter)) {
655 			if (numvnodes < target) {
656 				lru_iter_release(&iter);
657 				return true;
658 			}
659 
660 			vp = VIMPL_TO_VNODE(vip);
661 
662 			/* Probe usecount (unlocked). */
663 			if (vrefcnt(vp) > 0)
664 				continue;
665 			/* Try v_interlock -- we lock the wrong direction! */
666 			if (!mutex_tryenter(vp->v_interlock))
667 				continue;
668 			/* Probe usecount and state. */
669 			if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
670 				mutex_exit(vp->v_interlock);
671 				continue;
672 			}
673 			mutex_exit(&vdrain_lock);
674 
675 			mp = vp->v_mount;
676 			if (fstrans_start_nowait(mp) != 0) {
677 				mutex_exit(vp->v_interlock);
678 				mutex_enter(&vdrain_lock);
679 				continue;
680 			}
681 
682 			if (vcache_vget(vp) == 0) {
683 				if (!vrecycle(vp)) {
684 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
685 					mutex_enter(vp->v_interlock);
686 					vrelel(vp, 0, LK_EXCLUSIVE);
687 				}
688 			}
689 			fstrans_done(mp);
690 
691 			mutex_enter(&vdrain_lock);
692 		}
693 		lru_iter_release(&iter);
694 	}
695 
696 	return false;
697 }
698 
699 /*
700  * threadpool task to keep the number of vnodes below desiredvnodes.
701  */
702 static void
703 vdrain_task(struct threadpool_job *job)
704 {
705 	u_int target;
706 
707 	target = desiredvnodes - desiredvnodes / 16;
708 
709 	mutex_enter(&vdrain_lock);
710 
711 	while (!vdrain_one(target))
712 		kpause("vdrain", false, 1, &vdrain_lock);
713 
714 	threadpool_job_done(job);
715 	mutex_exit(&vdrain_lock);
716 }
717 
718 /*
719  * threadpool task to process asynchronous vrele.
720  */
721 static void
722 vrele_task(struct threadpool_job *job)
723 {
724 	int skipped;
725 	lru_iter_t iter;
726 	vnode_impl_t *vip;
727 	struct mount *mp;
728 
729 	mutex_enter(&vdrain_lock);
730 	while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
731 		for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
732 			mp = VIMPL_TO_VNODE(vip)->v_mount;
733 			if (fstrans_start_nowait(mp) == 0) {
734 				vrele_deferred(vip);
735 				fstrans_done(mp);
736 			} else {
737 				skipped++;
738 			}
739 		}
740 
741 		lru_iter_release(&iter);
742 		if (skipped)
743 			kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
744 	}
745 
746 	threadpool_job_done(job);
747 	lru_iter_release(&iter);
748 	mutex_exit(&vdrain_lock);
749 }
750 
751 /*
752  * Try to drop reference on a vnode.  Abort if we are releasing the
753  * last reference.  Note: this _must_ succeed if not the last reference.
754  */
755 static bool
756 vtryrele(vnode_t *vp)
757 {
758 	u_int use, next;
759 
760 	membar_release();
761 	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
762 		if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
763 			return false;
764 		}
765 		KASSERT((use & VUSECOUNT_MASK) > 1);
766 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
767 		if (__predict_true(next == use)) {
768 			return true;
769 		}
770 	}
771 }
772 
773 /*
774  * vput: unlock and release the reference.
775  */
776 void
777 vput(vnode_t *vp)
778 {
779 	int lktype;
780 
781 	/*
782 	 * Do an unlocked check of the usecount.  If it looks like we're not
783 	 * about to drop the last reference, then unlock the vnode and try
784 	 * to drop the reference.  If it ends up being the last reference
785 	 * after all, vrelel() can fix it all up.  Most of the time this
786 	 * will all go to plan.
787 	 */
788 	if (vrefcnt(vp) > 1) {
789 		VOP_UNLOCK(vp);
790 		if (vtryrele(vp)) {
791 			return;
792 		}
793 		lktype = LK_NONE;
794 	} else {
795 		lktype = VOP_ISLOCKED(vp);
796 		KASSERT(lktype != LK_NONE);
797 	}
798 	mutex_enter(vp->v_interlock);
799 	vrelel(vp, 0, lktype);
800 }
801 
802 /*
803  * Release a vnode from the deferred list.
804  */
805 static void
806 vrele_deferred(vnode_impl_t *vip)
807 {
808 	vnode_t *vp;
809 
810 	KASSERT(mutex_owned(&vdrain_lock));
811 	KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
812 
813 	vp = VIMPL_TO_VNODE(vip);
814 
815 	/*
816 	 * First remove the vnode from the vrele list.
817 	 * Put it on the last lru list, the last vrele()
818 	 * will put it back onto the right list before
819 	 * its usecount reaches zero.
820 	 */
821 	TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
822 	vip->vi_lrulisthd = &lru_list[LRU_HOLD];
823 	vip->vi_lrulisttm = getticks();
824 	TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
825 
826 	mutex_exit(&vdrain_lock);
827 
828 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
829 	mutex_enter(vp->v_interlock);
830 	vrelel(vp, 0, LK_EXCLUSIVE);
831 
832 	mutex_enter(&vdrain_lock);
833 }
834 
835 /*
836  * Vnode release.  If reference count drops to zero, call inactive
837  * routine and either return to freelist or free to the pool.
838  */
839 static void
840 vrelel(vnode_t *vp, int flags, int lktype)
841 {
842 	const bool async = ((flags & VRELEL_ASYNC) != 0);
843 	bool recycle, defer, objlock_held;
844 	u_int use, next;
845 	int error;
846 
847 	objlock_held = false;
848 
849 retry:
850 	KASSERT(mutex_owned(vp->v_interlock));
851 
852 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
853 	    VSTATE_GET(vp) != VS_RECLAIMED)) {
854 		vnpanic(vp, "dead but not clean");
855 	}
856 
857 	/*
858 	 * If not the last reference, just unlock and drop the reference count.
859 	 *
860 	 * Otherwise make sure we pass a point in time where we hold the
861 	 * last reference with VGET flag unset.
862 	 */
863 	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
864 		if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
865 			if (objlock_held) {
866 				objlock_held = false;
867 				rw_exit(vp->v_uobj.vmobjlock);
868 			}
869 			if (lktype != LK_NONE) {
870 				mutex_exit(vp->v_interlock);
871 				lktype = LK_NONE;
872 				VOP_UNLOCK(vp);
873 				mutex_enter(vp->v_interlock);
874 			}
875 			if (vtryrele(vp)) {
876 				mutex_exit(vp->v_interlock);
877 				return;
878 			}
879 			next = atomic_load_relaxed(&vp->v_usecount);
880 			continue;
881 		}
882 		KASSERT((use & VUSECOUNT_MASK) == 1);
883 		next = use & ~VUSECOUNT_VGET;
884 		if (next != use) {
885 			next = atomic_cas_uint(&vp->v_usecount, use, next);
886 		}
887 		if (__predict_true(next == use)) {
888 			break;
889 		}
890 	}
891 	membar_acquire();
892 	if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
893 		vnpanic(vp, "%s: bad ref count", __func__);
894 	}
895 
896 #ifdef DIAGNOSTIC
897 	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
898 	    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
899 		vprint("vrelel: missing VOP_CLOSE()", vp);
900 	}
901 #endif
902 
903 	/*
904 	 * If already clean there is no need to lock, defer or
905 	 * deactivate this node.
906 	 */
907 	if (VSTATE_GET(vp) == VS_RECLAIMED) {
908 		if (objlock_held) {
909 			objlock_held = false;
910 			rw_exit(vp->v_uobj.vmobjlock);
911 		}
912 		if (lktype != LK_NONE) {
913 			mutex_exit(vp->v_interlock);
914 			lktype = LK_NONE;
915 			VOP_UNLOCK(vp);
916 			mutex_enter(vp->v_interlock);
917 		}
918 		goto out;
919 	}
920 
921 	/*
922 	 * First try to get the vnode locked for VOP_INACTIVE().
923 	 * Defer vnode release to vrele task if caller requests
924 	 * it explicitly, is the pagedaemon or the lock failed.
925 	 */
926 	defer = false;
927 	if ((curlwp == uvm.pagedaemon_lwp) || async) {
928 		defer = true;
929 	} else if (lktype == LK_SHARED) {
930 		/* Excellent chance of getting, if the last ref. */
931 		error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
932 		if (error != 0) {
933 			defer = true;
934 		} else {
935 			lktype = LK_EXCLUSIVE;
936 		}
937 	} else if (lktype == LK_NONE) {
938 		/* Excellent chance of getting, if the last ref. */
939 		error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
940 		if (error != 0) {
941 			defer = true;
942 		} else {
943 			lktype = LK_EXCLUSIVE;
944 		}
945 	}
946 	KASSERT(mutex_owned(vp->v_interlock));
947 	if (defer) {
948 		/*
949 		 * Defer reclaim to the vrele task; it's not safe to
950 		 * clean it here.  We donate it our last reference.
951 		 */
952 		if (lktype != LK_NONE) {
953 			mutex_exit(vp->v_interlock);
954 			VOP_UNLOCK(vp);
955 			mutex_enter(vp->v_interlock);
956 		}
957 		lru_requeue(vp, &lru_list[LRU_VRELE]);
958 		mutex_exit(vp->v_interlock);
959 		return;
960 	}
961 	KASSERT(lktype == LK_EXCLUSIVE);
962 
963 	/* If the node gained another reference, retry. */
964 	use = atomic_load_relaxed(&vp->v_usecount);
965 	if ((use & VUSECOUNT_VGET) != 0) {
966 		goto retry;
967 	}
968 	KASSERT((use & VUSECOUNT_MASK) == 1);
969 
970 	if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
971 	    (vp->v_vflag & VV_MAPPED) != 0) {
972 		/* Take care of space accounting. */
973 		if (!objlock_held) {
974 			objlock_held = true;
975 			if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
976 				mutex_exit(vp->v_interlock);
977 				rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
978 				mutex_enter(vp->v_interlock);
979 				goto retry;
980 			}
981 		}
982 		if ((vp->v_iflag & VI_EXECMAP) != 0) {
983 			cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
984 		}
985 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
986 		vp->v_vflag &= ~VV_MAPPED;
987 	}
988 	if (objlock_held) {
989 		objlock_held = false;
990 		rw_exit(vp->v_uobj.vmobjlock);
991 	}
992 
993 	/*
994 	 * Deactivate the vnode, but preserve our reference across
995 	 * the call to VOP_INACTIVE().
996 	 *
997 	 * If VOP_INACTIVE() indicates that the file has been
998 	 * deleted, then recycle the vnode.
999 	 *
1000 	 * Note that VOP_INACTIVE() will not drop the vnode lock.
1001 	 */
1002 	mutex_exit(vp->v_interlock);
1003 	recycle = false;
1004 	VOP_INACTIVE(vp, &recycle);
1005 	if (!recycle) {
1006 		lktype = LK_NONE;
1007 		VOP_UNLOCK(vp);
1008 	}
1009 	mutex_enter(vp->v_interlock);
1010 
1011 	/*
1012 	 * Block new references then check again to see if a
1013 	 * new reference was acquired in the meantime.  If
1014 	 * it was, restore the vnode state and try again.
1015 	 */
1016 	if (recycle) {
1017 		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1018 		use = atomic_load_relaxed(&vp->v_usecount);
1019 		if ((use & VUSECOUNT_VGET) != 0) {
1020 			VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1021 			goto retry;
1022 		}
1023 		KASSERT((use & VUSECOUNT_MASK) == 1);
1024 	}
1025 
1026 	/*
1027 	 * Recycle the vnode if the file is now unused (unlinked).
1028 	 */
1029 	if (recycle) {
1030 		VSTATE_ASSERT(vp, VS_BLOCKED);
1031 		KASSERT(lktype == LK_EXCLUSIVE);
1032 		/* vcache_reclaim drops the lock. */
1033 		lktype = LK_NONE;
1034 		vcache_reclaim(vp);
1035 	}
1036 	KASSERT(vrefcnt(vp) > 0);
1037 	KASSERT(lktype == LK_NONE);
1038 
1039 out:
1040 	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1041 		if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
1042 		    (use & VUSECOUNT_MASK) == 1)) {
1043 			/* Gained and released another reference, retry. */
1044 			goto retry;
1045 		}
1046 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1047 		if (__predict_true(next == use)) {
1048 			if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
1049 				/* Gained another reference. */
1050 				mutex_exit(vp->v_interlock);
1051 				return;
1052 			}
1053 			break;
1054 		}
1055 	}
1056 	membar_acquire();
1057 
1058 	if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
1059 		/*
1060 		 * It's clean so destroy it.  It isn't referenced
1061 		 * anywhere since it has been reclaimed.
1062 		 */
1063 		vcache_free(VNODE_TO_VIMPL(vp));
1064 	} else {
1065 		/*
1066 		 * Otherwise, put it back onto the freelist.  It
1067 		 * can't be destroyed while still associated with
1068 		 * a file system.
1069 		 */
1070 		lru_requeue(vp, lru_which(vp));
1071 		mutex_exit(vp->v_interlock);
1072 	}
1073 }
1074 
1075 void
1076 vrele(vnode_t *vp)
1077 {
1078 
1079 	if (vtryrele(vp)) {
1080 		return;
1081 	}
1082 	mutex_enter(vp->v_interlock);
1083 	vrelel(vp, 0, LK_NONE);
1084 }
1085 
1086 /*
1087  * Asynchronous vnode release, vnode is released in different context.
1088  */
1089 void
1090 vrele_async(vnode_t *vp)
1091 {
1092 
1093 	if (vtryrele(vp)) {
1094 		return;
1095 	}
1096 	mutex_enter(vp->v_interlock);
1097 	vrelel(vp, VRELEL_ASYNC, LK_NONE);
1098 }
1099 
1100 /*
1101  * Vnode reference, where a reference is already held by some other
1102  * object (for example, a file structure).
1103  *
1104  * NB: lockless code sequences may rely on this not blocking.
1105  */
1106 void
1107 vref(vnode_t *vp)
1108 {
1109 
1110 	KASSERT(vrefcnt(vp) > 0);
1111 
1112 	atomic_inc_uint(&vp->v_usecount);
1113 }
1114 
1115 /*
1116  * Page or buffer structure gets a reference.
1117  * Called with v_interlock held.
1118  */
1119 void
1120 vholdl(vnode_t *vp)
1121 {
1122 
1123 	KASSERT(mutex_owned(vp->v_interlock));
1124 
1125 	if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
1126 		lru_requeue(vp, lru_which(vp));
1127 }
1128 
1129 /*
1130  * Page or buffer structure gets a reference.
1131  */
1132 void
1133 vhold(vnode_t *vp)
1134 {
1135 
1136 	mutex_enter(vp->v_interlock);
1137 	vholdl(vp);
1138 	mutex_exit(vp->v_interlock);
1139 }
1140 
1141 /*
1142  * Page or buffer structure frees a reference.
1143  * Called with v_interlock held.
1144  */
1145 void
1146 holdrelel(vnode_t *vp)
1147 {
1148 
1149 	KASSERT(mutex_owned(vp->v_interlock));
1150 
1151 	if (vp->v_holdcnt <= 0) {
1152 		vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1153 	}
1154 
1155 	vp->v_holdcnt--;
1156 	if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1157 		lru_requeue(vp, lru_which(vp));
1158 }
1159 
1160 /*
1161  * Page or buffer structure frees a reference.
1162  */
1163 void
1164 holdrele(vnode_t *vp)
1165 {
1166 
1167 	mutex_enter(vp->v_interlock);
1168 	holdrelel(vp);
1169 	mutex_exit(vp->v_interlock);
1170 }
1171 
1172 /*
1173  * Recycle an unused vnode if caller holds the last reference.
1174  */
1175 bool
1176 vrecycle(vnode_t *vp)
1177 {
1178 	int error __diagused;
1179 
1180 	mutex_enter(vp->v_interlock);
1181 
1182 	/* If the vnode is already clean we're done. */
1183 	VSTATE_WAIT_STABLE(vp);
1184 	if (VSTATE_GET(vp) != VS_LOADED) {
1185 		VSTATE_ASSERT(vp, VS_RECLAIMED);
1186 		vrelel(vp, 0, LK_NONE);
1187 		return true;
1188 	}
1189 
1190 	/* Prevent further references until the vnode is locked. */
1191 	VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1192 
1193 	/* Make sure we hold the last reference. */
1194 	if (vrefcnt(vp) != 1) {
1195 		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1196 		mutex_exit(vp->v_interlock);
1197 		return false;
1198 	}
1199 
1200 	mutex_exit(vp->v_interlock);
1201 
1202 	/*
1203 	 * On a leaf file system this lock will always succeed as we hold
1204 	 * the last reference and prevent further references.
1205 	 * On layered file systems waiting for the lock would open a can of
1206 	 * deadlocks as the lower vnodes may have other active references.
1207 	 */
1208 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1209 
1210 	mutex_enter(vp->v_interlock);
1211 	if (error) {
1212 		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1213 		mutex_exit(vp->v_interlock);
1214 		return false;
1215 	}
1216 
1217 	KASSERT(vrefcnt(vp) == 1);
1218 	vcache_reclaim(vp);
1219 	vrelel(vp, 0, LK_NONE);
1220 
1221 	return true;
1222 }
1223 
1224 /*
1225  * Helper for vrevoke() to propagate suspension from lastmp
1226  * to thismp.  Both args may be NULL.
1227  * Returns the currently suspended file system or NULL.
1228  */
1229 static struct mount *
1230 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1231 {
1232 	int error;
1233 
1234 	if (lastmp == thismp)
1235 		return thismp;
1236 
1237 	if (lastmp != NULL)
1238 		vfs_resume(lastmp);
1239 
1240 	if (thismp == NULL)
1241 		return NULL;
1242 
1243 	do {
1244 		error = vfs_suspend(thismp, 0);
1245 	} while (error == EINTR || error == ERESTART);
1246 
1247 	if (error == 0)
1248 		return thismp;
1249 
1250 	KASSERT(error == EOPNOTSUPP || error == ENOENT);
1251 	return NULL;
1252 }
1253 
1254 /*
1255  * Eliminate all activity associated with the requested vnode
1256  * and with all vnodes aliased to the requested vnode.
1257  */
1258 void
1259 vrevoke(vnode_t *vp)
1260 {
1261 	struct mount *mp;
1262 	vnode_t *vq;
1263 	enum vtype type;
1264 	dev_t dev;
1265 
1266 	KASSERT(vrefcnt(vp) > 0);
1267 
1268 	mp = vrevoke_suspend_next(NULL, vp->v_mount);
1269 
1270 	mutex_enter(vp->v_interlock);
1271 	VSTATE_WAIT_STABLE(vp);
1272 	if (VSTATE_GET(vp) == VS_RECLAIMED) {
1273 		mutex_exit(vp->v_interlock);
1274 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1275 		atomic_inc_uint(&vp->v_usecount);
1276 		mutex_exit(vp->v_interlock);
1277 		vgone(vp);
1278 	} else {
1279 		dev = vp->v_rdev;
1280 		type = vp->v_type;
1281 		mutex_exit(vp->v_interlock);
1282 
1283 		while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
1284 		    == 0) {
1285 			mp = vrevoke_suspend_next(mp, vq->v_mount);
1286 			vgone(vq);
1287 		}
1288 	}
1289 	vrevoke_suspend_next(mp, NULL);
1290 }
1291 
1292 /*
1293  * Eliminate all activity associated with a vnode in preparation for
1294  * reuse.  Drops a reference from the vnode.
1295  */
1296 void
1297 vgone(vnode_t *vp)
1298 {
1299 	int lktype;
1300 
1301 	KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1302 
1303 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1304 	lktype = LK_EXCLUSIVE;
1305 	mutex_enter(vp->v_interlock);
1306 	VSTATE_WAIT_STABLE(vp);
1307 	if (VSTATE_GET(vp) == VS_LOADED) {
1308 		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1309 		vcache_reclaim(vp);
1310 		lktype = LK_NONE;
1311 	}
1312 	VSTATE_ASSERT(vp, VS_RECLAIMED);
1313 	vrelel(vp, 0, lktype);
1314 }
1315 
1316 static inline uint32_t
1317 vcache_hash(const struct vcache_key *key)
1318 {
1319 	uint32_t hash = HASH32_BUF_INIT;
1320 
1321 	KASSERT(key->vk_key_len > 0);
1322 
1323 	hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1324 	hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1325 	return hash;
1326 }
1327 
1328 static int
1329 vcache_stats(struct hashstat_sysctl *hs, bool fill)
1330 {
1331 	vnode_impl_t *vip;
1332 	uint64_t chain;
1333 
1334 	strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
1335 	strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
1336 	if (!fill)
1337 		return 0;
1338 
1339 	hs->hash_size = vcache_hashmask + 1;
1340 
1341 	for (size_t i = 0; i < hs->hash_size; i++) {
1342 		chain = 0;
1343 		mutex_enter(&vcache_lock);
1344 		SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
1345 			chain++;
1346 		}
1347 		mutex_exit(&vcache_lock);
1348 		if (chain > 0) {
1349 			hs->hash_used++;
1350 			hs->hash_items += chain;
1351 			if (chain > hs->hash_maxchain)
1352 				hs->hash_maxchain = chain;
1353 		}
1354 		preempt_point();
1355 	}
1356 
1357 	return 0;
1358 }
1359 
1360 static void
1361 vcache_init(void)
1362 {
1363 
1364 	vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1365 	    0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1366 	KASSERT(vcache_pool != NULL);
1367 	mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1368 	cv_init(&vcache_cv, "vcache");
1369 	vcache_hashsize = desiredvnodes;
1370 	vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1371 	    &vcache_hashmask);
1372 	hashstat_register("vcache", vcache_stats);
1373 }
1374 
1375 static void
1376 vcache_reinit(void)
1377 {
1378 	int i;
1379 	uint32_t hash;
1380 	u_long oldmask, newmask;
1381 	struct hashhead *oldtab, *newtab;
1382 	vnode_impl_t *vip;
1383 
1384 	newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1385 	mutex_enter(&vcache_lock);
1386 	oldtab = vcache_hashtab;
1387 	oldmask = vcache_hashmask;
1388 	vcache_hashsize = desiredvnodes;
1389 	vcache_hashtab = newtab;
1390 	vcache_hashmask = newmask;
1391 	for (i = 0; i <= oldmask; i++) {
1392 		while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1393 			SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1394 			hash = vcache_hash(&vip->vi_key);
1395 			SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1396 			    vip, vi_hash);
1397 		}
1398 	}
1399 	mutex_exit(&vcache_lock);
1400 	hashdone(oldtab, HASH_SLIST, oldmask);
1401 }
1402 
1403 static inline vnode_impl_t *
1404 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1405 {
1406 	struct hashhead *hashp;
1407 	vnode_impl_t *vip;
1408 
1409 	KASSERT(mutex_owned(&vcache_lock));
1410 
1411 	hashp = &vcache_hashtab[hash & vcache_hashmask];
1412 	SLIST_FOREACH(vip, hashp, vi_hash) {
1413 		if (key->vk_mount != vip->vi_key.vk_mount)
1414 			continue;
1415 		if (key->vk_key_len != vip->vi_key.vk_key_len)
1416 			continue;
1417 		if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1418 			continue;
1419 		return vip;
1420 	}
1421 	return NULL;
1422 }
1423 
1424 /*
1425  * Allocate a new, uninitialized vcache node.
1426  */
1427 static vnode_impl_t *
1428 vcache_alloc(void)
1429 {
1430 	vnode_impl_t *vip;
1431 	vnode_t *vp;
1432 
1433 	vip = pool_cache_get(vcache_pool, PR_WAITOK);
1434 	vp = VIMPL_TO_VNODE(vip);
1435 	memset(vip, 0, sizeof(*vip));
1436 
1437 	rw_init(&vip->vi_lock);
1438 	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1439 
1440 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1441 	klist_init(&vip->vi_klist.vk_klist);
1442 	vp->v_klist = &vip->vi_klist;
1443 	cv_init(&vp->v_cv, "vnode");
1444 	cache_vnode_init(vp);
1445 
1446 	vp->v_usecount = 1;
1447 	vp->v_type = VNON;
1448 	vp->v_size = vp->v_writesize = VSIZENOTSET;
1449 
1450 	vip->vi_state = VS_LOADING;
1451 
1452 	lru_requeue(vp, &lru_list[LRU_FREE]);
1453 
1454 	return vip;
1455 }
1456 
1457 /*
1458  * Deallocate a vcache node in state VS_LOADING.
1459  *
1460  * vcache_lock held on entry and released on return.
1461  */
1462 static void
1463 vcache_dealloc(vnode_impl_t *vip)
1464 {
1465 	vnode_t *vp;
1466 
1467 	KASSERT(mutex_owned(&vcache_lock));
1468 
1469 	vp = VIMPL_TO_VNODE(vip);
1470 	vfs_ref(dead_rootmount);
1471 	vfs_insmntque(vp, dead_rootmount);
1472 	mutex_enter(vp->v_interlock);
1473 	vp->v_op = dead_vnodeop_p;
1474 	VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1475 	mutex_exit(&vcache_lock);
1476 	vrelel(vp, 0, LK_NONE);
1477 }
1478 
1479 /*
1480  * Free an unused, unreferenced vcache node.
1481  * v_interlock locked on entry.
1482  */
1483 static void
1484 vcache_free(vnode_impl_t *vip)
1485 {
1486 	vnode_t *vp;
1487 
1488 	vp = VIMPL_TO_VNODE(vip);
1489 	KASSERT(mutex_owned(vp->v_interlock));
1490 
1491 	KASSERT(vrefcnt(vp) == 0);
1492 	KASSERT(vp->v_holdcnt == 0);
1493 	KASSERT(vp->v_writecount == 0);
1494 	lru_requeue(vp, NULL);
1495 	mutex_exit(vp->v_interlock);
1496 
1497 	vfs_insmntque(vp, NULL);
1498 	if (vp->v_type == VBLK || vp->v_type == VCHR)
1499 		spec_node_destroy(vp);
1500 
1501 	mutex_obj_free(vp->v_interlock);
1502 	rw_destroy(&vip->vi_lock);
1503 	uvm_obj_destroy(&vp->v_uobj, true);
1504 	KASSERT(vp->v_klist == &vip->vi_klist);
1505 	klist_fini(&vip->vi_klist.vk_klist);
1506 	cv_destroy(&vp->v_cv);
1507 	cache_vnode_fini(vp);
1508 	pool_cache_put(vcache_pool, vip);
1509 }
1510 
1511 /*
1512  * Try to get an initial reference on this cached vnode.
1513  * Returns zero on success or EBUSY if the vnode state is not LOADED.
1514  *
1515  * NB: lockless code sequences may rely on this not blocking.
1516  */
1517 int
1518 vcache_tryvget(vnode_t *vp)
1519 {
1520 	u_int use, next;
1521 
1522 	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1523 		if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
1524 			return EBUSY;
1525 		}
1526 		next = atomic_cas_uint(&vp->v_usecount,
1527 		    use, (use + 1) | VUSECOUNT_VGET);
1528 		if (__predict_true(next == use)) {
1529 			membar_acquire();
1530 			return 0;
1531 		}
1532 	}
1533 }
1534 
1535 /*
1536  * Try to get an initial reference on this cached vnode.
1537  * Returns zero on success and  ENOENT if the vnode has been reclaimed.
1538  * Will wait for the vnode state to be stable.
1539  *
1540  * v_interlock locked on entry and unlocked on exit.
1541  */
1542 int
1543 vcache_vget(vnode_t *vp)
1544 {
1545 	int error;
1546 
1547 	KASSERT(mutex_owned(vp->v_interlock));
1548 
1549 	/* Increment hold count to prevent vnode from disappearing. */
1550 	vp->v_holdcnt++;
1551 	VSTATE_WAIT_STABLE(vp);
1552 	vp->v_holdcnt--;
1553 
1554 	/* If this was the last reference to a reclaimed vnode free it now. */
1555 	if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1556 		if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1557 			vcache_free(VNODE_TO_VIMPL(vp));
1558 		else
1559 			mutex_exit(vp->v_interlock);
1560 		return ENOENT;
1561 	}
1562 	VSTATE_ASSERT(vp, VS_LOADED);
1563 	error = vcache_tryvget(vp);
1564 	KASSERT(error == 0);
1565 	mutex_exit(vp->v_interlock);
1566 
1567 	return 0;
1568 }
1569 
1570 /*
1571  * Get a vnode / fs node pair by key and return it referenced through vpp.
1572  */
1573 int
1574 vcache_get(struct mount *mp, const void *key, size_t key_len,
1575     struct vnode **vpp)
1576 {
1577 	int error;
1578 	uint32_t hash;
1579 	const void *new_key;
1580 	struct vnode *vp;
1581 	struct vcache_key vcache_key;
1582 	vnode_impl_t *vip, *new_vip;
1583 
1584 	new_key = NULL;
1585 	*vpp = NULL;
1586 
1587 	vcache_key.vk_mount = mp;
1588 	vcache_key.vk_key = key;
1589 	vcache_key.vk_key_len = key_len;
1590 	hash = vcache_hash(&vcache_key);
1591 
1592 again:
1593 	mutex_enter(&vcache_lock);
1594 	vip = vcache_hash_lookup(&vcache_key, hash);
1595 
1596 	/* If found, take a reference or retry. */
1597 	if (__predict_true(vip != NULL)) {
1598 		/*
1599 		 * If the vnode is loading we cannot take the v_interlock
1600 		 * here as it might change during load (see uvm_obj_setlock()).
1601 		 * As changing state from VS_LOADING requires both vcache_lock
1602 		 * and v_interlock it is safe to test with vcache_lock held.
1603 		 *
1604 		 * Wait for vnodes changing state from VS_LOADING and retry.
1605 		 */
1606 		if (__predict_false(vip->vi_state == VS_LOADING)) {
1607 			cv_wait(&vcache_cv, &vcache_lock);
1608 			mutex_exit(&vcache_lock);
1609 			goto again;
1610 		}
1611 		vp = VIMPL_TO_VNODE(vip);
1612 		mutex_enter(vp->v_interlock);
1613 		mutex_exit(&vcache_lock);
1614 		error = vcache_vget(vp);
1615 		if (error == ENOENT)
1616 			goto again;
1617 		if (error == 0)
1618 			*vpp = vp;
1619 		KASSERT((error != 0) == (*vpp == NULL));
1620 		return error;
1621 	}
1622 	mutex_exit(&vcache_lock);
1623 
1624 	/* Allocate and initialize a new vcache / vnode pair. */
1625 	error = vfs_busy(mp);
1626 	if (error)
1627 		return error;
1628 	new_vip = vcache_alloc();
1629 	new_vip->vi_key = vcache_key;
1630 	vp = VIMPL_TO_VNODE(new_vip);
1631 	mutex_enter(&vcache_lock);
1632 	vip = vcache_hash_lookup(&vcache_key, hash);
1633 	if (vip == NULL) {
1634 		SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1635 		    new_vip, vi_hash);
1636 		vip = new_vip;
1637 	}
1638 
1639 	/* If another thread beat us inserting this node, retry. */
1640 	if (vip != new_vip) {
1641 		vcache_dealloc(new_vip);
1642 		vfs_unbusy(mp);
1643 		goto again;
1644 	}
1645 	mutex_exit(&vcache_lock);
1646 
1647 	/* Load the fs node.  Exclusive as new_node is VS_LOADING. */
1648 	error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1649 	if (error) {
1650 		mutex_enter(&vcache_lock);
1651 		SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1652 		    new_vip, vnode_impl, vi_hash);
1653 		vcache_dealloc(new_vip);
1654 		vfs_unbusy(mp);
1655 		KASSERT(*vpp == NULL);
1656 		return error;
1657 	}
1658 	KASSERT(new_key != NULL);
1659 	KASSERT(memcmp(key, new_key, key_len) == 0);
1660 	KASSERT(vp->v_op != NULL);
1661 	vfs_insmntque(vp, mp);
1662 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1663 		vp->v_vflag |= VV_MPSAFE;
1664 	vfs_ref(mp);
1665 	vfs_unbusy(mp);
1666 
1667 	/* Finished loading, finalize node. */
1668 	mutex_enter(&vcache_lock);
1669 	new_vip->vi_key.vk_key = new_key;
1670 	mutex_enter(vp->v_interlock);
1671 	VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1672 	mutex_exit(vp->v_interlock);
1673 	mutex_exit(&vcache_lock);
1674 	*vpp = vp;
1675 	return 0;
1676 }
1677 
1678 /*
1679  * Create a new vnode / fs node pair and return it referenced through vpp.
1680  */
1681 int
1682 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1683     kauth_cred_t cred, void *extra, struct vnode **vpp)
1684 {
1685 	int error;
1686 	uint32_t hash;
1687 	struct vnode *vp, *ovp;
1688 	vnode_impl_t *vip, *ovip;
1689 
1690 	*vpp = NULL;
1691 
1692 	/* Allocate and initialize a new vcache / vnode pair. */
1693 	error = vfs_busy(mp);
1694 	if (error)
1695 		return error;
1696 	vip = vcache_alloc();
1697 	vip->vi_key.vk_mount = mp;
1698 	vp = VIMPL_TO_VNODE(vip);
1699 
1700 	/* Create and load the fs node. */
1701 	error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1702 	    &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1703 	if (error) {
1704 		mutex_enter(&vcache_lock);
1705 		vcache_dealloc(vip);
1706 		vfs_unbusy(mp);
1707 		KASSERT(*vpp == NULL);
1708 		return error;
1709 	}
1710 	KASSERT(vp->v_op != NULL);
1711 	KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1712 	if (vip->vi_key.vk_key_len > 0) {
1713 		KASSERT(vip->vi_key.vk_key != NULL);
1714 		hash = vcache_hash(&vip->vi_key);
1715 
1716 		/*
1717 		 * Wait for previous instance to be reclaimed,
1718 		 * then insert new node.
1719 		 */
1720 		mutex_enter(&vcache_lock);
1721 		while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1722 			ovp = VIMPL_TO_VNODE(ovip);
1723 			mutex_enter(ovp->v_interlock);
1724 			mutex_exit(&vcache_lock);
1725 			error = vcache_vget(ovp);
1726 			KASSERT(error == ENOENT);
1727 			mutex_enter(&vcache_lock);
1728 		}
1729 		SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1730 		    vip, vi_hash);
1731 		mutex_exit(&vcache_lock);
1732 	}
1733 	vfs_insmntque(vp, mp);
1734 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1735 		vp->v_vflag |= VV_MPSAFE;
1736 	vfs_ref(mp);
1737 	vfs_unbusy(mp);
1738 
1739 	/* Finished loading, finalize node. */
1740 	mutex_enter(&vcache_lock);
1741 	mutex_enter(vp->v_interlock);
1742 	VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1743 	mutex_exit(&vcache_lock);
1744 	mutex_exit(vp->v_interlock);
1745 	*vpp = vp;
1746 	return 0;
1747 }
1748 
1749 /*
1750  * Prepare key change: update old cache nodes key and lock new cache node.
1751  * Return an error if the new node already exists.
1752  */
1753 int
1754 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1755     const void *old_key, size_t old_key_len,
1756     const void *new_key, size_t new_key_len)
1757 {
1758 	uint32_t old_hash, new_hash;
1759 	struct vcache_key old_vcache_key, new_vcache_key;
1760 	vnode_impl_t *vip, *new_vip;
1761 
1762 	old_vcache_key.vk_mount = mp;
1763 	old_vcache_key.vk_key = old_key;
1764 	old_vcache_key.vk_key_len = old_key_len;
1765 	old_hash = vcache_hash(&old_vcache_key);
1766 
1767 	new_vcache_key.vk_mount = mp;
1768 	new_vcache_key.vk_key = new_key;
1769 	new_vcache_key.vk_key_len = new_key_len;
1770 	new_hash = vcache_hash(&new_vcache_key);
1771 
1772 	new_vip = vcache_alloc();
1773 	new_vip->vi_key = new_vcache_key;
1774 
1775 	/* Insert locked new node used as placeholder. */
1776 	mutex_enter(&vcache_lock);
1777 	vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1778 	if (vip != NULL) {
1779 		vcache_dealloc(new_vip);
1780 		return EEXIST;
1781 	}
1782 	SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1783 	    new_vip, vi_hash);
1784 
1785 	/* Replace old nodes key with the temporary copy. */
1786 	vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1787 	KASSERT(vip != NULL);
1788 	KASSERT(VIMPL_TO_VNODE(vip) == vp);
1789 	KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1790 	vip->vi_key = old_vcache_key;
1791 	mutex_exit(&vcache_lock);
1792 	return 0;
1793 }
1794 
1795 /*
1796  * Key change complete: update old node and remove placeholder.
1797  */
1798 void
1799 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1800     const void *old_key, size_t old_key_len,
1801     const void *new_key, size_t new_key_len)
1802 {
1803 	uint32_t old_hash, new_hash;
1804 	struct vcache_key old_vcache_key, new_vcache_key;
1805 	vnode_impl_t *vip, *new_vip;
1806 	struct vnode *new_vp;
1807 
1808 	old_vcache_key.vk_mount = mp;
1809 	old_vcache_key.vk_key = old_key;
1810 	old_vcache_key.vk_key_len = old_key_len;
1811 	old_hash = vcache_hash(&old_vcache_key);
1812 
1813 	new_vcache_key.vk_mount = mp;
1814 	new_vcache_key.vk_key = new_key;
1815 	new_vcache_key.vk_key_len = new_key_len;
1816 	new_hash = vcache_hash(&new_vcache_key);
1817 
1818 	mutex_enter(&vcache_lock);
1819 
1820 	/* Lookup old and new node. */
1821 	vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1822 	KASSERT(vip != NULL);
1823 	KASSERT(VIMPL_TO_VNODE(vip) == vp);
1824 
1825 	new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1826 	KASSERT(new_vip != NULL);
1827 	KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1828 	new_vp = VIMPL_TO_VNODE(new_vip);
1829 	mutex_enter(new_vp->v_interlock);
1830 	VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1831 	mutex_exit(new_vp->v_interlock);
1832 
1833 	/* Rekey old node and put it onto its new hashlist. */
1834 	vip->vi_key = new_vcache_key;
1835 	if (old_hash != new_hash) {
1836 		SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1837 		    vip, vnode_impl, vi_hash);
1838 		SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1839 		    vip, vi_hash);
1840 	}
1841 
1842 	/* Remove new node used as placeholder. */
1843 	SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1844 	    new_vip, vnode_impl, vi_hash);
1845 	vcache_dealloc(new_vip);
1846 }
1847 
1848 /*
1849  * Disassociate the underlying file system from a vnode.
1850  *
1851  * Must be called with vnode locked and will return unlocked.
1852  * Must be called with the interlock held, and will return with it held.
1853  */
1854 static void
1855 vcache_reclaim(vnode_t *vp)
1856 {
1857 	lwp_t *l = curlwp;
1858 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1859 	struct mount *mp = vp->v_mount;
1860 	uint32_t hash;
1861 	uint8_t temp_buf[64], *temp_key;
1862 	size_t temp_key_len;
1863 	bool recycle;
1864 	int error;
1865 
1866 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1867 	KASSERT(mutex_owned(vp->v_interlock));
1868 	KASSERT(vrefcnt(vp) != 0);
1869 
1870 	temp_key_len = vip->vi_key.vk_key_len;
1871 	/*
1872 	 * Prevent the vnode from being recycled or brought into use
1873 	 * while we clean it out.
1874 	 */
1875 	VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
1876 
1877 	/*
1878 	 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
1879 	 * because VOP_RECLAIM() could cause vp->v_klist to
1880 	 * become invalid.  Don't check for interest in NOTE_REVOKE
1881 	 * here; it's always posted because it sets EV_EOF.
1882 	 *
1883 	 * Once it's been posted, reset vp->v_klist to point to
1884 	 * our own local storage, in case we were sharing with
1885 	 * someone else.
1886 	 */
1887 	KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
1888 	vp->v_klist = &vip->vi_klist;
1889 	mutex_exit(vp->v_interlock);
1890 
1891 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1892 	mutex_enter(vp->v_interlock);
1893 	if ((vp->v_iflag & VI_EXECMAP) != 0) {
1894 		cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1895 	}
1896 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1897 	vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
1898 	mutex_exit(vp->v_interlock);
1899 	rw_exit(vp->v_uobj.vmobjlock);
1900 
1901 	/*
1902 	 * With vnode state set to reclaiming, purge name cache immediately
1903 	 * to prevent new handles on vnode, and wait for existing threads
1904 	 * trying to get a handle to notice VS_RECLAIMED status and abort.
1905 	 */
1906 	cache_purge(vp);
1907 
1908 	/* Replace the vnode key with a temporary copy. */
1909 	if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1910 		temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1911 	} else {
1912 		temp_key = temp_buf;
1913 	}
1914 	if (vip->vi_key.vk_key_len > 0) {
1915 		mutex_enter(&vcache_lock);
1916 		memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1917 		vip->vi_key.vk_key = temp_key;
1918 		mutex_exit(&vcache_lock);
1919 	}
1920 
1921 	fstrans_start(mp);
1922 
1923 	/*
1924 	 * Clean out any cached data associated with the vnode.
1925 	 */
1926 	error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1927 	if (error != 0) {
1928 		if (wapbl_vphaswapbl(vp))
1929 			WAPBL_DISCARD(wapbl_vptomp(vp));
1930 		error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1931 	}
1932 	KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1933 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1934 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
1935 		 spec_node_revoke(vp);
1936 	}
1937 
1938 	/*
1939 	 * Disassociate the underlying file system from the vnode.
1940 	 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1941 	 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1942 	 * would no longer function.
1943 	 */
1944 	VOP_INACTIVE(vp, &recycle);
1945 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1946 	if (VOP_RECLAIM(vp)) {
1947 		vnpanic(vp, "%s: cannot reclaim", __func__);
1948 	}
1949 
1950 	KASSERT(vp->v_data == NULL);
1951 	KASSERT((vp->v_iflag & VI_PAGES) == 0);
1952 
1953 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1954 		uvm_ra_freectx(vp->v_ractx);
1955 		vp->v_ractx = NULL;
1956 	}
1957 
1958 	if (vip->vi_key.vk_key_len > 0) {
1959 	/* Remove from vnode cache. */
1960 		hash = vcache_hash(&vip->vi_key);
1961 		mutex_enter(&vcache_lock);
1962 		KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1963 		SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1964 		    vip, vnode_impl, vi_hash);
1965 		mutex_exit(&vcache_lock);
1966 	}
1967 	if (temp_key != temp_buf)
1968 		kmem_free(temp_key, temp_key_len);
1969 
1970 	/* Done with purge, notify sleepers of the grim news. */
1971 	mutex_enter(vp->v_interlock);
1972 	vp->v_op = dead_vnodeop_p;
1973 	VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1974 	vp->v_tag = VT_NON;
1975 	mutex_exit(vp->v_interlock);
1976 
1977 	/*
1978 	 * Move to dead mount.  Must be after changing the operations
1979 	 * vector as vnode operations enter the mount before using the
1980 	 * operations vector.  See sys/kern/vnode_if.c.
1981 	 */
1982 	vp->v_vflag &= ~VV_ROOT;
1983 	vfs_ref(dead_rootmount);
1984 	vfs_insmntque(vp, dead_rootmount);
1985 
1986 #ifdef PAX_SEGVGUARD
1987 	pax_segvguard_cleanup(vp);
1988 #endif /* PAX_SEGVGUARD */
1989 
1990 	mutex_enter(vp->v_interlock);
1991 	fstrans_done(mp);
1992 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1993 }
1994 
1995 /*
1996  * Disassociate the underlying file system from an open device vnode
1997  * and make it anonymous.
1998  *
1999  * Vnode unlocked on entry, drops a reference to the vnode.
2000  */
2001 void
2002 vcache_make_anon(vnode_t *vp)
2003 {
2004 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
2005 	uint32_t hash;
2006 	bool recycle;
2007 
2008 	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
2009 	KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
2010 	VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
2011 
2012 	/* Remove from vnode cache. */
2013 	hash = vcache_hash(&vip->vi_key);
2014 	mutex_enter(&vcache_lock);
2015 	KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
2016 	SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
2017 	    vip, vnode_impl, vi_hash);
2018 	vip->vi_key.vk_mount = dead_rootmount;
2019 	vip->vi_key.vk_key_len = 0;
2020 	vip->vi_key.vk_key = NULL;
2021 	mutex_exit(&vcache_lock);
2022 
2023 	/*
2024 	 * Disassociate the underlying file system from the vnode.
2025 	 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
2026 	 * the vnode, and may destroy the vnode so that VOP_UNLOCK
2027 	 * would no longer function.
2028 	 */
2029 	if (vn_lock(vp, LK_EXCLUSIVE)) {
2030 		vnpanic(vp, "%s: cannot lock", __func__);
2031 	}
2032 	VOP_INACTIVE(vp, &recycle);
2033 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2034 	if (VOP_RECLAIM(vp)) {
2035 		vnpanic(vp, "%s: cannot reclaim", __func__);
2036 	}
2037 
2038 	/* Purge name cache. */
2039 	cache_purge(vp);
2040 
2041 	/* Done with purge, change operations vector. */
2042 	mutex_enter(vp->v_interlock);
2043 	vp->v_op = spec_vnodeop_p;
2044 	vp->v_vflag |= VV_MPSAFE;
2045 	mutex_exit(vp->v_interlock);
2046 
2047 	/*
2048 	 * Move to dead mount.  Must be after changing the operations
2049 	 * vector as vnode operations enter the mount before using the
2050 	 * operations vector.  See sys/kern/vnode_if.c.
2051 	 */
2052 	vfs_ref(dead_rootmount);
2053 	vfs_insmntque(vp, dead_rootmount);
2054 
2055 	vrele(vp);
2056 }
2057 
2058 /*
2059  * Update outstanding I/O count and do wakeup if requested.
2060  */
2061 void
2062 vwakeup(struct buf *bp)
2063 {
2064 	vnode_t *vp;
2065 
2066 	if ((vp = bp->b_vp) == NULL)
2067 		return;
2068 
2069 	KASSERT(bp->b_objlock == vp->v_interlock);
2070 	KASSERT(mutex_owned(bp->b_objlock));
2071 
2072 	if (--vp->v_numoutput < 0)
2073 		vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
2074 	if (vp->v_numoutput == 0)
2075 		cv_broadcast(&vp->v_cv);
2076 }
2077 
2078 /*
2079  * Test a vnode for being or becoming dead.  Returns one of:
2080  * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
2081  * ENOENT: vnode is dead.
2082  * 0:      otherwise.
2083  *
2084  * Whenever this function returns a non-zero value all future
2085  * calls will also return a non-zero value.
2086  */
2087 int
2088 vdead_check(struct vnode *vp, int flags)
2089 {
2090 
2091 	KASSERT(mutex_owned(vp->v_interlock));
2092 
2093 	if (! ISSET(flags, VDEAD_NOWAIT))
2094 		VSTATE_WAIT_STABLE(vp);
2095 
2096 	if (VSTATE_GET(vp) == VS_RECLAIMING) {
2097 		KASSERT(ISSET(flags, VDEAD_NOWAIT));
2098 		return EBUSY;
2099 	} else if (VSTATE_GET(vp) == VS_RECLAIMED) {
2100 		return ENOENT;
2101 	}
2102 
2103 	return 0;
2104 }
2105 
2106 int
2107 vfs_drainvnodes(void)
2108 {
2109 
2110 	mutex_enter(&vdrain_lock);
2111 
2112 	if (!vdrain_one(desiredvnodes)) {
2113 		mutex_exit(&vdrain_lock);
2114 		return EBUSY;
2115 	}
2116 
2117 	mutex_exit(&vdrain_lock);
2118 
2119 	if (vcache_hashsize != desiredvnodes)
2120 		vcache_reinit();
2121 
2122 	return 0;
2123 }
2124 
2125 void
2126 vnpanic(vnode_t *vp, const char *fmt, ...)
2127 {
2128 	va_list ap;
2129 
2130 #ifdef DIAGNOSTIC
2131 	vprint(NULL, vp);
2132 #endif
2133 	va_start(ap, fmt);
2134 	vpanic(fmt, ap);
2135 	va_end(ap);
2136 }
2137 
2138 void
2139 vshareilock(vnode_t *tvp, vnode_t *fvp)
2140 {
2141 	kmutex_t *oldlock;
2142 
2143 	oldlock = tvp->v_interlock;
2144 	mutex_obj_hold(fvp->v_interlock);
2145 	tvp->v_interlock = fvp->v_interlock;
2146 	mutex_obj_free(oldlock);
2147 }
2148 
2149 void
2150 vshareklist(vnode_t *tvp, vnode_t *fvp)
2151 {
2152 	/*
2153 	 * If two vnodes share klist state, they must also share
2154 	 * an interlock.
2155 	 */
2156 	KASSERT(tvp->v_interlock == fvp->v_interlock);
2157 
2158 	/*
2159 	 * We make the following assumptions:
2160 	 *
2161 	 * ==> Some other synchronization is happening outside of
2162 	 *     our view to make this safe.
2163 	 *
2164 	 * ==> That the "to" vnode will have the necessary references
2165 	 *     on the "from" vnode so that the storage for the klist
2166 	 *     won't be yanked out from beneath us (the vnode_impl).
2167 	 *
2168 	 * ==> If "from" is also sharing, we then assume that "from"
2169 	 *     has the necessary references, and so on.
2170 	 */
2171 	tvp->v_klist = fvp->v_klist;
2172 }
2173