xref: /netbsd-src/sys/kern/vfs_vnode.c (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1 /*	$NetBSD: vfs_vnode.c,v 1.56 2016/08/20 12:37:08 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67  */
68 
69 /*
70  * The vnode cache subsystem.
71  *
72  * Life-cycle
73  *
74  *	Normally, there are two points where new vnodes are created:
75  *	VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
76  *	starts in one of the following ways:
77  *
78  *	- Allocation, via vcache_get(9) or vcache_new(9).
79  *	- Reclamation of inactive vnode, via vget(9).
80  *
81  *	Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82  *	was another, traditional way.  Currently, only the draining thread
83  *	recycles the vnodes.  This behaviour might be revisited.
84  *
85  *	The life-cycle ends when the last reference is dropped, usually
86  *	in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
87  *	the file system that vnode is inactive.  Via this call, file system
88  *	indicates whether vnode can be recycled (usually, it checks its own
89  *	references, e.g. count of links, whether the file was removed).
90  *
91  *	Depending on indication, vnode can be put into a free list (cache),
92  *	or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93  *	disassociate underlying file system from the vnode, and finally
94  *	destroyed.
95  *
96  * Vnode state
97  *
98  *	Vnode is always in one of six states:
99  *	- MARKER	This is a marker vnode to help list traversal.  It
100  *			will never change its state.
101  *	- LOADING	Vnode is associating underlying file system and not
102  *			yet ready to use.
103  *	- ACTIVE	Vnode has associated underlying file system and is
104  *			ready to use.
105  *	- BLOCKED	Vnode is active but cannot get new references.
106  *	- RECLAIMING	Vnode is disassociating from the underlying file
107  *			system.
108  *	- RECLAIMED	Vnode has disassociated from underlying file system
109  *			and is dead.
110  *
111  *	Valid state changes are:
112  *	LOADING -> ACTIVE
113  *			Vnode has been initialised in vcache_get() or
114  *			vcache_new() and is ready to use.
115  *	ACTIVE -> RECLAIMING
116  *			Vnode starts disassociation from underlying file
117  *			system in vcache_reclaim().
118  *	RECLAIMING -> RECLAIMED
119  *			Vnode finished disassociation from underlying file
120  *			system in vcache_reclaim().
121  *	ACTIVE -> BLOCKED
122  *			Either vcache_rekey*() is changing the vnode key or
123  *			vrelel() is about to call VOP_INACTIVE().
124  *	BLOCKED -> ACTIVE
125  *			The block condition is over.
126  *	LOADING -> RECLAIMED
127  *			Either vcache_get() or vcache_new() failed to
128  *			associate the underlying file system or vcache_rekey*()
129  *			drops a vnode used as placeholder.
130  *
131  *	Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132  *	and it is possible to wait for state change.
133  *
134  *	State is protected with v_interlock with one exception:
135  *	to change from LOADING both v_interlock and vcache.lock must be held
136  *	so it is possible to check "state == LOADING" without holding
137  *	v_interlock.  See vcache_get() for details.
138  *
139  * Reference counting
140  *
141  *	Vnode is considered active, if reference count (vnode_t::v_usecount)
142  *	is non-zero.  It is maintained using: vref(9) and vrele(9), as well
143  *	as vput(9), routines.  Common points holding references are e.g.
144  *	file openings, current working directory, mount points, etc.
145  *
146  * Note on v_usecount and its locking
147  *
148  *	At nearly all points it is known that v_usecount could be zero,
149  *	the vnode_t::v_interlock will be held.  To change v_usecount away
150  *	from zero, the interlock must be held.  To change from a non-zero
151  *	value to zero, again the interlock must be held.
152  *
153  *	Changing the usecount from a non-zero value to a non-zero value can
154  *	safely be done using atomic operations, without the interlock held.
155  *
156  */
157 
158 #include <sys/cdefs.h>
159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.56 2016/08/20 12:37:08 hannken Exp $");
160 
161 #define _VFS_VNODE_PRIVATE
162 
163 #include <sys/param.h>
164 #include <sys/kernel.h>
165 
166 #include <sys/atomic.h>
167 #include <sys/buf.h>
168 #include <sys/conf.h>
169 #include <sys/device.h>
170 #include <sys/hash.h>
171 #include <sys/kauth.h>
172 #include <sys/kmem.h>
173 #include <sys/kthread.h>
174 #include <sys/module.h>
175 #include <sys/mount.h>
176 #include <sys/namei.h>
177 #include <sys/syscallargs.h>
178 #include <sys/sysctl.h>
179 #include <sys/systm.h>
180 #include <sys/vnode.h>
181 #include <sys/wapbl.h>
182 #include <sys/fstrans.h>
183 
184 #include <uvm/uvm.h>
185 #include <uvm/uvm_readahead.h>
186 
187 /* Flags to vrelel. */
188 #define	VRELEL_ASYNC_RELE	0x0001	/* Always defer to vrele thread. */
189 
190 enum vcache_state {
191 	VN_MARKER,	/* Stable, used as marker. Will not change. */
192 	VN_LOADING,	/* Intermediate, initialising the fs node. */
193 	VN_ACTIVE,	/* Stable, valid fs node attached. */
194 	VN_BLOCKED,	/* Intermediate, active, no new references allowed. */
195 	VN_RECLAIMING,	/* Intermediate, detaching the fs node. */
196 	VN_RECLAIMED	/* Stable, no fs node attached. */
197 };
198 struct vcache_key {
199 	struct mount *vk_mount;
200 	const void *vk_key;
201 	size_t vk_key_len;
202 };
203 struct vcache_node {
204 	struct vnode vn_vnode;
205 	enum vcache_state vn_state;
206 	SLIST_ENTRY(vcache_node) vn_hash;
207 	struct vcache_key vn_key;
208 };
209 
210 #define VN_TO_VP(node)	((vnode_t *)(node))
211 #define VP_TO_VN(vp)	((struct vcache_node *)(vp))
212 
213 u_int			numvnodes		__cacheline_aligned;
214 
215 /*
216  * There are two free lists: one is for vnodes which have no buffer/page
217  * references and one for those which do (i.e. v_holdcnt is non-zero).
218  * Vnode recycling mechanism first attempts to look into the former list.
219  */
220 static kmutex_t		vnode_free_list_lock	__cacheline_aligned;
221 static vnodelst_t	vnode_free_list		__cacheline_aligned;
222 static vnodelst_t	vnode_hold_list		__cacheline_aligned;
223 static kcondvar_t	vdrain_cv		__cacheline_aligned;
224 
225 static vnodelst_t	vrele_list		__cacheline_aligned;
226 static kmutex_t		vrele_lock		__cacheline_aligned;
227 static kcondvar_t	vrele_cv		__cacheline_aligned;
228 static lwp_t *		vrele_lwp		__cacheline_aligned;
229 static int		vrele_pending		__cacheline_aligned;
230 static int		vrele_gen		__cacheline_aligned;
231 
232 SLIST_HEAD(hashhead, vcache_node);
233 static struct {
234 	kmutex_t	lock;
235 	kcondvar_t	cv;
236 	u_long		hashmask;
237 	struct hashhead	*hashtab;
238 	pool_cache_t	pool;
239 }			vcache			__cacheline_aligned;
240 
241 static int		cleanvnode(void);
242 static struct vcache_node *vcache_alloc(void);
243 static void		vcache_free(struct vcache_node *);
244 static void		vcache_init(void);
245 static void		vcache_reinit(void);
246 static void		vcache_reclaim(vnode_t *);
247 static void		vrelel(vnode_t *, int);
248 static void		vdrain_thread(void *);
249 static void		vrele_thread(void *);
250 static void		vnpanic(vnode_t *, const char *, ...)
251     __printflike(2, 3);
252 
253 /* Routines having to do with the management of the vnode table. */
254 extern struct mount	*dead_rootmount;
255 extern int		(**dead_vnodeop_p)(void *);
256 extern struct vfsops	dead_vfsops;
257 
258 /* Vnode state operations and diagnostics. */
259 
260 static const char *
261 vstate_name(enum vcache_state state)
262 {
263 
264 	switch (state) {
265 	case VN_MARKER:
266 		return "MARKER";
267 	case VN_LOADING:
268 		return "LOADING";
269 	case VN_ACTIVE:
270 		return "ACTIVE";
271 	case VN_BLOCKED:
272 		return "BLOCKED";
273 	case VN_RECLAIMING:
274 		return "RECLAIMING";
275 	case VN_RECLAIMED:
276 		return "RECLAIMED";
277 	default:
278 		return "ILLEGAL";
279 	}
280 }
281 
282 #if defined(DIAGNOSTIC)
283 
284 #define VSTATE_GET(vp) \
285 	vstate_assert_get((vp), __func__, __LINE__)
286 #define VSTATE_CHANGE(vp, from, to) \
287 	vstate_assert_change((vp), (from), (to), __func__, __LINE__)
288 #define VSTATE_WAIT_STABLE(vp) \
289 	vstate_assert_wait_stable((vp), __func__, __LINE__)
290 #define VSTATE_ASSERT(vp, state) \
291 	vstate_assert((vp), (state), __func__, __LINE__)
292 
293 static void
294 vstate_assert(vnode_t *vp, enum vcache_state state, const char *func, int line)
295 {
296 	struct vcache_node *node = VP_TO_VN(vp);
297 
298 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
299 
300 	if (__predict_true(node->vn_state == state))
301 		return;
302 	vnpanic(vp, "state is %s, expected %s at %s:%d",
303 	    vstate_name(node->vn_state), vstate_name(state), func, line);
304 }
305 
306 static enum vcache_state
307 vstate_assert_get(vnode_t *vp, const char *func, int line)
308 {
309 	struct vcache_node *node = VP_TO_VN(vp);
310 
311 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
312 	if (node->vn_state == VN_MARKER)
313 		vnpanic(vp, "state is %s at %s:%d",
314 		    vstate_name(node->vn_state), func, line);
315 
316 	return node->vn_state;
317 }
318 
319 static void
320 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
321 {
322 	struct vcache_node *node = VP_TO_VN(vp);
323 
324 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
325 	if (node->vn_state == VN_MARKER)
326 		vnpanic(vp, "state is %s at %s:%d",
327 		    vstate_name(node->vn_state), func, line);
328 
329 	while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
330 		cv_wait(&vp->v_cv, vp->v_interlock);
331 
332 	if (node->vn_state == VN_MARKER)
333 		vnpanic(vp, "state is %s at %s:%d",
334 		    vstate_name(node->vn_state), func, line);
335 }
336 
337 static void
338 vstate_assert_change(vnode_t *vp, enum vcache_state from, enum vcache_state to,
339     const char *func, int line)
340 {
341 	struct vcache_node *node = VP_TO_VN(vp);
342 
343 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
344 	if (from == VN_LOADING)
345 		KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line);
346 
347 	if (from == VN_MARKER)
348 		vnpanic(vp, "from is %s at %s:%d",
349 		    vstate_name(from), func, line);
350 	if (to == VN_MARKER)
351 		vnpanic(vp, "to is %s at %s:%d",
352 		    vstate_name(to), func, line);
353 	if (node->vn_state != from)
354 		vnpanic(vp, "from is %s, expected %s at %s:%d\n",
355 		    vstate_name(node->vn_state), vstate_name(from), func, line);
356 
357 	node->vn_state = to;
358 	if (from == VN_LOADING)
359 		cv_broadcast(&vcache.cv);
360 	if (to == VN_ACTIVE || to == VN_RECLAIMED)
361 		cv_broadcast(&vp->v_cv);
362 }
363 
364 #else /* defined(DIAGNOSTIC) */
365 
366 #define VSTATE_GET(vp) \
367 	(VP_TO_VN((vp))->vn_state)
368 #define VSTATE_CHANGE(vp, from, to) \
369 	vstate_change((vp), (from), (to))
370 #define VSTATE_WAIT_STABLE(vp) \
371 	vstate_wait_stable((vp))
372 #define VSTATE_ASSERT(vp, state)
373 
374 static void
375 vstate_wait_stable(vnode_t *vp)
376 {
377 	struct vcache_node *node = VP_TO_VN(vp);
378 
379 	while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
380 		cv_wait(&vp->v_cv, vp->v_interlock);
381 }
382 
383 static void
384 vstate_change(vnode_t *vp, enum vcache_state from, enum vcache_state to)
385 {
386 	struct vcache_node *node = VP_TO_VN(vp);
387 
388 	node->vn_state = to;
389 	if (from == VN_LOADING)
390 		cv_broadcast(&vcache.cv);
391 	if (to == VN_ACTIVE || to == VN_RECLAIMED)
392 		cv_broadcast(&vp->v_cv);
393 }
394 
395 #endif /* defined(DIAGNOSTIC) */
396 
397 void
398 vfs_vnode_sysinit(void)
399 {
400 	int error __diagused;
401 
402 	dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
403 	KASSERT(dead_rootmount != NULL);
404 	dead_rootmount->mnt_iflag = IMNT_MPSAFE;
405 
406 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
407 	TAILQ_INIT(&vnode_free_list);
408 	TAILQ_INIT(&vnode_hold_list);
409 	TAILQ_INIT(&vrele_list);
410 
411 	vcache_init();
412 
413 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
414 	cv_init(&vdrain_cv, "vdrain");
415 	cv_init(&vrele_cv, "vrele");
416 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
417 	    NULL, NULL, "vdrain");
418 	KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
419 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
420 	    NULL, &vrele_lwp, "vrele");
421 	KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error);
422 }
423 
424 /*
425  * Allocate a new marker vnode.
426  */
427 vnode_t *
428 vnalloc_marker(struct mount *mp)
429 {
430 	struct vcache_node *node;
431 	vnode_t *vp;
432 
433 	node = pool_cache_get(vcache.pool, PR_WAITOK);
434 	memset(node, 0, sizeof(*node));
435 	vp = VN_TO_VP(node);
436 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
437 	vp->v_mount = mp;
438 	vp->v_type = VBAD;
439 	node->vn_state = VN_MARKER;
440 
441 	return vp;
442 }
443 
444 /*
445  * Free a marker vnode.
446  */
447 void
448 vnfree_marker(vnode_t *vp)
449 {
450 	struct vcache_node *node;
451 
452 	node = VP_TO_VN(vp);
453 	KASSERT(node->vn_state == VN_MARKER);
454 	uvm_obj_destroy(&vp->v_uobj, true);
455 	pool_cache_put(vcache.pool, node);
456 }
457 
458 /*
459  * Test a vnode for being a marker vnode.
460  */
461 bool
462 vnis_marker(vnode_t *vp)
463 {
464 
465 	return (VP_TO_VN(vp)->vn_state == VN_MARKER);
466 }
467 
468 /*
469  * cleanvnode: grab a vnode from freelist, clean and free it.
470  *
471  * => Releases vnode_free_list_lock.
472  */
473 static int
474 cleanvnode(void)
475 {
476 	vnode_t *vp;
477 	vnodelst_t *listhd;
478 	struct mount *mp;
479 
480 	KASSERT(mutex_owned(&vnode_free_list_lock));
481 
482 	listhd = &vnode_free_list;
483 try_nextlist:
484 	TAILQ_FOREACH(vp, listhd, v_freelist) {
485 		/*
486 		 * It's safe to test v_usecount and v_iflag
487 		 * without holding the interlock here, since
488 		 * these vnodes should never appear on the
489 		 * lists.
490 		 */
491 		KASSERT(vp->v_usecount == 0);
492 		KASSERT(vp->v_freelisthd == listhd);
493 
494 		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
495 			continue;
496 		if (!mutex_tryenter(vp->v_interlock)) {
497 			VOP_UNLOCK(vp);
498 			continue;
499 		}
500 		mp = vp->v_mount;
501 		if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
502 			mutex_exit(vp->v_interlock);
503 			VOP_UNLOCK(vp);
504 			continue;
505 		}
506 		break;
507 	}
508 
509 	if (vp == NULL) {
510 		if (listhd == &vnode_free_list) {
511 			listhd = &vnode_hold_list;
512 			goto try_nextlist;
513 		}
514 		mutex_exit(&vnode_free_list_lock);
515 		return EBUSY;
516 	}
517 
518 	/* Remove it from the freelist. */
519 	TAILQ_REMOVE(listhd, vp, v_freelist);
520 	vp->v_freelisthd = NULL;
521 	mutex_exit(&vnode_free_list_lock);
522 
523 	KASSERT(vp->v_usecount == 0);
524 
525 	/*
526 	 * The vnode is still associated with a file system, so we must
527 	 * clean it out before freeing it.  We need to add a reference
528 	 * before doing this.
529 	 */
530 	vp->v_usecount = 1;
531 	vcache_reclaim(vp);
532 	vrelel(vp, 0);
533 	fstrans_done(mp);
534 
535 	return 0;
536 }
537 
538 /*
539  * Helper thread to keep the number of vnodes below desiredvnodes.
540  */
541 static void
542 vdrain_thread(void *cookie)
543 {
544 	int error;
545 
546 	mutex_enter(&vnode_free_list_lock);
547 
548 	for (;;) {
549 		cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
550 		while (numvnodes > desiredvnodes) {
551 			error = cleanvnode();
552 			if (error)
553 				kpause("vndsbusy", false, hz, NULL);
554 			mutex_enter(&vnode_free_list_lock);
555 			if (error)
556 				break;
557 		}
558 	}
559 }
560 
561 /*
562  * Remove a vnode from its freelist.
563  */
564 void
565 vremfree(vnode_t *vp)
566 {
567 
568 	KASSERT(mutex_owned(vp->v_interlock));
569 	KASSERT(vp->v_usecount == 0);
570 
571 	/*
572 	 * Note that the reference count must not change until
573 	 * the vnode is removed.
574 	 */
575 	mutex_enter(&vnode_free_list_lock);
576 	if (vp->v_holdcnt > 0) {
577 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
578 	} else {
579 		KASSERT(vp->v_freelisthd == &vnode_free_list);
580 	}
581 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
582 	vp->v_freelisthd = NULL;
583 	mutex_exit(&vnode_free_list_lock);
584 }
585 
586 /*
587  * vget: get a particular vnode from the free list, increment its reference
588  * count and return it.
589  *
590  * => Must be called with v_interlock held.
591  *
592  * If state is VN_RECLAIMING, the vnode may be eliminated in vcache_reclaim().
593  * In that case, we cannot grab the vnode, so the process is awakened when
594  * the transition is completed, and an error returned to indicate that the
595  * vnode is no longer usable.
596  *
597  * If state is VN_LOADING or VN_BLOCKED, wait until the vnode enters a
598  * stable state (VN_ACTIVE or VN_RECLAIMED).
599  */
600 int
601 vget(vnode_t *vp, int flags, bool waitok)
602 {
603 
604 	KASSERT(mutex_owned(vp->v_interlock));
605 	KASSERT((flags & ~LK_NOWAIT) == 0);
606 	KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
607 
608 	/*
609 	 * Before adding a reference, we must remove the vnode
610 	 * from its freelist.
611 	 */
612 	if (vp->v_usecount == 0) {
613 		vremfree(vp);
614 		vp->v_usecount = 1;
615 	} else {
616 		atomic_inc_uint(&vp->v_usecount);
617 	}
618 
619 	/*
620 	 * If the vnode is in the process of changing state we wait
621 	 * for the change to complete and take care not to return
622 	 * a clean vnode.
623 	 */
624 	if (! ISSET(flags, LK_NOWAIT))
625 		VSTATE_WAIT_STABLE(vp);
626 	if (VSTATE_GET(vp) == VN_RECLAIMED) {
627 		vrelel(vp, 0);
628 		return ENOENT;
629 	} else if (VSTATE_GET(vp) != VN_ACTIVE) {
630 		KASSERT(ISSET(flags, LK_NOWAIT));
631 		vrelel(vp, 0);
632 		return EBUSY;
633 	}
634 
635 	/*
636 	 * Ok, we got it in good shape.
637 	 */
638 	VSTATE_ASSERT(vp, VN_ACTIVE);
639 	mutex_exit(vp->v_interlock);
640 
641 	return 0;
642 }
643 
644 /*
645  * vput: unlock and release the reference.
646  */
647 void
648 vput(vnode_t *vp)
649 {
650 
651 	VOP_UNLOCK(vp);
652 	vrele(vp);
653 }
654 
655 /*
656  * Try to drop reference on a vnode.  Abort if we are releasing the
657  * last reference.  Note: this _must_ succeed if not the last reference.
658  */
659 static inline bool
660 vtryrele(vnode_t *vp)
661 {
662 	u_int use, next;
663 
664 	for (use = vp->v_usecount;; use = next) {
665 		if (use == 1) {
666 			return false;
667 		}
668 		KASSERT(use > 1);
669 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
670 		if (__predict_true(next == use)) {
671 			return true;
672 		}
673 	}
674 }
675 
676 /*
677  * Vnode release.  If reference count drops to zero, call inactive
678  * routine and either return to freelist or free to the pool.
679  */
680 static void
681 vrelel(vnode_t *vp, int flags)
682 {
683 	bool recycle, defer;
684 	int error;
685 
686 	KASSERT(mutex_owned(vp->v_interlock));
687 	KASSERT(vp->v_freelisthd == NULL);
688 
689 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
690 	    VSTATE_GET(vp) != VN_RECLAIMED)) {
691 		vnpanic(vp, "dead but not clean");
692 	}
693 
694 	/*
695 	 * If not the last reference, just drop the reference count
696 	 * and unlock.
697 	 */
698 	if (vtryrele(vp)) {
699 		mutex_exit(vp->v_interlock);
700 		return;
701 	}
702 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
703 		vnpanic(vp, "%s: bad ref count", __func__);
704 	}
705 
706 #ifdef DIAGNOSTIC
707 	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
708 	    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
709 		vprint("vrelel: missing VOP_CLOSE()", vp);
710 	}
711 #endif
712 
713 	/*
714 	 * If not clean, deactivate the vnode, but preserve
715 	 * our reference across the call to VOP_INACTIVE().
716 	 */
717 	if (VSTATE_GET(vp) != VN_RECLAIMED) {
718 		recycle = false;
719 
720 		/*
721 		 * XXX This ugly block can be largely eliminated if
722 		 * locking is pushed down into the file systems.
723 		 *
724 		 * Defer vnode release to vrele_thread if caller
725 		 * requests it explicitly or is the pagedaemon.
726 		 */
727 		if ((curlwp == uvm.pagedaemon_lwp) ||
728 		    (flags & VRELEL_ASYNC_RELE) != 0) {
729 			defer = true;
730 		} else if (curlwp == vrele_lwp) {
731 			/*
732 			 * We have to try harder.
733 			 */
734 			mutex_exit(vp->v_interlock);
735 			error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
736 			KASSERTMSG((error == 0), "vn_lock failed: %d", error);
737 			mutex_enter(vp->v_interlock);
738 			defer = false;
739 		} else {
740 			/* If we can't acquire the lock, then defer. */
741 			mutex_exit(vp->v_interlock);
742 			error = vn_lock(vp,
743 			    LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
744 			defer = (error != 0);
745 			mutex_enter(vp->v_interlock);
746 		}
747 
748 		KASSERT(mutex_owned(vp->v_interlock));
749 		KASSERT(! (curlwp == vrele_lwp && defer));
750 
751 		if (defer) {
752 			/*
753 			 * Defer reclaim to the kthread; it's not safe to
754 			 * clean it here.  We donate it our last reference.
755 			 */
756 			mutex_enter(&vrele_lock);
757 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
758 			if (++vrele_pending > (desiredvnodes >> 8))
759 				cv_signal(&vrele_cv);
760 			mutex_exit(&vrele_lock);
761 			mutex_exit(vp->v_interlock);
762 			return;
763 		}
764 
765 		/*
766 		 * If the node got another reference while we
767 		 * released the interlock, don't try to inactivate it yet.
768 		 */
769 		if (__predict_false(vtryrele(vp))) {
770 			VOP_UNLOCK(vp);
771 			mutex_exit(vp->v_interlock);
772 			return;
773 		}
774 		VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED);
775 		mutex_exit(vp->v_interlock);
776 
777 		/*
778 		 * The vnode must not gain another reference while being
779 		 * deactivated.  If VOP_INACTIVE() indicates that
780 		 * the described file has been deleted, then recycle
781 		 * the vnode.
782 		 *
783 		 * Note that VOP_INACTIVE() will drop the vnode lock.
784 		 */
785 		VOP_INACTIVE(vp, &recycle);
786 		if (recycle) {
787 			/* vcache_reclaim() below will drop the lock. */
788 			if (vn_lock(vp, LK_EXCLUSIVE) != 0)
789 				recycle = false;
790 		}
791 		mutex_enter(vp->v_interlock);
792 		VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE);
793 		if (!recycle) {
794 			if (vtryrele(vp)) {
795 				mutex_exit(vp->v_interlock);
796 				return;
797 			}
798 		}
799 
800 		/* Take care of space accounting. */
801 		if (vp->v_iflag & VI_EXECMAP) {
802 			atomic_add_int(&uvmexp.execpages,
803 			    -vp->v_uobj.uo_npages);
804 			atomic_add_int(&uvmexp.filepages,
805 			    vp->v_uobj.uo_npages);
806 		}
807 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
808 		vp->v_vflag &= ~VV_MAPPED;
809 
810 		/*
811 		 * Recycle the vnode if the file is now unused (unlinked),
812 		 * otherwise just free it.
813 		 */
814 		if (recycle) {
815 			VSTATE_ASSERT(vp, VN_ACTIVE);
816 			vcache_reclaim(vp);
817 		}
818 		KASSERT(vp->v_usecount > 0);
819 	}
820 
821 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
822 		/* Gained another reference while being reclaimed. */
823 		mutex_exit(vp->v_interlock);
824 		return;
825 	}
826 
827 	if (VSTATE_GET(vp) == VN_RECLAIMED) {
828 		/*
829 		 * It's clean so destroy it.  It isn't referenced
830 		 * anywhere since it has been reclaimed.
831 		 */
832 		KASSERT(vp->v_holdcnt == 0);
833 		KASSERT(vp->v_writecount == 0);
834 		mutex_exit(vp->v_interlock);
835 		vfs_insmntque(vp, NULL);
836 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
837 			spec_node_destroy(vp);
838 		}
839 		vcache_free(VP_TO_VN(vp));
840 	} else {
841 		/*
842 		 * Otherwise, put it back onto the freelist.  It
843 		 * can't be destroyed while still associated with
844 		 * a file system.
845 		 */
846 		mutex_enter(&vnode_free_list_lock);
847 		if (vp->v_holdcnt > 0) {
848 			vp->v_freelisthd = &vnode_hold_list;
849 		} else {
850 			vp->v_freelisthd = &vnode_free_list;
851 		}
852 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
853 		mutex_exit(&vnode_free_list_lock);
854 		mutex_exit(vp->v_interlock);
855 	}
856 }
857 
858 void
859 vrele(vnode_t *vp)
860 {
861 
862 	if (vtryrele(vp)) {
863 		return;
864 	}
865 	mutex_enter(vp->v_interlock);
866 	vrelel(vp, 0);
867 }
868 
869 /*
870  * Asynchronous vnode release, vnode is released in different context.
871  */
872 void
873 vrele_async(vnode_t *vp)
874 {
875 
876 	if (vtryrele(vp)) {
877 		return;
878 	}
879 	mutex_enter(vp->v_interlock);
880 	vrelel(vp, VRELEL_ASYNC_RELE);
881 }
882 
883 static void
884 vrele_thread(void *cookie)
885 {
886 	vnodelst_t skip_list;
887 	vnode_t *vp;
888 	struct mount *mp;
889 
890 	TAILQ_INIT(&skip_list);
891 
892 	mutex_enter(&vrele_lock);
893 	for (;;) {
894 		while (TAILQ_EMPTY(&vrele_list)) {
895 			vrele_gen++;
896 			cv_broadcast(&vrele_cv);
897 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
898 			TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
899 		}
900 		vp = TAILQ_FIRST(&vrele_list);
901 		mp = vp->v_mount;
902 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
903 		if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
904 			TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
905 			continue;
906 		}
907 		vrele_pending--;
908 		mutex_exit(&vrele_lock);
909 
910 		/*
911 		 * If not the last reference, then ignore the vnode
912 		 * and look for more work.
913 		 */
914 		mutex_enter(vp->v_interlock);
915 		vrelel(vp, 0);
916 		fstrans_done(mp);
917 		mutex_enter(&vrele_lock);
918 	}
919 }
920 
921 void
922 vrele_flush(void)
923 {
924 	int gen;
925 
926 	mutex_enter(&vrele_lock);
927 	gen = vrele_gen;
928 	while (vrele_pending && gen == vrele_gen) {
929 		cv_broadcast(&vrele_cv);
930 		cv_wait(&vrele_cv, &vrele_lock);
931 	}
932 	mutex_exit(&vrele_lock);
933 }
934 
935 /*
936  * Vnode reference, where a reference is already held by some other
937  * object (for example, a file structure).
938  */
939 void
940 vref(vnode_t *vp)
941 {
942 
943 	KASSERT(vp->v_usecount != 0);
944 
945 	atomic_inc_uint(&vp->v_usecount);
946 }
947 
948 /*
949  * Page or buffer structure gets a reference.
950  * Called with v_interlock held.
951  */
952 void
953 vholdl(vnode_t *vp)
954 {
955 
956 	KASSERT(mutex_owned(vp->v_interlock));
957 
958 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
959 		mutex_enter(&vnode_free_list_lock);
960 		KASSERT(vp->v_freelisthd == &vnode_free_list);
961 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
962 		vp->v_freelisthd = &vnode_hold_list;
963 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
964 		mutex_exit(&vnode_free_list_lock);
965 	}
966 }
967 
968 /*
969  * Page or buffer structure frees a reference.
970  * Called with v_interlock held.
971  */
972 void
973 holdrelel(vnode_t *vp)
974 {
975 
976 	KASSERT(mutex_owned(vp->v_interlock));
977 
978 	if (vp->v_holdcnt <= 0) {
979 		vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
980 	}
981 
982 	vp->v_holdcnt--;
983 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
984 		mutex_enter(&vnode_free_list_lock);
985 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
986 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
987 		vp->v_freelisthd = &vnode_free_list;
988 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
989 		mutex_exit(&vnode_free_list_lock);
990 	}
991 }
992 
993 /*
994  * Recycle an unused vnode if caller holds the last reference.
995  */
996 bool
997 vrecycle(vnode_t *vp)
998 {
999 
1000 	if (vn_lock(vp, LK_EXCLUSIVE) != 0)
1001 		return false;
1002 
1003 	mutex_enter(vp->v_interlock);
1004 
1005 	if (vp->v_usecount != 1) {
1006 		mutex_exit(vp->v_interlock);
1007 		VOP_UNLOCK(vp);
1008 		return false;
1009 	}
1010 	vcache_reclaim(vp);
1011 	vrelel(vp, 0);
1012 	return true;
1013 }
1014 
1015 /*
1016  * Eliminate all activity associated with the requested vnode
1017  * and with all vnodes aliased to the requested vnode.
1018  */
1019 void
1020 vrevoke(vnode_t *vp)
1021 {
1022 	vnode_t *vq;
1023 	enum vtype type;
1024 	dev_t dev;
1025 
1026 	KASSERT(vp->v_usecount > 0);
1027 
1028 	mutex_enter(vp->v_interlock);
1029 	VSTATE_WAIT_STABLE(vp);
1030 	if (VSTATE_GET(vp) == VN_RECLAIMED) {
1031 		mutex_exit(vp->v_interlock);
1032 		return;
1033 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1034 		atomic_inc_uint(&vp->v_usecount);
1035 		mutex_exit(vp->v_interlock);
1036 		vgone(vp);
1037 		return;
1038 	} else {
1039 		dev = vp->v_rdev;
1040 		type = vp->v_type;
1041 		mutex_exit(vp->v_interlock);
1042 	}
1043 
1044 	while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1045 		vgone(vq);
1046 	}
1047 }
1048 
1049 /*
1050  * Eliminate all activity associated with a vnode in preparation for
1051  * reuse.  Drops a reference from the vnode.
1052  */
1053 void
1054 vgone(vnode_t *vp)
1055 {
1056 
1057 	if (vn_lock(vp, LK_EXCLUSIVE) != 0) {
1058 		VSTATE_ASSERT(vp, VN_RECLAIMED);
1059 		vrele(vp);
1060 	}
1061 
1062 	mutex_enter(vp->v_interlock);
1063 	vcache_reclaim(vp);
1064 	vrelel(vp, 0);
1065 }
1066 
1067 static inline uint32_t
1068 vcache_hash(const struct vcache_key *key)
1069 {
1070 	uint32_t hash = HASH32_BUF_INIT;
1071 
1072 	hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1073 	hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1074 	return hash;
1075 }
1076 
1077 static void
1078 vcache_init(void)
1079 {
1080 
1081 	vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1082 	    "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1083 	KASSERT(vcache.pool != NULL);
1084 	mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1085 	cv_init(&vcache.cv, "vcache");
1086 	vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1087 	    &vcache.hashmask);
1088 }
1089 
1090 static void
1091 vcache_reinit(void)
1092 {
1093 	int i;
1094 	uint32_t hash;
1095 	u_long oldmask, newmask;
1096 	struct hashhead *oldtab, *newtab;
1097 	struct vcache_node *node;
1098 
1099 	newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1100 	mutex_enter(&vcache.lock);
1101 	oldtab = vcache.hashtab;
1102 	oldmask = vcache.hashmask;
1103 	vcache.hashtab = newtab;
1104 	vcache.hashmask = newmask;
1105 	for (i = 0; i <= oldmask; i++) {
1106 		while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1107 			SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1108 			hash = vcache_hash(&node->vn_key);
1109 			SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1110 			    node, vn_hash);
1111 		}
1112 	}
1113 	mutex_exit(&vcache.lock);
1114 	hashdone(oldtab, HASH_SLIST, oldmask);
1115 }
1116 
1117 static inline struct vcache_node *
1118 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1119 {
1120 	struct hashhead *hashp;
1121 	struct vcache_node *node;
1122 
1123 	KASSERT(mutex_owned(&vcache.lock));
1124 
1125 	hashp = &vcache.hashtab[hash & vcache.hashmask];
1126 	SLIST_FOREACH(node, hashp, vn_hash) {
1127 		if (key->vk_mount != node->vn_key.vk_mount)
1128 			continue;
1129 		if (key->vk_key_len != node->vn_key.vk_key_len)
1130 			continue;
1131 		if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1132 			continue;
1133 		return node;
1134 	}
1135 	return NULL;
1136 }
1137 
1138 /*
1139  * Allocate a new, uninitialized vcache node.
1140  */
1141 static struct vcache_node *
1142 vcache_alloc(void)
1143 {
1144 	struct vcache_node *node;
1145 	vnode_t *vp;
1146 
1147 	node = pool_cache_get(vcache.pool, PR_WAITOK);
1148 	memset(node, 0, sizeof(*node));
1149 
1150 	/* SLIST_INIT(&node->vn_hash); */
1151 
1152 	vp = VN_TO_VP(node);
1153 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1154 	cv_init(&vp->v_cv, "vnode");
1155 	/* LIST_INIT(&vp->v_nclist); */
1156 	/* LIST_INIT(&vp->v_dnclist); */
1157 
1158 	mutex_enter(&vnode_free_list_lock);
1159 	numvnodes++;
1160 	if (numvnodes > desiredvnodes + desiredvnodes / 10)
1161 		cv_signal(&vdrain_cv);
1162 	mutex_exit(&vnode_free_list_lock);
1163 
1164 	rw_init(&vp->v_lock);
1165 	vp->v_usecount = 1;
1166 	vp->v_type = VNON;
1167 	vp->v_size = vp->v_writesize = VSIZENOTSET;
1168 
1169 	node->vn_state = VN_LOADING;
1170 
1171 	return node;
1172 }
1173 
1174 /*
1175  * Free an unused, unreferenced vcache node.
1176  */
1177 static void
1178 vcache_free(struct vcache_node *node)
1179 {
1180 	vnode_t *vp;
1181 
1182 	vp = VN_TO_VP(node);
1183 
1184 	KASSERT(vp->v_usecount == 0);
1185 
1186 	rw_destroy(&vp->v_lock);
1187 	mutex_enter(&vnode_free_list_lock);
1188 	numvnodes--;
1189 	mutex_exit(&vnode_free_list_lock);
1190 
1191 	uvm_obj_destroy(&vp->v_uobj, true);
1192 	cv_destroy(&vp->v_cv);
1193 	pool_cache_put(vcache.pool, node);
1194 }
1195 
1196 /*
1197  * Get a vnode / fs node pair by key and return it referenced through vpp.
1198  */
1199 int
1200 vcache_get(struct mount *mp, const void *key, size_t key_len,
1201     struct vnode **vpp)
1202 {
1203 	int error;
1204 	uint32_t hash;
1205 	const void *new_key;
1206 	struct vnode *vp;
1207 	struct vcache_key vcache_key;
1208 	struct vcache_node *node, *new_node;
1209 
1210 	new_key = NULL;
1211 	*vpp = NULL;
1212 
1213 	vcache_key.vk_mount = mp;
1214 	vcache_key.vk_key = key;
1215 	vcache_key.vk_key_len = key_len;
1216 	hash = vcache_hash(&vcache_key);
1217 
1218 again:
1219 	mutex_enter(&vcache.lock);
1220 	node = vcache_hash_lookup(&vcache_key, hash);
1221 
1222 	/* If found, take a reference or retry. */
1223 	if (__predict_true(node != NULL)) {
1224 		/*
1225 		 * If the vnode is loading we cannot take the v_interlock
1226 		 * here as it might change during load (see uvm_obj_setlock()).
1227 		 * As changing state from VN_LOADING requires both vcache.lock
1228 		 * and v_interlock it is safe to test with vcache.lock held.
1229 		 *
1230 		 * Wait for vnodes changing state from VN_LOADING and retry.
1231 		 */
1232 		if (__predict_false(node->vn_state == VN_LOADING)) {
1233 			cv_wait(&vcache.cv, &vcache.lock);
1234 			mutex_exit(&vcache.lock);
1235 			goto again;
1236 		}
1237 		vp = VN_TO_VP(node);
1238 		mutex_enter(vp->v_interlock);
1239 		mutex_exit(&vcache.lock);
1240 		error = vget(vp, 0, true /* wait */);
1241 		if (error == ENOENT)
1242 			goto again;
1243 		if (error == 0)
1244 			*vpp = vp;
1245 		KASSERT((error != 0) == (*vpp == NULL));
1246 		return error;
1247 	}
1248 	mutex_exit(&vcache.lock);
1249 
1250 	/* Allocate and initialize a new vcache / vnode pair. */
1251 	error = vfs_busy(mp, NULL);
1252 	if (error)
1253 		return error;
1254 	new_node = vcache_alloc();
1255 	new_node->vn_key = vcache_key;
1256 	vp = VN_TO_VP(new_node);
1257 	mutex_enter(&vcache.lock);
1258 	node = vcache_hash_lookup(&vcache_key, hash);
1259 	if (node == NULL) {
1260 		SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1261 		    new_node, vn_hash);
1262 		node = new_node;
1263 	}
1264 
1265 	/* If another thread beat us inserting this node, retry. */
1266 	if (node != new_node) {
1267 		mutex_enter(vp->v_interlock);
1268 		VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1269 		mutex_exit(&vcache.lock);
1270 		vrelel(vp, 0);
1271 		vfs_unbusy(mp, false, NULL);
1272 		goto again;
1273 	}
1274 	mutex_exit(&vcache.lock);
1275 
1276 	/* Load the fs node.  Exclusive as new_node is VN_LOADING. */
1277 	error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1278 	if (error) {
1279 		mutex_enter(&vcache.lock);
1280 		SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1281 		    new_node, vcache_node, vn_hash);
1282 		mutex_enter(vp->v_interlock);
1283 		VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1284 		mutex_exit(&vcache.lock);
1285 		vrelel(vp, 0);
1286 		vfs_unbusy(mp, false, NULL);
1287 		KASSERT(*vpp == NULL);
1288 		return error;
1289 	}
1290 	KASSERT(new_key != NULL);
1291 	KASSERT(memcmp(key, new_key, key_len) == 0);
1292 	KASSERT(vp->v_op != NULL);
1293 	vfs_insmntque(vp, mp);
1294 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1295 		vp->v_vflag |= VV_MPSAFE;
1296 	vfs_unbusy(mp, true, NULL);
1297 
1298 	/* Finished loading, finalize node. */
1299 	mutex_enter(&vcache.lock);
1300 	new_node->vn_key.vk_key = new_key;
1301 	mutex_enter(vp->v_interlock);
1302 	VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE);
1303 	mutex_exit(vp->v_interlock);
1304 	mutex_exit(&vcache.lock);
1305 	*vpp = vp;
1306 	return 0;
1307 }
1308 
1309 /*
1310  * Create a new vnode / fs node pair and return it referenced through vpp.
1311  */
1312 int
1313 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1314     kauth_cred_t cred, struct vnode **vpp)
1315 {
1316 	int error;
1317 	uint32_t hash;
1318 	struct vnode *ovp, *vp;
1319 	struct vcache_node *new_node;
1320 	struct vcache_node *old_node __diagused;
1321 
1322 	*vpp = NULL;
1323 
1324 	/* Allocate and initialize a new vcache / vnode pair. */
1325 	error = vfs_busy(mp, NULL);
1326 	if (error)
1327 		return error;
1328 	new_node = vcache_alloc();
1329 	new_node->vn_key.vk_mount = mp;
1330 	vp = VN_TO_VP(new_node);
1331 
1332 	/* Create and load the fs node. */
1333 	error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1334 	    &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1335 	if (error) {
1336 		mutex_enter(&vcache.lock);
1337 		mutex_enter(vp->v_interlock);
1338 		VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1339 		mutex_exit(&vcache.lock);
1340 		vrelel(vp, 0);
1341 		vfs_unbusy(mp, false, NULL);
1342 		KASSERT(*vpp == NULL);
1343 		return error;
1344 	}
1345 	KASSERT(new_node->vn_key.vk_key != NULL);
1346 	KASSERT(vp->v_op != NULL);
1347 	hash = vcache_hash(&new_node->vn_key);
1348 
1349 	/* Wait for previous instance to be reclaimed, then insert new node. */
1350 	mutex_enter(&vcache.lock);
1351 	while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1352 		ovp = VN_TO_VP(old_node);
1353 		mutex_enter(ovp->v_interlock);
1354 		mutex_exit(&vcache.lock);
1355 		error = vget(ovp, 0, true /* wait */);
1356 		KASSERT(error == ENOENT);
1357 		mutex_enter(&vcache.lock);
1358 	}
1359 	SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1360 	    new_node, vn_hash);
1361 	mutex_exit(&vcache.lock);
1362 	vfs_insmntque(vp, mp);
1363 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1364 		vp->v_vflag |= VV_MPSAFE;
1365 	vfs_unbusy(mp, true, NULL);
1366 
1367 	/* Finished loading, finalize node. */
1368 	mutex_enter(&vcache.lock);
1369 	mutex_enter(vp->v_interlock);
1370 	VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE);
1371 	mutex_exit(&vcache.lock);
1372 	mutex_exit(vp->v_interlock);
1373 	*vpp = vp;
1374 	return 0;
1375 }
1376 
1377 /*
1378  * Prepare key change: lock old and new cache node.
1379  * Return an error if the new node already exists.
1380  */
1381 int
1382 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1383     const void *old_key, size_t old_key_len,
1384     const void *new_key, size_t new_key_len)
1385 {
1386 	uint32_t old_hash, new_hash;
1387 	struct vcache_key old_vcache_key, new_vcache_key;
1388 	struct vcache_node *node, *new_node;
1389 	struct vnode *tvp;
1390 
1391 	old_vcache_key.vk_mount = mp;
1392 	old_vcache_key.vk_key = old_key;
1393 	old_vcache_key.vk_key_len = old_key_len;
1394 	old_hash = vcache_hash(&old_vcache_key);
1395 
1396 	new_vcache_key.vk_mount = mp;
1397 	new_vcache_key.vk_key = new_key;
1398 	new_vcache_key.vk_key_len = new_key_len;
1399 	new_hash = vcache_hash(&new_vcache_key);
1400 
1401 	new_node = vcache_alloc();
1402 	new_node->vn_key = new_vcache_key;
1403 	tvp = VN_TO_VP(new_node);
1404 
1405 	/* Insert locked new node used as placeholder. */
1406 	mutex_enter(&vcache.lock);
1407 	node = vcache_hash_lookup(&new_vcache_key, new_hash);
1408 	if (node != NULL) {
1409 		mutex_enter(tvp->v_interlock);
1410 		VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED);
1411 		mutex_exit(&vcache.lock);
1412 		vrelel(tvp, 0);
1413 		return EEXIST;
1414 	}
1415 	SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1416 	    new_node, vn_hash);
1417 
1418 	/* Lock old node. */
1419 	node = vcache_hash_lookup(&old_vcache_key, old_hash);
1420 	KASSERT(node != NULL);
1421 	KASSERT(VN_TO_VP(node) == vp);
1422 	mutex_enter(vp->v_interlock);
1423 	VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED);
1424 	node->vn_key = old_vcache_key;
1425 	mutex_exit(vp->v_interlock);
1426 	mutex_exit(&vcache.lock);
1427 	return 0;
1428 }
1429 
1430 /*
1431  * Key change complete: remove old node and unlock new node.
1432  */
1433 void
1434 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1435     const void *old_key, size_t old_key_len,
1436     const void *new_key, size_t new_key_len)
1437 {
1438 	uint32_t old_hash, new_hash;
1439 	struct vcache_key old_vcache_key, new_vcache_key;
1440 	struct vcache_node *old_node, *new_node;
1441 	struct vnode *tvp;
1442 
1443 	old_vcache_key.vk_mount = mp;
1444 	old_vcache_key.vk_key = old_key;
1445 	old_vcache_key.vk_key_len = old_key_len;
1446 	old_hash = vcache_hash(&old_vcache_key);
1447 
1448 	new_vcache_key.vk_mount = mp;
1449 	new_vcache_key.vk_key = new_key;
1450 	new_vcache_key.vk_key_len = new_key_len;
1451 	new_hash = vcache_hash(&new_vcache_key);
1452 
1453 	mutex_enter(&vcache.lock);
1454 
1455 	/* Lookup old and new node. */
1456 	old_node = vcache_hash_lookup(&old_vcache_key, old_hash);
1457 	KASSERT(old_node != NULL);
1458 	KASSERT(VN_TO_VP(old_node) == vp);
1459 	mutex_enter(vp->v_interlock);
1460 	VSTATE_ASSERT(vp, VN_BLOCKED);
1461 
1462 	new_node = vcache_hash_lookup(&new_vcache_key, new_hash);
1463 	KASSERT(new_node != NULL);
1464 	KASSERT(new_node->vn_key.vk_key_len == new_key_len);
1465 	tvp = VN_TO_VP(new_node);
1466 	mutex_enter(tvp->v_interlock);
1467 	VSTATE_ASSERT(VN_TO_VP(new_node), VN_LOADING);
1468 
1469 	/* Rekey old node and put it onto its new hashlist. */
1470 	old_node->vn_key = new_vcache_key;
1471 	if (old_hash != new_hash) {
1472 		SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1473 		    old_node, vcache_node, vn_hash);
1474 		SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1475 		    old_node, vn_hash);
1476 	}
1477 	VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE);
1478 	mutex_exit(vp->v_interlock);
1479 
1480 	/* Remove new node used as placeholder. */
1481 	SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask],
1482 	    new_node, vcache_node, vn_hash);
1483 	VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED);
1484 	mutex_exit(&vcache.lock);
1485 	vrelel(tvp, 0);
1486 }
1487 
1488 /*
1489  * Disassociate the underlying file system from a vnode.
1490  *
1491  * Must be called with vnode locked and will return unlocked.
1492  * Must be called with the interlock held, and will return with it held.
1493  */
1494 static void
1495 vcache_reclaim(vnode_t *vp)
1496 {
1497 	lwp_t *l = curlwp;
1498 	struct vcache_node *node = VP_TO_VN(vp);
1499 	uint32_t hash;
1500 	uint8_t temp_buf[64], *temp_key;
1501 	size_t temp_key_len;
1502 	bool recycle, active;
1503 	int error;
1504 
1505 	KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1506 	    VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1507 	KASSERT(mutex_owned(vp->v_interlock));
1508 	KASSERT(vp->v_usecount != 0);
1509 
1510 	active = (vp->v_usecount > 1);
1511 	temp_key_len = node->vn_key.vk_key_len;
1512 	/*
1513 	 * Prevent the vnode from being recycled or brought into use
1514 	 * while we clean it out.
1515 	 */
1516 	VSTATE_CHANGE(vp, VN_ACTIVE, VN_RECLAIMING);
1517 	if (vp->v_iflag & VI_EXECMAP) {
1518 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1519 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1520 	}
1521 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1522 	mutex_exit(vp->v_interlock);
1523 
1524 	/* Replace the vnode key with a temporary copy. */
1525 	if (node->vn_key.vk_key_len > sizeof(temp_buf)) {
1526 		temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1527 	} else {
1528 		temp_key = temp_buf;
1529 	}
1530 	mutex_enter(&vcache.lock);
1531 	memcpy(temp_key, node->vn_key.vk_key, temp_key_len);
1532 	node->vn_key.vk_key = temp_key;
1533 	mutex_exit(&vcache.lock);
1534 
1535 	/*
1536 	 * Clean out any cached data associated with the vnode.
1537 	 * If purging an active vnode, it must be closed and
1538 	 * deactivated before being reclaimed. Note that the
1539 	 * VOP_INACTIVE will unlock the vnode.
1540 	 */
1541 	error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1542 	if (error != 0) {
1543 		if (wapbl_vphaswapbl(vp))
1544 			WAPBL_DISCARD(wapbl_vptomp(vp));
1545 		error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1546 	}
1547 	KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1548 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1549 	if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1550 		 spec_node_revoke(vp);
1551 	}
1552 	if (active) {
1553 		VOP_INACTIVE(vp, &recycle);
1554 	} else {
1555 		/*
1556 		 * Any other processes trying to obtain this lock must first
1557 		 * wait for VN_RECLAIMED, then call the new lock operation.
1558 		 */
1559 		VOP_UNLOCK(vp);
1560 	}
1561 
1562 	/* Disassociate the underlying file system from the vnode. */
1563 	if (VOP_RECLAIM(vp)) {
1564 		vnpanic(vp, "%s: cannot reclaim", __func__);
1565 	}
1566 
1567 	KASSERT(vp->v_data == NULL);
1568 	KASSERT(vp->v_uobj.uo_npages == 0);
1569 
1570 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
1571 		uvm_ra_freectx(vp->v_ractx);
1572 		vp->v_ractx = NULL;
1573 	}
1574 
1575 	/* Purge name cache. */
1576 	cache_purge(vp);
1577 
1578 	/* Move to dead mount. */
1579 	vp->v_vflag &= ~VV_ROOT;
1580 	atomic_inc_uint(&dead_rootmount->mnt_refcnt);
1581 	vfs_insmntque(vp, dead_rootmount);
1582 
1583 	/* Remove from vnode cache. */
1584 	hash = vcache_hash(&node->vn_key);
1585 	mutex_enter(&vcache.lock);
1586 	KASSERT(node == vcache_hash_lookup(&node->vn_key, hash));
1587 	SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1588 	    node, vcache_node, vn_hash);
1589 	mutex_exit(&vcache.lock);
1590 	if (temp_key != temp_buf)
1591 		kmem_free(temp_key, temp_key_len);
1592 
1593 	/* Done with purge, notify sleepers of the grim news. */
1594 	mutex_enter(vp->v_interlock);
1595 	vp->v_op = dead_vnodeop_p;
1596 	vp->v_vflag |= VV_LOCKSWORK;
1597 	VSTATE_CHANGE(vp, VN_RECLAIMING, VN_RECLAIMED);
1598 	vp->v_tag = VT_NON;
1599 	KNOTE(&vp->v_klist, NOTE_REVOKE);
1600 
1601 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1602 }
1603 
1604 /*
1605  * Print a vcache node.
1606  */
1607 void
1608 vcache_print(vnode_t *vp, const char *prefix, void (*pr)(const char *, ...))
1609 {
1610 	int n;
1611 	const uint8_t *cp;
1612 	struct vcache_node *node;
1613 
1614 	node = VP_TO_VN(vp);
1615 	n = node->vn_key.vk_key_len;
1616 	cp = node->vn_key.vk_key;
1617 
1618 	(*pr)("%sstate %s, key(%d)", prefix, vstate_name(node->vn_state), n);
1619 
1620 	while (n-- > 0)
1621 		(*pr)(" %02x", *cp++);
1622 	(*pr)("\n");
1623 }
1624 
1625 /*
1626  * Update outstanding I/O count and do wakeup if requested.
1627  */
1628 void
1629 vwakeup(struct buf *bp)
1630 {
1631 	vnode_t *vp;
1632 
1633 	if ((vp = bp->b_vp) == NULL)
1634 		return;
1635 
1636 	KASSERT(bp->b_objlock == vp->v_interlock);
1637 	KASSERT(mutex_owned(bp->b_objlock));
1638 
1639 	if (--vp->v_numoutput < 0)
1640 		vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1641 	if (vp->v_numoutput == 0)
1642 		cv_broadcast(&vp->v_cv);
1643 }
1644 
1645 /*
1646  * Test a vnode for being or becoming dead.  Returns one of:
1647  * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1648  * ENOENT: vnode is dead.
1649  * 0:      otherwise.
1650  *
1651  * Whenever this function returns a non-zero value all future
1652  * calls will also return a non-zero value.
1653  */
1654 int
1655 vdead_check(struct vnode *vp, int flags)
1656 {
1657 
1658 	KASSERT(mutex_owned(vp->v_interlock));
1659 
1660 	if (! ISSET(flags, VDEAD_NOWAIT))
1661 		VSTATE_WAIT_STABLE(vp);
1662 
1663 	if (VSTATE_GET(vp) == VN_RECLAIMING) {
1664 		KASSERT(ISSET(flags, VDEAD_NOWAIT));
1665 		return EBUSY;
1666 	} else if (VSTATE_GET(vp) == VN_RECLAIMED) {
1667 		return ENOENT;
1668 	}
1669 
1670 	return 0;
1671 }
1672 
1673 int
1674 vfs_drainvnodes(long target)
1675 {
1676 	int error;
1677 
1678 	mutex_enter(&vnode_free_list_lock);
1679 
1680 	while (numvnodes > target) {
1681 		error = cleanvnode();
1682 		if (error != 0)
1683 			return error;
1684 		mutex_enter(&vnode_free_list_lock);
1685 	}
1686 
1687 	mutex_exit(&vnode_free_list_lock);
1688 
1689 	vcache_reinit();
1690 
1691 	return 0;
1692 }
1693 
1694 void
1695 vnpanic(vnode_t *vp, const char *fmt, ...)
1696 {
1697 	va_list ap;
1698 
1699 #ifdef DIAGNOSTIC
1700 	vprint(NULL, vp);
1701 #endif
1702 	va_start(ap, fmt);
1703 	vpanic(fmt, ap);
1704 	va_end(ap);
1705 }
1706