xref: /dflybsd-src/sys/kern/vfs_lock.c (revision fc36a10bce8c5678d103e0498db849506d9dac68)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing but not quite).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/spinlock2.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #define VACT_MAX	10
71 #define VACT_INC	2
72 
73 static void vnode_terminate(struct vnode *vp);
74 
75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
76 
77 /*
78  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
79  * are inserted prior to the mid point, and otherwise inserted
80  * at the tail.
81  *
82  * The vnode code goes to great lengths to avoid moving vnodes between
83  * lists, but sometimes it is unavoidable.  For this situation we try to
84  * avoid lock contention but we do not try very hard to avoid cache line
85  * congestion.  A modestly sized hash table is used.
86  */
87 #define VLIST_PRIME2	123462047LU
88 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
89 
90 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
91 			 VLIST_PRIME2 % (unsigned)ncpus)
92 
93 static struct vnode_index *vnode_list_hash;
94 
95 int  activevnodes = 0;
96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
97 	&activevnodes, 0, "Number of active nodes");
98 int  cachedvnodes = 0;
99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
100 	&cachedvnodes, 0, "Number of total cached nodes");
101 int  inactivevnodes = 0;
102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
103 	&inactivevnodes, 0, "Number of inactive nodes");
104 static int batchfreevnodes = 5;
105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
106 	&batchfreevnodes, 0, "Number of vnodes to free at once");
107 #ifdef TRACKVNODE
108 static u_long trackvnode;
109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
110 		&trackvnode, 0, "");
111 #endif
112 
113 /*
114  * Called from vfsinit()
115  */
116 void
117 vfs_lock_init(void)
118 {
119 	int i;
120 
121 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
122 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
123 				  M_VNODE, M_ZERO | M_WAITOK);
124 	for (i = 0; i < ncpus; ++i) {
125 		struct vnode_index *vi = &vnode_list_hash[i];
126 
127 		TAILQ_INIT(&vi->inactive_list);
128 		TAILQ_INIT(&vi->active_list);
129 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
130 		spin_init(&vi->spin, "vfslock");
131 	}
132 }
133 
134 /*
135  * Misc functions
136  */
137 static __inline
138 void
139 _vsetflags(struct vnode *vp, int flags)
140 {
141 	atomic_set_int(&vp->v_flag, flags);
142 }
143 
144 static __inline
145 void
146 _vclrflags(struct vnode *vp, int flags)
147 {
148 	atomic_clear_int(&vp->v_flag, flags);
149 }
150 
151 void
152 vsetflags(struct vnode *vp, int flags)
153 {
154 	_vsetflags(vp, flags);
155 }
156 
157 void
158 vclrflags(struct vnode *vp, int flags)
159 {
160 	_vclrflags(vp, flags);
161 }
162 
163 /*
164  * Place the vnode on the active list.
165  *
166  * Caller must hold vp->v_spin
167  */
168 static __inline
169 void
170 _vactivate(struct vnode *vp)
171 {
172 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
173 
174 #ifdef TRACKVNODE
175 	if ((u_long)vp == trackvnode)
176 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
177 #endif
178 	spin_lock(&vi->spin);
179 
180 	switch(vp->v_state) {
181 	case VS_ACTIVE:
182 		spin_unlock(&vi->spin);
183 		panic("_vactivate: already active");
184 		/* NOT REACHED */
185 		return;
186 	case VS_INACTIVE:
187 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
188 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
189 		break;
190 	case VS_CACHED:
191 	case VS_DYING:
192 		break;
193 	}
194 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
195 	vp->v_state = VS_ACTIVE;
196 	spin_unlock(&vi->spin);
197 	atomic_add_int(&mycpu->gd_activevnodes, 1);
198 }
199 
200 /*
201  * Put a vnode on the inactive list.
202  *
203  * Caller must hold v_spin
204  */
205 static __inline
206 void
207 _vinactive(struct vnode *vp)
208 {
209 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
210 
211 #ifdef TRACKVNODE
212 	if ((u_long)vp == trackvnode) {
213 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
214 		print_backtrace(-1);
215 	}
216 #endif
217 	spin_lock(&vi->spin);
218 
219 	/*
220 	 * Remove from active list if it is sitting on it
221 	 */
222 	switch(vp->v_state) {
223 	case VS_ACTIVE:
224 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
225 		atomic_add_int(&mycpu->gd_activevnodes, -1);
226 		break;
227 	case VS_INACTIVE:
228 		spin_unlock(&vi->spin);
229 		panic("_vinactive: already inactive");
230 		/* NOT REACHED */
231 		return;
232 	case VS_CACHED:
233 	case VS_DYING:
234 		break;
235 	}
236 
237 	/*
238 	 * Distinguish between basically dead vnodes, vnodes with cached
239 	 * data, and vnodes without cached data.  A rover will shift the
240 	 * vnodes around as their cache status is lost.
241 	 */
242 	if (vp->v_flag & VRECLAIMED) {
243 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
244 	} else {
245 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
246 	}
247 	vp->v_state = VS_INACTIVE;
248 	spin_unlock(&vi->spin);
249 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
250 }
251 
252 /*
253  * Add a ref to an active vnode.  This function should never be called
254  * with an inactive vnode (use vget() instead), but might be called
255  * with other states.
256  */
257 void
258 vref(struct vnode *vp)
259 {
260 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
261 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
262 	atomic_add_int(&vp->v_refcnt, 1);
263 }
264 
265 void
266 synchronizevnodecount(void)
267 {
268 	int nca = 0;
269 	int act = 0;
270 	int ina = 0;
271 	int i;
272 
273 	for (i = 0; i < ncpus; ++i) {
274 		globaldata_t gd = globaldata_find(i);
275 		nca += gd->gd_cachedvnodes;
276 		act += gd->gd_activevnodes;
277 		ina += gd->gd_inactivevnodes;
278 	}
279 	cachedvnodes = nca;
280 	activevnodes = act;
281 	inactivevnodes = ina;
282 }
283 
284 /*
285  * Count number of cached vnodes.  This is middling expensive so be
286  * careful not to make this call in the critical path.  Each cpu tracks
287  * its own accumulator.  The individual accumulators must be summed
288  * together to get an accurate value.
289  */
290 int
291 countcachedvnodes(void)
292 {
293 	int i;
294 	int n = 0;
295 
296 	for (i = 0; i < ncpus; ++i) {
297 		globaldata_t gd = globaldata_find(i);
298 		n += gd->gd_cachedvnodes;
299 	}
300 	return n;
301 }
302 
303 int
304 countcachedandinactivevnodes(void)
305 {
306 	int i;
307 	int n = 0;
308 
309 	for (i = 0; i < ncpus; ++i) {
310 		globaldata_t gd = globaldata_find(i);
311 		n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
312 	}
313 	return n;
314 }
315 
316 /*
317  * Release a ref on an active or inactive vnode.
318  *
319  * Caller has no other requirements.
320  *
321  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
322  * transition, otherwise we leave the vnode in the active list and
323  * do a lockless transition to 0, which is very important for the
324  * critical path.
325  *
326  * (vrele() is not called when a vnode is being destroyed w/kfree)
327  */
328 void
329 vrele(struct vnode *vp)
330 {
331 	int count;
332 
333 #if 1
334 	count = vp->v_refcnt;
335 	cpu_ccfence();
336 
337 	for (;;) {
338 		KKASSERT((count & VREF_MASK) > 0);
339 		KKASSERT(vp->v_state == VS_ACTIVE ||
340 			 vp->v_state == VS_INACTIVE);
341 
342 		/*
343 		 * 2+ case
344 		 */
345 		if ((count & VREF_MASK) > 1) {
346 			if (atomic_fcmpset_int(&vp->v_refcnt,
347 					       &count, count - 1)) {
348 				break;
349 			}
350 			continue;
351 		}
352 
353 		/*
354 		 * 1->0 transition case must handle possible finalization.
355 		 * When finalizing we transition 1->0x40000000.  Note that
356 		 * cachedvnodes is only adjusted on transitions to ->0.
357 		 *
358 		 * WARNING! VREF_TERMINATE can be cleared at any point
359 		 *	    when the refcnt is non-zero (by vget()) and
360 		 *	    the vnode has not been reclaimed.  Thus
361 		 *	    transitions out of VREF_TERMINATE do not have
362 		 *	    to mess with cachedvnodes.
363 		 */
364 		if (count & VREF_FINALIZE) {
365 			vx_lock(vp);
366 			if (atomic_fcmpset_int(&vp->v_refcnt,
367 					      &count, VREF_TERMINATE)) {
368 				vnode_terminate(vp);
369 				break;
370 			}
371 			vx_unlock(vp);
372 		} else {
373 			if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) {
374 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
375 				break;
376 			}
377 		}
378 		cpu_pause();
379 		/* retry */
380 	}
381 #else
382 	/*
383 	 * XXX NOT YET WORKING!  Multiple threads can reference the vnode
384 	 * after dropping their count, racing destruction, because this
385 	 * code is not directly transitioning from 1->VREF_FINALIZE.
386 	 */
387         /*
388          * Drop the ref-count.  On the 1->0 transition we check VREF_FINALIZE
389          * and attempt to acquire VREF_TERMINATE if set.  It is possible for
390          * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but
391          * only one will be able to transition the vnode into the
392          * VREF_TERMINATE state.
393          *
394          * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter
395          *       this state once.
396          */
397         count = atomic_fetchadd_int(&vp->v_refcnt, -1);
398         if ((count & VREF_MASK) == 1) {
399                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
400                 --count;
401                 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) {
402                         vx_lock(vp);
403                         if (atomic_fcmpset_int(&vp->v_refcnt,
404                                                &count, VREF_TERMINATE)) {
405                                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
406                                 vnode_terminate(vp);
407                                 break;
408                         }
409                         vx_unlock(vp);
410                 }
411         }
412 #endif
413 }
414 
415 /*
416  * Add an auxiliary data structure reference to the vnode.  Auxiliary
417  * references do not change the state of the vnode or prevent deactivation
418  * or reclamation of the vnode, but will prevent the vnode from being
419  * destroyed (kfree()'d).
420  *
421  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
422  *	     already be held by the caller.  vdrop() will clean up the
423  *	     free list state.
424  */
425 void
426 vhold(struct vnode *vp)
427 {
428 	atomic_add_int(&vp->v_auxrefs, 1);
429 }
430 
431 /*
432  * Remove an auxiliary reference from the vnode.
433  */
434 void
435 vdrop(struct vnode *vp)
436 {
437 	atomic_add_int(&vp->v_auxrefs, -1);
438 }
439 
440 /*
441  * This function is called on the 1->0 transition (which is actually
442  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
443  * of the vnode.
444  *
445  * Additional vrefs are allowed to race but will not result in a reentrant
446  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
447  * prevents additional 1->0 transitions.
448  *
449  * ONLY A VGET() CAN REACTIVATE THE VNODE.
450  *
451  * Caller must hold the VX lock.
452  *
453  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
454  *
455  * NOTE: The vnode may be marked inactive with dirty buffers
456  *	 or dirty pages in its cached VM object still present.
457  *
458  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
459  *	 previously be active).  We lose control of the vnode the instant
460  *	 it is placed on the free list.
461  *
462  *	 The VX lock is required when transitioning to VS_CACHED but is
463  *	 not sufficient for the vshouldfree() interlocked test or when
464  *	 transitioning away from VS_CACHED.  v_spin is also required for
465  *	 those cases.
466  */
467 static
468 void
469 vnode_terminate(struct vnode *vp)
470 {
471 	KKASSERT(vp->v_state == VS_ACTIVE);
472 
473 	if ((vp->v_flag & VINACTIVE) == 0) {
474 		_vsetflags(vp, VINACTIVE);
475 		if (vp->v_mount)
476 			VOP_INACTIVE(vp);
477 	}
478 	spin_lock(&vp->v_spin);
479 	_vinactive(vp);
480 	spin_unlock(&vp->v_spin);
481 
482 	vx_unlock(vp);
483 }
484 
485 /****************************************************************
486  *			VX LOCKING FUNCTIONS			*
487  ****************************************************************
488  *
489  * These functions lock vnodes for reclamation and deactivation related
490  * activities.  The caller must already be holding some sort of reference
491  * on the vnode.
492  */
493 void
494 vx_lock(struct vnode *vp)
495 {
496 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
497 	spin_lock_update_only(&vp->v_spin);
498 }
499 
500 void
501 vx_unlock(struct vnode *vp)
502 {
503 	spin_unlock_update_only(&vp->v_spin);
504 	lockmgr(&vp->v_lock, LK_RELEASE);
505 }
506 
507 /*
508  * Downgrades a VX lock to a normal VN lock.  The lock remains EXCLUSIVE.
509  *
510  * Generally required after calling getnewvnode() if the intention is
511  * to return a normal locked vnode to the caller.
512  */
513 void
514 vx_downgrade(struct vnode *vp)
515 {
516 	spin_unlock_update_only(&vp->v_spin);
517 }
518 
519 /****************************************************************
520  *			VNODE ACQUISITION FUNCTIONS		*
521  ****************************************************************
522  *
523  * These functions must be used when accessing a vnode that has no
524  * chance of being destroyed in a SMP race.  That means the caller will
525  * usually either hold an auxiliary reference (such as the namecache)
526  * or hold some other lock that ensures that the vnode cannot be destroyed.
527  *
528  * These functions are MANDATORY for any code chain accessing a vnode
529  * whos activation state is not known.
530  *
531  * vget() can be called with LK_NOWAIT and will return EBUSY if the
532  * lock cannot be immediately acquired.
533  *
534  * vget()/vput() are used when reactivation is desired.
535  *
536  * vx_get() and vx_put() are used when reactivation is not desired.
537  */
538 int
539 vget(struct vnode *vp, int flags)
540 {
541 	int error;
542 
543 	/*
544 	 * A lock type must be passed
545 	 */
546 	if ((flags & LK_TYPE_MASK) == 0) {
547 		panic("vget() called with no lock specified!");
548 		/* NOT REACHED */
549 	}
550 
551 	/*
552 	 * Reference the structure and then acquire the lock.
553 	 *
554 	 * NOTE: The requested lock might be a shared lock and does
555 	 *	 not protect our access to the refcnt or other fields.
556 	 */
557 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
558 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
559 
560 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
561 		/*
562 		 * The lock failed, undo and return an error.  This will not
563 		 * normally trigger a termination.
564 		 */
565 		vrele(vp);
566 	} else if (vp->v_flag & VRECLAIMED) {
567 		/*
568 		 * The node is being reclaimed and cannot be reactivated
569 		 * any more, undo and return ENOENT.
570 		 */
571 		vn_unlock(vp);
572 		vrele(vp);
573 		error = ENOENT;
574 	} else if (vp->v_state == VS_ACTIVE) {
575 		/*
576 		 * A VS_ACTIVE vnode coupled with the fact that we have
577 		 * a vnode lock (even if shared) prevents v_state from
578 		 * changing.  Since the vnode is not in a VRECLAIMED state,
579 		 * we can safely clear VINACTIVE.
580 		 *
581 		 * It is possible for a shared lock to cause a race with
582 		 * another thread that is also in the process of clearing
583 		 * VREF_TERMINATE, meaning that we might return with it still
584 		 * set and then assert in a later vref().  The solution is to
585 		 * unconditionally clear VREF_TERMINATE here as well.
586 		 *
587 		 * NOTE! Multiple threads may clear VINACTIVE if this is
588 		 *	 shared lock.  This race is allowed.
589 		 */
590 		if (vp->v_flag & VINACTIVE)
591 			_vclrflags(vp, VINACTIVE);	/* SMP race ok */
592 		if (vp->v_act < VACT_MAX) {
593 			vp->v_act += VACT_INC;
594 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
595 				vp->v_act = VACT_MAX;
596 		}
597 		error = 0;
598 		if (vp->v_refcnt & VREF_TERMINATE)	/* SMP race ok */
599 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
600 	} else {
601 		/*
602 		 * If the vnode is not VS_ACTIVE it must be reactivated
603 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
604 		 * is needed to manipulate the vnode's list.
605 		 *
606 		 * Because the lockmgr lock might be shared, we might race
607 		 * another reactivation, which we handle.  In this situation,
608 		 * however, the refcnt prevents other v_state races.
609 		 *
610 		 * As with above, clearing VINACTIVE is allowed to race other
611 		 * clearings of VINACTIVE.
612 		 *
613 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
614 		 * the refcnt is non-zero and the vnode has not been
615 		 * reclaimed.  This also means that the transitions do
616 		 * not affect cachedvnodes.
617 		 *
618 		 * It is possible for a shared lock to cause a race with
619 		 * another thread that is also in the process of clearing
620 		 * VREF_TERMINATE, meaning that we might return with it still
621 		 * set and then assert in a later vref().  The solution is to
622 		 * unconditionally clear VREF_TERMINATE here as well.
623 		 */
624 		_vclrflags(vp, VINACTIVE);
625 		vp->v_act += VACT_INC;
626 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
627 			vp->v_act = VACT_MAX;
628 		spin_lock(&vp->v_spin);
629 
630 		switch(vp->v_state) {
631 		case VS_INACTIVE:
632 			_vactivate(vp);
633 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
634 							VREF_FINALIZE);
635 			spin_unlock(&vp->v_spin);
636 			break;
637 		case VS_CACHED:
638 			_vactivate(vp);
639 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
640 							VREF_FINALIZE);
641 			spin_unlock(&vp->v_spin);
642 			break;
643 		case VS_ACTIVE:
644 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
645 							VREF_TERMINATE);
646 			spin_unlock(&vp->v_spin);
647 			break;
648 		case VS_DYING:
649 			spin_unlock(&vp->v_spin);
650 			panic("Impossible VS_DYING state");
651 			break;
652 		}
653 		error = 0;
654 	}
655 	return(error);
656 }
657 
658 #ifdef DEBUG_VPUT
659 
660 void
661 debug_vput(struct vnode *vp, const char *filename, int line)
662 {
663 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
664 	vn_unlock(vp);
665 	vrele(vp);
666 }
667 
668 #else
669 
670 void
671 vput(struct vnode *vp)
672 {
673 	vn_unlock(vp);
674 	vrele(vp);
675 }
676 
677 #endif
678 
679 /*
680  * Acquire the vnode lock unguarded.
681  *
682  * The non-blocking version also uses a slightly different mechanic.
683  * This function will explicitly fail not only if it cannot acquire
684  * the lock normally, but also if the caller already holds a lock.
685  *
686  * The adjusted mechanic is used to close a loophole where complex
687  * VOP_RECLAIM code can circle around recursively and allocate the
688  * same vnode it is trying to destroy from the freelist.
689  *
690  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
691  * cause the incorrect behavior to occur.  If not for that lockmgr()
692  * would do the right thing.
693  *
694  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
695  */
696 void
697 vx_get(struct vnode *vp)
698 {
699 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
700 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
701 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
702 	spin_lock_update_only(&vp->v_spin);
703 }
704 
705 int
706 vx_get_nonblock(struct vnode *vp)
707 {
708 	int error;
709 
710 	if (lockinuse(&vp->v_lock))
711 		return(EBUSY);
712 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
713 	if (error == 0) {
714 		spin_lock_update_only(&vp->v_spin);
715 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
716 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
717 	}
718 	return(error);
719 }
720 
721 /*
722  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
723  * any needed state transitions.
724  *
725  * However, filesystems use this function to get rid of unwanted new vnodes
726  * so try to get the vnode on the correct queue in that case.
727  */
728 void
729 vx_put(struct vnode *vp)
730 {
731 	if (vp->v_type == VNON || vp->v_type == VBAD)
732 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
733 	spin_unlock_update_only(&vp->v_spin);
734 	lockmgr(&vp->v_lock, LK_RELEASE);
735 	vrele(vp);
736 }
737 
738 /*
739  * Try to reuse a vnode from the free list.  This function is somewhat
740  * advisory in that NULL can be returned as a normal case, even if free
741  * vnodes are present.
742  *
743  * The scan is limited because it can result in excessive CPU use during
744  * periods of extreme vnode use.
745  *
746  * NOTE: The returned vnode is not completely initialized.
747  *	 The returned vnode will be VX locked.
748  */
749 static
750 struct vnode *
751 cleanfreevnode(int maxcount)
752 {
753 	struct vnode_index *vi;
754 	struct vnode *vp;
755 	int count;
756 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
757 	int ri;
758 	int cpu_count;
759 
760 	/*
761 	 * Try to deactivate some vnodes cached on the active list.
762 	 */
763 	if (countcachedvnodes() < inactivevnodes)
764 		goto skip;
765 
766 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
767 
768 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
769 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
770 
771 		spin_lock(&vi->spin);
772 
773 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
774 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
775 		if (vp == NULL) {
776 			TAILQ_INSERT_HEAD(&vi->active_list,
777 					  &vi->active_rover, v_list);
778 		} else {
779 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
780 					   &vi->active_rover, v_list);
781 		}
782 		if (vp == NULL) {
783 			spin_unlock(&vi->spin);
784 			continue;
785 		}
786 		if ((vp->v_refcnt & VREF_MASK) != 0) {
787 			spin_unlock(&vi->spin);
788 			vp->v_act += VACT_INC;
789 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
790 				vp->v_act = VACT_MAX;
791 			continue;
792 		}
793 
794 		/*
795 		 * decrement by less if the vnode's object has a lot of
796 		 * VM pages.  XXX possible SMP races.
797 		 */
798 		if (vp->v_act > 0) {
799 			vm_object_t obj;
800 			if ((obj = vp->v_object) != NULL &&
801 			    obj->resident_page_count >= trigger) {
802 				vp->v_act -= 1;
803 			} else {
804 				vp->v_act -= VACT_INC;
805 			}
806 			if (vp->v_act < 0)
807 				vp->v_act = 0;
808 			spin_unlock(&vi->spin);
809 			continue;
810 		}
811 
812 		/*
813 		 * Try to deactivate the vnode.
814 		 */
815 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
816 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
817 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
818 
819 		spin_unlock(&vi->spin);
820 		vrele(vp);
821 	}
822 
823 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
824 
825 skip:
826 	/*
827 	 * Loop trying to lock the first vnode on the free list.
828 	 * Cycle if we can't.
829 	 */
830 	cpu_count = ncpus;
831 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
832 
833 	for (count = 0; count < maxcount; ++count, ++ri) {
834 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
835 
836 		spin_lock(&vi->spin);
837 
838 		vp = TAILQ_FIRST(&vi->inactive_list);
839 		if (vp == NULL) {
840 			spin_unlock(&vi->spin);
841 			if (--cpu_count == 0)
842 				break;
843 			ri = (ri + 16) & ~15;
844 			--ri;
845 			continue;
846 		}
847 
848 		/*
849 		 * non-blocking vx_get will also ref the vnode on success.
850 		 */
851 		if (vx_get_nonblock(vp)) {
852 			KKASSERT(vp->v_state == VS_INACTIVE);
853 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
854 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
855 			spin_unlock(&vi->spin);
856 			continue;
857 		}
858 
859 		/*
860 		 * Because we are holding vfs_spin the vnode should currently
861 		 * be inactive and VREF_TERMINATE should still be set.
862 		 *
863 		 * Once vfs_spin is released the vnode's state should remain
864 		 * unmodified due to both the lock and ref on it.
865 		 */
866 		KKASSERT(vp->v_state == VS_INACTIVE);
867 		spin_unlock(&vi->spin);
868 #ifdef TRACKVNODE
869 		if ((u_long)vp == trackvnode)
870 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
871 #endif
872 
873 		/*
874 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
875 		 * This includes namecache refs due to a related ncp being
876 		 * locked or having children, a VM object association, or
877 		 * other hold users.
878 		 *
879 		 * Do not reclaim/reuse a vnode if someone else has a real
880 		 * ref on it.  This can occur if a filesystem temporarily
881 		 * releases the vnode lock during VOP_RECLAIM.
882 		 */
883 		if (vp->v_auxrefs != vp->v_namecache_count ||
884 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
885 failed:
886 			if (vp->v_state == VS_INACTIVE) {
887 				spin_lock(&vi->spin);
888 				if (vp->v_state == VS_INACTIVE) {
889 					TAILQ_REMOVE(&vi->inactive_list,
890 						     vp, v_list);
891 					TAILQ_INSERT_TAIL(&vi->inactive_list,
892 							  vp, v_list);
893 				}
894 				spin_unlock(&vi->spin);
895 			}
896 			vx_put(vp);
897 			continue;
898 		}
899 
900 		/*
901 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
902 		 * for vnodes pulled from the inactive list, and cannot be
903 		 * changed while we hold the vx lock.
904 		 *
905 		 * Try to reclaim the vnode.
906 		 *
907 		 * The cache_inval_vp() can fail if any of the namecache
908 		 * elements are actively locked, preventing the vnode from
909 		 * bring reclaimed.  This is desired operation as it gives
910 		 * the namecache code certain guarantees just by holding
911 		 * a ncp.
912 		 */
913 		KKASSERT(vp->v_flag & VINACTIVE);
914 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
915 
916 		if ((vp->v_flag & VRECLAIMED) == 0) {
917 			if (cache_inval_vp_nonblock(vp))
918 				goto failed;
919 			vgone_vxlocked(vp);
920 			/* vnode is still VX locked */
921 		}
922 
923 		/*
924 		 * At this point if there are no other refs or auxrefs on
925 		 * the vnode with the inactive list locked, and we remove
926 		 * the vnode from the inactive list, it should not be
927 		 * possible for anyone else to access the vnode any more.
928 		 *
929 		 * Since the vnode is in a VRECLAIMED state, no new
930 		 * namecache associations could have been made and the
931 		 * vnode should have already been removed from its mountlist.
932 		 *
933 		 * Since we hold a VX lock on the vnode it cannot have been
934 		 * reactivated (moved out of the inactive list).
935 		 */
936 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
937 		spin_lock(&vi->spin);
938 		if (vp->v_auxrefs ||
939 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
940 			spin_unlock(&vi->spin);
941 			goto failed;
942 		}
943 		KKASSERT(vp->v_state == VS_INACTIVE);
944 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
945 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
946 		vp->v_state = VS_DYING;
947 		spin_unlock(&vi->spin);
948 
949 		/*
950 		 * Nothing should have been able to access this vp.  Only
951 		 * our ref should remain now.
952 		 */
953 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
954 		KASSERT(vp->v_refcnt == 1,
955 			("vp %p badrefs %08x", vp, vp->v_refcnt));
956 
957 		/*
958 		 * Return a VX locked vnode suitable for reuse.
959 		 */
960 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
961 		return(vp);
962 	}
963 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
964 	return(NULL);
965 }
966 
967 /*
968  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
969  *
970  * All new vnodes set the VAGE flags.  An open() of the vnode will
971  * decrement the (2-bit) flags.  Vnodes which are opened several times
972  * are thus retained in the cache over vnodes which are merely stat()d.
973  *
974  * We attempt to reuse an already-recycled vnode from our pcpu inactive
975  * queue first, and allocate otherwise.  Attempting to recycle inactive
976  * vnodes here can lead to numerous deadlocks, particularly with
977  * softupdates.
978  */
979 struct vnode *
980 allocvnode(int lktimeout, int lkflags)
981 {
982 	struct vnode *vp;
983 	struct vnode_index *vi;
984 
985 	/*
986 	 * lktimeout only applies when LK_TIMELOCK is used, and only
987 	 * the pageout daemon uses it.  The timeout may not be zero
988 	 * or the pageout daemon can deadlock in low-VM situations.
989 	 */
990 	if (lktimeout == 0)
991 		lktimeout = hz / 10;
992 
993 	/*
994 	 * Do not flag for synchronous recyclement unless there are enough
995 	 * freeable vnodes to recycle and the number of vnodes has
996 	 * significantly exceeded our target.  We want the normal vnlru
997 	 * process to handle the cleaning (at 9/10's) before we are forced
998 	 * to flag it here at 11/10's for userexit path processing.
999 	 */
1000 	if (numvnodes >= maxvnodes * 11 / 10 &&
1001 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
1002 		struct thread *td = curthread;
1003 		if (td->td_lwp)
1004 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
1005 	}
1006 
1007 	/*
1008 	 * Try to trivially reuse a reclaimed vnode from the head of the
1009 	 * inactive list for this cpu.  Any vnode cycling which occurs
1010 	 * which terminates the vnode will cause it to be returned to the
1011 	 * same pcpu structure (e.g. unlink calls).
1012 	 */
1013 	vi = &vnode_list_hash[mycpuid];
1014 	spin_lock(&vi->spin);
1015 
1016 	vp = TAILQ_FIRST(&vi->inactive_list);
1017 	if (vp && (vp->v_flag & VRECLAIMED)) {
1018 		/*
1019 		 * non-blocking vx_get will also ref the vnode on success.
1020 		 */
1021 		if (vx_get_nonblock(vp)) {
1022 			KKASSERT(vp->v_state == VS_INACTIVE);
1023 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1024 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
1025 			spin_unlock(&vi->spin);
1026 			goto slower;
1027 		}
1028 
1029 		/*
1030 		 * Because we are holding vfs_spin the vnode should currently
1031 		 * be inactive and VREF_TERMINATE should still be set.
1032 		 *
1033 		 * Once vfs_spin is released the vnode's state should remain
1034 		 * unmodified due to both the lock and ref on it.
1035 		 */
1036 		KKASSERT(vp->v_state == VS_INACTIVE);
1037 #ifdef TRACKVNODE
1038 		if ((u_long)vp == trackvnode)
1039 			kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1040 #endif
1041 
1042 		/*
1043 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
1044 		 * This includes namecache refs due to a related ncp being
1045 		 * locked or having children, a VM object association, or
1046 		 * other hold users.
1047 		 *
1048 		 * Do not reclaim/reuse a vnode if someone else has a real
1049 		 * ref on it.  This can occur if a filesystem temporarily
1050 		 * releases the vnode lock during VOP_RECLAIM.
1051 		 */
1052 		if (vp->v_auxrefs ||
1053 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1054 			if (vp->v_state == VS_INACTIVE) {
1055 				TAILQ_REMOVE(&vi->inactive_list,
1056 					     vp, v_list);
1057 				TAILQ_INSERT_TAIL(&vi->inactive_list,
1058 						  vp, v_list);
1059 			}
1060 			spin_unlock(&vi->spin);
1061 			vx_put(vp);
1062 			goto slower;
1063 		}
1064 
1065 		/*
1066 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
1067 		 * for vnodes pulled from the inactive list, and cannot be
1068 		 * changed while we hold the vx lock.
1069 		 *
1070 		 * Try to reclaim the vnode.
1071 		 */
1072 		KKASSERT(vp->v_flag & VINACTIVE);
1073 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1074 
1075 		if ((vp->v_flag & VRECLAIMED) == 0) {
1076 			spin_unlock(&vi->spin);
1077 			vx_put(vp);
1078 			goto slower;
1079 		}
1080 
1081 		/*
1082 		 * At this point if there are no other refs or auxrefs on
1083 		 * the vnode with the inactive list locked, and we remove
1084 		 * the vnode from the inactive list, it should not be
1085 		 * possible for anyone else to access the vnode any more.
1086 		 *
1087 		 * Since the vnode is in a VRECLAIMED state, no new
1088 		 * namecache associations could have been made and the
1089 		 * vnode should have already been removed from its mountlist.
1090 		 *
1091 		 * Since we hold a VX lock on the vnode it cannot have been
1092 		 * reactivated (moved out of the inactive list).
1093 		 */
1094 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1095 		KKASSERT(vp->v_state == VS_INACTIVE);
1096 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1097 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1098 		vp->v_state = VS_DYING;
1099 		spin_unlock(&vi->spin);
1100 
1101 		/*
1102 		 * Nothing should have been able to access this vp.  Only
1103 		 * our ref should remain now.
1104 		 *
1105 		 * At this point we can kfree() the vnode if we want to.
1106 		 * Instead, we reuse it for the allocation.
1107 		 */
1108 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1109 		KASSERT(vp->v_refcnt == 1,
1110 			("vp %p badrefs %08x", vp, vp->v_refcnt));
1111 		vx_unlock(vp);		/* safety: keep the API clean */
1112 		bzero(vp, sizeof(*vp));
1113 	} else {
1114 		spin_unlock(&vi->spin);
1115 slower:
1116 		vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1117 		atomic_add_int(&numvnodes, 1);
1118 	}
1119 
1120 	lwkt_token_init(&vp->v_token, "vnode");
1121 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1122 	TAILQ_INIT(&vp->v_namecache);
1123 	RB_INIT(&vp->v_rbclean_tree);
1124 	RB_INIT(&vp->v_rbdirty_tree);
1125 	RB_INIT(&vp->v_rbhash_tree);
1126 	spin_init(&vp->v_spin, "allocvnode");
1127 
1128 	vx_lock(vp);
1129 	vp->v_refcnt = 1;
1130 	vp->v_flag = VAGE0 | VAGE1;
1131 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1132 
1133 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1134 	/* exclusive lock still held */
1135 
1136 	vp->v_filesize = NOOFFSET;
1137 	vp->v_type = VNON;
1138 	vp->v_tag = 0;
1139 	vp->v_state = VS_CACHED;
1140 	_vactivate(vp);
1141 
1142 	return (vp);
1143 }
1144 
1145 /*
1146  * Called after a process has allocated a vnode via allocvnode()
1147  * and we detected that too many vnodes were present.
1148  *
1149  * This function is called just prior to a return to userland if the
1150  * process at some point had to allocate a new vnode during the last
1151  * system call and the vnode count was found to be excessive.
1152  *
1153  * This is a synchronous path that we do not normally want to execute.
1154  *
1155  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1156  *
1157  * WARNING: Sometimes numvnodes can blow out due to children being
1158  *	    present under directory vnodes in the namecache.  For the
1159  *	    moment use an if() instead of a while() and note that if
1160  *	    we were to use a while() we would still have to break out
1161  *	    if freesomevnodes() returned 0.  vnlru will also be trying
1162  *	    hard to free vnodes at the same time (with a lower trigger
1163  *	    pointer).
1164  */
1165 void
1166 allocvnode_gc(void)
1167 {
1168 	if (numvnodes >= maxvnodes &&
1169 	    countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1170 		freesomevnodes(batchfreevnodes);
1171 	}
1172 }
1173 
1174 int
1175 freesomevnodes(int n)
1176 {
1177 	struct vnode *vp;
1178 	int count = 0;
1179 
1180 	while (n) {
1181 		if ((vp = cleanfreevnode(n)) == NULL)
1182 			break;
1183 		vx_unlock(vp);
1184 		--n;
1185 		++count;
1186 		kfree(vp, M_VNODE);
1187 		atomic_add_int(&numvnodes, -1);
1188 	}
1189 	return(count);
1190 }
1191