xref: /dflybsd-src/sys/kern/vfs_lock.c (revision 6693db176654a0f25095ec64d0a74d58dcf0e47e)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/kern/vfs_lock.c,v 1.30 2008/06/30 03:57:41 dillon Exp $
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 #include "opt_ddb.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/proc.h>
48 #include <sys/vnode.h>
49 #include <sys/buf.h>
50 #include <sys/sysctl.h>
51 
52 #include <machine/limits.h>
53 
54 #include <vm/vm.h>
55 #include <vm/vm_object.h>
56 
57 #include <sys/buf2.h>
58 #include <sys/thread2.h>
59 #include <sys/sysref2.h>
60 
61 static void vnode_terminate(struct vnode *vp);
62 static boolean_t vnode_ctor(void *obj, void *private, int ocflags);
63 static void vnode_dtor(void *obj, void *private);
64 
65 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
66 static struct sysref_class vnode_sysref_class = {
67 	.name =		"vnode",
68 	.mtype =	M_VNODE,
69 	.proto =	SYSREF_PROTO_VNODE,
70 	.offset =	offsetof(struct vnode, v_sysref),
71 	.objsize =	sizeof(struct vnode),
72 	.mag_capacity =	256,
73 	.flags =	SRC_MANAGEDINIT,
74 	.ctor =		vnode_ctor,
75 	.dtor =		vnode_dtor,
76 	.ops = {
77 		.terminate = (sysref_terminate_func_t)vnode_terminate
78 	}
79 };
80 
81 /*
82  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
83  * are inserted prior to the mid point, and otherwise inserted
84  * at the tail.
85  */
86 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
87 static struct vnode	vnode_free_mid;
88 static struct spinlock	vfs_spin = SPINLOCK_INITIALIZER(vfs_spin);
89 
90 int  freevnodes = 0;
91 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
92 		&freevnodes, 0, "");
93 static int wantfreevnodes = 25;
94 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
95 		&wantfreevnodes, 0, "");
96 #ifdef TRACKVNODE
97 static ulong trackvnode;
98 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
99 		&trackvnode, 0, "");
100 #endif
101 
102 /*
103  * Called from vfsinit()
104  */
105 void
106 vfs_lock_init(void)
107 {
108 	TAILQ_INIT(&vnode_free_list);
109 	TAILQ_INSERT_HEAD(&vnode_free_list, &vnode_free_mid, v_freelist);
110 	spin_init(&vfs_spin);
111 }
112 
113 /*
114  * Misc functions
115  */
116 static __inline
117 void
118 _vsetflags(struct vnode *vp, int flags)
119 {
120 	atomic_set_int(&vp->v_flag, flags);
121 }
122 
123 static __inline
124 void
125 _vclrflags(struct vnode *vp, int flags)
126 {
127 	atomic_clear_int(&vp->v_flag, flags);
128 }
129 
130 void
131 vsetflags(struct vnode *vp, int flags)
132 {
133 	_vsetflags(vp, flags);
134 }
135 
136 void
137 vclrflags(struct vnode *vp, int flags)
138 {
139 	_vclrflags(vp, flags);
140 }
141 
142 /*
143  * Inline helper functions.  vbusy() and vfree() must be called while
144  * vp->v_spinlock is held.
145  *
146  * WARNING!  This functions is typically called with v_spinlock held.
147  *
148  * MPSAFE
149  */
150 static __inline
151 void
152 __vbusy(struct vnode *vp)
153 {
154 #ifdef TRACKVNODE
155 	if ((ulong)vp == trackvnode)
156 		kprintf("__vbusy %p %08x\n", vp, vp->v_flag);
157 #endif
158 	spin_lock_wr(&vfs_spin);
159 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
160 	freevnodes--;
161 	_vclrflags(vp, VFREE);
162 	spin_unlock_wr(&vfs_spin);
163 }
164 
165 /*
166  * WARNING!  This functions is typically called with v_spinlock held.
167  *
168  * MPSAFE
169  */
170 static __inline
171 void
172 __vfree(struct vnode *vp)
173 {
174 #ifdef TRACKVNODE
175 	if ((ulong)vp == trackvnode) {
176 		kprintf("__vfree %p %08x\n", vp, vp->v_flag);
177 		print_backtrace();
178 	}
179 #endif
180 	spin_lock_wr(&vfs_spin);
181 	if (vp->v_flag & VRECLAIMED)
182 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
183 	else if (vp->v_flag & (VAGE0 | VAGE1))
184 		TAILQ_INSERT_BEFORE(&vnode_free_mid, vp, v_freelist);
185 	else
186 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
187 	freevnodes++;
188 	_vsetflags(vp, VFREE);
189 	spin_unlock_wr(&vfs_spin);
190 }
191 
192 /*
193  * WARNING!  This functions is typically called with v_spinlock held.
194  *
195  * MPSAFE
196  */
197 static __inline
198 void
199 __vfreetail(struct vnode *vp)
200 {
201 #ifdef TRACKVNODE
202 	if ((ulong)vp == trackvnode)
203 		kprintf("__vfreetail %p %08x\n", vp, vp->v_flag);
204 #endif
205 	spin_lock_wr(&vfs_spin);
206 	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
207 	freevnodes++;
208 	_vsetflags(vp, VFREE);
209 	spin_unlock_wr(&vfs_spin);
210 }
211 
212 /*
213  * Return a C boolean if we should put the vnode on the freelist (VFREE),
214  * or leave it / mark it as VCACHED.
215  *
216  * This routine is only valid if the vnode is already either VFREE or
217  * VCACHED, or if it can become VFREE or VCACHED via vnode_terminate().
218  *
219  * WARNING!  This functions is typically called with v_spinlock held.
220  *
221  * MPSAFE
222  */
223 static __inline boolean_t
224 vshouldfree(struct vnode *vp)
225 {
226 	return (vp->v_auxrefs == 0 &&
227 	    (vp->v_object == NULL || vp->v_object->resident_page_count == 0));
228 }
229 
230 /*
231  * Add a ref to an active vnode.  This function should never be called
232  * with an inactive vnode (use vget() instead).
233  *
234  * MPSAFE
235  */
236 void
237 vref(struct vnode *vp)
238 {
239 	KKASSERT(vp->v_sysref.refcnt > 0 &&
240 		 (vp->v_flag & (VFREE|VINACTIVE)) == 0);
241 	sysref_get(&vp->v_sysref);
242 }
243 
244 /*
245  * Release a ref on an active or inactive vnode.  The sysref termination
246  * function will be called when the active last active reference is released,
247  * and the vnode is returned to the objcache when the last inactive
248  * reference is released.
249  */
250 void
251 vrele(struct vnode *vp)
252 {
253 	sysref_put(&vp->v_sysref);
254 }
255 
256 /*
257  * Add an auxiliary data structure reference to the vnode.  Auxiliary
258  * references do not change the state of the vnode or prevent them
259  * from being deactivated, reclaimed, or placed on the free list.
260  *
261  * An auxiliary reference DOES prevent the vnode from being destroyed,
262  * allowing you to vx_lock() it, test state, etc.
263  *
264  * An auxiliary reference DOES NOT move a vnode out of the VFREE state
265  * once it has entered it.
266  *
267  * MPSAFE
268  */
269 void
270 vhold(struct vnode *vp)
271 {
272 	KKASSERT(vp->v_sysref.refcnt != 0);
273 	atomic_add_int(&vp->v_auxrefs, 1);
274 }
275 
276 /*
277  * Remove an auxiliary reference from the vnode.
278  *
279  * vdrop needs to check for a VCACHE->VFREE transition to catch cases
280  * where a vnode is held past its reclamation.
281  *
282  * MPSAFE
283  */
284 void
285 vdrop(struct vnode *vp)
286 {
287 	KKASSERT(vp->v_sysref.refcnt != 0 && vp->v_auxrefs > 0);
288 	spin_lock_wr(&vp->v_spinlock);
289 	atomic_subtract_int(&vp->v_auxrefs, 1);
290 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
291 		_vclrflags(vp, VCACHED);
292 		__vfree(vp);
293 	}
294 	spin_unlock_wr(&vp->v_spinlock);
295 }
296 
297 /*
298  * This function is called when the last active reference on the vnode
299  * is released, typically via vrele().  SYSREF will give the vnode a
300  * negative ref count, indicating that it is undergoing termination or
301  * is being set aside for the cache, and one final sysref_put() is
302  * required to actually return it to the memory subsystem.
303  *
304  * However, because vnodes may have auxiliary structural references via
305  * v_auxrefs, we must interlock auxiliary references against termination
306  * via the VX lock mechanism.  It is possible for a vnode to be reactivated
307  * while we were blocked on the lock.
308  *
309  * MPSAFE
310  */
311 void
312 vnode_terminate(struct vnode *vp)
313 {
314 	vx_lock(vp);
315 	if (sysref_isinactive(&vp->v_sysref)) {
316 		/*
317 		 * Deactivate the vnode by marking it VFREE or VCACHED.
318 		 * The vnode can be reactivated from either state until
319 		 * reclaimed.  These states inherit the 'last' sysref on the
320 		 * vnode.
321 		 *
322 		 * NOTE: There may be additional inactive references from
323 		 * other entities blocking on the VX lock while we hold it,
324 		 * but this does not prevent us from changing the vnode's
325 		 * state.
326 		 *
327 		 * NOTE: The vnode could already be marked inactive.  XXX
328 		 *	 how?
329 		 *
330 		 * NOTE: v_mount may be NULL due to assignment to
331 		 *	 dead_vnode_vops
332 		 *
333 		 * NOTE: The vnode may be marked inactive with dirty buffers
334 		 *	 or dirty pages in its cached VM object still present.
335 		 */
336 		if ((vp->v_flag & VINACTIVE) == 0) {
337 			_vsetflags(vp, VINACTIVE);
338 			if (vp->v_mount)
339 				VOP_INACTIVE(vp);
340 		}
341 		spin_lock_wr(&vp->v_spinlock);
342 		KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0);
343 		if (vshouldfree(vp))
344 			__vfree(vp);
345 		else
346 			_vsetflags(vp, VCACHED); /* inactive but not yet free*/
347 		spin_unlock_wr(&vp->v_spinlock);
348 		vx_unlock(vp);
349 	} else {
350 		/*
351 		 * Someone reactivated the vnode while were blocked on the
352 		 * VX lock.  Release the VX lock and release the (now active)
353 		 * last reference which is no longer last.
354 		 */
355 		vx_unlock(vp);
356 		vrele(vp);
357 	}
358 }
359 
360 /*
361  * Physical vnode constructor / destructor.  These are only executed on
362  * the backend of the objcache.  They are NOT executed on every vnode
363  * allocation or deallocation.
364  *
365  * MPSAFE
366  */
367 boolean_t
368 vnode_ctor(void *obj, void *private, int ocflags)
369 {
370 	struct vnode *vp = obj;
371 
372 	lwkt_token_init(&vp->v_token);
373 	lockinit(&vp->v_lock, "vnode", 0, 0);
374 	ccms_dataspace_init(&vp->v_ccms);
375 	TAILQ_INIT(&vp->v_namecache);
376 	RB_INIT(&vp->v_rbclean_tree);
377 	RB_INIT(&vp->v_rbdirty_tree);
378 	RB_INIT(&vp->v_rbhash_tree);
379 	return(TRUE);
380 }
381 
382 /*
383  * MPSAFE
384  */
385 void
386 vnode_dtor(void *obj, void *private)
387 {
388 	struct vnode *vp = obj;
389 
390 	ccms_dataspace_destroy(&vp->v_ccms);
391 }
392 
393 /****************************************************************
394  *			VX LOCKING FUNCTIONS			*
395  ****************************************************************
396  *
397  * These functions lock vnodes for reclamation and deactivation related
398  * activities.  The caller must already be holding some sort of reference
399  * on the vnode.
400  *
401  * MPSAFE
402  */
403 void
404 vx_lock(struct vnode *vp)
405 {
406 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
407 }
408 
409 static int
410 vx_lock_nonblock(struct vnode *vp)
411 {
412 	return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT));
413 }
414 
415 void
416 vx_unlock(struct vnode *vp)
417 {
418 	lockmgr(&vp->v_lock, LK_RELEASE);
419 }
420 
421 /****************************************************************
422  *			VNODE ACQUISITION FUNCTIONS		*
423  ****************************************************************
424  *
425  * These functions must be used when accessing a vnode via an auxiliary
426  * reference such as the namecache or free list, or when you wish to
427  * do a combo ref+lock sequence.
428  *
429  * These functions are MANDATORY for any code chain accessing a vnode
430  * whos activation state is not known.
431  *
432  * vget() can be called with LK_NOWAIT and will return EBUSY if the
433  * lock cannot be immediately acquired.
434  *
435  * vget()/vput() are used when reactivation is desired.
436  *
437  * vx_get() and vx_put() are used when reactivation is not desired.
438  */
439 int
440 vget(struct vnode *vp, int flags)
441 {
442 	int error;
443 
444 	/*
445 	 * A lock type must be passed
446 	 */
447 	if ((flags & LK_TYPE_MASK) == 0) {
448 		panic("vget() called with no lock specified!");
449 		/* NOT REACHED */
450 	}
451 
452 	/*
453 	 * Reference the structure and then acquire the lock.  0->1
454 	 * transitions and refs during termination are allowed here so
455 	 * call sysref directly.
456 	 *
457 	 * NOTE: The requested lock might be a shared lock and does
458 	 *	 not protect our access to the refcnt or other fields.
459 	 */
460 	sysref_get(&vp->v_sysref);
461 	if ((error = vn_lock(vp, flags)) != 0) {
462 		/*
463 		 * The lock failed, undo and return an error.
464 		 */
465 		sysref_put(&vp->v_sysref);
466 	} else if (vp->v_flag & VRECLAIMED) {
467 		/*
468 		 * The node is being reclaimed and cannot be reactivated
469 		 * any more, undo and return ENOENT.
470 		 */
471 		vn_unlock(vp);
472 		vrele(vp);
473 		error = ENOENT;
474 	} else {
475 		/*
476 		 * If the vnode is marked VFREE or VCACHED it needs to be
477 		 * reactivated, otherwise it had better already be active.
478 		 * VINACTIVE must also be cleared.
479 		 *
480 		 * In the VFREE/VCACHED case we have to throw away the
481 		 * sysref that was earmarking those cases and preventing
482 		 * the vnode from being destroyed.  Our sysref is still held.
483 		 *
484 		 * The spinlock is our only real protection here.
485 		 */
486 		spin_lock_wr(&vp->v_spinlock);
487 		if (vp->v_flag & VFREE) {
488 			__vbusy(vp);
489 			sysref_activate(&vp->v_sysref);
490 			spin_unlock_wr(&vp->v_spinlock);
491 			sysref_put(&vp->v_sysref);
492 		} else if (vp->v_flag & VCACHED) {
493 			_vclrflags(vp, VCACHED);
494 			sysref_activate(&vp->v_sysref);
495 			spin_unlock_wr(&vp->v_spinlock);
496 			sysref_put(&vp->v_sysref);
497 		} else {
498 			if (sysref_isinactive(&vp->v_sysref)) {
499 				sysref_activate(&vp->v_sysref);
500 				kprintf("Warning vp %p reactivation race\n",
501 					vp);
502 			}
503 			spin_unlock_wr(&vp->v_spinlock);
504 		}
505 		_vclrflags(vp, VINACTIVE);
506 		error = 0;
507 	}
508 	return(error);
509 }
510 
511 /*
512  * MPSAFE
513  */
514 void
515 vput(struct vnode *vp)
516 {
517 	vn_unlock(vp);
518 	vrele(vp);
519 }
520 
521 /*
522  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
523  *
524  * MPSAFE
525  */
526 void
527 vx_get(struct vnode *vp)
528 {
529 	sysref_get(&vp->v_sysref);
530 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
531 }
532 
533 /*
534  * MPSAFE
535  */
536 int
537 vx_get_nonblock(struct vnode *vp)
538 {
539 	int error;
540 
541 	sysref_get(&vp->v_sysref);
542 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
543 	if (error)
544 		sysref_put(&vp->v_sysref);
545 	return(error);
546 }
547 
548 /*
549  * Relase a VX lock that also held a ref on the vnode.
550  *
551  * vx_put needs to check for a VCACHE->VFREE transition to catch the
552  * case where e.g. vnlru issues a vgone*().
553  *
554  * MPSAFE
555  */
556 void
557 vx_put(struct vnode *vp)
558 {
559 	spin_lock_wr(&vp->v_spinlock);
560 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
561 		_vclrflags(vp, VCACHED);
562 		__vfree(vp);
563 	}
564 	spin_unlock_wr(&vp->v_spinlock);
565 	lockmgr(&vp->v_lock, LK_RELEASE);
566 	sysref_put(&vp->v_sysref);
567 }
568 
569 /*
570  * Try to reuse a vnode from the free list.  NOTE: The returned vnode
571  * is not completely initialized.
572  *
573  * MPSAFE
574  */
575 static
576 struct vnode *
577 allocfreevnode(void)
578 {
579 	struct vnode *vp;
580 	int count;
581 
582 	for (count = 0; count < freevnodes; count++) {
583 		/*
584 		 * Note that regardless of how we block in this loop,
585 		 * we only get here if freevnodes != 0 so there
586 		 * had better be something on the list.
587 		 *
588 		 * Try to lock the first vnode on the free list.
589 		 * Cycle if we can't.
590 		 *
591 		 * XXX NOT MP SAFE
592 		 */
593 		spin_lock_wr(&vfs_spin);
594 		vp = TAILQ_FIRST(&vnode_free_list);
595 		if (vp == &vnode_free_mid)
596 			vp = TAILQ_NEXT(vp, v_freelist);
597 		if (vx_lock_nonblock(vp)) {
598 			KKASSERT(vp->v_flag & VFREE);
599 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
600 			TAILQ_INSERT_TAIL(&vnode_free_list,
601 					  vp, v_freelist);
602 			spin_unlock_wr(&vfs_spin);
603 			continue;
604 		}
605 		spin_unlock_wr(&vfs_spin);
606 #ifdef TRACKVNODE
607 		if ((ulong)vp == trackvnode)
608 			kprintf("allocfreevnode %p %08x\n", vp, vp->v_flag);
609 #endif
610 		/*
611 		 * Do not reclaim a vnode with auxillary refs.  This includes
612 		 * namecache refs due to a related ncp being locked or having
613 		 * children.
614 		 */
615 		if (vp->v_auxrefs) {
616 			__vfreetail(vp);
617 			vx_unlock(vp);
618 			continue;
619 		}
620 
621 		/*
622 		 * With the vnode locked we can safely remove it
623 		 * from the free list.  We inherit the reference
624 		 * that was previously associated with the vnode
625 		 * being on the free list.
626 		 */
627 		KKASSERT((vp->v_flag & (VFREE|VINACTIVE)) ==
628 			  (VFREE|VINACTIVE));
629 		KKASSERT(sysref_isinactive(&vp->v_sysref));
630 		__vbusy(vp);
631 
632 		/*
633 		 * Holding the VX lock on an inactive vnode prevents it
634 		 * from being reactivated or reused.  New namecache
635 		 * associations can only be made using active vnodes.
636 		 *
637 		 * Another thread may be blocked on our vnode lock while
638 		 * holding a namecache lock.  We can only reuse this vnode
639 		 * if we can clear all namecache associations without
640 		 * blocking.
641 		 */
642 		if ((vp->v_flag & VRECLAIMED) == 0) {
643 			if (cache_inval_vp_nonblock(vp)) {
644 				__vfreetail(vp);
645 				vx_unlock(vp);
646 				continue;
647 			}
648 			vgone_vxlocked(vp);
649 			/* vnode is still VX locked */
650 		}
651 
652 		/*
653 		 * We can reuse the vnode if no primary or auxiliary
654 		 * references remain other then ours, else put it
655 		 * back on the free list and keep looking.
656 		 *
657 		 * Either the free list inherits the last reference
658 		 * or we fall through and sysref_activate() the last
659 		 * reference.
660 		 *
661 		 * Since the vnode is in a VRECLAIMED state, no new
662 		 * namecache associations could have been made.
663 		 */
664 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
665 		if (vp->v_auxrefs ||
666 		    !sysref_islastdeactivation(&vp->v_sysref)) {
667 			__vfreetail(vp);
668 			vx_unlock(vp);
669 			continue;
670 		}
671 
672 		/*
673 		 * Return a VX locked vnode suitable for reuse.  The caller
674 		 * inherits the sysref.
675 		 */
676 		return(vp);
677 	}
678 	return(NULL);
679 }
680 
681 /*
682  * Obtain a new vnode from the freelist, allocating more if necessary.
683  * The returned vnode is VX locked & refd.
684  *
685  * All new vnodes set the VAGE flags.  An open() of the vnode will
686  * decrement the (2-bit) flags.  Vnodes which are opened several times
687  * are thus retained in the cache over vnodes which are merely stat()d.
688  *
689  * MPSAFE
690  */
691 struct vnode *
692 allocvnode(int lktimeout, int lkflags)
693 {
694 	struct vnode *vp;
695 
696 	/*
697 	 * Try to reuse vnodes if we hit the max.  This situation only
698 	 * occurs in certain large-memory (2G+) situations.  We cannot
699 	 * attempt to directly reclaim vnodes due to nasty recursion
700 	 * problems.
701 	 */
702 	while (numvnodes - freevnodes > desiredvnodes)
703 		vnlru_proc_wait();
704 
705 	/*
706 	 * Try to build up as many vnodes as we can before reallocating
707 	 * from the free list.  A vnode on the free list simply means
708 	 * that it is inactive with no resident pages.  It may or may not
709 	 * have been reclaimed and could have valuable information associated
710 	 * with it that we shouldn't throw away unless we really need to.
711 	 *
712 	 * HAMMER NOTE: Re-establishing a vnode is a fairly expensive
713 	 * operation for HAMMER but this should benefit UFS as well.
714 	 */
715 	if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes)
716 		vp = allocfreevnode();
717 	else
718 		vp = NULL;
719 	if (vp == NULL) {
720 		vp = sysref_alloc(&vnode_sysref_class);
721 		lockmgr(&vp->v_lock, LK_EXCLUSIVE);
722 		numvnodes++;
723 	}
724 
725 	/*
726 	 * We are using a managed sysref class, vnode fields are only
727 	 * zerod on initial allocation from the backing store, not
728 	 * on reallocation.  Thus we have to clear these fields for both
729 	 * reallocation and reuse.
730 	 */
731 #ifdef INVARIANTS
732 	if (vp->v_data)
733 		panic("cleaned vnode isn't");
734 	if (bio_track_active(&vp->v_track_read) ||
735 	    bio_track_active(&vp->v_track_write)) {
736 		panic("Clean vnode has pending I/O's");
737 	}
738 	if (vp->v_flag & VONWORKLST)
739 		panic("Clean vnode still pending on syncer worklist!");
740 	if (!RB_EMPTY(&vp->v_rbdirty_tree))
741 		panic("Clean vnode still has dirty buffers!");
742 	if (!RB_EMPTY(&vp->v_rbclean_tree))
743 		panic("Clean vnode still has clean buffers!");
744 	if (!RB_EMPTY(&vp->v_rbhash_tree))
745 		panic("Clean vnode still on hash tree!");
746 	KKASSERT(vp->v_mount == NULL);
747 #endif
748 	vp->v_flag = VAGE0 | VAGE1;
749 	vp->v_lastw = 0;
750 	vp->v_lasta = 0;
751 	vp->v_cstart = 0;
752 	vp->v_clen = 0;
753 	vp->v_socket = 0;
754 	vp->v_opencount = 0;
755 	vp->v_writecount = 0;	/* XXX */
756 
757 	/*
758 	 * lktimeout only applies when LK_TIMELOCK is used, and only
759 	 * the pageout daemon uses it.  The timeout may not be zero
760 	 * or the pageout daemon can deadlock in low-VM situations.
761 	 */
762 	if (lktimeout == 0)
763 		lktimeout = hz / 10;
764 	lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
765 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
766 	/* exclusive lock still held */
767 
768 	/*
769 	 * Note: sysref needs to be activated to convert -0x40000000 to +1.
770 	 * The -0x40000000 comes from the last ref on reuse, and from
771 	 * sysref_init() on allocate.
772 	 */
773 	sysref_activate(&vp->v_sysref);
774 	vp->v_filesize = NOOFFSET;
775 	vp->v_type = VNON;
776 	vp->v_tag = 0;
777 	vp->v_ops = NULL;
778 	vp->v_data = NULL;
779 	KKASSERT(vp->v_mount == NULL);
780 
781 	return (vp);
782 }
783 
784 /*
785  * MPSAFE
786  */
787 int
788 freesomevnodes(int n)
789 {
790 	struct vnode *vp;
791 	int count = 0;
792 
793 	while (n) {
794 		--n;
795 		if ((vp = allocfreevnode()) == NULL)
796 			break;
797 		vx_put(vp);
798 		--numvnodes;
799 	}
800 	return(count);
801 }
802 
803