1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vfs_spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vfs_spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vfs_spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vfs_spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vfs_spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 76 77 /* 78 * The vnode free list hold inactive vnodes. Aged inactive vnodes 79 * are inserted prior to the mid point, and otherwise inserted 80 * at the tail. 81 */ 82 TAILQ_HEAD(freelst, vnode); 83 static struct freelst vnode_active_list; 84 static struct freelst vnode_inactive_list; 85 static struct vnode vnode_active_rover; 86 static struct vnode vnode_inactive_mid1; 87 static struct vnode vnode_inactive_mid2; 88 static struct vnode vnode_inactive_rover; 89 static struct spinlock vfs_spin = SPINLOCK_INITIALIZER(vfs_spin); 90 static enum { ROVER_MID1, ROVER_MID2 } rover_state = ROVER_MID2; 91 92 int activevnodes = 0; 93 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 94 &activevnodes, 0, "Number of active nodes"); 95 int cachedvnodes = 0; 96 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 97 &cachedvnodes, 0, "Number of total cached nodes"); 98 int inactivevnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 100 &inactivevnodes, 0, "Number of inactive nodes"); 101 static int wantfreevnodes = 25; 102 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 103 &wantfreevnodes, 0, "Desired number of free vnodes"); 104 static int batchfreevnodes = 5; 105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 106 &batchfreevnodes, 0, "Number of vnodes to free at once"); 107 #ifdef TRACKVNODE 108 static ulong trackvnode; 109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 110 &trackvnode, 0, ""); 111 #endif 112 113 /* 114 * Called from vfsinit() 115 */ 116 void 117 vfs_lock_init(void) 118 { 119 TAILQ_INIT(&vnode_inactive_list); 120 TAILQ_INIT(&vnode_active_list); 121 TAILQ_INSERT_TAIL(&vnode_active_list, &vnode_active_rover, v_list); 122 TAILQ_INSERT_TAIL(&vnode_inactive_list, &vnode_inactive_mid1, v_list); 123 TAILQ_INSERT_TAIL(&vnode_inactive_list, &vnode_inactive_mid2, v_list); 124 TAILQ_INSERT_TAIL(&vnode_inactive_list, &vnode_inactive_rover, v_list); 125 spin_init(&vfs_spin); 126 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 127 } 128 129 /* 130 * Misc functions 131 */ 132 static __inline 133 void 134 _vsetflags(struct vnode *vp, int flags) 135 { 136 atomic_set_int(&vp->v_flag, flags); 137 } 138 139 static __inline 140 void 141 _vclrflags(struct vnode *vp, int flags) 142 { 143 atomic_clear_int(&vp->v_flag, flags); 144 } 145 146 void 147 vsetflags(struct vnode *vp, int flags) 148 { 149 _vsetflags(vp, flags); 150 } 151 152 void 153 vclrflags(struct vnode *vp, int flags) 154 { 155 _vclrflags(vp, flags); 156 } 157 158 /* 159 * Remove the vnode from the inactive list. 160 * 161 * _vactivate() may only be called while the vnode lock or VX lock is held. 162 * The vnode spinlock need not be held. 163 */ 164 static __inline 165 void 166 _vactivate(struct vnode *vp) 167 { 168 #ifdef TRACKVNODE 169 if ((ulong)vp == trackvnode) 170 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 171 #endif 172 spin_lock(&vfs_spin); 173 KKASSERT(vp->v_state == VS_INACTIVE || vp->v_state == VS_CACHED); 174 if (vp->v_state == VS_INACTIVE) { 175 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 176 --inactivevnodes; 177 } 178 TAILQ_INSERT_TAIL(&vnode_active_list, vp, v_list); 179 vp->v_state = VS_ACTIVE; 180 ++activevnodes; 181 spin_unlock(&vfs_spin); 182 } 183 184 /* 185 * Put a vnode on the inactive list. The vnode must not currently reside on 186 * any list (must be VS_CACHED). Vnode should be VINACTIVE. 187 * 188 * Caller must hold v_spin 189 */ 190 static __inline 191 void 192 _vinactive(struct vnode *vp) 193 { 194 #ifdef TRACKVNODE 195 if ((ulong)vp == trackvnode) { 196 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 197 print_backtrace(-1); 198 } 199 #endif 200 spin_lock(&vfs_spin); 201 KKASSERT(vp->v_state == VS_CACHED); 202 203 /* 204 * Distinguish between basically dead vnodes, vnodes with cached 205 * data, and vnodes without cached data. A rover will shift the 206 * vnodes around as their cache status is lost. 207 */ 208 if (vp->v_flag & VRECLAIMED) { 209 TAILQ_INSERT_HEAD(&vnode_inactive_list, vp, v_list); 210 } else if (vp->v_object && vp->v_object->resident_page_count) { 211 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 212 } else if (vp->v_object && vp->v_object->swblock_count) { 213 TAILQ_INSERT_BEFORE(&vnode_inactive_mid2, vp, v_list); 214 } else { 215 TAILQ_INSERT_BEFORE(&vnode_inactive_mid1, vp, v_list); 216 } 217 ++inactivevnodes; 218 vp->v_state = VS_INACTIVE; 219 spin_unlock(&vfs_spin); 220 } 221 222 static __inline 223 void 224 _vinactive_tail(struct vnode *vp) 225 { 226 #ifdef TRACKVNODE 227 if ((ulong)vp == trackvnode) 228 kprintf("_vinactive_tail %p %08x\n", vp, vp->v_flag); 229 #endif 230 spin_lock(&vfs_spin); 231 KKASSERT(vp->v_state == VS_CACHED); 232 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 233 ++inactivevnodes; 234 vp->v_state = VS_INACTIVE; 235 spin_unlock(&vfs_spin); 236 } 237 238 /* 239 * Return a C boolean if we should put the vnode on the inactive list 240 * (VS_INACTIVE) or leave it alone. 241 * 242 * This routine is only valid if the vnode is already either VS_INACTIVE or 243 * VS_CACHED, or if it can become VS_INACTIV or VS_CACHED via 244 * vnode_terminate(). 245 * 246 * WARNING! We used to indicate FALSE if the vnode had an object with 247 * resident pages but we no longer do that because it makes 248 * managing kern.maxvnodes difficult. Instead we rely on vfree() 249 * to place the vnode properly on the list. 250 * 251 * WARNING! This functions is typically called with v_spin held. 252 */ 253 static __inline boolean_t 254 vshouldfree(struct vnode *vp) 255 { 256 return (vp->v_auxrefs == 0); 257 } 258 259 /* 260 * Add a ref to an active vnode. This function should never be called 261 * with an inactive vnode (use vget() instead), but might be called 262 * with other states. 263 */ 264 void 265 vref(struct vnode *vp) 266 { 267 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 268 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 269 atomic_add_int(&vp->v_refcnt, 1); 270 } 271 272 /* 273 * Release a ref on an active or inactive vnode. 274 * 275 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 276 * transition, otherwise we leave the vnode in the active list and 277 * do a lockless transition to 0, which is very important for the 278 * critical path. 279 */ 280 void 281 vrele(struct vnode *vp) 282 { 283 for (;;) { 284 int count = vp->v_refcnt; 285 cpu_ccfence(); 286 KKASSERT((count & VREF_MASK) > 0); 287 288 /* 289 * 2+ case 290 */ 291 if ((count & VREF_MASK) > 1) { 292 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 293 break; 294 continue; 295 } 296 297 /* 298 * 1->0 transition case must handle possible finalization. 299 * When finalizing we transition 1->0x40000000. Note that 300 * cachedvnodes is only adjusted on transitions to ->0. 301 */ 302 if (count & VREF_FINALIZE) { 303 vx_lock(vp); 304 if (atomic_cmpset_int(&vp->v_refcnt, 305 count, VREF_TERMINATE)) { 306 vnode_terminate(vp); 307 break; 308 } 309 vx_unlock(vp); 310 } else { 311 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 312 atomic_add_int(&cachedvnodes, 1); 313 break; 314 } 315 } 316 /* retry */ 317 } 318 } 319 320 /* 321 * Add an auxiliary data structure reference to the vnode. Auxiliary 322 * references do not change the state of the vnode or prevent them 323 * from being deactivated, reclaimed, or placed on or removed from 324 * the free list. 325 * 326 * An auxiliary reference DOES prevent the vnode from being destroyed, 327 * allowing you to vx_lock() it, test state, etc. 328 * 329 * An auxiliary reference DOES NOT move a vnode out of the VS_INACTIVE 330 * state once it has entered it. 331 * 332 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 333 * already be held by the caller. vdrop() will clean up the 334 * free list state. 335 */ 336 void 337 vhold(struct vnode *vp) 338 { 339 atomic_add_int(&vp->v_auxrefs, 1); 340 } 341 342 /* 343 * Remove an auxiliary reference from the vnode. 344 * 345 * vdrop must check for the case where a vnode is held past its reclamation. 346 * We use v_spin to interlock VS_CACHED -> VS_INACTIVE transitions. 347 */ 348 void 349 vdrop(struct vnode *vp) 350 { 351 for (;;) { 352 int count = vp->v_auxrefs; 353 cpu_ccfence(); 354 KKASSERT(count > 0); 355 356 /* 357 * 2+ case 358 */ 359 if (count > 1) { 360 if (atomic_cmpset_int(&vp->v_auxrefs, count, count - 1)) 361 break; 362 continue; 363 } 364 365 /* 366 * 1->0 transition case must check for reclaimed vnodes that 367 * are expected to be placed on the inactive list. 368 * 369 * v_spin is required for the 1->0 transition. 370 * 371 * 1->0 and 0->1 transitions are allowed to race. The 372 * vp simply remains on the inactive list. 373 */ 374 spin_lock(&vp->v_spin); 375 if (atomic_cmpset_int(&vp->v_auxrefs, 1, 0)) { 376 if (vp->v_state == VS_CACHED && vshouldfree(vp)) 377 _vinactive(vp); 378 spin_unlock(&vp->v_spin); 379 break; 380 } 381 spin_unlock(&vp->v_spin); 382 /* retry */ 383 } 384 } 385 386 /* 387 * This function is called with vp vx_lock'd when the last active reference 388 * on the vnode is released, typically via vrele(). v_refcnt will be set 389 * to VREF_TERMINATE. 390 * 391 * Additional vrefs are allowed to race but will not result in a reentrant 392 * call to vnode_terminate() due to VREF_TERMINATE. 393 * 394 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 395 * 396 * NOTE: The vnode may be marked inactive with dirty buffers 397 * or dirty pages in its cached VM object still present. 398 * 399 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 400 * previously be active). We lose control of the vnode the instant 401 * it is placed on the free list. 402 * 403 * The VX lock is required when transitioning to VS_CACHED but is 404 * not sufficient for the vshouldfree() interlocked test or when 405 * transitioning away from VS_CACHED. v_spin is also required for 406 * those cases. 407 */ 408 void 409 vnode_terminate(struct vnode *vp) 410 { 411 KKASSERT(vp->v_state == VS_ACTIVE); 412 413 if ((vp->v_flag & VINACTIVE) == 0) { 414 _vsetflags(vp, VINACTIVE); 415 if (vp->v_mount) 416 VOP_INACTIVE(vp); 417 /* might deactivate page */ 418 } 419 spin_lock(&vp->v_spin); 420 if (vp->v_state == VS_ACTIVE) { 421 spin_lock(&vfs_spin); 422 KKASSERT(vp->v_state == VS_ACTIVE); 423 TAILQ_REMOVE(&vnode_active_list, vp, v_list); 424 --activevnodes; 425 vp->v_state = VS_CACHED; 426 spin_unlock(&vfs_spin); 427 } 428 if (vp->v_state == VS_CACHED && vshouldfree(vp)) 429 _vinactive(vp); 430 spin_unlock(&vp->v_spin); 431 vx_unlock(vp); 432 } 433 434 /**************************************************************** 435 * VX LOCKING FUNCTIONS * 436 **************************************************************** 437 * 438 * These functions lock vnodes for reclamation and deactivation related 439 * activities. The caller must already be holding some sort of reference 440 * on the vnode. 441 * 442 * MPSAFE 443 */ 444 void 445 vx_lock(struct vnode *vp) 446 { 447 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 448 } 449 450 /* 451 * The non-blocking version also uses a slightly different mechanic. 452 * This function will explicitly fail not only if it cannot acquire 453 * the lock normally, but also if the caller already holds a lock. 454 * 455 * The adjusted mechanic is used to close a loophole where complex 456 * VOP_RECLAIM code can circle around recursively and allocate the 457 * same vnode it is trying to destroy from the freelist. 458 * 459 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 460 * cause the incorrect behavior to occur. If not for that lockmgr() 461 * would do the right thing. 462 */ 463 static int 464 vx_lock_nonblock(struct vnode *vp) 465 { 466 if (lockcountnb(&vp->v_lock)) 467 return(EBUSY); 468 return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT)); 469 } 470 471 void 472 vx_unlock(struct vnode *vp) 473 { 474 lockmgr(&vp->v_lock, LK_RELEASE); 475 } 476 477 /**************************************************************** 478 * VNODE ACQUISITION FUNCTIONS * 479 **************************************************************** 480 * 481 * These functions must be used when accessing a vnode that has no 482 * chance of being destroyed in a SMP race. That means the caller will 483 * usually either hold an auxiliary reference (such as the namecache) 484 * or hold some other lock that ensures that the vnode cannot be destroyed. 485 * 486 * These functions are MANDATORY for any code chain accessing a vnode 487 * whos activation state is not known. 488 * 489 * vget() can be called with LK_NOWAIT and will return EBUSY if the 490 * lock cannot be immediately acquired. 491 * 492 * vget()/vput() are used when reactivation is desired. 493 * 494 * vx_get() and vx_put() are used when reactivation is not desired. 495 */ 496 int 497 vget(struct vnode *vp, int flags) 498 { 499 int error; 500 501 /* 502 * A lock type must be passed 503 */ 504 if ((flags & LK_TYPE_MASK) == 0) { 505 panic("vget() called with no lock specified!"); 506 /* NOT REACHED */ 507 } 508 509 /* 510 * Reference the structure and then acquire the lock. 511 * 512 * NOTE: The requested lock might be a shared lock and does 513 * not protect our access to the refcnt or other fields. 514 */ 515 if (atomic_fetchadd_int(&vp->v_refcnt, 1) == 0) 516 atomic_add_int(&cachedvnodes, -1); 517 518 if ((error = vn_lock(vp, flags)) != 0) { 519 /* 520 * The lock failed, undo and return an error. This will not 521 * normally trigger a termination. 522 */ 523 vrele(vp); 524 } else if (vp->v_flag & VRECLAIMED) { 525 /* 526 * The node is being reclaimed and cannot be reactivated 527 * any more, undo and return ENOENT. 528 */ 529 vn_unlock(vp); 530 vrele(vp); 531 error = ENOENT; 532 } else if (vp->v_state == VS_ACTIVE) { 533 /* 534 * A VS_ACTIVE vnode coupled with the fact that we have 535 * a vnode lock (even if shared) prevents v_state from 536 * changing. Since the vnode is not in a VRECLAIMED state, 537 * we can safely clear VINACTIVE. 538 * 539 * NOTE! Multiple threads may clear VINACTIVE if this is 540 * shared lock. This race is allowed. 541 */ 542 _vclrflags(vp, VINACTIVE); 543 error = 0; 544 } else { 545 /* 546 * If the vnode is not VS_ACTIVE it must be reactivated 547 * in addition to clearing VINACTIVE. An exclusive spin_lock 548 * is needed to manipulate the vnode's list. 549 * 550 * Because the lockmgr lock might be shared, we might race 551 * another reactivation, which we handle. In this situation, 552 * however, the refcnt prevents other v_state races. 553 * 554 * As with above, clearing VINACTIVE is allowed to race other 555 * clearings of VINACTIVE. 556 */ 557 _vclrflags(vp, VINACTIVE); 558 spin_lock(&vp->v_spin); 559 560 switch(vp->v_state) { 561 case VS_INACTIVE: 562 _vactivate(vp); 563 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 564 spin_unlock(&vp->v_spin); 565 break; 566 case VS_CACHED: 567 _vactivate(vp); 568 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 569 spin_unlock(&vp->v_spin); 570 break; 571 case VS_ACTIVE: 572 spin_unlock(&vp->v_spin); 573 break; 574 case VS_DYING: 575 spin_unlock(&vp->v_spin); 576 panic("Impossible VS_DYING state"); 577 break; 578 } 579 error = 0; 580 } 581 return(error); 582 } 583 584 #ifdef DEBUG_VPUT 585 586 void 587 debug_vput(struct vnode *vp, const char *filename, int line) 588 { 589 kprintf("vput(%p) %s:%d\n", vp, filename, line); 590 vn_unlock(vp); 591 vrele(vp); 592 } 593 594 #else 595 596 /* 597 * MPSAFE 598 */ 599 void 600 vput(struct vnode *vp) 601 { 602 vn_unlock(vp); 603 vrele(vp); 604 } 605 606 #endif 607 608 /* 609 * Acquire the vnode lock unguarded. 610 * 611 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 612 */ 613 void 614 vx_get(struct vnode *vp) 615 { 616 if (atomic_fetchadd_int(&vp->v_refcnt, 1) == 0) 617 atomic_add_int(&cachedvnodes, -1); 618 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 619 } 620 621 int 622 vx_get_nonblock(struct vnode *vp) 623 { 624 int error; 625 626 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 627 if (error == 0) { 628 if (atomic_fetchadd_int(&vp->v_refcnt, 1) == 0) 629 atomic_add_int(&cachedvnodes, -1); 630 } 631 return(error); 632 } 633 634 /* 635 * Relase a VX lock that also held a ref on the vnode. 636 * 637 * vx_put needs to check for VS_CACHED->VS_INACTIVE transitions to catch 638 * the case where e.g. vnlru issues a vgone*(), but should otherwise 639 * not mess with the v_state. 640 */ 641 void 642 vx_put(struct vnode *vp) 643 { 644 if (vp->v_state == VS_CACHED && vshouldfree(vp)) 645 _vinactive(vp); 646 lockmgr(&vp->v_lock, LK_RELEASE); 647 vrele(vp); 648 } 649 650 /* 651 * The rover looks for vnodes past the midline with no cached data and 652 * moves them to before the midline. If we do not do this the midline 653 * can wind up in a degenerate state. 654 */ 655 static 656 void 657 vnode_free_rover_scan_locked(void) 658 { 659 struct vnode *vp; 660 661 /* 662 * Get the vnode after the rover. The rover roves between mid1 and 663 * the end so the only special vnode it can encounter is mid2. 664 */ 665 vp = TAILQ_NEXT(&vnode_inactive_rover, v_list); 666 if (vp == &vnode_inactive_mid2) { 667 vp = TAILQ_NEXT(vp, v_list); 668 rover_state = ROVER_MID2; 669 } 670 KKASSERT(vp != &vnode_inactive_mid1); 671 672 /* 673 * Start over if we finished the scan. 674 */ 675 TAILQ_REMOVE(&vnode_inactive_list, &vnode_inactive_rover, v_list); 676 if (vp == NULL) { 677 TAILQ_INSERT_AFTER(&vnode_inactive_list, &vnode_inactive_mid1, 678 &vnode_inactive_rover, v_list); 679 rover_state = ROVER_MID1; 680 return; 681 } 682 TAILQ_INSERT_AFTER(&vnode_inactive_list, vp, 683 &vnode_inactive_rover, v_list); 684 685 /* 686 * Shift vp if appropriate. 687 */ 688 if (vp->v_object && vp->v_object->resident_page_count) { 689 /* 690 * Promote vnode with resident pages to section 3. 691 */ 692 if (rover_state == ROVER_MID1) { 693 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 694 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 695 } 696 } else if (vp->v_object && vp->v_object->swblock_count) { 697 /* 698 * Demote vnode with only swap pages to section 2 699 */ 700 if (rover_state == ROVER_MID2) { 701 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 702 TAILQ_INSERT_BEFORE(&vnode_inactive_mid2, vp, v_list); 703 } 704 } else { 705 /* 706 * Demote vnode with no cached data to section 1 707 */ 708 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 709 TAILQ_INSERT_BEFORE(&vnode_inactive_mid1, vp, v_list); 710 } 711 } 712 713 /* 714 * Called from vnlru_proc() 715 */ 716 void 717 vnode_free_rover_scan(int count) 718 { 719 spin_lock(&vfs_spin); 720 while (count > 0) { 721 --count; 722 vnode_free_rover_scan_locked(); 723 } 724 spin_unlock(&vfs_spin); 725 } 726 727 /* 728 * Try to reuse a vnode from the free list. This function is somewhat 729 * advisory in that NULL can be returned as a normal case, even if free 730 * vnodes are present. 731 * 732 * The scan is limited because it can result in excessive CPU use during 733 * periods of extreme vnode use. 734 * 735 * NOTE: The returned vnode is not completely initialized. 736 * 737 * MPSAFE 738 */ 739 static 740 struct vnode * 741 cleanfreevnode(int maxcount) 742 { 743 struct vnode *vp; 744 int count; 745 746 /* 747 * Try to deactivate some vnodes cached on the active list. 748 */ 749 for (count = 0; count < maxcount; count++) { 750 if (cachedvnodes - inactivevnodes < inactivevnodes) 751 break; 752 753 spin_lock(&vfs_spin); 754 vp = TAILQ_NEXT(&vnode_active_rover, v_list); 755 TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list); 756 if (vp == NULL) { 757 TAILQ_INSERT_HEAD(&vnode_active_list, 758 &vnode_active_rover, v_list); 759 } else { 760 TAILQ_INSERT_AFTER(&vnode_active_list, vp, 761 &vnode_active_rover, v_list); 762 } 763 if (vp == NULL || vp->v_refcnt != 0) { 764 spin_unlock(&vfs_spin); 765 continue; 766 } 767 768 /* 769 * Try to deactivate the vnode. 770 */ 771 if (atomic_fetchadd_int(&vp->v_refcnt, 1) == 0) 772 atomic_add_int(&cachedvnodes, -1); 773 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 774 spin_unlock(&vfs_spin); 775 vrele(vp); 776 } 777 778 /* 779 * Loop trying to lock the first vnode on the free list. 780 * Cycle if we can't. 781 * 782 * We use a bad hack in vx_lock_nonblock() which avoids 783 * the lock order reversal between vfs_spin and v_spin. 784 * This is very fragile code and I don't want to use 785 * vhold here. 786 */ 787 for (count = 0; count < maxcount; count++) { 788 spin_lock(&vfs_spin); 789 vnode_free_rover_scan_locked(); 790 vnode_free_rover_scan_locked(); 791 vp = TAILQ_FIRST(&vnode_inactive_list); 792 while (vp == &vnode_inactive_mid1 || 793 vp == &vnode_inactive_mid2 || 794 vp == &vnode_inactive_rover) { 795 vp = TAILQ_NEXT(vp, v_list); 796 } 797 if (vp == NULL) { 798 spin_unlock(&vfs_spin); 799 break; 800 } 801 if (vx_lock_nonblock(vp)) { 802 KKASSERT(vp->v_state == VS_INACTIVE); 803 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 804 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 805 spin_unlock(&vfs_spin); 806 continue; 807 } 808 809 /* 810 * The vnode should be inactive (VREF_TERMINATE should still 811 * be set in v_refcnt). Since we pulled it from the inactive 812 * list it should obviously not be VS_CACHED. Activate the 813 * vnode. 814 * 815 * Once removed from the inactive list we inherit the 816 * VREF_TERMINATE which prevents loss of control while 817 * we mess with the vnode. 818 */ 819 KKASSERT(vp->v_state == VS_INACTIVE); 820 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 821 --inactivevnodes; 822 vp->v_state = VS_DYING; 823 spin_unlock(&vfs_spin); 824 #ifdef TRACKVNODE 825 if ((ulong)vp == trackvnode) 826 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 827 #endif 828 /* 829 * Do not reclaim/reuse a vnode while auxillary refs exists. 830 * This includes namecache refs due to a related ncp being 831 * locked or having children, a VM object association, or 832 * other hold users. 833 * 834 * We will make this test several times as auxrefs can 835 * get incremented on us without any spinlocks being held 836 * until we have removed all namecache and inode references 837 * to the vnode. 838 * 839 * The inactive list association reinherits the v_refcnt. 840 */ 841 if (vp->v_auxrefs) { 842 vp->v_state = VS_CACHED; 843 _vinactive_tail(vp); 844 vx_unlock(vp); 845 continue; 846 } 847 848 KKASSERT(vp->v_flag & VINACTIVE); 849 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 850 851 /* 852 * Holding the VX lock on an inactive vnode prevents it 853 * from being reactivated or reused. New namecache 854 * associations can only be made using active vnodes. 855 * 856 * Another thread may be blocked on our vnode lock while 857 * holding a namecache lock. We can only reuse this vnode 858 * if we can clear all namecache associations without 859 * blocking. 860 * 861 * Because VCACHED is already in the correct state (cleared) 862 * we cannot race other vdrop()s occuring at the same time 863 * and can safely place vp on the free list. 864 */ 865 if ((vp->v_flag & VRECLAIMED) == 0) { 866 if (cache_inval_vp_nonblock(vp)) { 867 vp->v_state = VS_CACHED; 868 _vinactive_tail(vp); 869 vx_unlock(vp); 870 continue; 871 } 872 vgone_vxlocked(vp); 873 /* vnode is still VX locked */ 874 } 875 876 /* 877 * We can destroy the vnode if no primary or auxiliary 878 * references remain other then ours, else put it 879 * back on the free list and keep looking. 880 * 881 * Either the free list inherits the last reference 882 * or we fall through and sysref_activate() the last 883 * reference. 884 * 885 * Since the vnode is in a VRECLAIMED state, no new 886 * namecache associations could have been made. 887 */ 888 KKASSERT(vp->v_state == VS_DYING); 889 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 890 if (vp->v_auxrefs || 891 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE) { 892 vp->v_state = VS_CACHED; 893 _vinactive_tail(vp); 894 vx_unlock(vp); 895 continue; 896 } 897 898 /* 899 * Nothing should have been able to access this vp. 900 */ 901 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 902 KASSERT(vp->v_refcnt == 0, 903 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 904 905 /* 906 * Return a VX locked vnode suitable for reuse. The caller 907 * inherits the sysref. 908 */ 909 KKASSERT(vp->v_state == VS_DYING); 910 return(vp); 911 } 912 return(NULL); 913 } 914 915 /* 916 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 917 * 918 * All new vnodes set the VAGE flags. An open() of the vnode will 919 * decrement the (2-bit) flags. Vnodes which are opened several times 920 * are thus retained in the cache over vnodes which are merely stat()d. 921 * 922 * We always allocate the vnode. Attempting to recycle existing vnodes 923 * here can lead to numerous deadlocks, particularly with softupdates. 924 */ 925 struct vnode * 926 allocvnode(int lktimeout, int lkflags) 927 { 928 struct vnode *vp; 929 930 /* 931 * Do not flag for recyclement unless there are enough free vnodes 932 * to recycle and the number of vnodes has exceeded our target. 933 */ 934 if (inactivevnodes >= wantfreevnodes && numvnodes >= desiredvnodes) { 935 struct thread *td = curthread; 936 if (td->td_lwp) 937 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 938 } 939 940 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 941 942 lwkt_token_init(&vp->v_token, "vnode"); 943 lockinit(&vp->v_lock, "vnode", 0, 0); 944 TAILQ_INIT(&vp->v_namecache); 945 RB_INIT(&vp->v_rbclean_tree); 946 RB_INIT(&vp->v_rbdirty_tree); 947 RB_INIT(&vp->v_rbhash_tree); 948 spin_init(&vp->v_spin); 949 950 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 951 atomic_add_int(&numvnodes, 1); 952 vp->v_refcnt = 1; 953 vp->v_flag = VAGE0 | VAGE1; 954 955 /* 956 * lktimeout only applies when LK_TIMELOCK is used, and only 957 * the pageout daemon uses it. The timeout may not be zero 958 * or the pageout daemon can deadlock in low-VM situations. 959 */ 960 if (lktimeout == 0) 961 lktimeout = hz / 10; 962 lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags); 963 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 964 /* exclusive lock still held */ 965 966 vp->v_filesize = NOOFFSET; 967 vp->v_type = VNON; 968 vp->v_tag = 0; 969 vp->v_state = VS_CACHED; 970 _vactivate(vp); 971 972 return (vp); 973 } 974 975 /* 976 * Called after a process has allocated a vnode via allocvnode() 977 * and we detected that too many vnodes were present. 978 * 979 * Try to reuse vnodes if we hit the max. This situation only 980 * occurs in certain large-memory (2G+) situations on 32 bit systems, 981 * or if kern.maxvnodes is set to very low values. 982 * 983 * This function is called just prior to a return to userland if the 984 * process at some point had to allocate a new vnode during the last 985 * system call and the vnode count was found to be excessive. 986 * 987 * WARNING: Sometimes numvnodes can blow out due to children being 988 * present under directory vnodes in the namecache. For the 989 * moment use an if() instead of a while() and note that if 990 * we were to use a while() we would still have to break out 991 * if freesomevnodes() returned 0. 992 */ 993 void 994 allocvnode_gc(void) 995 { 996 if (numvnodes > desiredvnodes && cachedvnodes > wantfreevnodes) 997 freesomevnodes(batchfreevnodes); 998 } 999 1000 /* 1001 * MPSAFE 1002 */ 1003 int 1004 freesomevnodes(int n) 1005 { 1006 struct vnode *vp; 1007 int count = 0; 1008 1009 while (n) { 1010 if ((vp = cleanfreevnode(n * 2)) == NULL) 1011 break; 1012 --n; 1013 ++count; 1014 kfree(vp, M_VNODE); 1015 atomic_add_int(&numvnodes, -1); 1016 } 1017 return(count); 1018 } 1019