1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 76 77 /* 78 * The vnode free list hold inactive vnodes. Aged inactive vnodes 79 * are inserted prior to the mid point, and otherwise inserted 80 * at the tail. 81 * 82 * The vnode code goes to great lengths to avoid moving vnodes between 83 * lists, but sometimes it is unavoidable. For this situation we try to 84 * avoid lock contention but we do not try very hard to avoid cache line 85 * congestion. A modestly sized hash table is used. 86 */ 87 #define VLIST_PRIME2 123462047LU 88 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 89 90 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 91 VLIST_PRIME2 % (unsigned)ncpus) 92 93 static struct vnode_index *vnode_list_hash; 94 95 int activevnodes = 0; 96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 97 &activevnodes, 0, "Number of active nodes"); 98 int cachedvnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 100 &cachedvnodes, 0, "Number of total cached nodes"); 101 int inactivevnodes = 0; 102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 103 &inactivevnodes, 0, "Number of inactive nodes"); 104 static int batchfreevnodes = 5; 105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 106 &batchfreevnodes, 0, "Number of vnodes to free at once"); 107 #ifdef TRACKVNODE 108 static u_long trackvnode; 109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 110 &trackvnode, 0, ""); 111 #endif 112 113 /* 114 * Called from vfsinit() 115 */ 116 void 117 vfs_lock_init(void) 118 { 119 int i; 120 121 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 122 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 123 M_VNODE, M_ZERO | M_WAITOK); 124 for (i = 0; i < ncpus; ++i) { 125 struct vnode_index *vi = &vnode_list_hash[i]; 126 127 TAILQ_INIT(&vi->inactive_list); 128 TAILQ_INIT(&vi->active_list); 129 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 130 spin_init(&vi->spin, "vfslock"); 131 } 132 } 133 134 /* 135 * Misc functions 136 */ 137 static __inline 138 void 139 _vsetflags(struct vnode *vp, int flags) 140 { 141 atomic_set_int(&vp->v_flag, flags); 142 } 143 144 static __inline 145 void 146 _vclrflags(struct vnode *vp, int flags) 147 { 148 atomic_clear_int(&vp->v_flag, flags); 149 } 150 151 void 152 vsetflags(struct vnode *vp, int flags) 153 { 154 _vsetflags(vp, flags); 155 } 156 157 void 158 vclrflags(struct vnode *vp, int flags) 159 { 160 _vclrflags(vp, flags); 161 } 162 163 /* 164 * Place the vnode on the active list. 165 * 166 * Caller must hold vp->v_spin 167 */ 168 static __inline 169 void 170 _vactivate(struct vnode *vp) 171 { 172 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 173 174 #ifdef TRACKVNODE 175 if ((u_long)vp == trackvnode) 176 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 177 #endif 178 spin_lock(&vi->spin); 179 180 switch(vp->v_state) { 181 case VS_ACTIVE: 182 spin_unlock(&vi->spin); 183 panic("_vactivate: already active"); 184 /* NOT REACHED */ 185 return; 186 case VS_INACTIVE: 187 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 188 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 189 break; 190 case VS_CACHED: 191 case VS_DYING: 192 break; 193 } 194 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 195 vp->v_state = VS_ACTIVE; 196 spin_unlock(&vi->spin); 197 atomic_add_int(&mycpu->gd_activevnodes, 1); 198 } 199 200 /* 201 * Put a vnode on the inactive list. 202 * 203 * Caller must hold v_spin 204 */ 205 static __inline 206 void 207 _vinactive(struct vnode *vp) 208 { 209 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 210 211 #ifdef TRACKVNODE 212 if ((u_long)vp == trackvnode) { 213 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 214 print_backtrace(-1); 215 } 216 #endif 217 spin_lock(&vi->spin); 218 219 /* 220 * Remove from active list if it is sitting on it 221 */ 222 switch(vp->v_state) { 223 case VS_ACTIVE: 224 TAILQ_REMOVE(&vi->active_list, vp, v_list); 225 atomic_add_int(&mycpu->gd_activevnodes, -1); 226 break; 227 case VS_INACTIVE: 228 spin_unlock(&vi->spin); 229 panic("_vinactive: already inactive"); 230 /* NOT REACHED */ 231 return; 232 case VS_CACHED: 233 case VS_DYING: 234 break; 235 } 236 237 /* 238 * Distinguish between basically dead vnodes, vnodes with cached 239 * data, and vnodes without cached data. A rover will shift the 240 * vnodes around as their cache status is lost. 241 */ 242 if (vp->v_flag & VRECLAIMED) { 243 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 244 } else { 245 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 246 } 247 vp->v_state = VS_INACTIVE; 248 spin_unlock(&vi->spin); 249 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 250 } 251 252 /* 253 * Add a ref to an active vnode. This function should never be called 254 * with an inactive vnode (use vget() instead), but might be called 255 * with other states. 256 */ 257 void 258 vref(struct vnode *vp) 259 { 260 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 261 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 262 atomic_add_int(&vp->v_refcnt, 1); 263 } 264 265 void 266 synchronizevnodecount(void) 267 { 268 int nca = 0; 269 int act = 0; 270 int ina = 0; 271 int i; 272 273 for (i = 0; i < ncpus; ++i) { 274 globaldata_t gd = globaldata_find(i); 275 nca += gd->gd_cachedvnodes; 276 act += gd->gd_activevnodes; 277 ina += gd->gd_inactivevnodes; 278 } 279 cachedvnodes = nca; 280 activevnodes = act; 281 inactivevnodes = ina; 282 } 283 284 /* 285 * Count number of cached vnodes. This is middling expensive so be 286 * careful not to make this call in the critical path. Each cpu tracks 287 * its own accumulator. The individual accumulators must be summed 288 * together to get an accurate value. 289 */ 290 int 291 countcachedvnodes(void) 292 { 293 int i; 294 int n = 0; 295 296 for (i = 0; i < ncpus; ++i) { 297 globaldata_t gd = globaldata_find(i); 298 n += gd->gd_cachedvnodes; 299 } 300 return n; 301 } 302 303 int 304 countcachedandinactivevnodes(void) 305 { 306 int i; 307 int n = 0; 308 309 for (i = 0; i < ncpus; ++i) { 310 globaldata_t gd = globaldata_find(i); 311 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 312 } 313 return n; 314 } 315 316 /* 317 * Release a ref on an active or inactive vnode. 318 * 319 * Caller has no other requirements. 320 * 321 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 322 * transition, otherwise we leave the vnode in the active list and 323 * do a lockless transition to 0, which is very important for the 324 * critical path. 325 * 326 * (vrele() is not called when a vnode is being destroyed w/kfree) 327 */ 328 void 329 vrele(struct vnode *vp) 330 { 331 int count; 332 333 count = vp->v_refcnt; 334 cpu_ccfence(); 335 336 for (;;) { 337 KKASSERT((count & VREF_MASK) > 0); 338 KKASSERT(vp->v_state == VS_ACTIVE || 339 vp->v_state == VS_INACTIVE); 340 341 /* 342 * 2+ case 343 */ 344 if ((count & VREF_MASK) > 1) { 345 if (atomic_fcmpset_int(&vp->v_refcnt, 346 &count, count - 1)) { 347 break; 348 } 349 continue; 350 } 351 352 /* 353 * 1->0 transition case must handle possible finalization. 354 * When finalizing we transition 1->0x40000000. Note that 355 * cachedvnodes is only adjusted on transitions to ->0. 356 * 357 * WARNING! VREF_TERMINATE can be cleared at any point 358 * when the refcnt is non-zero (by vget()) and 359 * the vnode has not been reclaimed. Thus 360 * transitions out of VREF_TERMINATE do not have 361 * to mess with cachedvnodes. 362 */ 363 if (count & VREF_FINALIZE) { 364 vx_lock(vp); 365 if (atomic_fcmpset_int(&vp->v_refcnt, 366 &count, VREF_TERMINATE)) { 367 vnode_terminate(vp); 368 break; 369 } 370 vx_unlock(vp); 371 } else { 372 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { 373 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 374 break; 375 } 376 } 377 cpu_pause(); 378 /* retry */ 379 } 380 } 381 382 /* 383 * Add an auxiliary data structure reference to the vnode. Auxiliary 384 * references do not change the state of the vnode or prevent deactivation 385 * or reclamation of the vnode, but will prevent the vnode from being 386 * destroyed (kfree()'d). 387 * 388 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 389 * already be held by the caller. vdrop() will clean up the 390 * free list state. 391 */ 392 void 393 vhold(struct vnode *vp) 394 { 395 atomic_add_int(&vp->v_auxrefs, 1); 396 } 397 398 /* 399 * Remove an auxiliary reference from the vnode. 400 */ 401 void 402 vdrop(struct vnode *vp) 403 { 404 atomic_add_int(&vp->v_auxrefs, -1); 405 } 406 407 /* 408 * This function is called on the 1->0 transition (which is actually 409 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 410 * of the vnode. 411 * 412 * Additional vrefs are allowed to race but will not result in a reentrant 413 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 414 * prevents additional 1->0 transitions. 415 * 416 * ONLY A VGET() CAN REACTIVATE THE VNODE. 417 * 418 * Caller must hold the VX lock. 419 * 420 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 421 * 422 * NOTE: The vnode may be marked inactive with dirty buffers 423 * or dirty pages in its cached VM object still present. 424 * 425 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 426 * previously be active). We lose control of the vnode the instant 427 * it is placed on the free list. 428 * 429 * The VX lock is required when transitioning to VS_CACHED but is 430 * not sufficient for the vshouldfree() interlocked test or when 431 * transitioning away from VS_CACHED. v_spin is also required for 432 * those cases. 433 */ 434 static 435 void 436 vnode_terminate(struct vnode *vp) 437 { 438 KKASSERT(vp->v_state == VS_ACTIVE); 439 440 if ((vp->v_flag & VINACTIVE) == 0) { 441 _vsetflags(vp, VINACTIVE); 442 if (vp->v_mount) 443 VOP_INACTIVE(vp); 444 } 445 spin_lock(&vp->v_spin); 446 _vinactive(vp); 447 spin_unlock(&vp->v_spin); 448 449 vx_unlock(vp); 450 } 451 452 /**************************************************************** 453 * VX LOCKING FUNCTIONS * 454 **************************************************************** 455 * 456 * These functions lock vnodes for reclamation and deactivation related 457 * activities. The caller must already be holding some sort of reference 458 * on the vnode. 459 */ 460 void 461 vx_lock(struct vnode *vp) 462 { 463 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 464 } 465 466 void 467 vx_unlock(struct vnode *vp) 468 { 469 lockmgr(&vp->v_lock, LK_RELEASE); 470 } 471 472 /**************************************************************** 473 * VNODE ACQUISITION FUNCTIONS * 474 **************************************************************** 475 * 476 * These functions must be used when accessing a vnode that has no 477 * chance of being destroyed in a SMP race. That means the caller will 478 * usually either hold an auxiliary reference (such as the namecache) 479 * or hold some other lock that ensures that the vnode cannot be destroyed. 480 * 481 * These functions are MANDATORY for any code chain accessing a vnode 482 * whos activation state is not known. 483 * 484 * vget() can be called with LK_NOWAIT and will return EBUSY if the 485 * lock cannot be immediately acquired. 486 * 487 * vget()/vput() are used when reactivation is desired. 488 * 489 * vx_get() and vx_put() are used when reactivation is not desired. 490 */ 491 int 492 vget(struct vnode *vp, int flags) 493 { 494 int error; 495 496 /* 497 * A lock type must be passed 498 */ 499 if ((flags & LK_TYPE_MASK) == 0) { 500 panic("vget() called with no lock specified!"); 501 /* NOT REACHED */ 502 } 503 504 /* 505 * Reference the structure and then acquire the lock. 506 * 507 * NOTE: The requested lock might be a shared lock and does 508 * not protect our access to the refcnt or other fields. 509 */ 510 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 511 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 512 513 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 514 /* 515 * The lock failed, undo and return an error. This will not 516 * normally trigger a termination. 517 */ 518 vrele(vp); 519 } else if (vp->v_flag & VRECLAIMED) { 520 /* 521 * The node is being reclaimed and cannot be reactivated 522 * any more, undo and return ENOENT. 523 */ 524 vn_unlock(vp); 525 vrele(vp); 526 error = ENOENT; 527 } else if (vp->v_state == VS_ACTIVE) { 528 /* 529 * A VS_ACTIVE vnode coupled with the fact that we have 530 * a vnode lock (even if shared) prevents v_state from 531 * changing. Since the vnode is not in a VRECLAIMED state, 532 * we can safely clear VINACTIVE. 533 * 534 * It is possible for a shared lock to cause a race with 535 * another thread that is also in the process of clearing 536 * VREF_TERMINATE, meaning that we might return with it still 537 * set and then assert in a later vref(). The solution is to 538 * unconditionally clear VREF_TERMINATE here as well. 539 * 540 * NOTE! Multiple threads may clear VINACTIVE if this is 541 * shared lock. This race is allowed. 542 */ 543 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 544 vp->v_act += VACT_INC; 545 if (vp->v_act > VACT_MAX) /* SMP race ok */ 546 vp->v_act = VACT_MAX; 547 error = 0; 548 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 549 } else { 550 /* 551 * If the vnode is not VS_ACTIVE it must be reactivated 552 * in addition to clearing VINACTIVE. An exclusive spin_lock 553 * is needed to manipulate the vnode's list. 554 * 555 * Because the lockmgr lock might be shared, we might race 556 * another reactivation, which we handle. In this situation, 557 * however, the refcnt prevents other v_state races. 558 * 559 * As with above, clearing VINACTIVE is allowed to race other 560 * clearings of VINACTIVE. 561 * 562 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 563 * the refcnt is non-zero and the vnode has not been 564 * reclaimed. This also means that the transitions do 565 * not affect cachedvnodes. 566 * 567 * It is possible for a shared lock to cause a race with 568 * another thread that is also in the process of clearing 569 * VREF_TERMINATE, meaning that we might return with it still 570 * set and then assert in a later vref(). The solution is to 571 * unconditionally clear VREF_TERMINATE here as well. 572 */ 573 _vclrflags(vp, VINACTIVE); 574 vp->v_act += VACT_INC; 575 if (vp->v_act > VACT_MAX) /* SMP race ok */ 576 vp->v_act = VACT_MAX; 577 spin_lock(&vp->v_spin); 578 579 switch(vp->v_state) { 580 case VS_INACTIVE: 581 _vactivate(vp); 582 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 583 VREF_FINALIZE); 584 spin_unlock(&vp->v_spin); 585 break; 586 case VS_CACHED: 587 _vactivate(vp); 588 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 589 VREF_FINALIZE); 590 spin_unlock(&vp->v_spin); 591 break; 592 case VS_ACTIVE: 593 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 594 VREF_TERMINATE); 595 spin_unlock(&vp->v_spin); 596 break; 597 case VS_DYING: 598 spin_unlock(&vp->v_spin); 599 panic("Impossible VS_DYING state"); 600 break; 601 } 602 error = 0; 603 } 604 return(error); 605 } 606 607 #ifdef DEBUG_VPUT 608 609 void 610 debug_vput(struct vnode *vp, const char *filename, int line) 611 { 612 kprintf("vput(%p) %s:%d\n", vp, filename, line); 613 vn_unlock(vp); 614 vrele(vp); 615 } 616 617 #else 618 619 void 620 vput(struct vnode *vp) 621 { 622 vn_unlock(vp); 623 vrele(vp); 624 } 625 626 #endif 627 628 /* 629 * Acquire the vnode lock unguarded. 630 * 631 * The non-blocking version also uses a slightly different mechanic. 632 * This function will explicitly fail not only if it cannot acquire 633 * the lock normally, but also if the caller already holds a lock. 634 * 635 * The adjusted mechanic is used to close a loophole where complex 636 * VOP_RECLAIM code can circle around recursively and allocate the 637 * same vnode it is trying to destroy from the freelist. 638 * 639 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 640 * cause the incorrect behavior to occur. If not for that lockmgr() 641 * would do the right thing. 642 * 643 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 644 */ 645 void 646 vx_get(struct vnode *vp) 647 { 648 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 649 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 650 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 651 } 652 653 int 654 vx_get_nonblock(struct vnode *vp) 655 { 656 int error; 657 658 if (lockinuse(&vp->v_lock)) 659 return(EBUSY); 660 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 661 if (error == 0) { 662 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 663 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 664 } 665 return(error); 666 } 667 668 /* 669 * Release a VX lock that also held a ref on the vnode. vrele() will handle 670 * any needed state transitions. 671 * 672 * However, filesystems use this function to get rid of unwanted new vnodes 673 * so try to get the vnode on the correct queue in that case. 674 */ 675 void 676 vx_put(struct vnode *vp) 677 { 678 if (vp->v_type == VNON || vp->v_type == VBAD) 679 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 680 lockmgr(&vp->v_lock, LK_RELEASE); 681 vrele(vp); 682 } 683 684 /* 685 * Try to reuse a vnode from the free list. This function is somewhat 686 * advisory in that NULL can be returned as a normal case, even if free 687 * vnodes are present. 688 * 689 * The scan is limited because it can result in excessive CPU use during 690 * periods of extreme vnode use. 691 * 692 * NOTE: The returned vnode is not completely initialized. 693 */ 694 static 695 struct vnode * 696 cleanfreevnode(int maxcount) 697 { 698 struct vnode_index *vi; 699 struct vnode *vp; 700 int count; 701 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 702 int ri; 703 int cpu_count; 704 705 /* 706 * Try to deactivate some vnodes cached on the active list. 707 */ 708 if (countcachedvnodes() < inactivevnodes) 709 goto skip; 710 711 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 712 713 for (count = 0; count < maxcount * 2; ++count, ++ri) { 714 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 715 716 spin_lock(&vi->spin); 717 718 vp = TAILQ_NEXT(&vi->active_rover, v_list); 719 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 720 if (vp == NULL) { 721 TAILQ_INSERT_HEAD(&vi->active_list, 722 &vi->active_rover, v_list); 723 } else { 724 TAILQ_INSERT_AFTER(&vi->active_list, vp, 725 &vi->active_rover, v_list); 726 } 727 if (vp == NULL) { 728 spin_unlock(&vi->spin); 729 continue; 730 } 731 if ((vp->v_refcnt & VREF_MASK) != 0) { 732 spin_unlock(&vi->spin); 733 vp->v_act += VACT_INC; 734 if (vp->v_act > VACT_MAX) /* SMP race ok */ 735 vp->v_act = VACT_MAX; 736 continue; 737 } 738 739 /* 740 * decrement by less if the vnode's object has a lot of 741 * VM pages. XXX possible SMP races. 742 */ 743 if (vp->v_act > 0) { 744 vm_object_t obj; 745 if ((obj = vp->v_object) != NULL && 746 obj->resident_page_count >= trigger) { 747 vp->v_act -= 1; 748 } else { 749 vp->v_act -= VACT_INC; 750 } 751 if (vp->v_act < 0) 752 vp->v_act = 0; 753 spin_unlock(&vi->spin); 754 continue; 755 } 756 757 /* 758 * Try to deactivate the vnode. 759 */ 760 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 761 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 762 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 763 764 spin_unlock(&vi->spin); 765 vrele(vp); 766 } 767 768 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 769 770 skip: 771 /* 772 * Loop trying to lock the first vnode on the free list. 773 * Cycle if we can't. 774 */ 775 cpu_count = ncpus; 776 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 777 778 for (count = 0; count < maxcount; ++count, ++ri) { 779 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 780 781 spin_lock(&vi->spin); 782 783 vp = TAILQ_FIRST(&vi->inactive_list); 784 if (vp == NULL) { 785 spin_unlock(&vi->spin); 786 if (--cpu_count == 0) 787 break; 788 ri = (ri + 16) & ~15; 789 --ri; 790 continue; 791 } 792 793 /* 794 * non-blocking vx_get will also ref the vnode on success. 795 */ 796 if (vx_get_nonblock(vp)) { 797 KKASSERT(vp->v_state == VS_INACTIVE); 798 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 799 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 800 spin_unlock(&vi->spin); 801 continue; 802 } 803 804 /* 805 * Because we are holding vfs_spin the vnode should currently 806 * be inactive and VREF_TERMINATE should still be set. 807 * 808 * Once vfs_spin is released the vnode's state should remain 809 * unmodified due to both the lock and ref on it. 810 */ 811 KKASSERT(vp->v_state == VS_INACTIVE); 812 spin_unlock(&vi->spin); 813 #ifdef TRACKVNODE 814 if ((u_long)vp == trackvnode) 815 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 816 #endif 817 818 /* 819 * Do not reclaim/reuse a vnode while auxillary refs exists. 820 * This includes namecache refs due to a related ncp being 821 * locked or having children, a VM object association, or 822 * other hold users. 823 * 824 * Do not reclaim/reuse a vnode if someone else has a real 825 * ref on it. This can occur if a filesystem temporarily 826 * releases the vnode lock during VOP_RECLAIM. 827 */ 828 if (vp->v_auxrefs || 829 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 830 failed: 831 if (vp->v_state == VS_INACTIVE) { 832 spin_lock(&vi->spin); 833 if (vp->v_state == VS_INACTIVE) { 834 TAILQ_REMOVE(&vi->inactive_list, 835 vp, v_list); 836 TAILQ_INSERT_TAIL(&vi->inactive_list, 837 vp, v_list); 838 } 839 spin_unlock(&vi->spin); 840 } 841 vx_put(vp); 842 continue; 843 } 844 845 /* 846 * VINACTIVE and VREF_TERMINATE are expected to both be set 847 * for vnodes pulled from the inactive list, and cannot be 848 * changed while we hold the vx lock. 849 * 850 * Try to reclaim the vnode. 851 */ 852 KKASSERT(vp->v_flag & VINACTIVE); 853 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 854 855 if ((vp->v_flag & VRECLAIMED) == 0) { 856 if (cache_inval_vp_nonblock(vp)) 857 goto failed; 858 vgone_vxlocked(vp); 859 /* vnode is still VX locked */ 860 } 861 862 /* 863 * At this point if there are no other refs or auxrefs on 864 * the vnode with the inactive list locked, and we remove 865 * the vnode from the inactive list, it should not be 866 * possible for anyone else to access the vnode any more. 867 * 868 * Since the vnode is in a VRECLAIMED state, no new 869 * namecache associations could have been made and the 870 * vnode should have already been removed from its mountlist. 871 * 872 * Since we hold a VX lock on the vnode it cannot have been 873 * reactivated (moved out of the inactive list). 874 */ 875 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 876 spin_lock(&vi->spin); 877 if (vp->v_auxrefs || 878 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 879 spin_unlock(&vi->spin); 880 goto failed; 881 } 882 KKASSERT(vp->v_state == VS_INACTIVE); 883 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 884 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 885 vp->v_state = VS_DYING; 886 spin_unlock(&vi->spin); 887 888 /* 889 * Nothing should have been able to access this vp. Only 890 * our ref should remain now. 891 */ 892 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 893 KASSERT(vp->v_refcnt == 1, 894 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 895 896 /* 897 * Return a VX locked vnode suitable for reuse. 898 */ 899 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 900 return(vp); 901 } 902 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 903 return(NULL); 904 } 905 906 /* 907 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 908 * 909 * All new vnodes set the VAGE flags. An open() of the vnode will 910 * decrement the (2-bit) flags. Vnodes which are opened several times 911 * are thus retained in the cache over vnodes which are merely stat()d. 912 * 913 * We attempt to reuse an already-recycled vnode from our pcpu inactive 914 * queue first, and allocate otherwise. Attempting to recycle inactive 915 * vnodes here can lead to numerous deadlocks, particularly with 916 * softupdates. 917 */ 918 struct vnode * 919 allocvnode(int lktimeout, int lkflags) 920 { 921 struct vnode *vp; 922 struct vnode_index *vi; 923 924 /* 925 * lktimeout only applies when LK_TIMELOCK is used, and only 926 * the pageout daemon uses it. The timeout may not be zero 927 * or the pageout daemon can deadlock in low-VM situations. 928 */ 929 if (lktimeout == 0) 930 lktimeout = hz / 10; 931 932 /* 933 * Do not flag for synchronous recyclement unless there are enough 934 * freeable vnodes to recycle and the number of vnodes has 935 * significantly exceeded our target. We want the normal vnlru 936 * process to handle the cleaning (at 9/10's) before we are forced 937 * to flag it here at 11/10's for userexit path processing. 938 */ 939 if (numvnodes >= maxvnodes * 11 / 10 && 940 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 941 struct thread *td = curthread; 942 if (td->td_lwp) 943 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 944 } 945 946 /* 947 * Try to trivially reuse a reclaimed vnode from the head of the 948 * inactive list for this cpu. Any vnode cycling which occurs 949 * which terminates the vnode will cause it to be returned to the 950 * same pcpu structure (e.g. unlink calls). 951 */ 952 vi = &vnode_list_hash[mycpuid]; 953 spin_lock(&vi->spin); 954 955 vp = TAILQ_FIRST(&vi->inactive_list); 956 if (vp && (vp->v_flag & VRECLAIMED)) { 957 /* 958 * non-blocking vx_get will also ref the vnode on success. 959 */ 960 if (vx_get_nonblock(vp)) { 961 KKASSERT(vp->v_state == VS_INACTIVE); 962 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 963 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 964 spin_unlock(&vi->spin); 965 goto slower; 966 } 967 968 /* 969 * Because we are holding vfs_spin the vnode should currently 970 * be inactive and VREF_TERMINATE should still be set. 971 * 972 * Once vfs_spin is released the vnode's state should remain 973 * unmodified due to both the lock and ref on it. 974 */ 975 KKASSERT(vp->v_state == VS_INACTIVE); 976 #ifdef TRACKVNODE 977 if ((u_long)vp == trackvnode) 978 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 979 #endif 980 981 /* 982 * Do not reclaim/reuse a vnode while auxillary refs exists. 983 * This includes namecache refs due to a related ncp being 984 * locked or having children, a VM object association, or 985 * other hold users. 986 * 987 * Do not reclaim/reuse a vnode if someone else has a real 988 * ref on it. This can occur if a filesystem temporarily 989 * releases the vnode lock during VOP_RECLAIM. 990 */ 991 if (vp->v_auxrefs || 992 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 993 if (vp->v_state == VS_INACTIVE) { 994 TAILQ_REMOVE(&vi->inactive_list, 995 vp, v_list); 996 TAILQ_INSERT_TAIL(&vi->inactive_list, 997 vp, v_list); 998 } 999 spin_unlock(&vi->spin); 1000 vx_put(vp); 1001 goto slower; 1002 } 1003 1004 /* 1005 * VINACTIVE and VREF_TERMINATE are expected to both be set 1006 * for vnodes pulled from the inactive list, and cannot be 1007 * changed while we hold the vx lock. 1008 * 1009 * Try to reclaim the vnode. 1010 */ 1011 KKASSERT(vp->v_flag & VINACTIVE); 1012 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1013 1014 if ((vp->v_flag & VRECLAIMED) == 0) { 1015 spin_unlock(&vi->spin); 1016 vx_put(vp); 1017 goto slower; 1018 } 1019 1020 /* 1021 * At this point if there are no other refs or auxrefs on 1022 * the vnode with the inactive list locked, and we remove 1023 * the vnode from the inactive list, it should not be 1024 * possible for anyone else to access the vnode any more. 1025 * 1026 * Since the vnode is in a VRECLAIMED state, no new 1027 * namecache associations could have been made and the 1028 * vnode should have already been removed from its mountlist. 1029 * 1030 * Since we hold a VX lock on the vnode it cannot have been 1031 * reactivated (moved out of the inactive list). 1032 */ 1033 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1034 KKASSERT(vp->v_state == VS_INACTIVE); 1035 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1036 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1037 vp->v_state = VS_DYING; 1038 spin_unlock(&vi->spin); 1039 1040 /* 1041 * Nothing should have been able to access this vp. Only 1042 * our ref should remain now. 1043 * 1044 * At this point we can kfree() the vnode if we want to. 1045 * Instead, we reuse it for the allocation. 1046 */ 1047 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1048 KASSERT(vp->v_refcnt == 1, 1049 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1050 bzero(vp, sizeof(*vp)); 1051 } else { 1052 spin_unlock(&vi->spin); 1053 slower: 1054 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1055 atomic_add_int(&numvnodes, 1); 1056 } 1057 1058 lwkt_token_init(&vp->v_token, "vnode"); 1059 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1060 TAILQ_INIT(&vp->v_namecache); 1061 RB_INIT(&vp->v_rbclean_tree); 1062 RB_INIT(&vp->v_rbdirty_tree); 1063 RB_INIT(&vp->v_rbhash_tree); 1064 spin_init(&vp->v_spin, "allocvnode"); 1065 1066 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1067 vp->v_refcnt = 1; 1068 vp->v_flag = VAGE0 | VAGE1; 1069 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1070 1071 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1072 /* exclusive lock still held */ 1073 1074 vp->v_filesize = NOOFFSET; 1075 vp->v_type = VNON; 1076 vp->v_tag = 0; 1077 vp->v_state = VS_CACHED; 1078 _vactivate(vp); 1079 1080 return (vp); 1081 } 1082 1083 /* 1084 * Called after a process has allocated a vnode via allocvnode() 1085 * and we detected that too many vnodes were present. 1086 * 1087 * This function is called just prior to a return to userland if the 1088 * process at some point had to allocate a new vnode during the last 1089 * system call and the vnode count was found to be excessive. 1090 * 1091 * This is a synchronous path that we do not normally want to execute. 1092 * 1093 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1094 * 1095 * WARNING: Sometimes numvnodes can blow out due to children being 1096 * present under directory vnodes in the namecache. For the 1097 * moment use an if() instead of a while() and note that if 1098 * we were to use a while() we would still have to break out 1099 * if freesomevnodes() returned 0. vnlru will also be trying 1100 * hard to free vnodes at the same time (with a lower trigger 1101 * pointer). 1102 */ 1103 void 1104 allocvnode_gc(void) 1105 { 1106 if (numvnodes >= maxvnodes && 1107 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1108 freesomevnodes(batchfreevnodes); 1109 } 1110 } 1111 1112 int 1113 freesomevnodes(int n) 1114 { 1115 struct vnode *vp; 1116 int count = 0; 1117 1118 while (n) { 1119 if ((vp = cleanfreevnode(n)) == NULL) 1120 break; 1121 vx_unlock(vp); 1122 --n; 1123 ++count; 1124 kfree(vp, M_VNODE); 1125 atomic_add_int(&numvnodes, -1); 1126 } 1127 return(count); 1128 } 1129