1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 76 77 /* 78 * The vnode free list hold inactive vnodes. Aged inactive vnodes 79 * are inserted prior to the mid point, and otherwise inserted 80 * at the tail. 81 * 82 * The vnode code goes to great lengths to avoid moving vnodes between 83 * lists, but sometimes it is unavoidable. For this situation we try to 84 * avoid lock contention but we do not try very hard to avoid cache line 85 * congestion. A modestly sized hash table is used. 86 */ 87 #define VLIST_PRIME2 123462047LU 88 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 89 90 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 91 VLIST_PRIME2 % (unsigned)ncpus) 92 93 static struct vnode_index *vnode_list_hash; 94 95 int activevnodes = 0; 96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 97 &activevnodes, 0, "Number of active nodes"); 98 int cachedvnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 100 &cachedvnodes, 0, "Number of total cached nodes"); 101 int inactivevnodes = 0; 102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 103 &inactivevnodes, 0, "Number of inactive nodes"); 104 static int batchfreevnodes = 5; 105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 106 &batchfreevnodes, 0, "Number of vnodes to free at once"); 107 #ifdef TRACKVNODE 108 static u_long trackvnode; 109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 110 &trackvnode, 0, ""); 111 #endif 112 113 /* 114 * Called from vfsinit() 115 */ 116 void 117 vfs_lock_init(void) 118 { 119 int i; 120 121 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 122 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 123 M_VNODE, M_ZERO | M_WAITOK); 124 for (i = 0; i < ncpus; ++i) { 125 struct vnode_index *vi = &vnode_list_hash[i]; 126 127 TAILQ_INIT(&vi->inactive_list); 128 TAILQ_INIT(&vi->active_list); 129 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 130 spin_init(&vi->spin, "vfslock"); 131 } 132 } 133 134 /* 135 * Misc functions 136 */ 137 static __inline 138 void 139 _vsetflags(struct vnode *vp, int flags) 140 { 141 atomic_set_int(&vp->v_flag, flags); 142 } 143 144 static __inline 145 void 146 _vclrflags(struct vnode *vp, int flags) 147 { 148 atomic_clear_int(&vp->v_flag, flags); 149 } 150 151 void 152 vsetflags(struct vnode *vp, int flags) 153 { 154 _vsetflags(vp, flags); 155 } 156 157 void 158 vclrflags(struct vnode *vp, int flags) 159 { 160 _vclrflags(vp, flags); 161 } 162 163 /* 164 * Place the vnode on the active list. 165 * 166 * Caller must hold vp->v_spin 167 */ 168 static __inline 169 void 170 _vactivate(struct vnode *vp) 171 { 172 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 173 174 #ifdef TRACKVNODE 175 if ((u_long)vp == trackvnode) 176 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 177 #endif 178 spin_lock(&vi->spin); 179 180 switch(vp->v_state) { 181 case VS_ACTIVE: 182 spin_unlock(&vi->spin); 183 panic("_vactivate: already active"); 184 /* NOT REACHED */ 185 return; 186 case VS_INACTIVE: 187 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 188 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 189 break; 190 case VS_CACHED: 191 case VS_DYING: 192 break; 193 } 194 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 195 vp->v_state = VS_ACTIVE; 196 spin_unlock(&vi->spin); 197 atomic_add_int(&mycpu->gd_activevnodes, 1); 198 } 199 200 /* 201 * Put a vnode on the inactive list. 202 * 203 * Caller must hold v_spin 204 */ 205 static __inline 206 void 207 _vinactive(struct vnode *vp) 208 { 209 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 210 211 #ifdef TRACKVNODE 212 if ((u_long)vp == trackvnode) { 213 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 214 print_backtrace(-1); 215 } 216 #endif 217 spin_lock(&vi->spin); 218 219 /* 220 * Remove from active list if it is sitting on it 221 */ 222 switch(vp->v_state) { 223 case VS_ACTIVE: 224 TAILQ_REMOVE(&vi->active_list, vp, v_list); 225 atomic_add_int(&mycpu->gd_activevnodes, -1); 226 break; 227 case VS_INACTIVE: 228 spin_unlock(&vi->spin); 229 panic("_vinactive: already inactive"); 230 /* NOT REACHED */ 231 return; 232 case VS_CACHED: 233 case VS_DYING: 234 break; 235 } 236 237 /* 238 * Distinguish between basically dead vnodes, vnodes with cached 239 * data, and vnodes without cached data. A rover will shift the 240 * vnodes around as their cache status is lost. 241 */ 242 if (vp->v_flag & VRECLAIMED) { 243 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 244 } else { 245 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 246 } 247 vp->v_state = VS_INACTIVE; 248 spin_unlock(&vi->spin); 249 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 250 } 251 252 /* 253 * Add a ref to an active vnode. This function should never be called 254 * with an inactive vnode (use vget() instead), but might be called 255 * with other states. 256 */ 257 void 258 vref(struct vnode *vp) 259 { 260 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 261 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 262 atomic_add_int(&vp->v_refcnt, 1); 263 } 264 265 void 266 synchronizevnodecount(void) 267 { 268 int nca = 0; 269 int act = 0; 270 int ina = 0; 271 int i; 272 273 for (i = 0; i < ncpus; ++i) { 274 globaldata_t gd = globaldata_find(i); 275 nca += gd->gd_cachedvnodes; 276 act += gd->gd_activevnodes; 277 ina += gd->gd_inactivevnodes; 278 } 279 cachedvnodes = nca; 280 activevnodes = act; 281 inactivevnodes = ina; 282 } 283 284 /* 285 * Count number of cached vnodes. This is middling expensive so be 286 * careful not to make this call in the critical path. Each cpu tracks 287 * its own accumulator. The individual accumulators must be summed 288 * together to get an accurate value. 289 */ 290 int 291 countcachedvnodes(void) 292 { 293 int i; 294 int n = 0; 295 296 for (i = 0; i < ncpus; ++i) { 297 globaldata_t gd = globaldata_find(i); 298 n += gd->gd_cachedvnodes; 299 } 300 return n; 301 } 302 303 int 304 countcachedandinactivevnodes(void) 305 { 306 int i; 307 int n = 0; 308 309 for (i = 0; i < ncpus; ++i) { 310 globaldata_t gd = globaldata_find(i); 311 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 312 } 313 return n; 314 } 315 316 /* 317 * Release a ref on an active or inactive vnode. 318 * 319 * Caller has no other requirements. 320 * 321 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 322 * transition, otherwise we leave the vnode in the active list and 323 * do a lockless transition to 0, which is very important for the 324 * critical path. 325 * 326 * (vrele() is not called when a vnode is being destroyed w/kfree) 327 */ 328 void 329 vrele(struct vnode *vp) 330 { 331 int count; 332 333 #if 1 334 count = vp->v_refcnt; 335 cpu_ccfence(); 336 337 for (;;) { 338 KKASSERT((count & VREF_MASK) > 0); 339 KKASSERT(vp->v_state == VS_ACTIVE || 340 vp->v_state == VS_INACTIVE); 341 342 /* 343 * 2+ case 344 */ 345 if ((count & VREF_MASK) > 1) { 346 if (atomic_fcmpset_int(&vp->v_refcnt, 347 &count, count - 1)) { 348 break; 349 } 350 continue; 351 } 352 353 /* 354 * 1->0 transition case must handle possible finalization. 355 * When finalizing we transition 1->0x40000000. Note that 356 * cachedvnodes is only adjusted on transitions to ->0. 357 * 358 * WARNING! VREF_TERMINATE can be cleared at any point 359 * when the refcnt is non-zero (by vget()) and 360 * the vnode has not been reclaimed. Thus 361 * transitions out of VREF_TERMINATE do not have 362 * to mess with cachedvnodes. 363 */ 364 if (count & VREF_FINALIZE) { 365 vx_lock(vp); 366 if (atomic_fcmpset_int(&vp->v_refcnt, 367 &count, VREF_TERMINATE)) { 368 vnode_terminate(vp); 369 break; 370 } 371 vx_unlock(vp); 372 } else { 373 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { 374 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 375 break; 376 } 377 } 378 cpu_pause(); 379 /* retry */ 380 } 381 #else 382 /* 383 * XXX NOT YET WORKING! Multiple threads can reference the vnode 384 * after dropping their count, racing destruction, because this 385 * code is not directly transitioning from 1->VREF_FINALIZE. 386 */ 387 /* 388 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE 389 * and attempt to acquire VREF_TERMINATE if set. It is possible for 390 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but 391 * only one will be able to transition the vnode into the 392 * VREF_TERMINATE state. 393 * 394 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter 395 * this state once. 396 */ 397 count = atomic_fetchadd_int(&vp->v_refcnt, -1); 398 if ((count & VREF_MASK) == 1) { 399 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 400 --count; 401 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) { 402 vx_lock(vp); 403 if (atomic_fcmpset_int(&vp->v_refcnt, 404 &count, VREF_TERMINATE)) { 405 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 406 vnode_terminate(vp); 407 break; 408 } 409 vx_unlock(vp); 410 } 411 } 412 #endif 413 } 414 415 /* 416 * Add an auxiliary data structure reference to the vnode. Auxiliary 417 * references do not change the state of the vnode or prevent deactivation 418 * or reclamation of the vnode, but will prevent the vnode from being 419 * destroyed (kfree()'d). 420 * 421 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 422 * already be held by the caller. vdrop() will clean up the 423 * free list state. 424 */ 425 void 426 vhold(struct vnode *vp) 427 { 428 atomic_add_int(&vp->v_auxrefs, 1); 429 } 430 431 /* 432 * Remove an auxiliary reference from the vnode. 433 */ 434 void 435 vdrop(struct vnode *vp) 436 { 437 atomic_add_int(&vp->v_auxrefs, -1); 438 } 439 440 /* 441 * This function is called on the 1->0 transition (which is actually 442 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 443 * of the vnode. 444 * 445 * Additional vrefs are allowed to race but will not result in a reentrant 446 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 447 * prevents additional 1->0 transitions. 448 * 449 * ONLY A VGET() CAN REACTIVATE THE VNODE. 450 * 451 * Caller must hold the VX lock. 452 * 453 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 454 * 455 * NOTE: The vnode may be marked inactive with dirty buffers 456 * or dirty pages in its cached VM object still present. 457 * 458 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 459 * previously be active). We lose control of the vnode the instant 460 * it is placed on the free list. 461 * 462 * The VX lock is required when transitioning to VS_CACHED but is 463 * not sufficient for the vshouldfree() interlocked test or when 464 * transitioning away from VS_CACHED. v_spin is also required for 465 * those cases. 466 */ 467 static 468 void 469 vnode_terminate(struct vnode *vp) 470 { 471 KKASSERT(vp->v_state == VS_ACTIVE); 472 473 if ((vp->v_flag & VINACTIVE) == 0) { 474 _vsetflags(vp, VINACTIVE); 475 if (vp->v_mount) 476 VOP_INACTIVE(vp); 477 } 478 spin_lock(&vp->v_spin); 479 _vinactive(vp); 480 spin_unlock(&vp->v_spin); 481 482 vx_unlock(vp); 483 } 484 485 /**************************************************************** 486 * VX LOCKING FUNCTIONS * 487 **************************************************************** 488 * 489 * These functions lock vnodes for reclamation and deactivation related 490 * activities. The caller must already be holding some sort of reference 491 * on the vnode. 492 */ 493 void 494 vx_lock(struct vnode *vp) 495 { 496 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 497 } 498 499 void 500 vx_unlock(struct vnode *vp) 501 { 502 lockmgr(&vp->v_lock, LK_RELEASE); 503 } 504 505 /**************************************************************** 506 * VNODE ACQUISITION FUNCTIONS * 507 **************************************************************** 508 * 509 * These functions must be used when accessing a vnode that has no 510 * chance of being destroyed in a SMP race. That means the caller will 511 * usually either hold an auxiliary reference (such as the namecache) 512 * or hold some other lock that ensures that the vnode cannot be destroyed. 513 * 514 * These functions are MANDATORY for any code chain accessing a vnode 515 * whos activation state is not known. 516 * 517 * vget() can be called with LK_NOWAIT and will return EBUSY if the 518 * lock cannot be immediately acquired. 519 * 520 * vget()/vput() are used when reactivation is desired. 521 * 522 * vx_get() and vx_put() are used when reactivation is not desired. 523 */ 524 int 525 vget(struct vnode *vp, int flags) 526 { 527 int error; 528 529 /* 530 * A lock type must be passed 531 */ 532 if ((flags & LK_TYPE_MASK) == 0) { 533 panic("vget() called with no lock specified!"); 534 /* NOT REACHED */ 535 } 536 537 /* 538 * Reference the structure and then acquire the lock. 539 * 540 * NOTE: The requested lock might be a shared lock and does 541 * not protect our access to the refcnt or other fields. 542 */ 543 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 544 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 545 546 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 547 /* 548 * The lock failed, undo and return an error. This will not 549 * normally trigger a termination. 550 */ 551 vrele(vp); 552 } else if (vp->v_flag & VRECLAIMED) { 553 /* 554 * The node is being reclaimed and cannot be reactivated 555 * any more, undo and return ENOENT. 556 */ 557 vn_unlock(vp); 558 vrele(vp); 559 error = ENOENT; 560 } else if (vp->v_state == VS_ACTIVE) { 561 /* 562 * A VS_ACTIVE vnode coupled with the fact that we have 563 * a vnode lock (even if shared) prevents v_state from 564 * changing. Since the vnode is not in a VRECLAIMED state, 565 * we can safely clear VINACTIVE. 566 * 567 * It is possible for a shared lock to cause a race with 568 * another thread that is also in the process of clearing 569 * VREF_TERMINATE, meaning that we might return with it still 570 * set and then assert in a later vref(). The solution is to 571 * unconditionally clear VREF_TERMINATE here as well. 572 * 573 * NOTE! Multiple threads may clear VINACTIVE if this is 574 * shared lock. This race is allowed. 575 */ 576 if (vp->v_flag & VINACTIVE) 577 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 578 if (vp->v_act < VACT_MAX) { 579 vp->v_act += VACT_INC; 580 if (vp->v_act > VACT_MAX) /* SMP race ok */ 581 vp->v_act = VACT_MAX; 582 } 583 error = 0; 584 if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */ 585 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 586 } else { 587 /* 588 * If the vnode is not VS_ACTIVE it must be reactivated 589 * in addition to clearing VINACTIVE. An exclusive spin_lock 590 * is needed to manipulate the vnode's list. 591 * 592 * Because the lockmgr lock might be shared, we might race 593 * another reactivation, which we handle. In this situation, 594 * however, the refcnt prevents other v_state races. 595 * 596 * As with above, clearing VINACTIVE is allowed to race other 597 * clearings of VINACTIVE. 598 * 599 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 600 * the refcnt is non-zero and the vnode has not been 601 * reclaimed. This also means that the transitions do 602 * not affect cachedvnodes. 603 * 604 * It is possible for a shared lock to cause a race with 605 * another thread that is also in the process of clearing 606 * VREF_TERMINATE, meaning that we might return with it still 607 * set and then assert in a later vref(). The solution is to 608 * unconditionally clear VREF_TERMINATE here as well. 609 */ 610 _vclrflags(vp, VINACTIVE); 611 vp->v_act += VACT_INC; 612 if (vp->v_act > VACT_MAX) /* SMP race ok */ 613 vp->v_act = VACT_MAX; 614 spin_lock(&vp->v_spin); 615 616 switch(vp->v_state) { 617 case VS_INACTIVE: 618 _vactivate(vp); 619 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 620 VREF_FINALIZE); 621 spin_unlock(&vp->v_spin); 622 break; 623 case VS_CACHED: 624 _vactivate(vp); 625 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 626 VREF_FINALIZE); 627 spin_unlock(&vp->v_spin); 628 break; 629 case VS_ACTIVE: 630 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 631 VREF_TERMINATE); 632 spin_unlock(&vp->v_spin); 633 break; 634 case VS_DYING: 635 spin_unlock(&vp->v_spin); 636 panic("Impossible VS_DYING state"); 637 break; 638 } 639 error = 0; 640 } 641 return(error); 642 } 643 644 #ifdef DEBUG_VPUT 645 646 void 647 debug_vput(struct vnode *vp, const char *filename, int line) 648 { 649 kprintf("vput(%p) %s:%d\n", vp, filename, line); 650 vn_unlock(vp); 651 vrele(vp); 652 } 653 654 #else 655 656 void 657 vput(struct vnode *vp) 658 { 659 vn_unlock(vp); 660 vrele(vp); 661 } 662 663 #endif 664 665 /* 666 * Acquire the vnode lock unguarded. 667 * 668 * The non-blocking version also uses a slightly different mechanic. 669 * This function will explicitly fail not only if it cannot acquire 670 * the lock normally, but also if the caller already holds a lock. 671 * 672 * The adjusted mechanic is used to close a loophole where complex 673 * VOP_RECLAIM code can circle around recursively and allocate the 674 * same vnode it is trying to destroy from the freelist. 675 * 676 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 677 * cause the incorrect behavior to occur. If not for that lockmgr() 678 * would do the right thing. 679 * 680 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 681 */ 682 void 683 vx_get(struct vnode *vp) 684 { 685 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 686 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 687 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 688 } 689 690 int 691 vx_get_nonblock(struct vnode *vp) 692 { 693 int error; 694 695 if (lockinuse(&vp->v_lock)) 696 return(EBUSY); 697 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 698 if (error == 0) { 699 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 700 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 701 } 702 return(error); 703 } 704 705 /* 706 * Release a VX lock that also held a ref on the vnode. vrele() will handle 707 * any needed state transitions. 708 * 709 * However, filesystems use this function to get rid of unwanted new vnodes 710 * so try to get the vnode on the correct queue in that case. 711 */ 712 void 713 vx_put(struct vnode *vp) 714 { 715 if (vp->v_type == VNON || vp->v_type == VBAD) 716 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 717 lockmgr(&vp->v_lock, LK_RELEASE); 718 vrele(vp); 719 } 720 721 /* 722 * Try to reuse a vnode from the free list. This function is somewhat 723 * advisory in that NULL can be returned as a normal case, even if free 724 * vnodes are present. 725 * 726 * The scan is limited because it can result in excessive CPU use during 727 * periods of extreme vnode use. 728 * 729 * NOTE: The returned vnode is not completely initialized. 730 */ 731 static 732 struct vnode * 733 cleanfreevnode(int maxcount) 734 { 735 struct vnode_index *vi; 736 struct vnode *vp; 737 int count; 738 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 739 int ri; 740 int cpu_count; 741 742 /* 743 * Try to deactivate some vnodes cached on the active list. 744 */ 745 if (countcachedvnodes() < inactivevnodes) 746 goto skip; 747 748 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 749 750 for (count = 0; count < maxcount * 2; ++count, ++ri) { 751 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 752 753 spin_lock(&vi->spin); 754 755 vp = TAILQ_NEXT(&vi->active_rover, v_list); 756 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 757 if (vp == NULL) { 758 TAILQ_INSERT_HEAD(&vi->active_list, 759 &vi->active_rover, v_list); 760 } else { 761 TAILQ_INSERT_AFTER(&vi->active_list, vp, 762 &vi->active_rover, v_list); 763 } 764 if (vp == NULL) { 765 spin_unlock(&vi->spin); 766 continue; 767 } 768 if ((vp->v_refcnt & VREF_MASK) != 0) { 769 spin_unlock(&vi->spin); 770 vp->v_act += VACT_INC; 771 if (vp->v_act > VACT_MAX) /* SMP race ok */ 772 vp->v_act = VACT_MAX; 773 continue; 774 } 775 776 /* 777 * decrement by less if the vnode's object has a lot of 778 * VM pages. XXX possible SMP races. 779 */ 780 if (vp->v_act > 0) { 781 vm_object_t obj; 782 if ((obj = vp->v_object) != NULL && 783 obj->resident_page_count >= trigger) { 784 vp->v_act -= 1; 785 } else { 786 vp->v_act -= VACT_INC; 787 } 788 if (vp->v_act < 0) 789 vp->v_act = 0; 790 spin_unlock(&vi->spin); 791 continue; 792 } 793 794 /* 795 * Try to deactivate the vnode. 796 */ 797 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 798 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 799 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 800 801 spin_unlock(&vi->spin); 802 vrele(vp); 803 } 804 805 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 806 807 skip: 808 /* 809 * Loop trying to lock the first vnode on the free list. 810 * Cycle if we can't. 811 */ 812 cpu_count = ncpus; 813 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 814 815 for (count = 0; count < maxcount; ++count, ++ri) { 816 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 817 818 spin_lock(&vi->spin); 819 820 vp = TAILQ_FIRST(&vi->inactive_list); 821 if (vp == NULL) { 822 spin_unlock(&vi->spin); 823 if (--cpu_count == 0) 824 break; 825 ri = (ri + 16) & ~15; 826 --ri; 827 continue; 828 } 829 830 /* 831 * non-blocking vx_get will also ref the vnode on success. 832 */ 833 if (vx_get_nonblock(vp)) { 834 KKASSERT(vp->v_state == VS_INACTIVE); 835 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 836 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 837 spin_unlock(&vi->spin); 838 continue; 839 } 840 841 /* 842 * Because we are holding vfs_spin the vnode should currently 843 * be inactive and VREF_TERMINATE should still be set. 844 * 845 * Once vfs_spin is released the vnode's state should remain 846 * unmodified due to both the lock and ref on it. 847 */ 848 KKASSERT(vp->v_state == VS_INACTIVE); 849 spin_unlock(&vi->spin); 850 #ifdef TRACKVNODE 851 if ((u_long)vp == trackvnode) 852 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 853 #endif 854 855 /* 856 * Do not reclaim/reuse a vnode while auxillary refs exists. 857 * This includes namecache refs due to a related ncp being 858 * locked or having children, a VM object association, or 859 * other hold users. 860 * 861 * Do not reclaim/reuse a vnode if someone else has a real 862 * ref on it. This can occur if a filesystem temporarily 863 * releases the vnode lock during VOP_RECLAIM. 864 */ 865 if (vp->v_auxrefs != vp->v_namecache_count || 866 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 867 failed: 868 if (vp->v_state == VS_INACTIVE) { 869 spin_lock(&vi->spin); 870 if (vp->v_state == VS_INACTIVE) { 871 TAILQ_REMOVE(&vi->inactive_list, 872 vp, v_list); 873 TAILQ_INSERT_TAIL(&vi->inactive_list, 874 vp, v_list); 875 } 876 spin_unlock(&vi->spin); 877 } 878 vx_put(vp); 879 continue; 880 } 881 882 /* 883 * VINACTIVE and VREF_TERMINATE are expected to both be set 884 * for vnodes pulled from the inactive list, and cannot be 885 * changed while we hold the vx lock. 886 * 887 * Try to reclaim the vnode. 888 * 889 * The cache_inval_vp() can fail if any of the namecache 890 * elements are actively locked, preventing the vnode from 891 * bring reclaimed. This is desired operation as it gives 892 * the namecache code certain guarantees just by holding 893 * a ncp. 894 */ 895 KKASSERT(vp->v_flag & VINACTIVE); 896 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 897 898 if ((vp->v_flag & VRECLAIMED) == 0) { 899 if (cache_inval_vp_nonblock(vp)) 900 goto failed; 901 vgone_vxlocked(vp); 902 /* vnode is still VX locked */ 903 } 904 905 /* 906 * At this point if there are no other refs or auxrefs on 907 * the vnode with the inactive list locked, and we remove 908 * the vnode from the inactive list, it should not be 909 * possible for anyone else to access the vnode any more. 910 * 911 * Since the vnode is in a VRECLAIMED state, no new 912 * namecache associations could have been made and the 913 * vnode should have already been removed from its mountlist. 914 * 915 * Since we hold a VX lock on the vnode it cannot have been 916 * reactivated (moved out of the inactive list). 917 */ 918 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 919 spin_lock(&vi->spin); 920 if (vp->v_auxrefs || 921 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 922 spin_unlock(&vi->spin); 923 goto failed; 924 } 925 KKASSERT(vp->v_state == VS_INACTIVE); 926 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 927 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 928 vp->v_state = VS_DYING; 929 spin_unlock(&vi->spin); 930 931 /* 932 * Nothing should have been able to access this vp. Only 933 * our ref should remain now. 934 */ 935 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 936 KASSERT(vp->v_refcnt == 1, 937 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 938 939 /* 940 * Return a VX locked vnode suitable for reuse. 941 */ 942 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 943 return(vp); 944 } 945 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 946 return(NULL); 947 } 948 949 /* 950 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 951 * 952 * All new vnodes set the VAGE flags. An open() of the vnode will 953 * decrement the (2-bit) flags. Vnodes which are opened several times 954 * are thus retained in the cache over vnodes which are merely stat()d. 955 * 956 * We attempt to reuse an already-recycled vnode from our pcpu inactive 957 * queue first, and allocate otherwise. Attempting to recycle inactive 958 * vnodes here can lead to numerous deadlocks, particularly with 959 * softupdates. 960 */ 961 struct vnode * 962 allocvnode(int lktimeout, int lkflags) 963 { 964 struct vnode *vp; 965 struct vnode_index *vi; 966 967 /* 968 * lktimeout only applies when LK_TIMELOCK is used, and only 969 * the pageout daemon uses it. The timeout may not be zero 970 * or the pageout daemon can deadlock in low-VM situations. 971 */ 972 if (lktimeout == 0) 973 lktimeout = hz / 10; 974 975 /* 976 * Do not flag for synchronous recyclement unless there are enough 977 * freeable vnodes to recycle and the number of vnodes has 978 * significantly exceeded our target. We want the normal vnlru 979 * process to handle the cleaning (at 9/10's) before we are forced 980 * to flag it here at 11/10's for userexit path processing. 981 */ 982 if (numvnodes >= maxvnodes * 11 / 10 && 983 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 984 struct thread *td = curthread; 985 if (td->td_lwp) 986 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 987 } 988 989 /* 990 * Try to trivially reuse a reclaimed vnode from the head of the 991 * inactive list for this cpu. Any vnode cycling which occurs 992 * which terminates the vnode will cause it to be returned to the 993 * same pcpu structure (e.g. unlink calls). 994 */ 995 vi = &vnode_list_hash[mycpuid]; 996 spin_lock(&vi->spin); 997 998 vp = TAILQ_FIRST(&vi->inactive_list); 999 if (vp && (vp->v_flag & VRECLAIMED)) { 1000 /* 1001 * non-blocking vx_get will also ref the vnode on success. 1002 */ 1003 if (vx_get_nonblock(vp)) { 1004 KKASSERT(vp->v_state == VS_INACTIVE); 1005 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1006 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 1007 spin_unlock(&vi->spin); 1008 goto slower; 1009 } 1010 1011 /* 1012 * Because we are holding vfs_spin the vnode should currently 1013 * be inactive and VREF_TERMINATE should still be set. 1014 * 1015 * Once vfs_spin is released the vnode's state should remain 1016 * unmodified due to both the lock and ref on it. 1017 */ 1018 KKASSERT(vp->v_state == VS_INACTIVE); 1019 #ifdef TRACKVNODE 1020 if ((u_long)vp == trackvnode) 1021 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1022 #endif 1023 1024 /* 1025 * Do not reclaim/reuse a vnode while auxillary refs exists. 1026 * This includes namecache refs due to a related ncp being 1027 * locked or having children, a VM object association, or 1028 * other hold users. 1029 * 1030 * Do not reclaim/reuse a vnode if someone else has a real 1031 * ref on it. This can occur if a filesystem temporarily 1032 * releases the vnode lock during VOP_RECLAIM. 1033 */ 1034 if (vp->v_auxrefs || 1035 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1036 if (vp->v_state == VS_INACTIVE) { 1037 TAILQ_REMOVE(&vi->inactive_list, 1038 vp, v_list); 1039 TAILQ_INSERT_TAIL(&vi->inactive_list, 1040 vp, v_list); 1041 } 1042 spin_unlock(&vi->spin); 1043 vx_put(vp); 1044 goto slower; 1045 } 1046 1047 /* 1048 * VINACTIVE and VREF_TERMINATE are expected to both be set 1049 * for vnodes pulled from the inactive list, and cannot be 1050 * changed while we hold the vx lock. 1051 * 1052 * Try to reclaim the vnode. 1053 */ 1054 KKASSERT(vp->v_flag & VINACTIVE); 1055 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1056 1057 if ((vp->v_flag & VRECLAIMED) == 0) { 1058 spin_unlock(&vi->spin); 1059 vx_put(vp); 1060 goto slower; 1061 } 1062 1063 /* 1064 * At this point if there are no other refs or auxrefs on 1065 * the vnode with the inactive list locked, and we remove 1066 * the vnode from the inactive list, it should not be 1067 * possible for anyone else to access the vnode any more. 1068 * 1069 * Since the vnode is in a VRECLAIMED state, no new 1070 * namecache associations could have been made and the 1071 * vnode should have already been removed from its mountlist. 1072 * 1073 * Since we hold a VX lock on the vnode it cannot have been 1074 * reactivated (moved out of the inactive list). 1075 */ 1076 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1077 KKASSERT(vp->v_state == VS_INACTIVE); 1078 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1079 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1080 vp->v_state = VS_DYING; 1081 spin_unlock(&vi->spin); 1082 1083 /* 1084 * Nothing should have been able to access this vp. Only 1085 * our ref should remain now. 1086 * 1087 * At this point we can kfree() the vnode if we want to. 1088 * Instead, we reuse it for the allocation. 1089 */ 1090 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1091 KASSERT(vp->v_refcnt == 1, 1092 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1093 bzero(vp, sizeof(*vp)); 1094 } else { 1095 spin_unlock(&vi->spin); 1096 slower: 1097 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1098 atomic_add_int(&numvnodes, 1); 1099 } 1100 1101 lwkt_token_init(&vp->v_token, "vnode"); 1102 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1103 TAILQ_INIT(&vp->v_namecache); 1104 RB_INIT(&vp->v_rbclean_tree); 1105 RB_INIT(&vp->v_rbdirty_tree); 1106 RB_INIT(&vp->v_rbhash_tree); 1107 spin_init(&vp->v_spin, "allocvnode"); 1108 1109 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1110 vp->v_refcnt = 1; 1111 vp->v_flag = VAGE0 | VAGE1; 1112 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1113 1114 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1115 /* exclusive lock still held */ 1116 1117 vp->v_filesize = NOOFFSET; 1118 vp->v_type = VNON; 1119 vp->v_tag = 0; 1120 vp->v_state = VS_CACHED; 1121 _vactivate(vp); 1122 1123 return (vp); 1124 } 1125 1126 /* 1127 * Called after a process has allocated a vnode via allocvnode() 1128 * and we detected that too many vnodes were present. 1129 * 1130 * This function is called just prior to a return to userland if the 1131 * process at some point had to allocate a new vnode during the last 1132 * system call and the vnode count was found to be excessive. 1133 * 1134 * This is a synchronous path that we do not normally want to execute. 1135 * 1136 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1137 * 1138 * WARNING: Sometimes numvnodes can blow out due to children being 1139 * present under directory vnodes in the namecache. For the 1140 * moment use an if() instead of a while() and note that if 1141 * we were to use a while() we would still have to break out 1142 * if freesomevnodes() returned 0. vnlru will also be trying 1143 * hard to free vnodes at the same time (with a lower trigger 1144 * pointer). 1145 */ 1146 void 1147 allocvnode_gc(void) 1148 { 1149 if (numvnodes >= maxvnodes && 1150 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1151 freesomevnodes(batchfreevnodes); 1152 } 1153 } 1154 1155 int 1156 freesomevnodes(int n) 1157 { 1158 struct vnode *vp; 1159 int count = 0; 1160 1161 while (n) { 1162 if ((vp = cleanfreevnode(n)) == NULL) 1163 break; 1164 vx_unlock(vp); 1165 --n; 1166 ++count; 1167 kfree(vp, M_VNODE); 1168 atomic_add_int(&numvnodes, -1); 1169 } 1170 return(count); 1171 } 1172