1 /* $NetBSD: vfs_vnode.c,v 1.56 2016/08/20 12:37:08 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to 93 * disassociate underlying file system from the vnode, and finally 94 * destroyed. 95 * 96 * Vnode state 97 * 98 * Vnode is always in one of six states: 99 * - MARKER This is a marker vnode to help list traversal. It 100 * will never change its state. 101 * - LOADING Vnode is associating underlying file system and not 102 * yet ready to use. 103 * - ACTIVE Vnode has associated underlying file system and is 104 * ready to use. 105 * - BLOCKED Vnode is active but cannot get new references. 106 * - RECLAIMING Vnode is disassociating from the underlying file 107 * system. 108 * - RECLAIMED Vnode has disassociated from underlying file system 109 * and is dead. 110 * 111 * Valid state changes are: 112 * LOADING -> ACTIVE 113 * Vnode has been initialised in vcache_get() or 114 * vcache_new() and is ready to use. 115 * ACTIVE -> RECLAIMING 116 * Vnode starts disassociation from underlying file 117 * system in vcache_reclaim(). 118 * RECLAIMING -> RECLAIMED 119 * Vnode finished disassociation from underlying file 120 * system in vcache_reclaim(). 121 * ACTIVE -> BLOCKED 122 * Either vcache_rekey*() is changing the vnode key or 123 * vrelel() is about to call VOP_INACTIVE(). 124 * BLOCKED -> ACTIVE 125 * The block condition is over. 126 * LOADING -> RECLAIMED 127 * Either vcache_get() or vcache_new() failed to 128 * associate the underlying file system or vcache_rekey*() 129 * drops a vnode used as placeholder. 130 * 131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 132 * and it is possible to wait for state change. 133 * 134 * State is protected with v_interlock with one exception: 135 * to change from LOADING both v_interlock and vcache.lock must be held 136 * so it is possible to check "state == LOADING" without holding 137 * v_interlock. See vcache_get() for details. 138 * 139 * Reference counting 140 * 141 * Vnode is considered active, if reference count (vnode_t::v_usecount) 142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 143 * as vput(9), routines. Common points holding references are e.g. 144 * file openings, current working directory, mount points, etc. 145 * 146 * Note on v_usecount and its locking 147 * 148 * At nearly all points it is known that v_usecount could be zero, 149 * the vnode_t::v_interlock will be held. To change v_usecount away 150 * from zero, the interlock must be held. To change from a non-zero 151 * value to zero, again the interlock must be held. 152 * 153 * Changing the usecount from a non-zero value to a non-zero value can 154 * safely be done using atomic operations, without the interlock held. 155 * 156 */ 157 158 #include <sys/cdefs.h> 159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.56 2016/08/20 12:37:08 hannken Exp $"); 160 161 #define _VFS_VNODE_PRIVATE 162 163 #include <sys/param.h> 164 #include <sys/kernel.h> 165 166 #include <sys/atomic.h> 167 #include <sys/buf.h> 168 #include <sys/conf.h> 169 #include <sys/device.h> 170 #include <sys/hash.h> 171 #include <sys/kauth.h> 172 #include <sys/kmem.h> 173 #include <sys/kthread.h> 174 #include <sys/module.h> 175 #include <sys/mount.h> 176 #include <sys/namei.h> 177 #include <sys/syscallargs.h> 178 #include <sys/sysctl.h> 179 #include <sys/systm.h> 180 #include <sys/vnode.h> 181 #include <sys/wapbl.h> 182 #include <sys/fstrans.h> 183 184 #include <uvm/uvm.h> 185 #include <uvm/uvm_readahead.h> 186 187 /* Flags to vrelel. */ 188 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 189 190 enum vcache_state { 191 VN_MARKER, /* Stable, used as marker. Will not change. */ 192 VN_LOADING, /* Intermediate, initialising the fs node. */ 193 VN_ACTIVE, /* Stable, valid fs node attached. */ 194 VN_BLOCKED, /* Intermediate, active, no new references allowed. */ 195 VN_RECLAIMING, /* Intermediate, detaching the fs node. */ 196 VN_RECLAIMED /* Stable, no fs node attached. */ 197 }; 198 struct vcache_key { 199 struct mount *vk_mount; 200 const void *vk_key; 201 size_t vk_key_len; 202 }; 203 struct vcache_node { 204 struct vnode vn_vnode; 205 enum vcache_state vn_state; 206 SLIST_ENTRY(vcache_node) vn_hash; 207 struct vcache_key vn_key; 208 }; 209 210 #define VN_TO_VP(node) ((vnode_t *)(node)) 211 #define VP_TO_VN(vp) ((struct vcache_node *)(vp)) 212 213 u_int numvnodes __cacheline_aligned; 214 215 /* 216 * There are two free lists: one is for vnodes which have no buffer/page 217 * references and one for those which do (i.e. v_holdcnt is non-zero). 218 * Vnode recycling mechanism first attempts to look into the former list. 219 */ 220 static kmutex_t vnode_free_list_lock __cacheline_aligned; 221 static vnodelst_t vnode_free_list __cacheline_aligned; 222 static vnodelst_t vnode_hold_list __cacheline_aligned; 223 static kcondvar_t vdrain_cv __cacheline_aligned; 224 225 static vnodelst_t vrele_list __cacheline_aligned; 226 static kmutex_t vrele_lock __cacheline_aligned; 227 static kcondvar_t vrele_cv __cacheline_aligned; 228 static lwp_t * vrele_lwp __cacheline_aligned; 229 static int vrele_pending __cacheline_aligned; 230 static int vrele_gen __cacheline_aligned; 231 232 SLIST_HEAD(hashhead, vcache_node); 233 static struct { 234 kmutex_t lock; 235 kcondvar_t cv; 236 u_long hashmask; 237 struct hashhead *hashtab; 238 pool_cache_t pool; 239 } vcache __cacheline_aligned; 240 241 static int cleanvnode(void); 242 static struct vcache_node *vcache_alloc(void); 243 static void vcache_free(struct vcache_node *); 244 static void vcache_init(void); 245 static void vcache_reinit(void); 246 static void vcache_reclaim(vnode_t *); 247 static void vrelel(vnode_t *, int); 248 static void vdrain_thread(void *); 249 static void vrele_thread(void *); 250 static void vnpanic(vnode_t *, const char *, ...) 251 __printflike(2, 3); 252 253 /* Routines having to do with the management of the vnode table. */ 254 extern struct mount *dead_rootmount; 255 extern int (**dead_vnodeop_p)(void *); 256 extern struct vfsops dead_vfsops; 257 258 /* Vnode state operations and diagnostics. */ 259 260 static const char * 261 vstate_name(enum vcache_state state) 262 { 263 264 switch (state) { 265 case VN_MARKER: 266 return "MARKER"; 267 case VN_LOADING: 268 return "LOADING"; 269 case VN_ACTIVE: 270 return "ACTIVE"; 271 case VN_BLOCKED: 272 return "BLOCKED"; 273 case VN_RECLAIMING: 274 return "RECLAIMING"; 275 case VN_RECLAIMED: 276 return "RECLAIMED"; 277 default: 278 return "ILLEGAL"; 279 } 280 } 281 282 #if defined(DIAGNOSTIC) 283 284 #define VSTATE_GET(vp) \ 285 vstate_assert_get((vp), __func__, __LINE__) 286 #define VSTATE_CHANGE(vp, from, to) \ 287 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 288 #define VSTATE_WAIT_STABLE(vp) \ 289 vstate_assert_wait_stable((vp), __func__, __LINE__) 290 #define VSTATE_ASSERT(vp, state) \ 291 vstate_assert((vp), (state), __func__, __LINE__) 292 293 static void 294 vstate_assert(vnode_t *vp, enum vcache_state state, const char *func, int line) 295 { 296 struct vcache_node *node = VP_TO_VN(vp); 297 298 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 299 300 if (__predict_true(node->vn_state == state)) 301 return; 302 vnpanic(vp, "state is %s, expected %s at %s:%d", 303 vstate_name(node->vn_state), vstate_name(state), func, line); 304 } 305 306 static enum vcache_state 307 vstate_assert_get(vnode_t *vp, const char *func, int line) 308 { 309 struct vcache_node *node = VP_TO_VN(vp); 310 311 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 312 if (node->vn_state == VN_MARKER) 313 vnpanic(vp, "state is %s at %s:%d", 314 vstate_name(node->vn_state), func, line); 315 316 return node->vn_state; 317 } 318 319 static void 320 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 321 { 322 struct vcache_node *node = VP_TO_VN(vp); 323 324 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 325 if (node->vn_state == VN_MARKER) 326 vnpanic(vp, "state is %s at %s:%d", 327 vstate_name(node->vn_state), func, line); 328 329 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED) 330 cv_wait(&vp->v_cv, vp->v_interlock); 331 332 if (node->vn_state == VN_MARKER) 333 vnpanic(vp, "state is %s at %s:%d", 334 vstate_name(node->vn_state), func, line); 335 } 336 337 static void 338 vstate_assert_change(vnode_t *vp, enum vcache_state from, enum vcache_state to, 339 const char *func, int line) 340 { 341 struct vcache_node *node = VP_TO_VN(vp); 342 343 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 344 if (from == VN_LOADING) 345 KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line); 346 347 if (from == VN_MARKER) 348 vnpanic(vp, "from is %s at %s:%d", 349 vstate_name(from), func, line); 350 if (to == VN_MARKER) 351 vnpanic(vp, "to is %s at %s:%d", 352 vstate_name(to), func, line); 353 if (node->vn_state != from) 354 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 355 vstate_name(node->vn_state), vstate_name(from), func, line); 356 357 node->vn_state = to; 358 if (from == VN_LOADING) 359 cv_broadcast(&vcache.cv); 360 if (to == VN_ACTIVE || to == VN_RECLAIMED) 361 cv_broadcast(&vp->v_cv); 362 } 363 364 #else /* defined(DIAGNOSTIC) */ 365 366 #define VSTATE_GET(vp) \ 367 (VP_TO_VN((vp))->vn_state) 368 #define VSTATE_CHANGE(vp, from, to) \ 369 vstate_change((vp), (from), (to)) 370 #define VSTATE_WAIT_STABLE(vp) \ 371 vstate_wait_stable((vp)) 372 #define VSTATE_ASSERT(vp, state) 373 374 static void 375 vstate_wait_stable(vnode_t *vp) 376 { 377 struct vcache_node *node = VP_TO_VN(vp); 378 379 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED) 380 cv_wait(&vp->v_cv, vp->v_interlock); 381 } 382 383 static void 384 vstate_change(vnode_t *vp, enum vcache_state from, enum vcache_state to) 385 { 386 struct vcache_node *node = VP_TO_VN(vp); 387 388 node->vn_state = to; 389 if (from == VN_LOADING) 390 cv_broadcast(&vcache.cv); 391 if (to == VN_ACTIVE || to == VN_RECLAIMED) 392 cv_broadcast(&vp->v_cv); 393 } 394 395 #endif /* defined(DIAGNOSTIC) */ 396 397 void 398 vfs_vnode_sysinit(void) 399 { 400 int error __diagused; 401 402 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 403 KASSERT(dead_rootmount != NULL); 404 dead_rootmount->mnt_iflag = IMNT_MPSAFE; 405 406 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 407 TAILQ_INIT(&vnode_free_list); 408 TAILQ_INIT(&vnode_hold_list); 409 TAILQ_INIT(&vrele_list); 410 411 vcache_init(); 412 413 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 414 cv_init(&vdrain_cv, "vdrain"); 415 cv_init(&vrele_cv, "vrele"); 416 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 417 NULL, NULL, "vdrain"); 418 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); 419 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 420 NULL, &vrele_lwp, "vrele"); 421 KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error); 422 } 423 424 /* 425 * Allocate a new marker vnode. 426 */ 427 vnode_t * 428 vnalloc_marker(struct mount *mp) 429 { 430 struct vcache_node *node; 431 vnode_t *vp; 432 433 node = pool_cache_get(vcache.pool, PR_WAITOK); 434 memset(node, 0, sizeof(*node)); 435 vp = VN_TO_VP(node); 436 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 437 vp->v_mount = mp; 438 vp->v_type = VBAD; 439 node->vn_state = VN_MARKER; 440 441 return vp; 442 } 443 444 /* 445 * Free a marker vnode. 446 */ 447 void 448 vnfree_marker(vnode_t *vp) 449 { 450 struct vcache_node *node; 451 452 node = VP_TO_VN(vp); 453 KASSERT(node->vn_state == VN_MARKER); 454 uvm_obj_destroy(&vp->v_uobj, true); 455 pool_cache_put(vcache.pool, node); 456 } 457 458 /* 459 * Test a vnode for being a marker vnode. 460 */ 461 bool 462 vnis_marker(vnode_t *vp) 463 { 464 465 return (VP_TO_VN(vp)->vn_state == VN_MARKER); 466 } 467 468 /* 469 * cleanvnode: grab a vnode from freelist, clean and free it. 470 * 471 * => Releases vnode_free_list_lock. 472 */ 473 static int 474 cleanvnode(void) 475 { 476 vnode_t *vp; 477 vnodelst_t *listhd; 478 struct mount *mp; 479 480 KASSERT(mutex_owned(&vnode_free_list_lock)); 481 482 listhd = &vnode_free_list; 483 try_nextlist: 484 TAILQ_FOREACH(vp, listhd, v_freelist) { 485 /* 486 * It's safe to test v_usecount and v_iflag 487 * without holding the interlock here, since 488 * these vnodes should never appear on the 489 * lists. 490 */ 491 KASSERT(vp->v_usecount == 0); 492 KASSERT(vp->v_freelisthd == listhd); 493 494 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 495 continue; 496 if (!mutex_tryenter(vp->v_interlock)) { 497 VOP_UNLOCK(vp); 498 continue; 499 } 500 mp = vp->v_mount; 501 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 502 mutex_exit(vp->v_interlock); 503 VOP_UNLOCK(vp); 504 continue; 505 } 506 break; 507 } 508 509 if (vp == NULL) { 510 if (listhd == &vnode_free_list) { 511 listhd = &vnode_hold_list; 512 goto try_nextlist; 513 } 514 mutex_exit(&vnode_free_list_lock); 515 return EBUSY; 516 } 517 518 /* Remove it from the freelist. */ 519 TAILQ_REMOVE(listhd, vp, v_freelist); 520 vp->v_freelisthd = NULL; 521 mutex_exit(&vnode_free_list_lock); 522 523 KASSERT(vp->v_usecount == 0); 524 525 /* 526 * The vnode is still associated with a file system, so we must 527 * clean it out before freeing it. We need to add a reference 528 * before doing this. 529 */ 530 vp->v_usecount = 1; 531 vcache_reclaim(vp); 532 vrelel(vp, 0); 533 fstrans_done(mp); 534 535 return 0; 536 } 537 538 /* 539 * Helper thread to keep the number of vnodes below desiredvnodes. 540 */ 541 static void 542 vdrain_thread(void *cookie) 543 { 544 int error; 545 546 mutex_enter(&vnode_free_list_lock); 547 548 for (;;) { 549 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 550 while (numvnodes > desiredvnodes) { 551 error = cleanvnode(); 552 if (error) 553 kpause("vndsbusy", false, hz, NULL); 554 mutex_enter(&vnode_free_list_lock); 555 if (error) 556 break; 557 } 558 } 559 } 560 561 /* 562 * Remove a vnode from its freelist. 563 */ 564 void 565 vremfree(vnode_t *vp) 566 { 567 568 KASSERT(mutex_owned(vp->v_interlock)); 569 KASSERT(vp->v_usecount == 0); 570 571 /* 572 * Note that the reference count must not change until 573 * the vnode is removed. 574 */ 575 mutex_enter(&vnode_free_list_lock); 576 if (vp->v_holdcnt > 0) { 577 KASSERT(vp->v_freelisthd == &vnode_hold_list); 578 } else { 579 KASSERT(vp->v_freelisthd == &vnode_free_list); 580 } 581 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 582 vp->v_freelisthd = NULL; 583 mutex_exit(&vnode_free_list_lock); 584 } 585 586 /* 587 * vget: get a particular vnode from the free list, increment its reference 588 * count and return it. 589 * 590 * => Must be called with v_interlock held. 591 * 592 * If state is VN_RECLAIMING, the vnode may be eliminated in vcache_reclaim(). 593 * In that case, we cannot grab the vnode, so the process is awakened when 594 * the transition is completed, and an error returned to indicate that the 595 * vnode is no longer usable. 596 * 597 * If state is VN_LOADING or VN_BLOCKED, wait until the vnode enters a 598 * stable state (VN_ACTIVE or VN_RECLAIMED). 599 */ 600 int 601 vget(vnode_t *vp, int flags, bool waitok) 602 { 603 604 KASSERT(mutex_owned(vp->v_interlock)); 605 KASSERT((flags & ~LK_NOWAIT) == 0); 606 KASSERT(waitok == ((flags & LK_NOWAIT) == 0)); 607 608 /* 609 * Before adding a reference, we must remove the vnode 610 * from its freelist. 611 */ 612 if (vp->v_usecount == 0) { 613 vremfree(vp); 614 vp->v_usecount = 1; 615 } else { 616 atomic_inc_uint(&vp->v_usecount); 617 } 618 619 /* 620 * If the vnode is in the process of changing state we wait 621 * for the change to complete and take care not to return 622 * a clean vnode. 623 */ 624 if (! ISSET(flags, LK_NOWAIT)) 625 VSTATE_WAIT_STABLE(vp); 626 if (VSTATE_GET(vp) == VN_RECLAIMED) { 627 vrelel(vp, 0); 628 return ENOENT; 629 } else if (VSTATE_GET(vp) != VN_ACTIVE) { 630 KASSERT(ISSET(flags, LK_NOWAIT)); 631 vrelel(vp, 0); 632 return EBUSY; 633 } 634 635 /* 636 * Ok, we got it in good shape. 637 */ 638 VSTATE_ASSERT(vp, VN_ACTIVE); 639 mutex_exit(vp->v_interlock); 640 641 return 0; 642 } 643 644 /* 645 * vput: unlock and release the reference. 646 */ 647 void 648 vput(vnode_t *vp) 649 { 650 651 VOP_UNLOCK(vp); 652 vrele(vp); 653 } 654 655 /* 656 * Try to drop reference on a vnode. Abort if we are releasing the 657 * last reference. Note: this _must_ succeed if not the last reference. 658 */ 659 static inline bool 660 vtryrele(vnode_t *vp) 661 { 662 u_int use, next; 663 664 for (use = vp->v_usecount;; use = next) { 665 if (use == 1) { 666 return false; 667 } 668 KASSERT(use > 1); 669 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 670 if (__predict_true(next == use)) { 671 return true; 672 } 673 } 674 } 675 676 /* 677 * Vnode release. If reference count drops to zero, call inactive 678 * routine and either return to freelist or free to the pool. 679 */ 680 static void 681 vrelel(vnode_t *vp, int flags) 682 { 683 bool recycle, defer; 684 int error; 685 686 KASSERT(mutex_owned(vp->v_interlock)); 687 KASSERT(vp->v_freelisthd == NULL); 688 689 if (__predict_false(vp->v_op == dead_vnodeop_p && 690 VSTATE_GET(vp) != VN_RECLAIMED)) { 691 vnpanic(vp, "dead but not clean"); 692 } 693 694 /* 695 * If not the last reference, just drop the reference count 696 * and unlock. 697 */ 698 if (vtryrele(vp)) { 699 mutex_exit(vp->v_interlock); 700 return; 701 } 702 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 703 vnpanic(vp, "%s: bad ref count", __func__); 704 } 705 706 #ifdef DIAGNOSTIC 707 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 708 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 709 vprint("vrelel: missing VOP_CLOSE()", vp); 710 } 711 #endif 712 713 /* 714 * If not clean, deactivate the vnode, but preserve 715 * our reference across the call to VOP_INACTIVE(). 716 */ 717 if (VSTATE_GET(vp) != VN_RECLAIMED) { 718 recycle = false; 719 720 /* 721 * XXX This ugly block can be largely eliminated if 722 * locking is pushed down into the file systems. 723 * 724 * Defer vnode release to vrele_thread if caller 725 * requests it explicitly or is the pagedaemon. 726 */ 727 if ((curlwp == uvm.pagedaemon_lwp) || 728 (flags & VRELEL_ASYNC_RELE) != 0) { 729 defer = true; 730 } else if (curlwp == vrele_lwp) { 731 /* 732 * We have to try harder. 733 */ 734 mutex_exit(vp->v_interlock); 735 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 736 KASSERTMSG((error == 0), "vn_lock failed: %d", error); 737 mutex_enter(vp->v_interlock); 738 defer = false; 739 } else { 740 /* If we can't acquire the lock, then defer. */ 741 mutex_exit(vp->v_interlock); 742 error = vn_lock(vp, 743 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 744 defer = (error != 0); 745 mutex_enter(vp->v_interlock); 746 } 747 748 KASSERT(mutex_owned(vp->v_interlock)); 749 KASSERT(! (curlwp == vrele_lwp && defer)); 750 751 if (defer) { 752 /* 753 * Defer reclaim to the kthread; it's not safe to 754 * clean it here. We donate it our last reference. 755 */ 756 mutex_enter(&vrele_lock); 757 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 758 if (++vrele_pending > (desiredvnodes >> 8)) 759 cv_signal(&vrele_cv); 760 mutex_exit(&vrele_lock); 761 mutex_exit(vp->v_interlock); 762 return; 763 } 764 765 /* 766 * If the node got another reference while we 767 * released the interlock, don't try to inactivate it yet. 768 */ 769 if (__predict_false(vtryrele(vp))) { 770 VOP_UNLOCK(vp); 771 mutex_exit(vp->v_interlock); 772 return; 773 } 774 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED); 775 mutex_exit(vp->v_interlock); 776 777 /* 778 * The vnode must not gain another reference while being 779 * deactivated. If VOP_INACTIVE() indicates that 780 * the described file has been deleted, then recycle 781 * the vnode. 782 * 783 * Note that VOP_INACTIVE() will drop the vnode lock. 784 */ 785 VOP_INACTIVE(vp, &recycle); 786 if (recycle) { 787 /* vcache_reclaim() below will drop the lock. */ 788 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 789 recycle = false; 790 } 791 mutex_enter(vp->v_interlock); 792 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE); 793 if (!recycle) { 794 if (vtryrele(vp)) { 795 mutex_exit(vp->v_interlock); 796 return; 797 } 798 } 799 800 /* Take care of space accounting. */ 801 if (vp->v_iflag & VI_EXECMAP) { 802 atomic_add_int(&uvmexp.execpages, 803 -vp->v_uobj.uo_npages); 804 atomic_add_int(&uvmexp.filepages, 805 vp->v_uobj.uo_npages); 806 } 807 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 808 vp->v_vflag &= ~VV_MAPPED; 809 810 /* 811 * Recycle the vnode if the file is now unused (unlinked), 812 * otherwise just free it. 813 */ 814 if (recycle) { 815 VSTATE_ASSERT(vp, VN_ACTIVE); 816 vcache_reclaim(vp); 817 } 818 KASSERT(vp->v_usecount > 0); 819 } 820 821 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 822 /* Gained another reference while being reclaimed. */ 823 mutex_exit(vp->v_interlock); 824 return; 825 } 826 827 if (VSTATE_GET(vp) == VN_RECLAIMED) { 828 /* 829 * It's clean so destroy it. It isn't referenced 830 * anywhere since it has been reclaimed. 831 */ 832 KASSERT(vp->v_holdcnt == 0); 833 KASSERT(vp->v_writecount == 0); 834 mutex_exit(vp->v_interlock); 835 vfs_insmntque(vp, NULL); 836 if (vp->v_type == VBLK || vp->v_type == VCHR) { 837 spec_node_destroy(vp); 838 } 839 vcache_free(VP_TO_VN(vp)); 840 } else { 841 /* 842 * Otherwise, put it back onto the freelist. It 843 * can't be destroyed while still associated with 844 * a file system. 845 */ 846 mutex_enter(&vnode_free_list_lock); 847 if (vp->v_holdcnt > 0) { 848 vp->v_freelisthd = &vnode_hold_list; 849 } else { 850 vp->v_freelisthd = &vnode_free_list; 851 } 852 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 853 mutex_exit(&vnode_free_list_lock); 854 mutex_exit(vp->v_interlock); 855 } 856 } 857 858 void 859 vrele(vnode_t *vp) 860 { 861 862 if (vtryrele(vp)) { 863 return; 864 } 865 mutex_enter(vp->v_interlock); 866 vrelel(vp, 0); 867 } 868 869 /* 870 * Asynchronous vnode release, vnode is released in different context. 871 */ 872 void 873 vrele_async(vnode_t *vp) 874 { 875 876 if (vtryrele(vp)) { 877 return; 878 } 879 mutex_enter(vp->v_interlock); 880 vrelel(vp, VRELEL_ASYNC_RELE); 881 } 882 883 static void 884 vrele_thread(void *cookie) 885 { 886 vnodelst_t skip_list; 887 vnode_t *vp; 888 struct mount *mp; 889 890 TAILQ_INIT(&skip_list); 891 892 mutex_enter(&vrele_lock); 893 for (;;) { 894 while (TAILQ_EMPTY(&vrele_list)) { 895 vrele_gen++; 896 cv_broadcast(&vrele_cv); 897 cv_timedwait(&vrele_cv, &vrele_lock, hz); 898 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist); 899 } 900 vp = TAILQ_FIRST(&vrele_list); 901 mp = vp->v_mount; 902 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 903 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) { 904 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist); 905 continue; 906 } 907 vrele_pending--; 908 mutex_exit(&vrele_lock); 909 910 /* 911 * If not the last reference, then ignore the vnode 912 * and look for more work. 913 */ 914 mutex_enter(vp->v_interlock); 915 vrelel(vp, 0); 916 fstrans_done(mp); 917 mutex_enter(&vrele_lock); 918 } 919 } 920 921 void 922 vrele_flush(void) 923 { 924 int gen; 925 926 mutex_enter(&vrele_lock); 927 gen = vrele_gen; 928 while (vrele_pending && gen == vrele_gen) { 929 cv_broadcast(&vrele_cv); 930 cv_wait(&vrele_cv, &vrele_lock); 931 } 932 mutex_exit(&vrele_lock); 933 } 934 935 /* 936 * Vnode reference, where a reference is already held by some other 937 * object (for example, a file structure). 938 */ 939 void 940 vref(vnode_t *vp) 941 { 942 943 KASSERT(vp->v_usecount != 0); 944 945 atomic_inc_uint(&vp->v_usecount); 946 } 947 948 /* 949 * Page or buffer structure gets a reference. 950 * Called with v_interlock held. 951 */ 952 void 953 vholdl(vnode_t *vp) 954 { 955 956 KASSERT(mutex_owned(vp->v_interlock)); 957 958 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 959 mutex_enter(&vnode_free_list_lock); 960 KASSERT(vp->v_freelisthd == &vnode_free_list); 961 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 962 vp->v_freelisthd = &vnode_hold_list; 963 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 964 mutex_exit(&vnode_free_list_lock); 965 } 966 } 967 968 /* 969 * Page or buffer structure frees a reference. 970 * Called with v_interlock held. 971 */ 972 void 973 holdrelel(vnode_t *vp) 974 { 975 976 KASSERT(mutex_owned(vp->v_interlock)); 977 978 if (vp->v_holdcnt <= 0) { 979 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 980 } 981 982 vp->v_holdcnt--; 983 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 984 mutex_enter(&vnode_free_list_lock); 985 KASSERT(vp->v_freelisthd == &vnode_hold_list); 986 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 987 vp->v_freelisthd = &vnode_free_list; 988 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 989 mutex_exit(&vnode_free_list_lock); 990 } 991 } 992 993 /* 994 * Recycle an unused vnode if caller holds the last reference. 995 */ 996 bool 997 vrecycle(vnode_t *vp) 998 { 999 1000 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 1001 return false; 1002 1003 mutex_enter(vp->v_interlock); 1004 1005 if (vp->v_usecount != 1) { 1006 mutex_exit(vp->v_interlock); 1007 VOP_UNLOCK(vp); 1008 return false; 1009 } 1010 vcache_reclaim(vp); 1011 vrelel(vp, 0); 1012 return true; 1013 } 1014 1015 /* 1016 * Eliminate all activity associated with the requested vnode 1017 * and with all vnodes aliased to the requested vnode. 1018 */ 1019 void 1020 vrevoke(vnode_t *vp) 1021 { 1022 vnode_t *vq; 1023 enum vtype type; 1024 dev_t dev; 1025 1026 KASSERT(vp->v_usecount > 0); 1027 1028 mutex_enter(vp->v_interlock); 1029 VSTATE_WAIT_STABLE(vp); 1030 if (VSTATE_GET(vp) == VN_RECLAIMED) { 1031 mutex_exit(vp->v_interlock); 1032 return; 1033 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1034 atomic_inc_uint(&vp->v_usecount); 1035 mutex_exit(vp->v_interlock); 1036 vgone(vp); 1037 return; 1038 } else { 1039 dev = vp->v_rdev; 1040 type = vp->v_type; 1041 mutex_exit(vp->v_interlock); 1042 } 1043 1044 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1045 vgone(vq); 1046 } 1047 } 1048 1049 /* 1050 * Eliminate all activity associated with a vnode in preparation for 1051 * reuse. Drops a reference from the vnode. 1052 */ 1053 void 1054 vgone(vnode_t *vp) 1055 { 1056 1057 if (vn_lock(vp, LK_EXCLUSIVE) != 0) { 1058 VSTATE_ASSERT(vp, VN_RECLAIMED); 1059 vrele(vp); 1060 } 1061 1062 mutex_enter(vp->v_interlock); 1063 vcache_reclaim(vp); 1064 vrelel(vp, 0); 1065 } 1066 1067 static inline uint32_t 1068 vcache_hash(const struct vcache_key *key) 1069 { 1070 uint32_t hash = HASH32_BUF_INIT; 1071 1072 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 1073 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 1074 return hash; 1075 } 1076 1077 static void 1078 vcache_init(void) 1079 { 1080 1081 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0, 1082 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1083 KASSERT(vcache.pool != NULL); 1084 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE); 1085 cv_init(&vcache.cv, "vcache"); 1086 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1087 &vcache.hashmask); 1088 } 1089 1090 static void 1091 vcache_reinit(void) 1092 { 1093 int i; 1094 uint32_t hash; 1095 u_long oldmask, newmask; 1096 struct hashhead *oldtab, *newtab; 1097 struct vcache_node *node; 1098 1099 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1100 mutex_enter(&vcache.lock); 1101 oldtab = vcache.hashtab; 1102 oldmask = vcache.hashmask; 1103 vcache.hashtab = newtab; 1104 vcache.hashmask = newmask; 1105 for (i = 0; i <= oldmask; i++) { 1106 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) { 1107 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash); 1108 hash = vcache_hash(&node->vn_key); 1109 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask], 1110 node, vn_hash); 1111 } 1112 } 1113 mutex_exit(&vcache.lock); 1114 hashdone(oldtab, HASH_SLIST, oldmask); 1115 } 1116 1117 static inline struct vcache_node * 1118 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1119 { 1120 struct hashhead *hashp; 1121 struct vcache_node *node; 1122 1123 KASSERT(mutex_owned(&vcache.lock)); 1124 1125 hashp = &vcache.hashtab[hash & vcache.hashmask]; 1126 SLIST_FOREACH(node, hashp, vn_hash) { 1127 if (key->vk_mount != node->vn_key.vk_mount) 1128 continue; 1129 if (key->vk_key_len != node->vn_key.vk_key_len) 1130 continue; 1131 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len)) 1132 continue; 1133 return node; 1134 } 1135 return NULL; 1136 } 1137 1138 /* 1139 * Allocate a new, uninitialized vcache node. 1140 */ 1141 static struct vcache_node * 1142 vcache_alloc(void) 1143 { 1144 struct vcache_node *node; 1145 vnode_t *vp; 1146 1147 node = pool_cache_get(vcache.pool, PR_WAITOK); 1148 memset(node, 0, sizeof(*node)); 1149 1150 /* SLIST_INIT(&node->vn_hash); */ 1151 1152 vp = VN_TO_VP(node); 1153 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 1154 cv_init(&vp->v_cv, "vnode"); 1155 /* LIST_INIT(&vp->v_nclist); */ 1156 /* LIST_INIT(&vp->v_dnclist); */ 1157 1158 mutex_enter(&vnode_free_list_lock); 1159 numvnodes++; 1160 if (numvnodes > desiredvnodes + desiredvnodes / 10) 1161 cv_signal(&vdrain_cv); 1162 mutex_exit(&vnode_free_list_lock); 1163 1164 rw_init(&vp->v_lock); 1165 vp->v_usecount = 1; 1166 vp->v_type = VNON; 1167 vp->v_size = vp->v_writesize = VSIZENOTSET; 1168 1169 node->vn_state = VN_LOADING; 1170 1171 return node; 1172 } 1173 1174 /* 1175 * Free an unused, unreferenced vcache node. 1176 */ 1177 static void 1178 vcache_free(struct vcache_node *node) 1179 { 1180 vnode_t *vp; 1181 1182 vp = VN_TO_VP(node); 1183 1184 KASSERT(vp->v_usecount == 0); 1185 1186 rw_destroy(&vp->v_lock); 1187 mutex_enter(&vnode_free_list_lock); 1188 numvnodes--; 1189 mutex_exit(&vnode_free_list_lock); 1190 1191 uvm_obj_destroy(&vp->v_uobj, true); 1192 cv_destroy(&vp->v_cv); 1193 pool_cache_put(vcache.pool, node); 1194 } 1195 1196 /* 1197 * Get a vnode / fs node pair by key and return it referenced through vpp. 1198 */ 1199 int 1200 vcache_get(struct mount *mp, const void *key, size_t key_len, 1201 struct vnode **vpp) 1202 { 1203 int error; 1204 uint32_t hash; 1205 const void *new_key; 1206 struct vnode *vp; 1207 struct vcache_key vcache_key; 1208 struct vcache_node *node, *new_node; 1209 1210 new_key = NULL; 1211 *vpp = NULL; 1212 1213 vcache_key.vk_mount = mp; 1214 vcache_key.vk_key = key; 1215 vcache_key.vk_key_len = key_len; 1216 hash = vcache_hash(&vcache_key); 1217 1218 again: 1219 mutex_enter(&vcache.lock); 1220 node = vcache_hash_lookup(&vcache_key, hash); 1221 1222 /* If found, take a reference or retry. */ 1223 if (__predict_true(node != NULL)) { 1224 /* 1225 * If the vnode is loading we cannot take the v_interlock 1226 * here as it might change during load (see uvm_obj_setlock()). 1227 * As changing state from VN_LOADING requires both vcache.lock 1228 * and v_interlock it is safe to test with vcache.lock held. 1229 * 1230 * Wait for vnodes changing state from VN_LOADING and retry. 1231 */ 1232 if (__predict_false(node->vn_state == VN_LOADING)) { 1233 cv_wait(&vcache.cv, &vcache.lock); 1234 mutex_exit(&vcache.lock); 1235 goto again; 1236 } 1237 vp = VN_TO_VP(node); 1238 mutex_enter(vp->v_interlock); 1239 mutex_exit(&vcache.lock); 1240 error = vget(vp, 0, true /* wait */); 1241 if (error == ENOENT) 1242 goto again; 1243 if (error == 0) 1244 *vpp = vp; 1245 KASSERT((error != 0) == (*vpp == NULL)); 1246 return error; 1247 } 1248 mutex_exit(&vcache.lock); 1249 1250 /* Allocate and initialize a new vcache / vnode pair. */ 1251 error = vfs_busy(mp, NULL); 1252 if (error) 1253 return error; 1254 new_node = vcache_alloc(); 1255 new_node->vn_key = vcache_key; 1256 vp = VN_TO_VP(new_node); 1257 mutex_enter(&vcache.lock); 1258 node = vcache_hash_lookup(&vcache_key, hash); 1259 if (node == NULL) { 1260 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask], 1261 new_node, vn_hash); 1262 node = new_node; 1263 } 1264 1265 /* If another thread beat us inserting this node, retry. */ 1266 if (node != new_node) { 1267 mutex_enter(vp->v_interlock); 1268 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1269 mutex_exit(&vcache.lock); 1270 vrelel(vp, 0); 1271 vfs_unbusy(mp, false, NULL); 1272 goto again; 1273 } 1274 mutex_exit(&vcache.lock); 1275 1276 /* Load the fs node. Exclusive as new_node is VN_LOADING. */ 1277 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1278 if (error) { 1279 mutex_enter(&vcache.lock); 1280 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask], 1281 new_node, vcache_node, vn_hash); 1282 mutex_enter(vp->v_interlock); 1283 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1284 mutex_exit(&vcache.lock); 1285 vrelel(vp, 0); 1286 vfs_unbusy(mp, false, NULL); 1287 KASSERT(*vpp == NULL); 1288 return error; 1289 } 1290 KASSERT(new_key != NULL); 1291 KASSERT(memcmp(key, new_key, key_len) == 0); 1292 KASSERT(vp->v_op != NULL); 1293 vfs_insmntque(vp, mp); 1294 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1295 vp->v_vflag |= VV_MPSAFE; 1296 vfs_unbusy(mp, true, NULL); 1297 1298 /* Finished loading, finalize node. */ 1299 mutex_enter(&vcache.lock); 1300 new_node->vn_key.vk_key = new_key; 1301 mutex_enter(vp->v_interlock); 1302 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE); 1303 mutex_exit(vp->v_interlock); 1304 mutex_exit(&vcache.lock); 1305 *vpp = vp; 1306 return 0; 1307 } 1308 1309 /* 1310 * Create a new vnode / fs node pair and return it referenced through vpp. 1311 */ 1312 int 1313 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1314 kauth_cred_t cred, struct vnode **vpp) 1315 { 1316 int error; 1317 uint32_t hash; 1318 struct vnode *ovp, *vp; 1319 struct vcache_node *new_node; 1320 struct vcache_node *old_node __diagused; 1321 1322 *vpp = NULL; 1323 1324 /* Allocate and initialize a new vcache / vnode pair. */ 1325 error = vfs_busy(mp, NULL); 1326 if (error) 1327 return error; 1328 new_node = vcache_alloc(); 1329 new_node->vn_key.vk_mount = mp; 1330 vp = VN_TO_VP(new_node); 1331 1332 /* Create and load the fs node. */ 1333 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, 1334 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key); 1335 if (error) { 1336 mutex_enter(&vcache.lock); 1337 mutex_enter(vp->v_interlock); 1338 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1339 mutex_exit(&vcache.lock); 1340 vrelel(vp, 0); 1341 vfs_unbusy(mp, false, NULL); 1342 KASSERT(*vpp == NULL); 1343 return error; 1344 } 1345 KASSERT(new_node->vn_key.vk_key != NULL); 1346 KASSERT(vp->v_op != NULL); 1347 hash = vcache_hash(&new_node->vn_key); 1348 1349 /* Wait for previous instance to be reclaimed, then insert new node. */ 1350 mutex_enter(&vcache.lock); 1351 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) { 1352 ovp = VN_TO_VP(old_node); 1353 mutex_enter(ovp->v_interlock); 1354 mutex_exit(&vcache.lock); 1355 error = vget(ovp, 0, true /* wait */); 1356 KASSERT(error == ENOENT); 1357 mutex_enter(&vcache.lock); 1358 } 1359 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask], 1360 new_node, vn_hash); 1361 mutex_exit(&vcache.lock); 1362 vfs_insmntque(vp, mp); 1363 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1364 vp->v_vflag |= VV_MPSAFE; 1365 vfs_unbusy(mp, true, NULL); 1366 1367 /* Finished loading, finalize node. */ 1368 mutex_enter(&vcache.lock); 1369 mutex_enter(vp->v_interlock); 1370 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE); 1371 mutex_exit(&vcache.lock); 1372 mutex_exit(vp->v_interlock); 1373 *vpp = vp; 1374 return 0; 1375 } 1376 1377 /* 1378 * Prepare key change: lock old and new cache node. 1379 * Return an error if the new node already exists. 1380 */ 1381 int 1382 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1383 const void *old_key, size_t old_key_len, 1384 const void *new_key, size_t new_key_len) 1385 { 1386 uint32_t old_hash, new_hash; 1387 struct vcache_key old_vcache_key, new_vcache_key; 1388 struct vcache_node *node, *new_node; 1389 struct vnode *tvp; 1390 1391 old_vcache_key.vk_mount = mp; 1392 old_vcache_key.vk_key = old_key; 1393 old_vcache_key.vk_key_len = old_key_len; 1394 old_hash = vcache_hash(&old_vcache_key); 1395 1396 new_vcache_key.vk_mount = mp; 1397 new_vcache_key.vk_key = new_key; 1398 new_vcache_key.vk_key_len = new_key_len; 1399 new_hash = vcache_hash(&new_vcache_key); 1400 1401 new_node = vcache_alloc(); 1402 new_node->vn_key = new_vcache_key; 1403 tvp = VN_TO_VP(new_node); 1404 1405 /* Insert locked new node used as placeholder. */ 1406 mutex_enter(&vcache.lock); 1407 node = vcache_hash_lookup(&new_vcache_key, new_hash); 1408 if (node != NULL) { 1409 mutex_enter(tvp->v_interlock); 1410 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED); 1411 mutex_exit(&vcache.lock); 1412 vrelel(tvp, 0); 1413 return EEXIST; 1414 } 1415 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask], 1416 new_node, vn_hash); 1417 1418 /* Lock old node. */ 1419 node = vcache_hash_lookup(&old_vcache_key, old_hash); 1420 KASSERT(node != NULL); 1421 KASSERT(VN_TO_VP(node) == vp); 1422 mutex_enter(vp->v_interlock); 1423 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED); 1424 node->vn_key = old_vcache_key; 1425 mutex_exit(vp->v_interlock); 1426 mutex_exit(&vcache.lock); 1427 return 0; 1428 } 1429 1430 /* 1431 * Key change complete: remove old node and unlock new node. 1432 */ 1433 void 1434 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1435 const void *old_key, size_t old_key_len, 1436 const void *new_key, size_t new_key_len) 1437 { 1438 uint32_t old_hash, new_hash; 1439 struct vcache_key old_vcache_key, new_vcache_key; 1440 struct vcache_node *old_node, *new_node; 1441 struct vnode *tvp; 1442 1443 old_vcache_key.vk_mount = mp; 1444 old_vcache_key.vk_key = old_key; 1445 old_vcache_key.vk_key_len = old_key_len; 1446 old_hash = vcache_hash(&old_vcache_key); 1447 1448 new_vcache_key.vk_mount = mp; 1449 new_vcache_key.vk_key = new_key; 1450 new_vcache_key.vk_key_len = new_key_len; 1451 new_hash = vcache_hash(&new_vcache_key); 1452 1453 mutex_enter(&vcache.lock); 1454 1455 /* Lookup old and new node. */ 1456 old_node = vcache_hash_lookup(&old_vcache_key, old_hash); 1457 KASSERT(old_node != NULL); 1458 KASSERT(VN_TO_VP(old_node) == vp); 1459 mutex_enter(vp->v_interlock); 1460 VSTATE_ASSERT(vp, VN_BLOCKED); 1461 1462 new_node = vcache_hash_lookup(&new_vcache_key, new_hash); 1463 KASSERT(new_node != NULL); 1464 KASSERT(new_node->vn_key.vk_key_len == new_key_len); 1465 tvp = VN_TO_VP(new_node); 1466 mutex_enter(tvp->v_interlock); 1467 VSTATE_ASSERT(VN_TO_VP(new_node), VN_LOADING); 1468 1469 /* Rekey old node and put it onto its new hashlist. */ 1470 old_node->vn_key = new_vcache_key; 1471 if (old_hash != new_hash) { 1472 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask], 1473 old_node, vcache_node, vn_hash); 1474 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask], 1475 old_node, vn_hash); 1476 } 1477 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE); 1478 mutex_exit(vp->v_interlock); 1479 1480 /* Remove new node used as placeholder. */ 1481 SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask], 1482 new_node, vcache_node, vn_hash); 1483 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED); 1484 mutex_exit(&vcache.lock); 1485 vrelel(tvp, 0); 1486 } 1487 1488 /* 1489 * Disassociate the underlying file system from a vnode. 1490 * 1491 * Must be called with vnode locked and will return unlocked. 1492 * Must be called with the interlock held, and will return with it held. 1493 */ 1494 static void 1495 vcache_reclaim(vnode_t *vp) 1496 { 1497 lwp_t *l = curlwp; 1498 struct vcache_node *node = VP_TO_VN(vp); 1499 uint32_t hash; 1500 uint8_t temp_buf[64], *temp_key; 1501 size_t temp_key_len; 1502 bool recycle, active; 1503 int error; 1504 1505 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1506 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1507 KASSERT(mutex_owned(vp->v_interlock)); 1508 KASSERT(vp->v_usecount != 0); 1509 1510 active = (vp->v_usecount > 1); 1511 temp_key_len = node->vn_key.vk_key_len; 1512 /* 1513 * Prevent the vnode from being recycled or brought into use 1514 * while we clean it out. 1515 */ 1516 VSTATE_CHANGE(vp, VN_ACTIVE, VN_RECLAIMING); 1517 if (vp->v_iflag & VI_EXECMAP) { 1518 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1519 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1520 } 1521 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1522 mutex_exit(vp->v_interlock); 1523 1524 /* Replace the vnode key with a temporary copy. */ 1525 if (node->vn_key.vk_key_len > sizeof(temp_buf)) { 1526 temp_key = kmem_alloc(temp_key_len, KM_SLEEP); 1527 } else { 1528 temp_key = temp_buf; 1529 } 1530 mutex_enter(&vcache.lock); 1531 memcpy(temp_key, node->vn_key.vk_key, temp_key_len); 1532 node->vn_key.vk_key = temp_key; 1533 mutex_exit(&vcache.lock); 1534 1535 /* 1536 * Clean out any cached data associated with the vnode. 1537 * If purging an active vnode, it must be closed and 1538 * deactivated before being reclaimed. Note that the 1539 * VOP_INACTIVE will unlock the vnode. 1540 */ 1541 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1542 if (error != 0) { 1543 if (wapbl_vphaswapbl(vp)) 1544 WAPBL_DISCARD(wapbl_vptomp(vp)); 1545 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1546 } 1547 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1548 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1549 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1550 spec_node_revoke(vp); 1551 } 1552 if (active) { 1553 VOP_INACTIVE(vp, &recycle); 1554 } else { 1555 /* 1556 * Any other processes trying to obtain this lock must first 1557 * wait for VN_RECLAIMED, then call the new lock operation. 1558 */ 1559 VOP_UNLOCK(vp); 1560 } 1561 1562 /* Disassociate the underlying file system from the vnode. */ 1563 if (VOP_RECLAIM(vp)) { 1564 vnpanic(vp, "%s: cannot reclaim", __func__); 1565 } 1566 1567 KASSERT(vp->v_data == NULL); 1568 KASSERT(vp->v_uobj.uo_npages == 0); 1569 1570 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1571 uvm_ra_freectx(vp->v_ractx); 1572 vp->v_ractx = NULL; 1573 } 1574 1575 /* Purge name cache. */ 1576 cache_purge(vp); 1577 1578 /* Move to dead mount. */ 1579 vp->v_vflag &= ~VV_ROOT; 1580 atomic_inc_uint(&dead_rootmount->mnt_refcnt); 1581 vfs_insmntque(vp, dead_rootmount); 1582 1583 /* Remove from vnode cache. */ 1584 hash = vcache_hash(&node->vn_key); 1585 mutex_enter(&vcache.lock); 1586 KASSERT(node == vcache_hash_lookup(&node->vn_key, hash)); 1587 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask], 1588 node, vcache_node, vn_hash); 1589 mutex_exit(&vcache.lock); 1590 if (temp_key != temp_buf) 1591 kmem_free(temp_key, temp_key_len); 1592 1593 /* Done with purge, notify sleepers of the grim news. */ 1594 mutex_enter(vp->v_interlock); 1595 vp->v_op = dead_vnodeop_p; 1596 vp->v_vflag |= VV_LOCKSWORK; 1597 VSTATE_CHANGE(vp, VN_RECLAIMING, VN_RECLAIMED); 1598 vp->v_tag = VT_NON; 1599 KNOTE(&vp->v_klist, NOTE_REVOKE); 1600 1601 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1602 } 1603 1604 /* 1605 * Print a vcache node. 1606 */ 1607 void 1608 vcache_print(vnode_t *vp, const char *prefix, void (*pr)(const char *, ...)) 1609 { 1610 int n; 1611 const uint8_t *cp; 1612 struct vcache_node *node; 1613 1614 node = VP_TO_VN(vp); 1615 n = node->vn_key.vk_key_len; 1616 cp = node->vn_key.vk_key; 1617 1618 (*pr)("%sstate %s, key(%d)", prefix, vstate_name(node->vn_state), n); 1619 1620 while (n-- > 0) 1621 (*pr)(" %02x", *cp++); 1622 (*pr)("\n"); 1623 } 1624 1625 /* 1626 * Update outstanding I/O count and do wakeup if requested. 1627 */ 1628 void 1629 vwakeup(struct buf *bp) 1630 { 1631 vnode_t *vp; 1632 1633 if ((vp = bp->b_vp) == NULL) 1634 return; 1635 1636 KASSERT(bp->b_objlock == vp->v_interlock); 1637 KASSERT(mutex_owned(bp->b_objlock)); 1638 1639 if (--vp->v_numoutput < 0) 1640 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1641 if (vp->v_numoutput == 0) 1642 cv_broadcast(&vp->v_cv); 1643 } 1644 1645 /* 1646 * Test a vnode for being or becoming dead. Returns one of: 1647 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1648 * ENOENT: vnode is dead. 1649 * 0: otherwise. 1650 * 1651 * Whenever this function returns a non-zero value all future 1652 * calls will also return a non-zero value. 1653 */ 1654 int 1655 vdead_check(struct vnode *vp, int flags) 1656 { 1657 1658 KASSERT(mutex_owned(vp->v_interlock)); 1659 1660 if (! ISSET(flags, VDEAD_NOWAIT)) 1661 VSTATE_WAIT_STABLE(vp); 1662 1663 if (VSTATE_GET(vp) == VN_RECLAIMING) { 1664 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 1665 return EBUSY; 1666 } else if (VSTATE_GET(vp) == VN_RECLAIMED) { 1667 return ENOENT; 1668 } 1669 1670 return 0; 1671 } 1672 1673 int 1674 vfs_drainvnodes(long target) 1675 { 1676 int error; 1677 1678 mutex_enter(&vnode_free_list_lock); 1679 1680 while (numvnodes > target) { 1681 error = cleanvnode(); 1682 if (error != 0) 1683 return error; 1684 mutex_enter(&vnode_free_list_lock); 1685 } 1686 1687 mutex_exit(&vnode_free_list_lock); 1688 1689 vcache_reinit(); 1690 1691 return 0; 1692 } 1693 1694 void 1695 vnpanic(vnode_t *vp, const char *fmt, ...) 1696 { 1697 va_list ap; 1698 1699 #ifdef DIAGNOSTIC 1700 vprint(NULL, vp); 1701 #endif 1702 va_start(ap, fmt); 1703 vpanic(fmt, ap); 1704 va_end(ap); 1705 } 1706