1 /* $NetBSD: vfs_vnode.c,v 1.53 2016/07/07 06:55:43 msaitoh Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Vnode state 96 * 97 * Vnode is always in one of six states: 98 * - MARKER This is a marker vnode to help list traversal. It 99 * will never change its state. 100 * - LOADING Vnode is associating underlying file system and not 101 * yet ready to use. 102 * - ACTIVE Vnode has associated underlying file system and is 103 * ready to use. 104 * - BLOCKED Vnode is active but cannot get new references. 105 * - RECLAIMING Vnode is disassociating from the underlying file 106 * system. 107 * - RECLAIMED Vnode has disassociated from underlying file system 108 * and is dead. 109 * 110 * Valid state changes are: 111 * LOADING -> ACTIVE 112 * Vnode has been initialised in vcache_get() or 113 * vcache_new() and is ready to use. 114 * ACTIVE -> RECLAIMING 115 * Vnode starts disassociation from underlying file 116 * system in vclean(). 117 * RECLAIMING -> RECLAIMED 118 * Vnode finished disassociation from underlying file 119 * system in vclean(). 120 * ACTIVE -> BLOCKED 121 * Either vcache_rekey*() is changing the vnode key or 122 * vrelel() is about to call VOP_INACTIVE(). 123 * BLOCKED -> ACTIVE 124 * The block condition is over. 125 * LOADING -> RECLAIMED 126 * Either vcache_get() or vcache_new() failed to 127 * associate the underlying file system or vcache_rekey*() 128 * drops a vnode used as placeholder. 129 * 130 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 131 * and it is possible to wait for state change. 132 * 133 * State is protected with v_interlock with one exception: 134 * to change from LOADING both v_interlock and vcache.lock must be held 135 * so it is possible to check "state == LOADING" without holding 136 * v_interlock. See vcache_get() for details. 137 * 138 * Reference counting 139 * 140 * Vnode is considered active, if reference count (vnode_t::v_usecount) 141 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 142 * as vput(9), routines. Common points holding references are e.g. 143 * file openings, current working directory, mount points, etc. 144 * 145 * Note on v_usecount and its locking 146 * 147 * At nearly all points it is known that v_usecount could be zero, 148 * the vnode_t::v_interlock will be held. To change v_usecount away 149 * from zero, the interlock must be held. To change from a non-zero 150 * value to zero, again the interlock must be held. 151 * 152 * Changing the usecount from a non-zero value to a non-zero value can 153 * safely be done using atomic operations, without the interlock held. 154 * 155 */ 156 157 #include <sys/cdefs.h> 158 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.53 2016/07/07 06:55:43 msaitoh Exp $"); 159 160 #define _VFS_VNODE_PRIVATE 161 162 #include <sys/param.h> 163 #include <sys/kernel.h> 164 165 #include <sys/atomic.h> 166 #include <sys/buf.h> 167 #include <sys/conf.h> 168 #include <sys/device.h> 169 #include <sys/hash.h> 170 #include <sys/kauth.h> 171 #include <sys/kmem.h> 172 #include <sys/kthread.h> 173 #include <sys/module.h> 174 #include <sys/mount.h> 175 #include <sys/namei.h> 176 #include <sys/syscallargs.h> 177 #include <sys/sysctl.h> 178 #include <sys/systm.h> 179 #include <sys/vnode.h> 180 #include <sys/wapbl.h> 181 #include <sys/fstrans.h> 182 183 #include <uvm/uvm.h> 184 #include <uvm/uvm_readahead.h> 185 186 /* Flags to vrelel. */ 187 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 188 189 enum vcache_state { 190 VN_MARKER, /* Stable, used as marker. Will not change. */ 191 VN_LOADING, /* Intermediate, initialising the fs node. */ 192 VN_ACTIVE, /* Stable, valid fs node attached. */ 193 VN_BLOCKED, /* Intermediate, active, no new references allowed. */ 194 VN_RECLAIMING, /* Intermediate, detaching the fs node. */ 195 VN_RECLAIMED /* Stable, no fs node attached. */ 196 }; 197 struct vcache_key { 198 struct mount *vk_mount; 199 const void *vk_key; 200 size_t vk_key_len; 201 }; 202 struct vcache_node { 203 struct vnode vn_vnode; 204 enum vcache_state vn_state; 205 SLIST_ENTRY(vcache_node) vn_hash; 206 struct vcache_key vn_key; 207 }; 208 209 #define VN_TO_VP(node) ((vnode_t *)(node)) 210 #define VP_TO_VN(vp) ((struct vcache_node *)(vp)) 211 212 u_int numvnodes __cacheline_aligned; 213 214 /* 215 * There are two free lists: one is for vnodes which have no buffer/page 216 * references and one for those which do (i.e. v_holdcnt is non-zero). 217 * Vnode recycling mechanism first attempts to look into the former list. 218 */ 219 static kmutex_t vnode_free_list_lock __cacheline_aligned; 220 static vnodelst_t vnode_free_list __cacheline_aligned; 221 static vnodelst_t vnode_hold_list __cacheline_aligned; 222 static kcondvar_t vdrain_cv __cacheline_aligned; 223 224 static vnodelst_t vrele_list __cacheline_aligned; 225 static kmutex_t vrele_lock __cacheline_aligned; 226 static kcondvar_t vrele_cv __cacheline_aligned; 227 static lwp_t * vrele_lwp __cacheline_aligned; 228 static int vrele_pending __cacheline_aligned; 229 static int vrele_gen __cacheline_aligned; 230 231 SLIST_HEAD(hashhead, vcache_node); 232 static struct { 233 kmutex_t lock; 234 kcondvar_t cv; 235 u_long hashmask; 236 struct hashhead *hashtab; 237 pool_cache_t pool; 238 } vcache __cacheline_aligned; 239 240 static int cleanvnode(void); 241 static struct vcache_node *vcache_alloc(void); 242 static void vcache_free(struct vcache_node *); 243 static void vcache_init(void); 244 static void vcache_reinit(void); 245 static void vclean(vnode_t *); 246 static void vrelel(vnode_t *, int); 247 static void vdrain_thread(void *); 248 static void vrele_thread(void *); 249 static void vnpanic(vnode_t *, const char *, ...) 250 __printflike(2, 3); 251 252 /* Routines having to do with the management of the vnode table. */ 253 extern struct mount *dead_rootmount; 254 extern int (**dead_vnodeop_p)(void *); 255 extern struct vfsops dead_vfsops; 256 257 /* Vnode state operations and diagnostics. */ 258 259 static const char * 260 vstate_name(enum vcache_state state) 261 { 262 263 switch (state) { 264 case VN_MARKER: 265 return "MARKER"; 266 case VN_LOADING: 267 return "LOADING"; 268 case VN_ACTIVE: 269 return "ACTIVE"; 270 case VN_BLOCKED: 271 return "BLOCKED"; 272 case VN_RECLAIMING: 273 return "RECLAIMING"; 274 case VN_RECLAIMED: 275 return "RECLAIMED"; 276 default: 277 return "ILLEGAL"; 278 } 279 } 280 281 #if defined(DIAGNOSTIC) 282 283 #define VSTATE_GET(vp) \ 284 vstate_assert_get((vp), __func__, __LINE__) 285 #define VSTATE_CHANGE(vp, from, to) \ 286 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 287 #define VSTATE_WAIT_STABLE(vp) \ 288 vstate_assert_wait_stable((vp), __func__, __LINE__) 289 #define VSTATE_ASSERT(vp, state) \ 290 vstate_assert((vp), (state), __func__, __LINE__) 291 292 static void 293 vstate_assert(vnode_t *vp, enum vcache_state state, const char *func, int line) 294 { 295 struct vcache_node *node = VP_TO_VN(vp); 296 297 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 298 299 if (__predict_true(node->vn_state == state)) 300 return; 301 vnpanic(vp, "state is %s, expected %s at %s:%d", 302 vstate_name(node->vn_state), vstate_name(state), func, line); 303 } 304 305 static enum vcache_state 306 vstate_assert_get(vnode_t *vp, const char *func, int line) 307 { 308 struct vcache_node *node = VP_TO_VN(vp); 309 310 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 311 if (node->vn_state == VN_MARKER) 312 vnpanic(vp, "state is %s at %s:%d", 313 vstate_name(node->vn_state), func, line); 314 315 return node->vn_state; 316 } 317 318 static void 319 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 320 { 321 struct vcache_node *node = VP_TO_VN(vp); 322 323 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 324 if (node->vn_state == VN_MARKER) 325 vnpanic(vp, "state is %s at %s:%d", 326 vstate_name(node->vn_state), func, line); 327 328 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED) 329 cv_wait(&vp->v_cv, vp->v_interlock); 330 331 if (node->vn_state == VN_MARKER) 332 vnpanic(vp, "state is %s at %s:%d", 333 vstate_name(node->vn_state), func, line); 334 } 335 336 static void 337 vstate_assert_change(vnode_t *vp, enum vcache_state from, enum vcache_state to, 338 const char *func, int line) 339 { 340 struct vcache_node *node = VP_TO_VN(vp); 341 342 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 343 if (from == VN_LOADING) 344 KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line); 345 346 if (from == VN_MARKER) 347 vnpanic(vp, "from is %s at %s:%d", 348 vstate_name(from), func, line); 349 if (to == VN_MARKER) 350 vnpanic(vp, "to is %s at %s:%d", 351 vstate_name(to), func, line); 352 if (node->vn_state != from) 353 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 354 vstate_name(node->vn_state), vstate_name(from), func, line); 355 356 node->vn_state = to; 357 if (from == VN_LOADING) 358 cv_broadcast(&vcache.cv); 359 if (to == VN_ACTIVE || to == VN_RECLAIMED) 360 cv_broadcast(&vp->v_cv); 361 } 362 363 #else /* defined(DIAGNOSTIC) */ 364 365 #define VSTATE_GET(vp) \ 366 (VP_TO_VN((vp))->vn_state) 367 #define VSTATE_CHANGE(vp, from, to) \ 368 vstate_change((vp), (from), (to)) 369 #define VSTATE_WAIT_STABLE(vp) \ 370 vstate_wait_stable((vp)) 371 #define VSTATE_ASSERT(vp, state) 372 373 static void 374 vstate_wait_stable(vnode_t *vp) 375 { 376 struct vcache_node *node = VP_TO_VN(vp); 377 378 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED) 379 cv_wait(&vp->v_cv, vp->v_interlock); 380 } 381 382 static void 383 vstate_change(vnode_t *vp, enum vcache_state from, enum vcache_state to) 384 { 385 struct vcache_node *node = VP_TO_VN(vp); 386 387 node->vn_state = to; 388 if (from == VN_LOADING) 389 cv_broadcast(&vcache.cv); 390 if (to == VN_ACTIVE || to == VN_RECLAIMED) 391 cv_broadcast(&vp->v_cv); 392 } 393 394 #endif /* defined(DIAGNOSTIC) */ 395 396 void 397 vfs_vnode_sysinit(void) 398 { 399 int error __diagused; 400 401 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 402 KASSERT(dead_rootmount != NULL); 403 dead_rootmount->mnt_iflag = IMNT_MPSAFE; 404 405 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 406 TAILQ_INIT(&vnode_free_list); 407 TAILQ_INIT(&vnode_hold_list); 408 TAILQ_INIT(&vrele_list); 409 410 vcache_init(); 411 412 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 413 cv_init(&vdrain_cv, "vdrain"); 414 cv_init(&vrele_cv, "vrele"); 415 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 416 NULL, NULL, "vdrain"); 417 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); 418 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 419 NULL, &vrele_lwp, "vrele"); 420 KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error); 421 } 422 423 /* 424 * Allocate a new marker vnode. 425 */ 426 vnode_t * 427 vnalloc_marker(struct mount *mp) 428 { 429 struct vcache_node *node; 430 vnode_t *vp; 431 432 node = pool_cache_get(vcache.pool, PR_WAITOK); 433 memset(node, 0, sizeof(*node)); 434 vp = VN_TO_VP(node); 435 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 436 vp->v_mount = mp; 437 vp->v_type = VBAD; 438 node->vn_state = VN_MARKER; 439 440 return vp; 441 } 442 443 /* 444 * Free a marker vnode. 445 */ 446 void 447 vnfree_marker(vnode_t *vp) 448 { 449 struct vcache_node *node; 450 451 node = VP_TO_VN(vp); 452 KASSERT(node->vn_state == VN_MARKER); 453 uvm_obj_destroy(&vp->v_uobj, true); 454 pool_cache_put(vcache.pool, node); 455 } 456 457 /* 458 * Test a vnode for being a marker vnode. 459 */ 460 bool 461 vnis_marker(vnode_t *vp) 462 { 463 464 return (VP_TO_VN(vp)->vn_state == VN_MARKER); 465 } 466 467 /* 468 * cleanvnode: grab a vnode from freelist, clean and free it. 469 * 470 * => Releases vnode_free_list_lock. 471 */ 472 static int 473 cleanvnode(void) 474 { 475 vnode_t *vp; 476 vnodelst_t *listhd; 477 struct mount *mp; 478 479 KASSERT(mutex_owned(&vnode_free_list_lock)); 480 481 listhd = &vnode_free_list; 482 try_nextlist: 483 TAILQ_FOREACH(vp, listhd, v_freelist) { 484 /* 485 * It's safe to test v_usecount and v_iflag 486 * without holding the interlock here, since 487 * these vnodes should never appear on the 488 * lists. 489 */ 490 KASSERT(vp->v_usecount == 0); 491 KASSERT(vp->v_freelisthd == listhd); 492 493 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 494 continue; 495 if (!mutex_tryenter(vp->v_interlock)) { 496 VOP_UNLOCK(vp); 497 continue; 498 } 499 mp = vp->v_mount; 500 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 501 mutex_exit(vp->v_interlock); 502 VOP_UNLOCK(vp); 503 continue; 504 } 505 break; 506 } 507 508 if (vp == NULL) { 509 if (listhd == &vnode_free_list) { 510 listhd = &vnode_hold_list; 511 goto try_nextlist; 512 } 513 mutex_exit(&vnode_free_list_lock); 514 return EBUSY; 515 } 516 517 /* Remove it from the freelist. */ 518 TAILQ_REMOVE(listhd, vp, v_freelist); 519 vp->v_freelisthd = NULL; 520 mutex_exit(&vnode_free_list_lock); 521 522 KASSERT(vp->v_usecount == 0); 523 524 /* 525 * The vnode is still associated with a file system, so we must 526 * clean it out before freeing it. We need to add a reference 527 * before doing this. 528 */ 529 vp->v_usecount = 1; 530 vclean(vp); 531 vrelel(vp, 0); 532 fstrans_done(mp); 533 534 return 0; 535 } 536 537 /* 538 * Helper thread to keep the number of vnodes below desiredvnodes. 539 */ 540 static void 541 vdrain_thread(void *cookie) 542 { 543 int error; 544 545 mutex_enter(&vnode_free_list_lock); 546 547 for (;;) { 548 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 549 while (numvnodes > desiredvnodes) { 550 error = cleanvnode(); 551 if (error) 552 kpause("vndsbusy", false, hz, NULL); 553 mutex_enter(&vnode_free_list_lock); 554 if (error) 555 break; 556 } 557 } 558 } 559 560 /* 561 * Remove a vnode from its freelist. 562 */ 563 void 564 vremfree(vnode_t *vp) 565 { 566 567 KASSERT(mutex_owned(vp->v_interlock)); 568 KASSERT(vp->v_usecount == 0); 569 570 /* 571 * Note that the reference count must not change until 572 * the vnode is removed. 573 */ 574 mutex_enter(&vnode_free_list_lock); 575 if (vp->v_holdcnt > 0) { 576 KASSERT(vp->v_freelisthd == &vnode_hold_list); 577 } else { 578 KASSERT(vp->v_freelisthd == &vnode_free_list); 579 } 580 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 581 vp->v_freelisthd = NULL; 582 mutex_exit(&vnode_free_list_lock); 583 } 584 585 /* 586 * vget: get a particular vnode from the free list, increment its reference 587 * count and return it. 588 * 589 * => Must be called with v_interlock held. 590 * 591 * If state is VN_RECLAIMING, the vnode may be eliminated in vgone()/vclean(). 592 * In that case, we cannot grab the vnode, so the process is awakened when 593 * the transition is completed, and an error returned to indicate that the 594 * vnode is no longer usable. 595 * 596 * If state is VN_LOADING or VN_BLOCKED, wait until the vnode enters a 597 * stable state (VN_ACTIVE or VN_RECLAIMED). 598 */ 599 int 600 vget(vnode_t *vp, int flags, bool waitok) 601 { 602 603 KASSERT(mutex_owned(vp->v_interlock)); 604 KASSERT((flags & ~LK_NOWAIT) == 0); 605 KASSERT(waitok == ((flags & LK_NOWAIT) == 0)); 606 607 /* 608 * Before adding a reference, we must remove the vnode 609 * from its freelist. 610 */ 611 if (vp->v_usecount == 0) { 612 vremfree(vp); 613 vp->v_usecount = 1; 614 } else { 615 atomic_inc_uint(&vp->v_usecount); 616 } 617 618 /* 619 * If the vnode is in the process of changing state we wait 620 * for the change to complete and take care not to return 621 * a clean vnode. 622 */ 623 if (! ISSET(flags, LK_NOWAIT)) 624 VSTATE_WAIT_STABLE(vp); 625 if (VSTATE_GET(vp) == VN_RECLAIMED) { 626 vrelel(vp, 0); 627 return ENOENT; 628 } else if (VSTATE_GET(vp) != VN_ACTIVE) { 629 KASSERT(ISSET(flags, LK_NOWAIT)); 630 vrelel(vp, 0); 631 return EBUSY; 632 } 633 634 /* 635 * Ok, we got it in good shape. 636 */ 637 VSTATE_ASSERT(vp, VN_ACTIVE); 638 mutex_exit(vp->v_interlock); 639 640 return 0; 641 } 642 643 /* 644 * vput: unlock and release the reference. 645 */ 646 void 647 vput(vnode_t *vp) 648 { 649 650 VOP_UNLOCK(vp); 651 vrele(vp); 652 } 653 654 /* 655 * Try to drop reference on a vnode. Abort if we are releasing the 656 * last reference. Note: this _must_ succeed if not the last reference. 657 */ 658 static inline bool 659 vtryrele(vnode_t *vp) 660 { 661 u_int use, next; 662 663 for (use = vp->v_usecount;; use = next) { 664 if (use == 1) { 665 return false; 666 } 667 KASSERT(use > 1); 668 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 669 if (__predict_true(next == use)) { 670 return true; 671 } 672 } 673 } 674 675 /* 676 * Vnode release. If reference count drops to zero, call inactive 677 * routine and either return to freelist or free to the pool. 678 */ 679 static void 680 vrelel(vnode_t *vp, int flags) 681 { 682 bool recycle, defer; 683 int error; 684 685 KASSERT(mutex_owned(vp->v_interlock)); 686 KASSERT(vp->v_freelisthd == NULL); 687 688 if (__predict_false(vp->v_op == dead_vnodeop_p && 689 VSTATE_GET(vp) != VN_RECLAIMED)) { 690 vnpanic(vp, "dead but not clean"); 691 } 692 693 /* 694 * If not the last reference, just drop the reference count 695 * and unlock. 696 */ 697 if (vtryrele(vp)) { 698 mutex_exit(vp->v_interlock); 699 return; 700 } 701 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 702 vnpanic(vp, "%s: bad ref count", __func__); 703 } 704 705 #ifdef DIAGNOSTIC 706 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 707 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 708 vprint("vrelel: missing VOP_CLOSE()", vp); 709 } 710 #endif 711 712 /* 713 * If not clean, deactivate the vnode, but preserve 714 * our reference across the call to VOP_INACTIVE(). 715 */ 716 if (VSTATE_GET(vp) != VN_RECLAIMED) { 717 recycle = false; 718 719 /* 720 * XXX This ugly block can be largely eliminated if 721 * locking is pushed down into the file systems. 722 * 723 * Defer vnode release to vrele_thread if caller 724 * requests it explicitly or is the pagedaemon. 725 */ 726 if ((curlwp == uvm.pagedaemon_lwp) || 727 (flags & VRELEL_ASYNC_RELE) != 0) { 728 defer = true; 729 } else if (curlwp == vrele_lwp) { 730 /* 731 * We have to try harder. 732 */ 733 mutex_exit(vp->v_interlock); 734 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 735 KASSERTMSG((error == 0), "vn_lock failed: %d", error); 736 mutex_enter(vp->v_interlock); 737 defer = false; 738 } else { 739 /* If we can't acquire the lock, then defer. */ 740 mutex_exit(vp->v_interlock); 741 error = vn_lock(vp, 742 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 743 defer = (error != 0); 744 mutex_enter(vp->v_interlock); 745 } 746 747 KASSERT(mutex_owned(vp->v_interlock)); 748 KASSERT(! (curlwp == vrele_lwp && defer)); 749 750 if (defer) { 751 /* 752 * Defer reclaim to the kthread; it's not safe to 753 * clean it here. We donate it our last reference. 754 */ 755 mutex_enter(&vrele_lock); 756 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 757 if (++vrele_pending > (desiredvnodes >> 8)) 758 cv_signal(&vrele_cv); 759 mutex_exit(&vrele_lock); 760 mutex_exit(vp->v_interlock); 761 return; 762 } 763 764 /* 765 * If the node got another reference while we 766 * released the interlock, don't try to inactivate it yet. 767 */ 768 if (__predict_false(vtryrele(vp))) { 769 VOP_UNLOCK(vp); 770 mutex_exit(vp->v_interlock); 771 return; 772 } 773 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED); 774 mutex_exit(vp->v_interlock); 775 776 /* 777 * The vnode must not gain another reference while being 778 * deactivated. If VOP_INACTIVE() indicates that 779 * the described file has been deleted, then recycle 780 * the vnode. 781 * 782 * Note that VOP_INACTIVE() will drop the vnode lock. 783 */ 784 VOP_INACTIVE(vp, &recycle); 785 if (recycle) { 786 /* vclean() below will drop the lock. */ 787 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 788 recycle = false; 789 } 790 mutex_enter(vp->v_interlock); 791 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE); 792 if (!recycle) { 793 if (vtryrele(vp)) { 794 mutex_exit(vp->v_interlock); 795 return; 796 } 797 } 798 799 /* Take care of space accounting. */ 800 if (vp->v_iflag & VI_EXECMAP) { 801 atomic_add_int(&uvmexp.execpages, 802 -vp->v_uobj.uo_npages); 803 atomic_add_int(&uvmexp.filepages, 804 vp->v_uobj.uo_npages); 805 } 806 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 807 vp->v_vflag &= ~VV_MAPPED; 808 809 /* 810 * Recycle the vnode if the file is now unused (unlinked), 811 * otherwise just free it. 812 */ 813 if (recycle) { 814 VSTATE_ASSERT(vp, VN_ACTIVE); 815 vclean(vp); 816 } 817 KASSERT(vp->v_usecount > 0); 818 } 819 820 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 821 /* Gained another reference while being reclaimed. */ 822 mutex_exit(vp->v_interlock); 823 return; 824 } 825 826 if (VSTATE_GET(vp) == VN_RECLAIMED) { 827 /* 828 * It's clean so destroy it. It isn't referenced 829 * anywhere since it has been reclaimed. 830 */ 831 KASSERT(vp->v_holdcnt == 0); 832 KASSERT(vp->v_writecount == 0); 833 mutex_exit(vp->v_interlock); 834 vfs_insmntque(vp, NULL); 835 if (vp->v_type == VBLK || vp->v_type == VCHR) { 836 spec_node_destroy(vp); 837 } 838 vcache_free(VP_TO_VN(vp)); 839 } else { 840 /* 841 * Otherwise, put it back onto the freelist. It 842 * can't be destroyed while still associated with 843 * a file system. 844 */ 845 mutex_enter(&vnode_free_list_lock); 846 if (vp->v_holdcnt > 0) { 847 vp->v_freelisthd = &vnode_hold_list; 848 } else { 849 vp->v_freelisthd = &vnode_free_list; 850 } 851 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 852 mutex_exit(&vnode_free_list_lock); 853 mutex_exit(vp->v_interlock); 854 } 855 } 856 857 void 858 vrele(vnode_t *vp) 859 { 860 861 if (vtryrele(vp)) { 862 return; 863 } 864 mutex_enter(vp->v_interlock); 865 vrelel(vp, 0); 866 } 867 868 /* 869 * Asynchronous vnode release, vnode is released in different context. 870 */ 871 void 872 vrele_async(vnode_t *vp) 873 { 874 875 if (vtryrele(vp)) { 876 return; 877 } 878 mutex_enter(vp->v_interlock); 879 vrelel(vp, VRELEL_ASYNC_RELE); 880 } 881 882 static void 883 vrele_thread(void *cookie) 884 { 885 vnodelst_t skip_list; 886 vnode_t *vp; 887 struct mount *mp; 888 889 TAILQ_INIT(&skip_list); 890 891 mutex_enter(&vrele_lock); 892 for (;;) { 893 while (TAILQ_EMPTY(&vrele_list)) { 894 vrele_gen++; 895 cv_broadcast(&vrele_cv); 896 cv_timedwait(&vrele_cv, &vrele_lock, hz); 897 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist); 898 } 899 vp = TAILQ_FIRST(&vrele_list); 900 mp = vp->v_mount; 901 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 902 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) { 903 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist); 904 continue; 905 } 906 vrele_pending--; 907 mutex_exit(&vrele_lock); 908 909 /* 910 * If not the last reference, then ignore the vnode 911 * and look for more work. 912 */ 913 mutex_enter(vp->v_interlock); 914 vrelel(vp, 0); 915 fstrans_done(mp); 916 mutex_enter(&vrele_lock); 917 } 918 } 919 920 void 921 vrele_flush(void) 922 { 923 int gen; 924 925 mutex_enter(&vrele_lock); 926 gen = vrele_gen; 927 while (vrele_pending && gen == vrele_gen) { 928 cv_broadcast(&vrele_cv); 929 cv_wait(&vrele_cv, &vrele_lock); 930 } 931 mutex_exit(&vrele_lock); 932 } 933 934 /* 935 * Vnode reference, where a reference is already held by some other 936 * object (for example, a file structure). 937 */ 938 void 939 vref(vnode_t *vp) 940 { 941 942 KASSERT(vp->v_usecount != 0); 943 944 atomic_inc_uint(&vp->v_usecount); 945 } 946 947 /* 948 * Page or buffer structure gets a reference. 949 * Called with v_interlock held. 950 */ 951 void 952 vholdl(vnode_t *vp) 953 { 954 955 KASSERT(mutex_owned(vp->v_interlock)); 956 957 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 958 mutex_enter(&vnode_free_list_lock); 959 KASSERT(vp->v_freelisthd == &vnode_free_list); 960 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 961 vp->v_freelisthd = &vnode_hold_list; 962 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 963 mutex_exit(&vnode_free_list_lock); 964 } 965 } 966 967 /* 968 * Page or buffer structure frees a reference. 969 * Called with v_interlock held. 970 */ 971 void 972 holdrelel(vnode_t *vp) 973 { 974 975 KASSERT(mutex_owned(vp->v_interlock)); 976 977 if (vp->v_holdcnt <= 0) { 978 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 979 } 980 981 vp->v_holdcnt--; 982 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 983 mutex_enter(&vnode_free_list_lock); 984 KASSERT(vp->v_freelisthd == &vnode_hold_list); 985 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 986 vp->v_freelisthd = &vnode_free_list; 987 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 988 mutex_exit(&vnode_free_list_lock); 989 } 990 } 991 992 /* 993 * Disassociate the underlying file system from a vnode. 994 * 995 * Must be called with vnode locked and will return unlocked. 996 * Must be called with the interlock held, and will return with it held. 997 */ 998 static void 999 vclean(vnode_t *vp) 1000 { 1001 lwp_t *l = curlwp; 1002 bool recycle, active; 1003 int error; 1004 1005 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1006 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1007 KASSERT(mutex_owned(vp->v_interlock)); 1008 KASSERT(vp->v_usecount != 0); 1009 1010 active = (vp->v_usecount > 1); 1011 /* 1012 * Prevent the vnode from being recycled or brought into use 1013 * while we clean it out. 1014 */ 1015 VSTATE_CHANGE(vp, VN_ACTIVE, VN_RECLAIMING); 1016 if (vp->v_iflag & VI_EXECMAP) { 1017 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1018 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1019 } 1020 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1021 mutex_exit(vp->v_interlock); 1022 1023 /* 1024 * Clean out any cached data associated with the vnode. 1025 * If purging an active vnode, it must be closed and 1026 * deactivated before being reclaimed. Note that the 1027 * VOP_INACTIVE will unlock the vnode. 1028 */ 1029 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1030 if (error != 0) { 1031 if (wapbl_vphaswapbl(vp)) 1032 WAPBL_DISCARD(wapbl_vptomp(vp)); 1033 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1034 } 1035 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1036 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1037 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1038 spec_node_revoke(vp); 1039 } 1040 if (active) { 1041 VOP_INACTIVE(vp, &recycle); 1042 } else { 1043 /* 1044 * Any other processes trying to obtain this lock must first 1045 * wait for VN_RECLAIMED, then call the new lock operation. 1046 */ 1047 VOP_UNLOCK(vp); 1048 } 1049 1050 /* Disassociate the underlying file system from the vnode. */ 1051 if (VOP_RECLAIM(vp)) { 1052 vnpanic(vp, "%s: cannot reclaim", __func__); 1053 } 1054 1055 KASSERT(vp->v_data == NULL); 1056 KASSERT(vp->v_uobj.uo_npages == 0); 1057 1058 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1059 uvm_ra_freectx(vp->v_ractx); 1060 vp->v_ractx = NULL; 1061 } 1062 1063 /* Purge name cache. */ 1064 cache_purge(vp); 1065 1066 /* Move to dead mount. */ 1067 vp->v_vflag &= ~VV_ROOT; 1068 atomic_inc_uint(&dead_rootmount->mnt_refcnt); 1069 vfs_insmntque(vp, dead_rootmount); 1070 1071 /* Done with purge, notify sleepers of the grim news. */ 1072 mutex_enter(vp->v_interlock); 1073 vp->v_op = dead_vnodeop_p; 1074 vp->v_vflag |= VV_LOCKSWORK; 1075 VSTATE_CHANGE(vp, VN_RECLAIMING, VN_RECLAIMED); 1076 vp->v_tag = VT_NON; 1077 KNOTE(&vp->v_klist, NOTE_REVOKE); 1078 1079 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1080 } 1081 1082 /* 1083 * Recycle an unused vnode if caller holds the last reference. 1084 */ 1085 bool 1086 vrecycle(vnode_t *vp) 1087 { 1088 1089 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 1090 return false; 1091 1092 mutex_enter(vp->v_interlock); 1093 1094 if (vp->v_usecount != 1) { 1095 mutex_exit(vp->v_interlock); 1096 VOP_UNLOCK(vp); 1097 return false; 1098 } 1099 vclean(vp); 1100 vrelel(vp, 0); 1101 return true; 1102 } 1103 1104 /* 1105 * Eliminate all activity associated with the requested vnode 1106 * and with all vnodes aliased to the requested vnode. 1107 */ 1108 void 1109 vrevoke(vnode_t *vp) 1110 { 1111 vnode_t *vq; 1112 enum vtype type; 1113 dev_t dev; 1114 1115 KASSERT(vp->v_usecount > 0); 1116 1117 mutex_enter(vp->v_interlock); 1118 VSTATE_WAIT_STABLE(vp); 1119 if (VSTATE_GET(vp) == VN_RECLAIMED) { 1120 mutex_exit(vp->v_interlock); 1121 return; 1122 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1123 atomic_inc_uint(&vp->v_usecount); 1124 mutex_exit(vp->v_interlock); 1125 vgone(vp); 1126 return; 1127 } else { 1128 dev = vp->v_rdev; 1129 type = vp->v_type; 1130 mutex_exit(vp->v_interlock); 1131 } 1132 1133 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1134 vgone(vq); 1135 } 1136 } 1137 1138 /* 1139 * Eliminate all activity associated with a vnode in preparation for 1140 * reuse. Drops a reference from the vnode. 1141 */ 1142 void 1143 vgone(vnode_t *vp) 1144 { 1145 1146 if (vn_lock(vp, LK_EXCLUSIVE) != 0) { 1147 VSTATE_ASSERT(vp, VN_RECLAIMED); 1148 vrele(vp); 1149 } 1150 1151 mutex_enter(vp->v_interlock); 1152 vclean(vp); 1153 vrelel(vp, 0); 1154 } 1155 1156 static inline uint32_t 1157 vcache_hash(const struct vcache_key *key) 1158 { 1159 uint32_t hash = HASH32_BUF_INIT; 1160 1161 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 1162 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 1163 return hash; 1164 } 1165 1166 static void 1167 vcache_init(void) 1168 { 1169 1170 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0, 1171 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1172 KASSERT(vcache.pool != NULL); 1173 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE); 1174 cv_init(&vcache.cv, "vcache"); 1175 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1176 &vcache.hashmask); 1177 } 1178 1179 static void 1180 vcache_reinit(void) 1181 { 1182 int i; 1183 uint32_t hash; 1184 u_long oldmask, newmask; 1185 struct hashhead *oldtab, *newtab; 1186 struct vcache_node *node; 1187 1188 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1189 mutex_enter(&vcache.lock); 1190 oldtab = vcache.hashtab; 1191 oldmask = vcache.hashmask; 1192 vcache.hashtab = newtab; 1193 vcache.hashmask = newmask; 1194 for (i = 0; i <= oldmask; i++) { 1195 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) { 1196 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash); 1197 hash = vcache_hash(&node->vn_key); 1198 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask], 1199 node, vn_hash); 1200 } 1201 } 1202 mutex_exit(&vcache.lock); 1203 hashdone(oldtab, HASH_SLIST, oldmask); 1204 } 1205 1206 static inline struct vcache_node * 1207 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1208 { 1209 struct hashhead *hashp; 1210 struct vcache_node *node; 1211 1212 KASSERT(mutex_owned(&vcache.lock)); 1213 1214 hashp = &vcache.hashtab[hash & vcache.hashmask]; 1215 SLIST_FOREACH(node, hashp, vn_hash) { 1216 if (key->vk_mount != node->vn_key.vk_mount) 1217 continue; 1218 if (key->vk_key_len != node->vn_key.vk_key_len) 1219 continue; 1220 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len)) 1221 continue; 1222 return node; 1223 } 1224 return NULL; 1225 } 1226 1227 /* 1228 * Allocate a new, uninitialized vcache node. 1229 */ 1230 static struct vcache_node * 1231 vcache_alloc(void) 1232 { 1233 struct vcache_node *node; 1234 vnode_t *vp; 1235 1236 node = pool_cache_get(vcache.pool, PR_WAITOK); 1237 memset(node, 0, sizeof(*node)); 1238 1239 /* SLIST_INIT(&node->vn_hash); */ 1240 1241 vp = VN_TO_VP(node); 1242 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 1243 cv_init(&vp->v_cv, "vnode"); 1244 /* LIST_INIT(&vp->v_nclist); */ 1245 /* LIST_INIT(&vp->v_dnclist); */ 1246 1247 mutex_enter(&vnode_free_list_lock); 1248 numvnodes++; 1249 if (numvnodes > desiredvnodes + desiredvnodes / 10) 1250 cv_signal(&vdrain_cv); 1251 mutex_exit(&vnode_free_list_lock); 1252 1253 rw_init(&vp->v_lock); 1254 vp->v_usecount = 1; 1255 vp->v_type = VNON; 1256 vp->v_size = vp->v_writesize = VSIZENOTSET; 1257 1258 node->vn_state = VN_LOADING; 1259 1260 return node; 1261 } 1262 1263 /* 1264 * Free an unused, unreferenced vcache node. 1265 */ 1266 static void 1267 vcache_free(struct vcache_node *node) 1268 { 1269 vnode_t *vp; 1270 1271 vp = VN_TO_VP(node); 1272 1273 KASSERT(vp->v_usecount == 0); 1274 1275 rw_destroy(&vp->v_lock); 1276 mutex_enter(&vnode_free_list_lock); 1277 numvnodes--; 1278 mutex_exit(&vnode_free_list_lock); 1279 1280 uvm_obj_destroy(&vp->v_uobj, true); 1281 cv_destroy(&vp->v_cv); 1282 pool_cache_put(vcache.pool, node); 1283 } 1284 1285 /* 1286 * Get a vnode / fs node pair by key and return it referenced through vpp. 1287 */ 1288 int 1289 vcache_get(struct mount *mp, const void *key, size_t key_len, 1290 struct vnode **vpp) 1291 { 1292 int error; 1293 uint32_t hash; 1294 const void *new_key; 1295 struct vnode *vp; 1296 struct vcache_key vcache_key; 1297 struct vcache_node *node, *new_node; 1298 1299 new_key = NULL; 1300 *vpp = NULL; 1301 1302 vcache_key.vk_mount = mp; 1303 vcache_key.vk_key = key; 1304 vcache_key.vk_key_len = key_len; 1305 hash = vcache_hash(&vcache_key); 1306 1307 again: 1308 mutex_enter(&vcache.lock); 1309 node = vcache_hash_lookup(&vcache_key, hash); 1310 1311 /* If found, take a reference or retry. */ 1312 if (__predict_true(node != NULL)) { 1313 /* 1314 * If the vnode is loading we cannot take the v_interlock 1315 * here as it might change during load (see uvm_obj_setlock()). 1316 * As changing state from VN_LOADING requires both vcache.lock 1317 * and v_interlock it is safe to test with vcache.lock held. 1318 * 1319 * Wait for vnodes changing state from VN_LOADING and retry. 1320 */ 1321 if (__predict_false(node->vn_state == VN_LOADING)) { 1322 cv_wait(&vcache.cv, &vcache.lock); 1323 mutex_exit(&vcache.lock); 1324 goto again; 1325 } 1326 vp = VN_TO_VP(node); 1327 mutex_enter(vp->v_interlock); 1328 mutex_exit(&vcache.lock); 1329 error = vget(vp, 0, true /* wait */); 1330 if (error == ENOENT) 1331 goto again; 1332 if (error == 0) 1333 *vpp = vp; 1334 KASSERT((error != 0) == (*vpp == NULL)); 1335 return error; 1336 } 1337 mutex_exit(&vcache.lock); 1338 1339 /* Allocate and initialize a new vcache / vnode pair. */ 1340 error = vfs_busy(mp, NULL); 1341 if (error) 1342 return error; 1343 new_node = vcache_alloc(); 1344 new_node->vn_key = vcache_key; 1345 vp = VN_TO_VP(new_node); 1346 mutex_enter(&vcache.lock); 1347 node = vcache_hash_lookup(&vcache_key, hash); 1348 if (node == NULL) { 1349 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask], 1350 new_node, vn_hash); 1351 node = new_node; 1352 } 1353 1354 /* If another thread beat us inserting this node, retry. */ 1355 if (node != new_node) { 1356 mutex_enter(vp->v_interlock); 1357 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1358 mutex_exit(&vcache.lock); 1359 vrelel(vp, 0); 1360 vfs_unbusy(mp, false, NULL); 1361 goto again; 1362 } 1363 mutex_exit(&vcache.lock); 1364 1365 /* Load the fs node. Exclusive as new_node is VN_LOADING. */ 1366 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1367 if (error) { 1368 mutex_enter(&vcache.lock); 1369 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask], 1370 new_node, vcache_node, vn_hash); 1371 mutex_enter(vp->v_interlock); 1372 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1373 mutex_exit(&vcache.lock); 1374 vrelel(vp, 0); 1375 vfs_unbusy(mp, false, NULL); 1376 KASSERT(*vpp == NULL); 1377 return error; 1378 } 1379 KASSERT(new_key != NULL); 1380 KASSERT(memcmp(key, new_key, key_len) == 0); 1381 KASSERT(vp->v_op != NULL); 1382 vfs_insmntque(vp, mp); 1383 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1384 vp->v_vflag |= VV_MPSAFE; 1385 vfs_unbusy(mp, true, NULL); 1386 1387 /* Finished loading, finalize node. */ 1388 mutex_enter(&vcache.lock); 1389 new_node->vn_key.vk_key = new_key; 1390 mutex_enter(vp->v_interlock); 1391 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE); 1392 mutex_exit(vp->v_interlock); 1393 mutex_exit(&vcache.lock); 1394 *vpp = vp; 1395 return 0; 1396 } 1397 1398 /* 1399 * Create a new vnode / fs node pair and return it referenced through vpp. 1400 */ 1401 int 1402 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1403 kauth_cred_t cred, struct vnode **vpp) 1404 { 1405 int error; 1406 uint32_t hash; 1407 struct vnode *ovp, *vp; 1408 struct vcache_node *new_node; 1409 struct vcache_node *old_node __diagused; 1410 1411 *vpp = NULL; 1412 1413 /* Allocate and initialize a new vcache / vnode pair. */ 1414 error = vfs_busy(mp, NULL); 1415 if (error) 1416 return error; 1417 new_node = vcache_alloc(); 1418 new_node->vn_key.vk_mount = mp; 1419 vp = VN_TO_VP(new_node); 1420 1421 /* Create and load the fs node. */ 1422 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, 1423 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key); 1424 if (error) { 1425 mutex_enter(&vcache.lock); 1426 mutex_enter(vp->v_interlock); 1427 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED); 1428 mutex_exit(&vcache.lock); 1429 vrelel(vp, 0); 1430 vfs_unbusy(mp, false, NULL); 1431 KASSERT(*vpp == NULL); 1432 return error; 1433 } 1434 KASSERT(new_node->vn_key.vk_key != NULL); 1435 KASSERT(vp->v_op != NULL); 1436 hash = vcache_hash(&new_node->vn_key); 1437 1438 /* Wait for previous instance to be reclaimed, then insert new node. */ 1439 mutex_enter(&vcache.lock); 1440 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) { 1441 ovp = VN_TO_VP(old_node); 1442 mutex_enter(ovp->v_interlock); 1443 mutex_exit(&vcache.lock); 1444 error = vget(ovp, 0, true /* wait */); 1445 KASSERT(error == ENOENT); 1446 mutex_enter(&vcache.lock); 1447 } 1448 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask], 1449 new_node, vn_hash); 1450 mutex_exit(&vcache.lock); 1451 vfs_insmntque(vp, mp); 1452 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1453 vp->v_vflag |= VV_MPSAFE; 1454 vfs_unbusy(mp, true, NULL); 1455 1456 /* Finished loading, finalize node. */ 1457 mutex_enter(&vcache.lock); 1458 mutex_enter(vp->v_interlock); 1459 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE); 1460 mutex_exit(&vcache.lock); 1461 mutex_exit(vp->v_interlock); 1462 *vpp = vp; 1463 return 0; 1464 } 1465 1466 /* 1467 * Prepare key change: lock old and new cache node. 1468 * Return an error if the new node already exists. 1469 */ 1470 int 1471 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1472 const void *old_key, size_t old_key_len, 1473 const void *new_key, size_t new_key_len) 1474 { 1475 uint32_t old_hash, new_hash; 1476 struct vcache_key old_vcache_key, new_vcache_key; 1477 struct vcache_node *node, *new_node; 1478 struct vnode *tvp; 1479 1480 old_vcache_key.vk_mount = mp; 1481 old_vcache_key.vk_key = old_key; 1482 old_vcache_key.vk_key_len = old_key_len; 1483 old_hash = vcache_hash(&old_vcache_key); 1484 1485 new_vcache_key.vk_mount = mp; 1486 new_vcache_key.vk_key = new_key; 1487 new_vcache_key.vk_key_len = new_key_len; 1488 new_hash = vcache_hash(&new_vcache_key); 1489 1490 new_node = vcache_alloc(); 1491 new_node->vn_key = new_vcache_key; 1492 tvp = VN_TO_VP(new_node); 1493 1494 /* Insert locked new node used as placeholder. */ 1495 mutex_enter(&vcache.lock); 1496 node = vcache_hash_lookup(&new_vcache_key, new_hash); 1497 if (node != NULL) { 1498 mutex_enter(tvp->v_interlock); 1499 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED); 1500 mutex_exit(&vcache.lock); 1501 vrelel(tvp, 0); 1502 return EEXIST; 1503 } 1504 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask], 1505 new_node, vn_hash); 1506 1507 /* Lock old node. */ 1508 node = vcache_hash_lookup(&old_vcache_key, old_hash); 1509 KASSERT(node != NULL); 1510 KASSERT(VN_TO_VP(node) == vp); 1511 mutex_enter(vp->v_interlock); 1512 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED); 1513 node->vn_key = old_vcache_key; 1514 mutex_exit(vp->v_interlock); 1515 mutex_exit(&vcache.lock); 1516 return 0; 1517 } 1518 1519 /* 1520 * Key change complete: remove old node and unlock new node. 1521 */ 1522 void 1523 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1524 const void *old_key, size_t old_key_len, 1525 const void *new_key, size_t new_key_len) 1526 { 1527 uint32_t old_hash, new_hash; 1528 struct vcache_key old_vcache_key, new_vcache_key; 1529 struct vcache_node *old_node, *new_node; 1530 struct vnode *tvp; 1531 1532 old_vcache_key.vk_mount = mp; 1533 old_vcache_key.vk_key = old_key; 1534 old_vcache_key.vk_key_len = old_key_len; 1535 old_hash = vcache_hash(&old_vcache_key); 1536 1537 new_vcache_key.vk_mount = mp; 1538 new_vcache_key.vk_key = new_key; 1539 new_vcache_key.vk_key_len = new_key_len; 1540 new_hash = vcache_hash(&new_vcache_key); 1541 1542 mutex_enter(&vcache.lock); 1543 1544 /* Lookup old and new node. */ 1545 old_node = vcache_hash_lookup(&old_vcache_key, old_hash); 1546 KASSERT(old_node != NULL); 1547 KASSERT(VN_TO_VP(old_node) == vp); 1548 mutex_enter(vp->v_interlock); 1549 VSTATE_ASSERT(vp, VN_BLOCKED); 1550 1551 new_node = vcache_hash_lookup(&new_vcache_key, new_hash); 1552 KASSERT(new_node != NULL); 1553 KASSERT(new_node->vn_key.vk_key_len == new_key_len); 1554 tvp = VN_TO_VP(new_node); 1555 mutex_enter(tvp->v_interlock); 1556 VSTATE_ASSERT(VN_TO_VP(new_node), VN_LOADING); 1557 1558 /* Rekey old node and put it onto its new hashlist. */ 1559 old_node->vn_key = new_vcache_key; 1560 if (old_hash != new_hash) { 1561 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask], 1562 old_node, vcache_node, vn_hash); 1563 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask], 1564 old_node, vn_hash); 1565 } 1566 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE); 1567 mutex_exit(vp->v_interlock); 1568 1569 /* Remove new node used as placeholder. */ 1570 SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask], 1571 new_node, vcache_node, vn_hash); 1572 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED); 1573 mutex_exit(&vcache.lock); 1574 vrelel(tvp, 0); 1575 } 1576 1577 /* 1578 * Remove a vnode / fs node pair from the cache. 1579 */ 1580 void 1581 vcache_remove(struct mount *mp, const void *key, size_t key_len) 1582 { 1583 uint32_t hash; 1584 struct vcache_key vcache_key; 1585 struct vcache_node *node; 1586 1587 vcache_key.vk_mount = mp; 1588 vcache_key.vk_key = key; 1589 vcache_key.vk_key_len = key_len; 1590 hash = vcache_hash(&vcache_key); 1591 1592 mutex_enter(&vcache.lock); 1593 node = vcache_hash_lookup(&vcache_key, hash); 1594 KASSERT(node != NULL); 1595 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask], 1596 node, vcache_node, vn_hash); 1597 mutex_exit(&vcache.lock); 1598 } 1599 1600 /* 1601 * Print a vcache node. 1602 */ 1603 void 1604 vcache_print(vnode_t *vp, const char *prefix, void (*pr)(const char *, ...)) 1605 { 1606 int n; 1607 const uint8_t *cp; 1608 struct vcache_node *node; 1609 1610 node = VP_TO_VN(vp); 1611 n = node->vn_key.vk_key_len; 1612 cp = node->vn_key.vk_key; 1613 1614 (*pr)("%sstate %s, key(%d)", prefix, vstate_name(node->vn_state), n); 1615 1616 while (n-- > 0) 1617 (*pr)(" %02x", *cp++); 1618 (*pr)("\n"); 1619 } 1620 1621 /* 1622 * Update outstanding I/O count and do wakeup if requested. 1623 */ 1624 void 1625 vwakeup(struct buf *bp) 1626 { 1627 vnode_t *vp; 1628 1629 if ((vp = bp->b_vp) == NULL) 1630 return; 1631 1632 KASSERT(bp->b_objlock == vp->v_interlock); 1633 KASSERT(mutex_owned(bp->b_objlock)); 1634 1635 if (--vp->v_numoutput < 0) 1636 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1637 if (vp->v_numoutput == 0) 1638 cv_broadcast(&vp->v_cv); 1639 } 1640 1641 /* 1642 * Test a vnode for being or becoming dead. Returns one of: 1643 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1644 * ENOENT: vnode is dead. 1645 * 0: otherwise. 1646 * 1647 * Whenever this function returns a non-zero value all future 1648 * calls will also return a non-zero value. 1649 */ 1650 int 1651 vdead_check(struct vnode *vp, int flags) 1652 { 1653 1654 KASSERT(mutex_owned(vp->v_interlock)); 1655 1656 if (! ISSET(flags, VDEAD_NOWAIT)) 1657 VSTATE_WAIT_STABLE(vp); 1658 1659 if (VSTATE_GET(vp) == VN_RECLAIMING) { 1660 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 1661 return EBUSY; 1662 } else if (VSTATE_GET(vp) == VN_RECLAIMED) { 1663 return ENOENT; 1664 } 1665 1666 return 0; 1667 } 1668 1669 int 1670 vfs_drainvnodes(long target) 1671 { 1672 int error; 1673 1674 mutex_enter(&vnode_free_list_lock); 1675 1676 while (numvnodes > target) { 1677 error = cleanvnode(); 1678 if (error != 0) 1679 return error; 1680 mutex_enter(&vnode_free_list_lock); 1681 } 1682 1683 mutex_exit(&vnode_free_list_lock); 1684 1685 vcache_reinit(); 1686 1687 return 0; 1688 } 1689 1690 void 1691 vnpanic(vnode_t *vp, const char *fmt, ...) 1692 { 1693 va_list ap; 1694 1695 #ifdef DIAGNOSTIC 1696 vprint(NULL, vp); 1697 #endif 1698 va_start(ap, fmt); 1699 vpanic(fmt, ap); 1700 va_end(ap); 1701 } 1702