1 /* $NetBSD: vfs_vnode.c,v 1.107 2020/01/12 17:49:17 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vcache_vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to 93 * disassociate underlying file system from the vnode, and finally 94 * destroyed. 95 * 96 * Vnode state 97 * 98 * Vnode is always in one of six states: 99 * - MARKER This is a marker vnode to help list traversal. It 100 * will never change its state. 101 * - LOADING Vnode is associating underlying file system and not 102 * yet ready to use. 103 * - LOADED Vnode has associated underlying file system and is 104 * ready to use. 105 * - BLOCKED Vnode is active but cannot get new references. 106 * - RECLAIMING Vnode is disassociating from the underlying file 107 * system. 108 * - RECLAIMED Vnode has disassociated from underlying file system 109 * and is dead. 110 * 111 * Valid state changes are: 112 * LOADING -> LOADED 113 * Vnode has been initialised in vcache_get() or 114 * vcache_new() and is ready to use. 115 * LOADED -> RECLAIMING 116 * Vnode starts disassociation from underlying file 117 * system in vcache_reclaim(). 118 * RECLAIMING -> RECLAIMED 119 * Vnode finished disassociation from underlying file 120 * system in vcache_reclaim(). 121 * LOADED -> BLOCKED 122 * Either vcache_rekey*() is changing the vnode key or 123 * vrelel() is about to call VOP_INACTIVE(). 124 * BLOCKED -> LOADED 125 * The block condition is over. 126 * LOADING -> RECLAIMED 127 * Either vcache_get() or vcache_new() failed to 128 * associate the underlying file system or vcache_rekey*() 129 * drops a vnode used as placeholder. 130 * 131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 132 * and it is possible to wait for state change. 133 * 134 * State is protected with v_interlock with one exception: 135 * to change from LOADING both v_interlock and vcache_lock must be held 136 * so it is possible to check "state == LOADING" without holding 137 * v_interlock. See vcache_get() for details. 138 * 139 * Reference counting 140 * 141 * Vnode is considered active, if reference count (vnode_t::v_usecount) 142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 143 * as vput(9), routines. Common points holding references are e.g. 144 * file openings, current working directory, mount points, etc. 145 * 146 */ 147 148 #include <sys/cdefs.h> 149 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.107 2020/01/12 17:49:17 ad Exp $"); 150 151 #include <sys/param.h> 152 #include <sys/kernel.h> 153 154 #include <sys/atomic.h> 155 #include <sys/buf.h> 156 #include <sys/conf.h> 157 #include <sys/device.h> 158 #include <sys/hash.h> 159 #include <sys/kauth.h> 160 #include <sys/kmem.h> 161 #include <sys/kthread.h> 162 #include <sys/module.h> 163 #include <sys/mount.h> 164 #include <sys/namei.h> 165 #include <sys/syscallargs.h> 166 #include <sys/sysctl.h> 167 #include <sys/systm.h> 168 #include <sys/vnode_impl.h> 169 #include <sys/wapbl.h> 170 #include <sys/fstrans.h> 171 172 #include <uvm/uvm.h> 173 #include <uvm/uvm_readahead.h> 174 #include <uvm/uvm_stat.h> 175 176 /* Flags to vrelel. */ 177 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */ 178 179 #define LRU_VRELE 0 180 #define LRU_FREE 1 181 #define LRU_HOLD 2 182 #define LRU_COUNT 3 183 184 /* 185 * There are three lru lists: one holds vnodes waiting for async release, 186 * one is for vnodes which have no buffer/page references and one for those 187 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single, 188 * private cache line as vnodes migrate between them while under the same 189 * lock (vdrain_lock). 190 */ 191 u_int numvnodes __cacheline_aligned; 192 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; 193 static kmutex_t vdrain_lock __cacheline_aligned; 194 static kcondvar_t vdrain_cv; 195 static int vdrain_gen; 196 static kcondvar_t vdrain_gen_cv; 197 static bool vdrain_retry; 198 static lwp_t * vdrain_lwp; 199 SLIST_HEAD(hashhead, vnode_impl); 200 static kmutex_t vcache_lock __cacheline_aligned; 201 static kcondvar_t vcache_cv; 202 static u_int vcache_hashsize; 203 static u_long vcache_hashmask; 204 static struct hashhead *vcache_hashtab; 205 static pool_cache_t vcache_pool; 206 static void lru_requeue(vnode_t *, vnodelst_t *); 207 static vnodelst_t * lru_which(vnode_t *); 208 static vnode_impl_t * vcache_alloc(void); 209 static void vcache_dealloc(vnode_impl_t *); 210 static void vcache_free(vnode_impl_t *); 211 static void vcache_init(void); 212 static void vcache_reinit(void); 213 static void vcache_reclaim(vnode_t *); 214 static void vrelel(vnode_t *, int, int); 215 static void vdrain_thread(void *); 216 static void vnpanic(vnode_t *, const char *, ...) 217 __printflike(2, 3); 218 219 /* Routines having to do with the management of the vnode table. */ 220 extern struct mount *dead_rootmount; 221 extern int (**dead_vnodeop_p)(void *); 222 extern int (**spec_vnodeop_p)(void *); 223 extern struct vfsops dead_vfsops; 224 225 /* Vnode state operations and diagnostics. */ 226 227 #if defined(DIAGNOSTIC) 228 229 #define VSTATE_VALID(state) \ 230 ((state) != VS_ACTIVE && (state) != VS_MARKER) 231 #define VSTATE_GET(vp) \ 232 vstate_assert_get((vp), __func__, __LINE__) 233 #define VSTATE_CHANGE(vp, from, to) \ 234 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 235 #define VSTATE_WAIT_STABLE(vp) \ 236 vstate_assert_wait_stable((vp), __func__, __LINE__) 237 238 void 239 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, 240 bool has_lock) 241 { 242 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 243 244 if (!has_lock) { 245 /* 246 * Prevent predictive loads from the CPU, but check the state 247 * without loooking first. 248 */ 249 membar_enter(); 250 if (state == VS_ACTIVE && vp->v_usecount > 0 && 251 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) 252 return; 253 if (vip->vi_state == state) 254 return; 255 mutex_enter((vp)->v_interlock); 256 } 257 258 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 259 260 if ((state == VS_ACTIVE && vp->v_usecount > 0 && 261 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) || 262 vip->vi_state == state) { 263 if (!has_lock) 264 mutex_exit((vp)->v_interlock); 265 return; 266 } 267 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d", 268 vstate_name(vip->vi_state), vp->v_usecount, 269 vstate_name(state), func, line); 270 } 271 272 static enum vnode_state 273 vstate_assert_get(vnode_t *vp, const char *func, int line) 274 { 275 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 276 277 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 278 if (! VSTATE_VALID(vip->vi_state)) 279 vnpanic(vp, "state is %s at %s:%d", 280 vstate_name(vip->vi_state), func, line); 281 282 return vip->vi_state; 283 } 284 285 static void 286 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 287 { 288 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 289 290 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 291 if (! VSTATE_VALID(vip->vi_state)) 292 vnpanic(vp, "state is %s at %s:%d", 293 vstate_name(vip->vi_state), func, line); 294 295 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 296 cv_wait(&vp->v_cv, vp->v_interlock); 297 298 if (! VSTATE_VALID(vip->vi_state)) 299 vnpanic(vp, "state is %s at %s:%d", 300 vstate_name(vip->vi_state), func, line); 301 } 302 303 static void 304 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, 305 const char *func, int line) 306 { 307 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 308 309 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 310 if (from == VS_LOADING) 311 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); 312 313 if (! VSTATE_VALID(from)) 314 vnpanic(vp, "from is %s at %s:%d", 315 vstate_name(from), func, line); 316 if (! VSTATE_VALID(to)) 317 vnpanic(vp, "to is %s at %s:%d", 318 vstate_name(to), func, line); 319 if (vip->vi_state != from) 320 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 321 vstate_name(vip->vi_state), vstate_name(from), func, line); 322 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1) 323 vnpanic(vp, "%s to %s with usecount %d at %s:%d", 324 vstate_name(from), vstate_name(to), vp->v_usecount, 325 func, line); 326 327 vip->vi_state = to; 328 if (from == VS_LOADING) 329 cv_broadcast(&vcache_cv); 330 if (to == VS_LOADED || to == VS_RECLAIMED) 331 cv_broadcast(&vp->v_cv); 332 } 333 334 #else /* defined(DIAGNOSTIC) */ 335 336 #define VSTATE_GET(vp) \ 337 (VNODE_TO_VIMPL((vp))->vi_state) 338 #define VSTATE_CHANGE(vp, from, to) \ 339 vstate_change((vp), (from), (to)) 340 #define VSTATE_WAIT_STABLE(vp) \ 341 vstate_wait_stable((vp)) 342 void 343 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, 344 bool has_lock) 345 { 346 347 } 348 349 static void 350 vstate_wait_stable(vnode_t *vp) 351 { 352 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 353 354 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 355 cv_wait(&vp->v_cv, vp->v_interlock); 356 } 357 358 static void 359 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) 360 { 361 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 362 363 vip->vi_state = to; 364 if (from == VS_LOADING) 365 cv_broadcast(&vcache_cv); 366 if (to == VS_LOADED || to == VS_RECLAIMED) 367 cv_broadcast(&vp->v_cv); 368 } 369 370 #endif /* defined(DIAGNOSTIC) */ 371 372 void 373 vfs_vnode_sysinit(void) 374 { 375 int error __diagused, i; 376 377 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 378 KASSERT(dead_rootmount != NULL); 379 dead_rootmount->mnt_iflag |= IMNT_MPSAFE; 380 381 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); 382 for (i = 0; i < LRU_COUNT; i++) { 383 TAILQ_INIT(&lru_list[i]); 384 } 385 vcache_init(); 386 387 cv_init(&vdrain_cv, "vdrain"); 388 cv_init(&vdrain_gen_cv, "vdrainwt"); 389 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 390 NULL, &vdrain_lwp, "vdrain"); 391 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); 392 } 393 394 /* 395 * Allocate a new marker vnode. 396 */ 397 vnode_t * 398 vnalloc_marker(struct mount *mp) 399 { 400 vnode_impl_t *vip; 401 vnode_t *vp; 402 403 vip = pool_cache_get(vcache_pool, PR_WAITOK); 404 memset(vip, 0, sizeof(*vip)); 405 vp = VIMPL_TO_VNODE(vip); 406 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 407 vp->v_mount = mp; 408 vp->v_type = VBAD; 409 vip->vi_state = VS_MARKER; 410 411 return vp; 412 } 413 414 /* 415 * Free a marker vnode. 416 */ 417 void 418 vnfree_marker(vnode_t *vp) 419 { 420 vnode_impl_t *vip; 421 422 vip = VNODE_TO_VIMPL(vp); 423 KASSERT(vip->vi_state == VS_MARKER); 424 uvm_obj_destroy(&vp->v_uobj, true); 425 pool_cache_put(vcache_pool, vip); 426 } 427 428 /* 429 * Test a vnode for being a marker vnode. 430 */ 431 bool 432 vnis_marker(vnode_t *vp) 433 { 434 435 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); 436 } 437 438 /* 439 * Return the lru list this node should be on. 440 */ 441 static vnodelst_t * 442 lru_which(vnode_t *vp) 443 { 444 445 KASSERT(mutex_owned(vp->v_interlock)); 446 447 if (vp->v_holdcnt > 0) 448 return &lru_list[LRU_HOLD]; 449 else 450 return &lru_list[LRU_FREE]; 451 } 452 453 /* 454 * Put vnode to end of given list. 455 * Both the current and the new list may be NULL, used on vnode alloc/free. 456 * Adjust numvnodes and signal vdrain thread if there is work. 457 */ 458 static void 459 lru_requeue(vnode_t *vp, vnodelst_t *listhd) 460 { 461 vnode_impl_t *vip; 462 int d; 463 464 /* 465 * If the vnode is on the correct list, and was put there recently, 466 * then leave it be, thus avoiding huge cache and lock contention. 467 */ 468 vip = VNODE_TO_VIMPL(vp); 469 if (listhd == vip->vi_lrulisthd && 470 (hardclock_ticks - vip->vi_lrulisttm) < hz) { 471 return; 472 } 473 474 mutex_enter(&vdrain_lock); 475 d = 0; 476 if (vip->vi_lrulisthd != NULL) 477 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 478 else 479 d++; 480 vip->vi_lrulisthd = listhd; 481 vip->vi_lrulisttm = hardclock_ticks; 482 if (vip->vi_lrulisthd != NULL) 483 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 484 else 485 d--; 486 if (d != 0) { 487 /* 488 * Looks strange? This is not a bug. Don't store 489 * numvnodes unless there is a change - avoid false 490 * sharing on MP. 491 */ 492 numvnodes += d; 493 } 494 if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE]) 495 cv_broadcast(&vdrain_cv); 496 mutex_exit(&vdrain_lock); 497 } 498 499 /* 500 * Release deferred vrele vnodes for this mount. 501 * Called with file system suspended. 502 */ 503 void 504 vrele_flush(struct mount *mp) 505 { 506 vnode_impl_t *vip, *marker; 507 vnode_t *vp; 508 509 KASSERT(fstrans_is_owner(mp)); 510 511 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 512 513 mutex_enter(&vdrain_lock); 514 TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist); 515 516 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 517 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); 518 TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker, 519 vi_lrulist); 520 vp = VIMPL_TO_VNODE(vip); 521 if (vnis_marker(vp)) 522 continue; 523 524 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); 525 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 526 vip->vi_lrulisthd = &lru_list[LRU_HOLD]; 527 vip->vi_lrulisttm = hardclock_ticks; 528 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 529 mutex_exit(&vdrain_lock); 530 531 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 532 mutex_enter(vp->v_interlock); 533 vrelel(vp, 0, LK_EXCLUSIVE); 534 535 mutex_enter(&vdrain_lock); 536 } 537 538 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); 539 mutex_exit(&vdrain_lock); 540 541 vnfree_marker(VIMPL_TO_VNODE(marker)); 542 } 543 544 /* 545 * Reclaim a cached vnode. Used from vdrain_thread only. 546 */ 547 static __inline void 548 vdrain_remove(vnode_t *vp) 549 { 550 struct mount *mp; 551 552 KASSERT(mutex_owned(&vdrain_lock)); 553 554 /* Probe usecount (unlocked). */ 555 if (vp->v_usecount > 0) 556 return; 557 /* Try v_interlock -- we lock the wrong direction! */ 558 if (!mutex_tryenter(vp->v_interlock)) 559 return; 560 /* Probe usecount and state. */ 561 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) { 562 mutex_exit(vp->v_interlock); 563 return; 564 } 565 mp = vp->v_mount; 566 if (fstrans_start_nowait(mp) != 0) { 567 mutex_exit(vp->v_interlock); 568 return; 569 } 570 vdrain_retry = true; 571 mutex_exit(&vdrain_lock); 572 573 if (vcache_vget(vp) == 0) { 574 if (!vrecycle(vp)) { 575 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 576 mutex_enter(vp->v_interlock); 577 vrelel(vp, 0, LK_EXCLUSIVE); 578 } 579 } 580 fstrans_done(mp); 581 582 mutex_enter(&vdrain_lock); 583 } 584 585 /* 586 * Release a cached vnode. Used from vdrain_thread only. 587 */ 588 static __inline void 589 vdrain_vrele(vnode_t *vp) 590 { 591 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 592 struct mount *mp; 593 594 KASSERT(mutex_owned(&vdrain_lock)); 595 596 mp = vp->v_mount; 597 if (fstrans_start_nowait(mp) != 0) 598 return; 599 600 /* 601 * First remove the vnode from the vrele list. 602 * Put it on the last lru list, the last vrele() 603 * will put it back onto the right list before 604 * its v_usecount reaches zero. 605 */ 606 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); 607 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 608 vip->vi_lrulisthd = &lru_list[LRU_HOLD]; 609 vip->vi_lrulisttm = hardclock_ticks; 610 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 611 612 vdrain_retry = true; 613 mutex_exit(&vdrain_lock); 614 615 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 616 mutex_enter(vp->v_interlock); 617 vrelel(vp, 0, LK_EXCLUSIVE); 618 fstrans_done(mp); 619 620 mutex_enter(&vdrain_lock); 621 } 622 623 /* 624 * Helper thread to keep the number of vnodes below desiredvnodes 625 * and release vnodes from asynchronous vrele. 626 */ 627 static void 628 vdrain_thread(void *cookie) 629 { 630 int i; 631 u_int target; 632 vnode_impl_t *vip, *marker; 633 634 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 635 636 mutex_enter(&vdrain_lock); 637 638 for (;;) { 639 vdrain_retry = false; 640 target = desiredvnodes - desiredvnodes/10; 641 642 for (i = 0; i < LRU_COUNT; i++) { 643 TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist); 644 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 645 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); 646 TAILQ_INSERT_AFTER(&lru_list[i], vip, marker, 647 vi_lrulist); 648 if (vnis_marker(VIMPL_TO_VNODE(vip))) 649 continue; 650 if (i == LRU_VRELE) 651 vdrain_vrele(VIMPL_TO_VNODE(vip)); 652 else if (numvnodes < target) 653 break; 654 else 655 vdrain_remove(VIMPL_TO_VNODE(vip)); 656 } 657 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); 658 } 659 660 if (vdrain_retry) { 661 mutex_exit(&vdrain_lock); 662 yield(); 663 mutex_enter(&vdrain_lock); 664 } else { 665 vdrain_gen++; 666 cv_broadcast(&vdrain_gen_cv); 667 cv_wait(&vdrain_cv, &vdrain_lock); 668 } 669 } 670 } 671 672 /* 673 * vput: unlock and release the reference. 674 */ 675 void 676 vput(vnode_t *vp) 677 { 678 int lktype; 679 680 if ((vp->v_vflag & VV_LOCKSWORK) == 0) { 681 lktype = LK_EXCLUSIVE; 682 } else { 683 lktype = VOP_ISLOCKED(vp); 684 KASSERT(lktype != LK_NONE); 685 } 686 mutex_enter(vp->v_interlock); 687 vrelel(vp, 0, lktype); 688 } 689 690 /* 691 * Vnode release. If reference count drops to zero, call inactive 692 * routine and either return to freelist or free to the pool. 693 */ 694 static void 695 vrelel(vnode_t *vp, int flags, int lktype) 696 { 697 const bool async = ((flags & VRELEL_ASYNC) != 0); 698 bool recycle, defer; 699 int error; 700 701 KASSERT(mutex_owned(vp->v_interlock)); 702 703 if (__predict_false(vp->v_op == dead_vnodeop_p && 704 VSTATE_GET(vp) != VS_RECLAIMED)) { 705 vnpanic(vp, "dead but not clean"); 706 } 707 708 /* 709 * If not the last reference, just drop the reference count 710 * and unlock. 711 */ 712 if (vp->v_usecount > 1) { 713 if (lktype != LK_NONE) { 714 VOP_UNLOCK(vp); 715 } 716 vp->v_usecount--; 717 mutex_exit(vp->v_interlock); 718 return; 719 } 720 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 721 vnpanic(vp, "%s: bad ref count", __func__); 722 } 723 724 #ifdef DIAGNOSTIC 725 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 726 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 727 vprint("vrelel: missing VOP_CLOSE()", vp); 728 } 729 #endif 730 731 /* 732 * First try to get the vnode locked for VOP_INACTIVE(). 733 * Defer vnode release to vdrain_thread if caller requests 734 * it explicitly, is the pagedaemon or the lock failed. 735 */ 736 defer = false; 737 if ((curlwp == uvm.pagedaemon_lwp) || async) { 738 defer = true; 739 } else if (lktype == LK_SHARED) { 740 /* Excellent chance of getting, if the last ref. */ 741 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | 742 LK_NOWAIT); 743 if (error != 0) { 744 defer = true; 745 } else { 746 lktype = LK_EXCLUSIVE; 747 } 748 } else if (lktype == LK_NONE) { 749 /* Excellent chance of getting, if the last ref. */ 750 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | 751 LK_NOWAIT); 752 if (error != 0) { 753 defer = true; 754 } else { 755 lktype = LK_EXCLUSIVE; 756 } 757 } 758 KASSERT(mutex_owned(vp->v_interlock)); 759 if (defer) { 760 /* 761 * Defer reclaim to the kthread; it's not safe to 762 * clean it here. We donate it our last reference. 763 */ 764 if (lktype != LK_NONE) { 765 VOP_UNLOCK(vp); 766 } 767 lru_requeue(vp, &lru_list[LRU_VRELE]); 768 mutex_exit(vp->v_interlock); 769 return; 770 } 771 KASSERT(lktype == LK_EXCLUSIVE); 772 773 /* 774 * If not clean, deactivate the vnode, but preserve 775 * our reference across the call to VOP_INACTIVE(). 776 */ 777 if (VSTATE_GET(vp) == VS_RECLAIMED) { 778 VOP_UNLOCK(vp); 779 } else { 780 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 781 mutex_exit(vp->v_interlock); 782 783 /* 784 * The vnode must not gain another reference while being 785 * deactivated. If VOP_INACTIVE() indicates that 786 * the described file has been deleted, then recycle 787 * the vnode. 788 * 789 * Note that VOP_INACTIVE() will not drop the vnode lock. 790 */ 791 recycle = false; 792 VOP_INACTIVE(vp, &recycle); 793 if (!recycle) 794 VOP_UNLOCK(vp); 795 mutex_enter(vp->v_interlock); 796 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 797 if (!recycle) { 798 if (vp->v_usecount > 1) { 799 vp->v_usecount--; 800 mutex_exit(vp->v_interlock); 801 return; 802 } 803 } 804 805 /* Take care of space accounting. */ 806 if ((vp->v_iflag & VI_EXECMAP) != 0 && 807 vp->v_uobj.uo_npages != 0) { 808 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); 809 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages); 810 } 811 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 812 vp->v_vflag &= ~VV_MAPPED; 813 814 /* 815 * Recycle the vnode if the file is now unused (unlinked), 816 * otherwise just free it. 817 */ 818 if (recycle) { 819 VSTATE_ASSERT(vp, VS_LOADED); 820 /* vcache_reclaim drops the lock. */ 821 vcache_reclaim(vp); 822 } 823 KASSERT(vp->v_usecount > 0); 824 } 825 826 vp->v_usecount--; 827 if (vp->v_usecount != 0) { 828 /* Gained another reference while being reclaimed. */ 829 mutex_exit(vp->v_interlock); 830 return; 831 } 832 833 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { 834 /* 835 * It's clean so destroy it. It isn't referenced 836 * anywhere since it has been reclaimed. 837 */ 838 vcache_free(VNODE_TO_VIMPL(vp)); 839 } else { 840 /* 841 * Otherwise, put it back onto the freelist. It 842 * can't be destroyed while still associated with 843 * a file system. 844 */ 845 lru_requeue(vp, lru_which(vp)); 846 mutex_exit(vp->v_interlock); 847 } 848 } 849 850 void 851 vrele(vnode_t *vp) 852 { 853 854 mutex_enter(vp->v_interlock); 855 vrelel(vp, 0, LK_NONE); 856 } 857 858 /* 859 * Asynchronous vnode release, vnode is released in different context. 860 */ 861 void 862 vrele_async(vnode_t *vp) 863 { 864 865 mutex_enter(vp->v_interlock); 866 vrelel(vp, VRELEL_ASYNC, LK_NONE); 867 } 868 869 /* 870 * Vnode reference, where a reference is already held by some other 871 * object (for example, a file structure). 872 */ 873 void 874 vref(vnode_t *vp) 875 { 876 877 KASSERT(vp->v_usecount != 0); 878 879 mutex_enter(vp->v_interlock); 880 vp->v_usecount++; 881 mutex_exit(vp->v_interlock); 882 } 883 884 /* 885 * Page or buffer structure gets a reference. 886 * Called with v_interlock held. 887 */ 888 void 889 vholdl(vnode_t *vp) 890 { 891 892 KASSERT(mutex_owned(vp->v_interlock)); 893 894 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) 895 lru_requeue(vp, lru_which(vp)); 896 } 897 898 /* 899 * Page or buffer structure frees a reference. 900 * Called with v_interlock held. 901 */ 902 void 903 holdrelel(vnode_t *vp) 904 { 905 906 KASSERT(mutex_owned(vp->v_interlock)); 907 908 if (vp->v_holdcnt <= 0) { 909 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 910 } 911 912 vp->v_holdcnt--; 913 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 914 lru_requeue(vp, lru_which(vp)); 915 } 916 917 /* 918 * Recycle an unused vnode if caller holds the last reference. 919 */ 920 bool 921 vrecycle(vnode_t *vp) 922 { 923 int error __diagused; 924 925 mutex_enter(vp->v_interlock); 926 927 /* Make sure we hold the last reference. */ 928 VSTATE_WAIT_STABLE(vp); 929 if (vp->v_usecount != 1) { 930 mutex_exit(vp->v_interlock); 931 return false; 932 } 933 934 /* If the vnode is already clean we're done. */ 935 if (VSTATE_GET(vp) != VS_LOADED) { 936 VSTATE_ASSERT(vp, VS_RECLAIMED); 937 vrelel(vp, 0, LK_NONE); 938 return true; 939 } 940 941 /* Prevent further references until the vnode is locked. */ 942 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 943 mutex_exit(vp->v_interlock); 944 945 /* 946 * On a leaf file system this lock will always succeed as we hold 947 * the last reference and prevent further references. 948 * On layered file systems waiting for the lock would open a can of 949 * deadlocks as the lower vnodes may have other active references. 950 */ 951 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 952 953 mutex_enter(vp->v_interlock); 954 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 955 956 if (error) { 957 mutex_exit(vp->v_interlock); 958 return false; 959 } 960 961 KASSERT(vp->v_usecount == 1); 962 vcache_reclaim(vp); 963 vrelel(vp, 0, LK_NONE); 964 965 return true; 966 } 967 968 /* 969 * Helper for vrevoke() to propagate suspension from lastmp 970 * to thismp. Both args may be NULL. 971 * Returns the currently suspended file system or NULL. 972 */ 973 static struct mount * 974 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp) 975 { 976 int error; 977 978 if (lastmp == thismp) 979 return thismp; 980 981 if (lastmp != NULL) 982 vfs_resume(lastmp); 983 984 if (thismp == NULL) 985 return NULL; 986 987 do { 988 error = vfs_suspend(thismp, 0); 989 } while (error == EINTR || error == ERESTART); 990 991 if (error == 0) 992 return thismp; 993 994 KASSERT(error == EOPNOTSUPP); 995 return NULL; 996 } 997 998 /* 999 * Eliminate all activity associated with the requested vnode 1000 * and with all vnodes aliased to the requested vnode. 1001 */ 1002 void 1003 vrevoke(vnode_t *vp) 1004 { 1005 struct mount *mp; 1006 vnode_t *vq; 1007 enum vtype type; 1008 dev_t dev; 1009 1010 KASSERT(vp->v_usecount > 0); 1011 1012 mp = vrevoke_suspend_next(NULL, vp->v_mount); 1013 1014 mutex_enter(vp->v_interlock); 1015 VSTATE_WAIT_STABLE(vp); 1016 if (VSTATE_GET(vp) == VS_RECLAIMED) { 1017 mutex_exit(vp->v_interlock); 1018 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1019 vp->v_usecount++; 1020 mutex_exit(vp->v_interlock); 1021 vgone(vp); 1022 } else { 1023 dev = vp->v_rdev; 1024 type = vp->v_type; 1025 mutex_exit(vp->v_interlock); 1026 1027 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1028 mp = vrevoke_suspend_next(mp, vq->v_mount); 1029 vgone(vq); 1030 } 1031 } 1032 vrevoke_suspend_next(mp, NULL); 1033 } 1034 1035 /* 1036 * Eliminate all activity associated with a vnode in preparation for 1037 * reuse. Drops a reference from the vnode. 1038 */ 1039 void 1040 vgone(vnode_t *vp) 1041 { 1042 int lktype; 1043 1044 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); 1045 1046 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1047 lktype = LK_EXCLUSIVE; 1048 mutex_enter(vp->v_interlock); 1049 VSTATE_WAIT_STABLE(vp); 1050 if (VSTATE_GET(vp) == VS_LOADED) { 1051 vcache_reclaim(vp); 1052 lktype = LK_NONE; 1053 } 1054 VSTATE_ASSERT(vp, VS_RECLAIMED); 1055 vrelel(vp, 0, lktype); 1056 } 1057 1058 static inline uint32_t 1059 vcache_hash(const struct vcache_key *key) 1060 { 1061 uint32_t hash = HASH32_BUF_INIT; 1062 1063 KASSERT(key->vk_key_len > 0); 1064 1065 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 1066 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 1067 return hash; 1068 } 1069 1070 static void 1071 vcache_init(void) 1072 { 1073 1074 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0, 1075 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1076 KASSERT(vcache_pool != NULL); 1077 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); 1078 cv_init(&vcache_cv, "vcache"); 1079 vcache_hashsize = desiredvnodes; 1080 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1081 &vcache_hashmask); 1082 } 1083 1084 static void 1085 vcache_reinit(void) 1086 { 1087 int i; 1088 uint32_t hash; 1089 u_long oldmask, newmask; 1090 struct hashhead *oldtab, *newtab; 1091 vnode_impl_t *vip; 1092 1093 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1094 mutex_enter(&vcache_lock); 1095 oldtab = vcache_hashtab; 1096 oldmask = vcache_hashmask; 1097 vcache_hashsize = desiredvnodes; 1098 vcache_hashtab = newtab; 1099 vcache_hashmask = newmask; 1100 for (i = 0; i <= oldmask; i++) { 1101 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { 1102 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); 1103 hash = vcache_hash(&vip->vi_key); 1104 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], 1105 vip, vi_hash); 1106 } 1107 } 1108 mutex_exit(&vcache_lock); 1109 hashdone(oldtab, HASH_SLIST, oldmask); 1110 } 1111 1112 static inline vnode_impl_t * 1113 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1114 { 1115 struct hashhead *hashp; 1116 vnode_impl_t *vip; 1117 1118 KASSERT(mutex_owned(&vcache_lock)); 1119 1120 hashp = &vcache_hashtab[hash & vcache_hashmask]; 1121 SLIST_FOREACH(vip, hashp, vi_hash) { 1122 if (key->vk_mount != vip->vi_key.vk_mount) 1123 continue; 1124 if (key->vk_key_len != vip->vi_key.vk_key_len) 1125 continue; 1126 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) 1127 continue; 1128 return vip; 1129 } 1130 return NULL; 1131 } 1132 1133 /* 1134 * Allocate a new, uninitialized vcache node. 1135 */ 1136 static vnode_impl_t * 1137 vcache_alloc(void) 1138 { 1139 vnode_impl_t *vip; 1140 vnode_t *vp; 1141 1142 vip = pool_cache_get(vcache_pool, PR_WAITOK); 1143 memset(vip, 0, sizeof(*vip)); 1144 1145 vip->vi_lock = rw_obj_alloc(); 1146 /* SLIST_INIT(&vip->vi_hash); */ 1147 TAILQ_INIT(&vip->vi_nclist); 1148 /* LIST_INIT(&vip->vi_dnclist); */ 1149 1150 vp = VIMPL_TO_VNODE(vip); 1151 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 1152 cv_init(&vp->v_cv, "vnode"); 1153 1154 vp->v_usecount = 1; 1155 vp->v_type = VNON; 1156 vp->v_size = vp->v_writesize = VSIZENOTSET; 1157 1158 vip->vi_state = VS_LOADING; 1159 1160 lru_requeue(vp, &lru_list[LRU_FREE]); 1161 1162 return vip; 1163 } 1164 1165 /* 1166 * Deallocate a vcache node in state VS_LOADING. 1167 * 1168 * vcache_lock held on entry and released on return. 1169 */ 1170 static void 1171 vcache_dealloc(vnode_impl_t *vip) 1172 { 1173 vnode_t *vp; 1174 1175 KASSERT(mutex_owned(&vcache_lock)); 1176 1177 vp = VIMPL_TO_VNODE(vip); 1178 vfs_ref(dead_rootmount); 1179 vfs_insmntque(vp, dead_rootmount); 1180 mutex_enter(vp->v_interlock); 1181 vp->v_op = dead_vnodeop_p; 1182 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1183 mutex_exit(&vcache_lock); 1184 vrelel(vp, 0, LK_NONE); 1185 } 1186 1187 /* 1188 * Free an unused, unreferenced vcache node. 1189 * v_interlock locked on entry. 1190 */ 1191 static void 1192 vcache_free(vnode_impl_t *vip) 1193 { 1194 vnode_t *vp; 1195 1196 vp = VIMPL_TO_VNODE(vip); 1197 KASSERT(mutex_owned(vp->v_interlock)); 1198 1199 KASSERT(vp->v_usecount == 0); 1200 KASSERT(vp->v_holdcnt == 0); 1201 KASSERT(vp->v_writecount == 0); 1202 lru_requeue(vp, NULL); 1203 mutex_exit(vp->v_interlock); 1204 1205 vfs_insmntque(vp, NULL); 1206 if (vp->v_type == VBLK || vp->v_type == VCHR) 1207 spec_node_destroy(vp); 1208 1209 rw_obj_free(vip->vi_lock); 1210 uvm_obj_destroy(&vp->v_uobj, true); 1211 cv_destroy(&vp->v_cv); 1212 pool_cache_put(vcache_pool, vip); 1213 } 1214 1215 /* 1216 * Try to get an initial reference on this cached vnode. 1217 * Returns zero on success, ENOENT if the vnode has been reclaimed and 1218 * EBUSY if the vnode state is unstable. 1219 * 1220 * v_interlock locked on entry and unlocked on exit. 1221 */ 1222 int 1223 vcache_tryvget(vnode_t *vp) 1224 { 1225 int error = 0; 1226 1227 KASSERT(mutex_owned(vp->v_interlock)); 1228 1229 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) 1230 error = ENOENT; 1231 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED)) 1232 error = EBUSY; 1233 else 1234 vp->v_usecount++; 1235 1236 mutex_exit(vp->v_interlock); 1237 1238 return error; 1239 } 1240 1241 /* 1242 * Try to get an initial reference on this cached vnode. 1243 * Returns zero on success and ENOENT if the vnode has been reclaimed. 1244 * Will wait for the vnode state to be stable. 1245 * 1246 * v_interlock locked on entry and unlocked on exit. 1247 */ 1248 int 1249 vcache_vget(vnode_t *vp) 1250 { 1251 1252 KASSERT(mutex_owned(vp->v_interlock)); 1253 1254 /* Increment hold count to prevent vnode from disappearing. */ 1255 vp->v_holdcnt++; 1256 VSTATE_WAIT_STABLE(vp); 1257 vp->v_holdcnt--; 1258 1259 /* If this was the last reference to a reclaimed vnode free it now. */ 1260 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { 1261 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 1262 vcache_free(VNODE_TO_VIMPL(vp)); 1263 else 1264 mutex_exit(vp->v_interlock); 1265 return ENOENT; 1266 } 1267 VSTATE_ASSERT(vp, VS_LOADED); 1268 vp->v_usecount++; 1269 mutex_exit(vp->v_interlock); 1270 1271 return 0; 1272 } 1273 1274 /* 1275 * Get a vnode / fs node pair by key and return it referenced through vpp. 1276 */ 1277 int 1278 vcache_get(struct mount *mp, const void *key, size_t key_len, 1279 struct vnode **vpp) 1280 { 1281 int error; 1282 uint32_t hash; 1283 const void *new_key; 1284 struct vnode *vp; 1285 struct vcache_key vcache_key; 1286 vnode_impl_t *vip, *new_vip; 1287 1288 new_key = NULL; 1289 *vpp = NULL; 1290 1291 vcache_key.vk_mount = mp; 1292 vcache_key.vk_key = key; 1293 vcache_key.vk_key_len = key_len; 1294 hash = vcache_hash(&vcache_key); 1295 1296 again: 1297 mutex_enter(&vcache_lock); 1298 vip = vcache_hash_lookup(&vcache_key, hash); 1299 1300 /* If found, take a reference or retry. */ 1301 if (__predict_true(vip != NULL)) { 1302 /* 1303 * If the vnode is loading we cannot take the v_interlock 1304 * here as it might change during load (see uvm_obj_setlock()). 1305 * As changing state from VS_LOADING requires both vcache_lock 1306 * and v_interlock it is safe to test with vcache_lock held. 1307 * 1308 * Wait for vnodes changing state from VS_LOADING and retry. 1309 */ 1310 if (__predict_false(vip->vi_state == VS_LOADING)) { 1311 cv_wait(&vcache_cv, &vcache_lock); 1312 mutex_exit(&vcache_lock); 1313 goto again; 1314 } 1315 vp = VIMPL_TO_VNODE(vip); 1316 mutex_enter(vp->v_interlock); 1317 mutex_exit(&vcache_lock); 1318 error = vcache_vget(vp); 1319 if (error == ENOENT) 1320 goto again; 1321 if (error == 0) 1322 *vpp = vp; 1323 KASSERT((error != 0) == (*vpp == NULL)); 1324 return error; 1325 } 1326 mutex_exit(&vcache_lock); 1327 1328 /* Allocate and initialize a new vcache / vnode pair. */ 1329 error = vfs_busy(mp); 1330 if (error) 1331 return error; 1332 new_vip = vcache_alloc(); 1333 new_vip->vi_key = vcache_key; 1334 vp = VIMPL_TO_VNODE(new_vip); 1335 mutex_enter(&vcache_lock); 1336 vip = vcache_hash_lookup(&vcache_key, hash); 1337 if (vip == NULL) { 1338 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1339 new_vip, vi_hash); 1340 vip = new_vip; 1341 } 1342 1343 /* If another thread beat us inserting this node, retry. */ 1344 if (vip != new_vip) { 1345 vcache_dealloc(new_vip); 1346 vfs_unbusy(mp); 1347 goto again; 1348 } 1349 mutex_exit(&vcache_lock); 1350 1351 /* Load the fs node. Exclusive as new_node is VS_LOADING. */ 1352 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1353 if (error) { 1354 mutex_enter(&vcache_lock); 1355 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1356 new_vip, vnode_impl, vi_hash); 1357 vcache_dealloc(new_vip); 1358 vfs_unbusy(mp); 1359 KASSERT(*vpp == NULL); 1360 return error; 1361 } 1362 KASSERT(new_key != NULL); 1363 KASSERT(memcmp(key, new_key, key_len) == 0); 1364 KASSERT(vp->v_op != NULL); 1365 vfs_insmntque(vp, mp); 1366 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1367 vp->v_vflag |= VV_MPSAFE; 1368 vfs_ref(mp); 1369 vfs_unbusy(mp); 1370 1371 /* Finished loading, finalize node. */ 1372 mutex_enter(&vcache_lock); 1373 new_vip->vi_key.vk_key = new_key; 1374 mutex_enter(vp->v_interlock); 1375 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1376 mutex_exit(vp->v_interlock); 1377 mutex_exit(&vcache_lock); 1378 *vpp = vp; 1379 return 0; 1380 } 1381 1382 /* 1383 * Create a new vnode / fs node pair and return it referenced through vpp. 1384 */ 1385 int 1386 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1387 kauth_cred_t cred, void *extra, struct vnode **vpp) 1388 { 1389 int error; 1390 uint32_t hash; 1391 struct vnode *vp, *ovp; 1392 vnode_impl_t *vip, *ovip; 1393 1394 *vpp = NULL; 1395 1396 /* Allocate and initialize a new vcache / vnode pair. */ 1397 error = vfs_busy(mp); 1398 if (error) 1399 return error; 1400 vip = vcache_alloc(); 1401 vip->vi_key.vk_mount = mp; 1402 vp = VIMPL_TO_VNODE(vip); 1403 1404 /* Create and load the fs node. */ 1405 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra, 1406 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); 1407 if (error) { 1408 mutex_enter(&vcache_lock); 1409 vcache_dealloc(vip); 1410 vfs_unbusy(mp); 1411 KASSERT(*vpp == NULL); 1412 return error; 1413 } 1414 KASSERT(vp->v_op != NULL); 1415 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); 1416 if (vip->vi_key.vk_key_len > 0) { 1417 KASSERT(vip->vi_key.vk_key != NULL); 1418 hash = vcache_hash(&vip->vi_key); 1419 1420 /* 1421 * Wait for previous instance to be reclaimed, 1422 * then insert new node. 1423 */ 1424 mutex_enter(&vcache_lock); 1425 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { 1426 ovp = VIMPL_TO_VNODE(ovip); 1427 mutex_enter(ovp->v_interlock); 1428 mutex_exit(&vcache_lock); 1429 error = vcache_vget(ovp); 1430 KASSERT(error == ENOENT); 1431 mutex_enter(&vcache_lock); 1432 } 1433 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1434 vip, vi_hash); 1435 mutex_exit(&vcache_lock); 1436 } 1437 vfs_insmntque(vp, mp); 1438 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1439 vp->v_vflag |= VV_MPSAFE; 1440 vfs_ref(mp); 1441 vfs_unbusy(mp); 1442 1443 /* Finished loading, finalize node. */ 1444 mutex_enter(&vcache_lock); 1445 mutex_enter(vp->v_interlock); 1446 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1447 mutex_exit(&vcache_lock); 1448 mutex_exit(vp->v_interlock); 1449 *vpp = vp; 1450 return 0; 1451 } 1452 1453 /* 1454 * Prepare key change: update old cache nodes key and lock new cache node. 1455 * Return an error if the new node already exists. 1456 */ 1457 int 1458 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1459 const void *old_key, size_t old_key_len, 1460 const void *new_key, size_t new_key_len) 1461 { 1462 uint32_t old_hash, new_hash; 1463 struct vcache_key old_vcache_key, new_vcache_key; 1464 vnode_impl_t *vip, *new_vip; 1465 1466 old_vcache_key.vk_mount = mp; 1467 old_vcache_key.vk_key = old_key; 1468 old_vcache_key.vk_key_len = old_key_len; 1469 old_hash = vcache_hash(&old_vcache_key); 1470 1471 new_vcache_key.vk_mount = mp; 1472 new_vcache_key.vk_key = new_key; 1473 new_vcache_key.vk_key_len = new_key_len; 1474 new_hash = vcache_hash(&new_vcache_key); 1475 1476 new_vip = vcache_alloc(); 1477 new_vip->vi_key = new_vcache_key; 1478 1479 /* Insert locked new node used as placeholder. */ 1480 mutex_enter(&vcache_lock); 1481 vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1482 if (vip != NULL) { 1483 vcache_dealloc(new_vip); 1484 return EEXIST; 1485 } 1486 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1487 new_vip, vi_hash); 1488 1489 /* Replace old nodes key with the temporary copy. */ 1490 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1491 KASSERT(vip != NULL); 1492 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1493 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); 1494 vip->vi_key = old_vcache_key; 1495 mutex_exit(&vcache_lock); 1496 return 0; 1497 } 1498 1499 /* 1500 * Key change complete: update old node and remove placeholder. 1501 */ 1502 void 1503 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1504 const void *old_key, size_t old_key_len, 1505 const void *new_key, size_t new_key_len) 1506 { 1507 uint32_t old_hash, new_hash; 1508 struct vcache_key old_vcache_key, new_vcache_key; 1509 vnode_impl_t *vip, *new_vip; 1510 struct vnode *new_vp; 1511 1512 old_vcache_key.vk_mount = mp; 1513 old_vcache_key.vk_key = old_key; 1514 old_vcache_key.vk_key_len = old_key_len; 1515 old_hash = vcache_hash(&old_vcache_key); 1516 1517 new_vcache_key.vk_mount = mp; 1518 new_vcache_key.vk_key = new_key; 1519 new_vcache_key.vk_key_len = new_key_len; 1520 new_hash = vcache_hash(&new_vcache_key); 1521 1522 mutex_enter(&vcache_lock); 1523 1524 /* Lookup old and new node. */ 1525 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1526 KASSERT(vip != NULL); 1527 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1528 1529 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1530 KASSERT(new_vip != NULL); 1531 KASSERT(new_vip->vi_key.vk_key_len == new_key_len); 1532 new_vp = VIMPL_TO_VNODE(new_vip); 1533 mutex_enter(new_vp->v_interlock); 1534 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); 1535 mutex_exit(new_vp->v_interlock); 1536 1537 /* Rekey old node and put it onto its new hashlist. */ 1538 vip->vi_key = new_vcache_key; 1539 if (old_hash != new_hash) { 1540 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], 1541 vip, vnode_impl, vi_hash); 1542 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1543 vip, vi_hash); 1544 } 1545 1546 /* Remove new node used as placeholder. */ 1547 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], 1548 new_vip, vnode_impl, vi_hash); 1549 vcache_dealloc(new_vip); 1550 } 1551 1552 /* 1553 * Disassociate the underlying file system from a vnode. 1554 * 1555 * Must be called with vnode locked and will return unlocked. 1556 * Must be called with the interlock held, and will return with it held. 1557 */ 1558 static void 1559 vcache_reclaim(vnode_t *vp) 1560 { 1561 lwp_t *l = curlwp; 1562 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 1563 struct mount *mp = vp->v_mount; 1564 uint32_t hash; 1565 uint8_t temp_buf[64], *temp_key; 1566 size_t temp_key_len; 1567 bool recycle, active; 1568 int error; 1569 1570 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1571 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1572 KASSERT(mutex_owned(vp->v_interlock)); 1573 KASSERT(vp->v_usecount != 0); 1574 1575 active = (vp->v_usecount > 1); 1576 temp_key_len = vip->vi_key.vk_key_len; 1577 /* 1578 * Prevent the vnode from being recycled or brought into use 1579 * while we clean it out. 1580 */ 1581 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING); 1582 if ((vp->v_iflag & VI_EXECMAP) != 0 && vp->v_uobj.uo_npages != 0) { 1583 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); 1584 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages); 1585 } 1586 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1587 mutex_exit(vp->v_interlock); 1588 1589 /* Replace the vnode key with a temporary copy. */ 1590 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { 1591 temp_key = kmem_alloc(temp_key_len, KM_SLEEP); 1592 } else { 1593 temp_key = temp_buf; 1594 } 1595 if (vip->vi_key.vk_key_len > 0) { 1596 mutex_enter(&vcache_lock); 1597 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); 1598 vip->vi_key.vk_key = temp_key; 1599 mutex_exit(&vcache_lock); 1600 } 1601 1602 fstrans_start(mp); 1603 1604 /* 1605 * Clean out any cached data associated with the vnode. 1606 * If purging an active vnode, it must be closed and 1607 * deactivated before being reclaimed. 1608 */ 1609 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1610 if (error != 0) { 1611 if (wapbl_vphaswapbl(vp)) 1612 WAPBL_DISCARD(wapbl_vptomp(vp)); 1613 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1614 } 1615 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1616 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1617 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1618 spec_node_revoke(vp); 1619 } 1620 1621 /* 1622 * Disassociate the underlying file system from the vnode. 1623 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks 1624 * the vnode, and may destroy the vnode so that VOP_UNLOCK 1625 * would no longer function. 1626 */ 1627 VOP_INACTIVE(vp, &recycle); 1628 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1629 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1630 if (VOP_RECLAIM(vp)) { 1631 vnpanic(vp, "%s: cannot reclaim", __func__); 1632 } 1633 1634 KASSERT(vp->v_data == NULL); 1635 KASSERT(vp->v_uobj.uo_npages == 0); 1636 1637 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1638 uvm_ra_freectx(vp->v_ractx); 1639 vp->v_ractx = NULL; 1640 } 1641 1642 /* Purge name cache. */ 1643 cache_purge(vp); 1644 1645 if (vip->vi_key.vk_key_len > 0) { 1646 /* Remove from vnode cache. */ 1647 hash = vcache_hash(&vip->vi_key); 1648 mutex_enter(&vcache_lock); 1649 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 1650 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1651 vip, vnode_impl, vi_hash); 1652 mutex_exit(&vcache_lock); 1653 } 1654 if (temp_key != temp_buf) 1655 kmem_free(temp_key, temp_key_len); 1656 1657 /* Done with purge, notify sleepers of the grim news. */ 1658 mutex_enter(vp->v_interlock); 1659 vp->v_op = dead_vnodeop_p; 1660 vp->v_vflag |= VV_LOCKSWORK; 1661 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); 1662 vp->v_tag = VT_NON; 1663 KNOTE(&vp->v_klist, NOTE_REVOKE); 1664 mutex_exit(vp->v_interlock); 1665 1666 /* 1667 * Move to dead mount. Must be after changing the operations 1668 * vector as vnode operations enter the mount before using the 1669 * operations vector. See sys/kern/vnode_if.c. 1670 */ 1671 vp->v_vflag &= ~VV_ROOT; 1672 vfs_ref(dead_rootmount); 1673 vfs_insmntque(vp, dead_rootmount); 1674 1675 mutex_enter(vp->v_interlock); 1676 fstrans_done(mp); 1677 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1678 } 1679 1680 /* 1681 * Disassociate the underlying file system from an open device vnode 1682 * and make it anonymous. 1683 * 1684 * Vnode unlocked on entry, drops a reference to the vnode. 1685 */ 1686 void 1687 vcache_make_anon(vnode_t *vp) 1688 { 1689 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 1690 uint32_t hash; 1691 bool recycle; 1692 1693 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 1694 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); 1695 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); 1696 1697 /* Remove from vnode cache. */ 1698 hash = vcache_hash(&vip->vi_key); 1699 mutex_enter(&vcache_lock); 1700 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 1701 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1702 vip, vnode_impl, vi_hash); 1703 vip->vi_key.vk_mount = dead_rootmount; 1704 vip->vi_key.vk_key_len = 0; 1705 vip->vi_key.vk_key = NULL; 1706 mutex_exit(&vcache_lock); 1707 1708 /* 1709 * Disassociate the underlying file system from the vnode. 1710 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks 1711 * the vnode, and may destroy the vnode so that VOP_UNLOCK 1712 * would no longer function. 1713 */ 1714 if (vn_lock(vp, LK_EXCLUSIVE)) { 1715 vnpanic(vp, "%s: cannot lock", __func__); 1716 } 1717 VOP_INACTIVE(vp, &recycle); 1718 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1719 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1720 if (VOP_RECLAIM(vp)) { 1721 vnpanic(vp, "%s: cannot reclaim", __func__); 1722 } 1723 1724 /* Purge name cache. */ 1725 cache_purge(vp); 1726 1727 /* Done with purge, change operations vector. */ 1728 mutex_enter(vp->v_interlock); 1729 vp->v_op = spec_vnodeop_p; 1730 vp->v_vflag |= VV_MPSAFE; 1731 vp->v_vflag &= ~VV_LOCKSWORK; 1732 mutex_exit(vp->v_interlock); 1733 1734 /* 1735 * Move to dead mount. Must be after changing the operations 1736 * vector as vnode operations enter the mount before using the 1737 * operations vector. See sys/kern/vnode_if.c. 1738 */ 1739 vfs_ref(dead_rootmount); 1740 vfs_insmntque(vp, dead_rootmount); 1741 1742 vrele(vp); 1743 } 1744 1745 /* 1746 * Update outstanding I/O count and do wakeup if requested. 1747 */ 1748 void 1749 vwakeup(struct buf *bp) 1750 { 1751 vnode_t *vp; 1752 1753 if ((vp = bp->b_vp) == NULL) 1754 return; 1755 1756 KASSERT(bp->b_objlock == vp->v_interlock); 1757 KASSERT(mutex_owned(bp->b_objlock)); 1758 1759 if (--vp->v_numoutput < 0) 1760 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1761 if (vp->v_numoutput == 0) 1762 cv_broadcast(&vp->v_cv); 1763 } 1764 1765 /* 1766 * Test a vnode for being or becoming dead. Returns one of: 1767 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1768 * ENOENT: vnode is dead. 1769 * 0: otherwise. 1770 * 1771 * Whenever this function returns a non-zero value all future 1772 * calls will also return a non-zero value. 1773 */ 1774 int 1775 vdead_check(struct vnode *vp, int flags) 1776 { 1777 1778 KASSERT(mutex_owned(vp->v_interlock)); 1779 1780 if (! ISSET(flags, VDEAD_NOWAIT)) 1781 VSTATE_WAIT_STABLE(vp); 1782 1783 if (VSTATE_GET(vp) == VS_RECLAIMING) { 1784 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 1785 return EBUSY; 1786 } else if (VSTATE_GET(vp) == VS_RECLAIMED) { 1787 return ENOENT; 1788 } 1789 1790 return 0; 1791 } 1792 1793 int 1794 vfs_drainvnodes(void) 1795 { 1796 int i, gen; 1797 1798 mutex_enter(&vdrain_lock); 1799 for (i = 0; i < 2; i++) { 1800 gen = vdrain_gen; 1801 while (gen == vdrain_gen) { 1802 cv_broadcast(&vdrain_cv); 1803 cv_wait(&vdrain_gen_cv, &vdrain_lock); 1804 } 1805 } 1806 mutex_exit(&vdrain_lock); 1807 1808 if (numvnodes >= desiredvnodes) 1809 return EBUSY; 1810 1811 if (vcache_hashsize != desiredvnodes) 1812 vcache_reinit(); 1813 1814 return 0; 1815 } 1816 1817 void 1818 vnpanic(vnode_t *vp, const char *fmt, ...) 1819 { 1820 va_list ap; 1821 1822 #ifdef DIAGNOSTIC 1823 vprint(NULL, vp); 1824 #endif 1825 va_start(ap, fmt); 1826 vpanic(fmt, ap); 1827 va_end(ap); 1828 } 1829