1 /* $NetBSD: vfs_vnode.c,v 1.156 2024/12/07 02:27:38 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vcache_vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to 93 * disassociate underlying file system from the vnode, and finally 94 * destroyed. 95 * 96 * Vnode state 97 * 98 * Vnode is always in one of six states: 99 * - MARKER This is a marker vnode to help list traversal. It 100 * will never change its state. 101 * - LOADING Vnode is associating underlying file system and not 102 * yet ready to use. 103 * - LOADED Vnode has associated underlying file system and is 104 * ready to use. 105 * - BLOCKED Vnode is active but cannot get new references. 106 * - RECLAIMING Vnode is disassociating from the underlying file 107 * system. 108 * - RECLAIMED Vnode has disassociated from underlying file system 109 * and is dead. 110 * 111 * Valid state changes are: 112 * LOADING -> LOADED 113 * Vnode has been initialised in vcache_get() or 114 * vcache_new() and is ready to use. 115 * BLOCKED -> RECLAIMING 116 * Vnode starts disassociation from underlying file 117 * system in vcache_reclaim(). 118 * RECLAIMING -> RECLAIMED 119 * Vnode finished disassociation from underlying file 120 * system in vcache_reclaim(). 121 * LOADED -> BLOCKED 122 * Either vcache_rekey*() is changing the vnode key or 123 * vrelel() is about to call VOP_INACTIVE(). 124 * BLOCKED -> LOADED 125 * The block condition is over. 126 * LOADING -> RECLAIMED 127 * Either vcache_get() or vcache_new() failed to 128 * associate the underlying file system or vcache_rekey*() 129 * drops a vnode used as placeholder. 130 * 131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 132 * and it is possible to wait for state change. 133 * 134 * State is protected with v_interlock with one exception: 135 * to change from LOADING both v_interlock and vcache_lock must be held 136 * so it is possible to check "state == LOADING" without holding 137 * v_interlock. See vcache_get() for details. 138 * 139 * Reference counting 140 * 141 * Vnode is considered active, if reference count (vnode_t::v_usecount) 142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 143 * as vput(9), routines. Common points holding references are e.g. 144 * file openings, current working directory, mount points, etc. 145 * 146 * v_usecount is adjusted with atomic operations, however to change 147 * from a non-zero value to zero the interlock must also be held. 148 */ 149 150 #include <sys/cdefs.h> 151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.156 2024/12/07 02:27:38 riastradh Exp $"); 152 153 #ifdef _KERNEL_OPT 154 #include "opt_pax.h" 155 #endif 156 157 #include <sys/param.h> 158 #include <sys/types.h> 159 160 #include <sys/atomic.h> 161 #include <sys/buf.h> 162 #include <sys/conf.h> 163 #include <sys/device.h> 164 #include <sys/fstrans.h> 165 #include <sys/hash.h> 166 #include <sys/kauth.h> 167 #include <sys/kernel.h> 168 #include <sys/kmem.h> 169 #include <sys/module.h> 170 #include <sys/mount.h> 171 #include <sys/namei.h> 172 #include <sys/pax.h> 173 #include <sys/sdt.h> 174 #include <sys/syscallargs.h> 175 #include <sys/sysctl.h> 176 #include <sys/systm.h> 177 #include <sys/threadpool.h> 178 #include <sys/vnode_impl.h> 179 #include <sys/wapbl.h> 180 181 #include <miscfs/deadfs/deadfs.h> 182 #include <miscfs/specfs/specdev.h> 183 184 #include <uvm/uvm.h> 185 #include <uvm/uvm_readahead.h> 186 #include <uvm/uvm_stat.h> 187 188 /* Flags to vrelel. */ 189 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */ 190 191 #define LRU_VRELE 0 192 #define LRU_FREE 1 193 #define LRU_HOLD 2 194 #define LRU_COUNT 3 195 196 /* 197 * There are three lru lists: one holds vnodes waiting for async release, 198 * one is for vnodes which have no buffer/page references and one for those 199 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single, 200 * private cache line as vnodes migrate between them while under the same 201 * lock (vdrain_lock). 202 */ 203 204 typedef struct { 205 vnode_impl_t *li_marker; 206 } lru_iter_t; 207 208 u_int numvnodes __cacheline_aligned; 209 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; 210 static struct threadpool *threadpool; 211 static struct threadpool_job vdrain_job; 212 static struct threadpool_job vrele_job; 213 static kmutex_t vdrain_lock __cacheline_aligned; 214 SLIST_HEAD(hashhead, vnode_impl); 215 static kmutex_t vcache_lock __cacheline_aligned; 216 static kcondvar_t vcache_cv; 217 static u_int vcache_hashsize; 218 static u_long vcache_hashmask; 219 static struct hashhead *vcache_hashtab; 220 static pool_cache_t vcache_pool; 221 static void lru_requeue(vnode_t *, vnodelst_t *); 222 static vnodelst_t * lru_which(vnode_t *); 223 static vnode_impl_t * lru_iter_first(int, lru_iter_t *); 224 static vnode_impl_t * lru_iter_next(lru_iter_t *); 225 static void lru_iter_release(lru_iter_t *); 226 static vnode_impl_t * vcache_alloc(void); 227 static void vcache_dealloc(vnode_impl_t *); 228 static void vcache_free(vnode_impl_t *); 229 static void vcache_init(void); 230 static void vcache_reinit(void); 231 static void vcache_reclaim(vnode_t *); 232 static void vrele_deferred(vnode_impl_t *); 233 static void vrelel(vnode_t *, int, int); 234 static void vnpanic(vnode_t *, const char *, ...) 235 __printflike(2, 3); 236 static bool vdrain_one(u_int); 237 static void vdrain_task(struct threadpool_job *); 238 static void vrele_task(struct threadpool_job *); 239 240 /* Routines having to do with the management of the vnode table. */ 241 242 /* 243 * The high bit of v_usecount is a gate for vcache_tryvget(). It's set 244 * only when the vnode state is LOADED. 245 * The next bit of v_usecount is a flag for vrelel(). It's set 246 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds. 247 */ 248 #define VUSECOUNT_MASK 0x3fffffff 249 #define VUSECOUNT_GATE 0x80000000 250 #define VUSECOUNT_VGET 0x40000000 251 252 /* 253 * Return the current usecount of a vnode. 254 */ 255 inline int 256 vrefcnt(struct vnode *vp) 257 { 258 259 return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK; 260 } 261 262 /* Vnode state operations and diagnostics. */ 263 264 #if defined(DIAGNOSTIC) 265 266 #define VSTATE_VALID(state) \ 267 ((state) != VS_ACTIVE && (state) != VS_MARKER) 268 #define VSTATE_GET(vp) \ 269 vstate_assert_get((vp), __func__, __LINE__) 270 #define VSTATE_CHANGE(vp, from, to) \ 271 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 272 #define VSTATE_WAIT_STABLE(vp) \ 273 vstate_assert_wait_stable((vp), __func__, __LINE__) 274 275 void 276 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, 277 bool has_lock) 278 { 279 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 280 int refcnt = vrefcnt(vp); 281 282 if (!has_lock) { 283 enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state); 284 285 if (state == VS_ACTIVE && refcnt > 0 && 286 (vstate == VS_LOADED || vstate == VS_BLOCKED)) 287 return; 288 if (vstate == state) 289 return; 290 mutex_enter((vp)->v_interlock); 291 } 292 293 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 294 295 if ((state == VS_ACTIVE && refcnt > 0 && 296 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) || 297 vip->vi_state == state) { 298 if (!has_lock) 299 mutex_exit((vp)->v_interlock); 300 return; 301 } 302 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d", 303 vstate_name(vip->vi_state), refcnt, 304 vstate_name(state), func, line); 305 } 306 307 static enum vnode_state 308 vstate_assert_get(vnode_t *vp, const char *func, int line) 309 { 310 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 311 312 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 313 if (! VSTATE_VALID(vip->vi_state)) 314 vnpanic(vp, "state is %s at %s:%d", 315 vstate_name(vip->vi_state), func, line); 316 317 return vip->vi_state; 318 } 319 320 static void 321 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 322 { 323 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 324 325 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 326 if (! VSTATE_VALID(vip->vi_state)) 327 vnpanic(vp, "state is %s at %s:%d", 328 vstate_name(vip->vi_state), func, line); 329 330 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 331 cv_wait(&vp->v_cv, vp->v_interlock); 332 333 if (! VSTATE_VALID(vip->vi_state)) 334 vnpanic(vp, "state is %s at %s:%d", 335 vstate_name(vip->vi_state), func, line); 336 } 337 338 static void 339 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, 340 const char *func, int line) 341 { 342 bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE); 343 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 344 345 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 346 if (from == VS_LOADING) 347 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); 348 349 if (! VSTATE_VALID(from)) 350 vnpanic(vp, "from is %s at %s:%d", 351 vstate_name(from), func, line); 352 if (! VSTATE_VALID(to)) 353 vnpanic(vp, "to is %s at %s:%d", 354 vstate_name(to), func, line); 355 if (vip->vi_state != from) 356 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 357 vstate_name(vip->vi_state), vstate_name(from), func, line); 358 if ((from == VS_LOADED) != gated) 359 vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n", 360 vstate_name(vip->vi_state), gated, func, line); 361 362 /* Open/close the gate for vcache_tryvget(). */ 363 if (to == VS_LOADED) { 364 membar_release(); 365 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); 366 } else { 367 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); 368 } 369 370 atomic_store_relaxed(&vip->vi_state, to); 371 if (from == VS_LOADING) 372 cv_broadcast(&vcache_cv); 373 if (to == VS_LOADED || to == VS_RECLAIMED) 374 cv_broadcast(&vp->v_cv); 375 } 376 377 #else /* defined(DIAGNOSTIC) */ 378 379 #define VSTATE_GET(vp) \ 380 (VNODE_TO_VIMPL((vp))->vi_state) 381 #define VSTATE_CHANGE(vp, from, to) \ 382 vstate_change((vp), (from), (to)) 383 #define VSTATE_WAIT_STABLE(vp) \ 384 vstate_wait_stable((vp)) 385 void 386 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, 387 bool has_lock) 388 { 389 390 } 391 392 static void 393 vstate_wait_stable(vnode_t *vp) 394 { 395 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 396 397 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 398 cv_wait(&vp->v_cv, vp->v_interlock); 399 } 400 401 static void 402 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) 403 { 404 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 405 406 /* Open/close the gate for vcache_tryvget(). */ 407 if (to == VS_LOADED) { 408 membar_release(); 409 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); 410 } else { 411 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); 412 } 413 414 atomic_store_relaxed(&vip->vi_state, to); 415 if (from == VS_LOADING) 416 cv_broadcast(&vcache_cv); 417 if (to == VS_LOADED || to == VS_RECLAIMED) 418 cv_broadcast(&vp->v_cv); 419 } 420 421 #endif /* defined(DIAGNOSTIC) */ 422 423 void 424 vfs_vnode_sysinit(void) 425 { 426 int error __diagused, i; 427 428 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 429 KASSERT(dead_rootmount != NULL); 430 dead_rootmount->mnt_iflag |= IMNT_MPSAFE; 431 432 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); 433 for (i = 0; i < LRU_COUNT; i++) { 434 TAILQ_INIT(&lru_list[i]); 435 } 436 vcache_init(); 437 438 error = threadpool_get(&threadpool, PRI_NONE); 439 KASSERTMSG((error == 0), "threadpool_get failed: %d", error); 440 threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain"); 441 threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele"); 442 } 443 444 /* 445 * Allocate a new marker vnode. 446 */ 447 vnode_t * 448 vnalloc_marker(struct mount *mp) 449 { 450 vnode_impl_t *vip; 451 vnode_t *vp; 452 453 vip = pool_cache_get(vcache_pool, PR_WAITOK); 454 memset(vip, 0, sizeof(*vip)); 455 vp = VIMPL_TO_VNODE(vip); 456 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); 457 vp->v_mount = mp; 458 vp->v_type = VBAD; 459 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 460 klist_init(&vip->vi_klist.vk_klist); 461 vp->v_klist = &vip->vi_klist; 462 vip->vi_state = VS_MARKER; 463 464 return vp; 465 } 466 467 /* 468 * Free a marker vnode. 469 */ 470 void 471 vnfree_marker(vnode_t *vp) 472 { 473 vnode_impl_t *vip; 474 475 vip = VNODE_TO_VIMPL(vp); 476 KASSERT(vip->vi_state == VS_MARKER); 477 mutex_obj_free(vp->v_interlock); 478 uvm_obj_destroy(&vp->v_uobj, true); 479 klist_fini(&vip->vi_klist.vk_klist); 480 pool_cache_put(vcache_pool, vip); 481 } 482 483 /* 484 * Test a vnode for being a marker vnode. 485 */ 486 bool 487 vnis_marker(vnode_t *vp) 488 { 489 490 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); 491 } 492 493 /* 494 * Return the lru list this node should be on. 495 */ 496 static vnodelst_t * 497 lru_which(vnode_t *vp) 498 { 499 500 KASSERT(mutex_owned(vp->v_interlock)); 501 502 if (vp->v_holdcnt > 0) 503 return &lru_list[LRU_HOLD]; 504 else 505 return &lru_list[LRU_FREE]; 506 } 507 508 /* 509 * Put vnode to end of given list. 510 * Both the current and the new list may be NULL, used on vnode alloc/free. 511 * Adjust numvnodes and signal vdrain thread if there is work. 512 */ 513 static void 514 lru_requeue(vnode_t *vp, vnodelst_t *listhd) 515 { 516 vnode_impl_t *vip; 517 int d; 518 519 /* 520 * If the vnode is on the correct list, and was put there recently, 521 * then leave it be, thus avoiding huge cache and lock contention. 522 */ 523 vip = VNODE_TO_VIMPL(vp); 524 if (listhd == vip->vi_lrulisthd && 525 (getticks() - vip->vi_lrulisttm) < hz) { 526 return; 527 } 528 529 mutex_enter(&vdrain_lock); 530 d = 0; 531 if (vip->vi_lrulisthd != NULL) 532 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 533 else 534 d++; 535 vip->vi_lrulisthd = listhd; 536 vip->vi_lrulisttm = getticks(); 537 if (vip->vi_lrulisthd != NULL) 538 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 539 else 540 d--; 541 if (d != 0) { 542 /* 543 * Looks strange? This is not a bug. Don't store 544 * numvnodes unless there is a change - avoid false 545 * sharing on MP. 546 */ 547 numvnodes += d; 548 } 549 if (listhd == &lru_list[LRU_VRELE]) 550 threadpool_schedule_job(threadpool, &vrele_job); 551 if (d > 0 && numvnodes > desiredvnodes) 552 threadpool_schedule_job(threadpool, &vdrain_job); 553 if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) 554 kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock); 555 mutex_exit(&vdrain_lock); 556 } 557 558 /* 559 * LRU list iterator. 560 * Caller holds vdrain_lock. 561 */ 562 static vnode_impl_t * 563 lru_iter_first(int idx, lru_iter_t *iterp) 564 { 565 vnode_impl_t *marker; 566 567 KASSERT(mutex_owned(&vdrain_lock)); 568 569 mutex_exit(&vdrain_lock); 570 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 571 mutex_enter(&vdrain_lock); 572 marker->vi_lrulisthd = &lru_list[idx]; 573 iterp->li_marker = marker; 574 575 TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist); 576 577 return lru_iter_next(iterp); 578 } 579 580 static vnode_impl_t * 581 lru_iter_next(lru_iter_t *iter) 582 { 583 vnode_impl_t *vip, *marker; 584 vnodelst_t *listhd; 585 586 KASSERT(mutex_owned(&vdrain_lock)); 587 588 marker = iter->li_marker; 589 listhd = marker->vi_lrulisthd; 590 591 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 592 TAILQ_REMOVE(listhd, marker, vi_lrulist); 593 TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist); 594 if (!vnis_marker(VIMPL_TO_VNODE(vip))) 595 break; 596 } 597 598 return vip; 599 } 600 601 static void 602 lru_iter_release(lru_iter_t *iter) 603 { 604 vnode_impl_t *marker; 605 606 KASSERT(mutex_owned(&vdrain_lock)); 607 608 marker = iter->li_marker; 609 TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist); 610 611 mutex_exit(&vdrain_lock); 612 vnfree_marker(VIMPL_TO_VNODE(marker)); 613 mutex_enter(&vdrain_lock); 614 } 615 616 /* 617 * Release deferred vrele vnodes for this mount. 618 * Called with file system suspended. 619 */ 620 void 621 vrele_flush(struct mount *mp) 622 { 623 lru_iter_t iter; 624 vnode_impl_t *vip; 625 626 KASSERT(fstrans_is_owner(mp)); 627 628 mutex_enter(&vdrain_lock); 629 for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL; 630 vip = lru_iter_next(&iter)) { 631 if (VIMPL_TO_VNODE(vip)->v_mount != mp) 632 continue; 633 vrele_deferred(vip); 634 } 635 lru_iter_release(&iter); 636 mutex_exit(&vdrain_lock); 637 } 638 639 /* 640 * One pass through the LRU lists to keep the number of allocated 641 * vnodes below target. Returns true if target met. 642 */ 643 static bool 644 vdrain_one(u_int target) 645 { 646 int ix, lists[] = { LRU_FREE, LRU_HOLD }; 647 lru_iter_t iter; 648 vnode_impl_t *vip; 649 vnode_t *vp; 650 struct mount *mp; 651 652 KASSERT(mutex_owned(&vdrain_lock)); 653 654 for (ix = 0; ix < __arraycount(lists); ix++) { 655 for (vip = lru_iter_first(lists[ix], &iter); vip != NULL; 656 vip = lru_iter_next(&iter)) { 657 if (numvnodes < target) { 658 lru_iter_release(&iter); 659 return true; 660 } 661 662 vp = VIMPL_TO_VNODE(vip); 663 664 /* Probe usecount (unlocked). */ 665 if (vrefcnt(vp) > 0) 666 continue; 667 /* Try v_interlock -- we lock the wrong direction! */ 668 if (!mutex_tryenter(vp->v_interlock)) 669 continue; 670 /* Probe usecount and state. */ 671 if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) { 672 mutex_exit(vp->v_interlock); 673 continue; 674 } 675 mutex_exit(&vdrain_lock); 676 677 mp = vp->v_mount; 678 if (fstrans_start_nowait(mp) != 0) { 679 mutex_exit(vp->v_interlock); 680 mutex_enter(&vdrain_lock); 681 continue; 682 } 683 684 if (vcache_vget(vp) == 0) { 685 if (!vrecycle(vp)) { 686 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 687 mutex_enter(vp->v_interlock); 688 vrelel(vp, 0, LK_EXCLUSIVE); 689 } 690 } 691 fstrans_done(mp); 692 693 mutex_enter(&vdrain_lock); 694 } 695 lru_iter_release(&iter); 696 } 697 698 return false; 699 } 700 701 /* 702 * threadpool task to keep the number of vnodes below desiredvnodes. 703 */ 704 static void 705 vdrain_task(struct threadpool_job *job) 706 { 707 u_int target; 708 709 target = desiredvnodes - desiredvnodes / 16; 710 711 mutex_enter(&vdrain_lock); 712 713 while (!vdrain_one(target)) 714 kpause("vdrain", false, 1, &vdrain_lock); 715 716 threadpool_job_done(job); 717 mutex_exit(&vdrain_lock); 718 } 719 720 /* 721 * threadpool task to process asynchronous vrele. 722 */ 723 static void 724 vrele_task(struct threadpool_job *job) 725 { 726 int skipped; 727 lru_iter_t iter; 728 vnode_impl_t *vip; 729 struct mount *mp; 730 731 mutex_enter(&vdrain_lock); 732 while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) { 733 for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) { 734 mp = VIMPL_TO_VNODE(vip)->v_mount; 735 if (fstrans_start_nowait(mp) == 0) { 736 vrele_deferred(vip); 737 fstrans_done(mp); 738 } else { 739 skipped++; 740 } 741 } 742 743 lru_iter_release(&iter); 744 if (skipped) { 745 kpause("vrele", false, MAX(1, mstohz(10)), 746 &vdrain_lock); 747 } 748 } 749 750 threadpool_job_done(job); 751 lru_iter_release(&iter); 752 mutex_exit(&vdrain_lock); 753 } 754 755 /* 756 * Try to drop reference on a vnode. Abort if we are releasing the 757 * last reference. Note: this _must_ succeed if not the last reference. 758 */ 759 static bool 760 vtryrele(vnode_t *vp) 761 { 762 u_int use, next; 763 764 membar_release(); 765 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { 766 if (__predict_false((use & VUSECOUNT_MASK) == 1)) { 767 return false; 768 } 769 KASSERT((use & VUSECOUNT_MASK) > 1); 770 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 771 if (__predict_true(next == use)) { 772 return true; 773 } 774 } 775 } 776 777 /* 778 * vput: unlock and release the reference. 779 */ 780 void 781 vput(vnode_t *vp) 782 { 783 int lktype; 784 785 /* 786 * Do an unlocked check of the usecount. If it looks like we're not 787 * about to drop the last reference, then unlock the vnode and try 788 * to drop the reference. If it ends up being the last reference 789 * after all, vrelel() can fix it all up. Most of the time this 790 * will all go to plan. 791 */ 792 if (vrefcnt(vp) > 1) { 793 VOP_UNLOCK(vp); 794 if (vtryrele(vp)) { 795 return; 796 } 797 lktype = LK_NONE; 798 } else { 799 lktype = VOP_ISLOCKED(vp); 800 KASSERT(lktype != LK_NONE); 801 } 802 mutex_enter(vp->v_interlock); 803 vrelel(vp, 0, lktype); 804 } 805 806 /* 807 * Release a vnode from the deferred list. 808 */ 809 static void 810 vrele_deferred(vnode_impl_t *vip) 811 { 812 vnode_t *vp; 813 814 KASSERT(mutex_owned(&vdrain_lock)); 815 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); 816 817 vp = VIMPL_TO_VNODE(vip); 818 819 /* 820 * First remove the vnode from the vrele list. 821 * Put it on the last lru list, the last vrele() 822 * will put it back onto the right list before 823 * its usecount reaches zero. 824 */ 825 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 826 vip->vi_lrulisthd = &lru_list[LRU_HOLD]; 827 vip->vi_lrulisttm = getticks(); 828 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 829 830 mutex_exit(&vdrain_lock); 831 832 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 833 mutex_enter(vp->v_interlock); 834 vrelel(vp, 0, LK_EXCLUSIVE); 835 836 mutex_enter(&vdrain_lock); 837 } 838 839 /* 840 * Vnode release. If reference count drops to zero, call inactive 841 * routine and either return to freelist or free to the pool. 842 */ 843 static void 844 vrelel(vnode_t *vp, int flags, int lktype) 845 { 846 const bool async = ((flags & VRELEL_ASYNC) != 0); 847 bool recycle, defer, objlock_held; 848 u_int use, next; 849 int error; 850 851 objlock_held = false; 852 853 retry: 854 KASSERT(mutex_owned(vp->v_interlock)); 855 856 if (__predict_false(vp->v_op == dead_vnodeop_p && 857 VSTATE_GET(vp) != VS_RECLAIMED)) { 858 vnpanic(vp, "dead but not clean"); 859 } 860 861 /* 862 * If not the last reference, just unlock and drop the reference count. 863 * 864 * Otherwise make sure we pass a point in time where we hold the 865 * last reference with VGET flag unset. 866 */ 867 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { 868 if (__predict_false((use & VUSECOUNT_MASK) > 1)) { 869 if (objlock_held) { 870 objlock_held = false; 871 rw_exit(vp->v_uobj.vmobjlock); 872 } 873 if (lktype != LK_NONE) { 874 mutex_exit(vp->v_interlock); 875 lktype = LK_NONE; 876 VOP_UNLOCK(vp); 877 mutex_enter(vp->v_interlock); 878 } 879 if (vtryrele(vp)) { 880 mutex_exit(vp->v_interlock); 881 return; 882 } 883 next = atomic_load_relaxed(&vp->v_usecount); 884 continue; 885 } 886 KASSERT((use & VUSECOUNT_MASK) == 1); 887 next = use & ~VUSECOUNT_VGET; 888 if (next != use) { 889 next = atomic_cas_uint(&vp->v_usecount, use, next); 890 } 891 if (__predict_true(next == use)) { 892 break; 893 } 894 } 895 membar_acquire(); 896 if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) { 897 vnpanic(vp, "%s: bad ref count", __func__); 898 } 899 900 #ifdef DIAGNOSTIC 901 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 902 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 903 vprint("vrelel: missing VOP_CLOSE()", vp); 904 } 905 #endif 906 907 /* 908 * If already clean there is no need to lock, defer or 909 * deactivate this node. 910 */ 911 if (VSTATE_GET(vp) == VS_RECLAIMED) { 912 if (objlock_held) { 913 objlock_held = false; 914 rw_exit(vp->v_uobj.vmobjlock); 915 } 916 if (lktype != LK_NONE) { 917 mutex_exit(vp->v_interlock); 918 lktype = LK_NONE; 919 VOP_UNLOCK(vp); 920 mutex_enter(vp->v_interlock); 921 } 922 goto out; 923 } 924 925 /* 926 * First try to get the vnode locked for VOP_INACTIVE(). 927 * Defer vnode release to vrele task if caller requests 928 * it explicitly, is the pagedaemon or the lock failed. 929 */ 930 defer = false; 931 if ((curlwp == uvm.pagedaemon_lwp) || async) { 932 defer = true; 933 } else if (lktype == LK_SHARED) { 934 /* Excellent chance of getting, if the last ref. */ 935 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT); 936 if (error != 0) { 937 defer = true; 938 } else { 939 lktype = LK_EXCLUSIVE; 940 } 941 } else if (lktype == LK_NONE) { 942 /* Excellent chance of getting, if the last ref. */ 943 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 944 if (error != 0) { 945 defer = true; 946 } else { 947 lktype = LK_EXCLUSIVE; 948 } 949 } 950 KASSERT(mutex_owned(vp->v_interlock)); 951 if (defer) { 952 /* 953 * Defer reclaim to the vrele task; it's not safe to 954 * clean it here. We donate it our last reference. 955 */ 956 if (lktype != LK_NONE) { 957 mutex_exit(vp->v_interlock); 958 VOP_UNLOCK(vp); 959 mutex_enter(vp->v_interlock); 960 } 961 lru_requeue(vp, &lru_list[LRU_VRELE]); 962 mutex_exit(vp->v_interlock); 963 return; 964 } 965 KASSERT(lktype == LK_EXCLUSIVE); 966 967 /* If the node gained another reference, retry. */ 968 use = atomic_load_relaxed(&vp->v_usecount); 969 if ((use & VUSECOUNT_VGET) != 0) { 970 goto retry; 971 } 972 KASSERT((use & VUSECOUNT_MASK) == 1); 973 974 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 || 975 (vp->v_vflag & VV_MAPPED) != 0) { 976 /* Take care of space accounting. */ 977 if (!objlock_held) { 978 objlock_held = true; 979 if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) { 980 mutex_exit(vp->v_interlock); 981 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 982 mutex_enter(vp->v_interlock); 983 goto retry; 984 } 985 } 986 if ((vp->v_iflag & VI_EXECMAP) != 0) { 987 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); 988 } 989 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 990 vp->v_vflag &= ~VV_MAPPED; 991 } 992 if (objlock_held) { 993 objlock_held = false; 994 rw_exit(vp->v_uobj.vmobjlock); 995 } 996 997 /* 998 * Deactivate the vnode, but preserve our reference across 999 * the call to VOP_INACTIVE(). 1000 * 1001 * If VOP_INACTIVE() indicates that the file has been 1002 * deleted, then recycle the vnode. 1003 * 1004 * Note that VOP_INACTIVE() will not drop the vnode lock. 1005 */ 1006 mutex_exit(vp->v_interlock); 1007 recycle = false; 1008 VOP_INACTIVE(vp, &recycle); 1009 if (!recycle) { 1010 lktype = LK_NONE; 1011 VOP_UNLOCK(vp); 1012 } 1013 mutex_enter(vp->v_interlock); 1014 1015 /* 1016 * Block new references then check again to see if a 1017 * new reference was acquired in the meantime. If 1018 * it was, restore the vnode state and try again. 1019 */ 1020 if (recycle) { 1021 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 1022 use = atomic_load_relaxed(&vp->v_usecount); 1023 if ((use & VUSECOUNT_VGET) != 0) { 1024 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 1025 goto retry; 1026 } 1027 KASSERT((use & VUSECOUNT_MASK) == 1); 1028 } 1029 1030 /* 1031 * Recycle the vnode if the file is now unused (unlinked). 1032 */ 1033 if (recycle) { 1034 VSTATE_ASSERT(vp, VS_BLOCKED); 1035 KASSERT(lktype == LK_EXCLUSIVE); 1036 /* vcache_reclaim drops the lock. */ 1037 lktype = LK_NONE; 1038 vcache_reclaim(vp); 1039 } 1040 KASSERT(vrefcnt(vp) > 0); 1041 KASSERT(lktype == LK_NONE); 1042 1043 out: 1044 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { 1045 if (__predict_false((use & VUSECOUNT_VGET) != 0 && 1046 (use & VUSECOUNT_MASK) == 1)) { 1047 /* Gained and released another reference, retry. */ 1048 goto retry; 1049 } 1050 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 1051 if (__predict_true(next == use)) { 1052 if (__predict_false((use & VUSECOUNT_MASK) != 1)) { 1053 /* Gained another reference. */ 1054 mutex_exit(vp->v_interlock); 1055 return; 1056 } 1057 break; 1058 } 1059 } 1060 membar_acquire(); 1061 1062 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { 1063 /* 1064 * It's clean so destroy it. It isn't referenced 1065 * anywhere since it has been reclaimed. 1066 */ 1067 vcache_free(VNODE_TO_VIMPL(vp)); 1068 } else { 1069 /* 1070 * Otherwise, put it back onto the freelist. It 1071 * can't be destroyed while still associated with 1072 * a file system. 1073 */ 1074 lru_requeue(vp, lru_which(vp)); 1075 mutex_exit(vp->v_interlock); 1076 } 1077 } 1078 1079 void 1080 vrele(vnode_t *vp) 1081 { 1082 1083 if (vtryrele(vp)) { 1084 return; 1085 } 1086 mutex_enter(vp->v_interlock); 1087 vrelel(vp, 0, LK_NONE); 1088 } 1089 1090 /* 1091 * Asynchronous vnode release, vnode is released in different context. 1092 */ 1093 void 1094 vrele_async(vnode_t *vp) 1095 { 1096 1097 if (vtryrele(vp)) { 1098 return; 1099 } 1100 mutex_enter(vp->v_interlock); 1101 vrelel(vp, VRELEL_ASYNC, LK_NONE); 1102 } 1103 1104 /* 1105 * Vnode reference, where a reference is already held by some other 1106 * object (for example, a file structure). 1107 * 1108 * NB: lockless code sequences may rely on this not blocking. 1109 */ 1110 void 1111 vref(vnode_t *vp) 1112 { 1113 1114 KASSERT(vrefcnt(vp) > 0); 1115 1116 atomic_inc_uint(&vp->v_usecount); 1117 } 1118 1119 /* 1120 * Page or buffer structure gets a reference. 1121 * Called with v_interlock held. 1122 */ 1123 void 1124 vholdl(vnode_t *vp) 1125 { 1126 1127 KASSERT(mutex_owned(vp->v_interlock)); 1128 1129 if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0) 1130 lru_requeue(vp, lru_which(vp)); 1131 } 1132 1133 /* 1134 * Page or buffer structure gets a reference. 1135 */ 1136 void 1137 vhold(vnode_t *vp) 1138 { 1139 1140 mutex_enter(vp->v_interlock); 1141 vholdl(vp); 1142 mutex_exit(vp->v_interlock); 1143 } 1144 1145 /* 1146 * Page or buffer structure frees a reference. 1147 * Called with v_interlock held. 1148 */ 1149 void 1150 holdrelel(vnode_t *vp) 1151 { 1152 1153 KASSERT(mutex_owned(vp->v_interlock)); 1154 1155 if (vp->v_holdcnt <= 0) { 1156 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 1157 } 1158 1159 vp->v_holdcnt--; 1160 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) 1161 lru_requeue(vp, lru_which(vp)); 1162 } 1163 1164 /* 1165 * Page or buffer structure frees a reference. 1166 */ 1167 void 1168 holdrele(vnode_t *vp) 1169 { 1170 1171 mutex_enter(vp->v_interlock); 1172 holdrelel(vp); 1173 mutex_exit(vp->v_interlock); 1174 } 1175 1176 /* 1177 * Recycle an unused vnode if caller holds the last reference. 1178 */ 1179 bool 1180 vrecycle(vnode_t *vp) 1181 { 1182 int error __diagused; 1183 1184 mutex_enter(vp->v_interlock); 1185 1186 /* If the vnode is already clean we're done. */ 1187 VSTATE_WAIT_STABLE(vp); 1188 if (VSTATE_GET(vp) != VS_LOADED) { 1189 VSTATE_ASSERT(vp, VS_RECLAIMED); 1190 vrelel(vp, 0, LK_NONE); 1191 return true; 1192 } 1193 1194 /* Prevent further references until the vnode is locked. */ 1195 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 1196 1197 /* Make sure we hold the last reference. */ 1198 if (vrefcnt(vp) != 1) { 1199 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 1200 mutex_exit(vp->v_interlock); 1201 return false; 1202 } 1203 1204 mutex_exit(vp->v_interlock); 1205 1206 /* 1207 * On a leaf file system this lock will always succeed as we hold 1208 * the last reference and prevent further references. 1209 * On layered file systems waiting for the lock would open a can of 1210 * deadlocks as the lower vnodes may have other active references. 1211 */ 1212 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 1213 1214 mutex_enter(vp->v_interlock); 1215 if (error) { 1216 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 1217 mutex_exit(vp->v_interlock); 1218 return false; 1219 } 1220 1221 KASSERT(vrefcnt(vp) == 1); 1222 vcache_reclaim(vp); 1223 vrelel(vp, 0, LK_NONE); 1224 1225 return true; 1226 } 1227 1228 /* 1229 * Helper for vrevoke() to propagate suspension from lastmp 1230 * to thismp. Both args may be NULL. 1231 * Returns the currently suspended file system or NULL. 1232 */ 1233 static struct mount * 1234 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp) 1235 { 1236 int error; 1237 1238 if (lastmp == thismp) 1239 return thismp; 1240 1241 if (lastmp != NULL) 1242 vfs_resume(lastmp); 1243 1244 if (thismp == NULL) 1245 return NULL; 1246 1247 do { 1248 error = vfs_suspend(thismp, 0); 1249 } while (error == EINTR || error == ERESTART); 1250 1251 if (error == 0) 1252 return thismp; 1253 1254 KASSERT(error == EOPNOTSUPP || error == ENOENT); 1255 return NULL; 1256 } 1257 1258 /* 1259 * Eliminate all activity associated with the requested vnode 1260 * and with all vnodes aliased to the requested vnode. 1261 */ 1262 void 1263 vrevoke(vnode_t *vp) 1264 { 1265 struct mount *mp; 1266 vnode_t *vq; 1267 enum vtype type; 1268 dev_t dev; 1269 1270 KASSERT(vrefcnt(vp) > 0); 1271 1272 mp = vrevoke_suspend_next(NULL, vp->v_mount); 1273 1274 mutex_enter(vp->v_interlock); 1275 VSTATE_WAIT_STABLE(vp); 1276 if (VSTATE_GET(vp) == VS_RECLAIMED) { 1277 mutex_exit(vp->v_interlock); 1278 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1279 atomic_inc_uint(&vp->v_usecount); 1280 mutex_exit(vp->v_interlock); 1281 vgone(vp); 1282 } else { 1283 dev = vp->v_rdev; 1284 type = vp->v_type; 1285 mutex_exit(vp->v_interlock); 1286 1287 while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq) 1288 == 0) { 1289 mp = vrevoke_suspend_next(mp, vq->v_mount); 1290 vgone(vq); 1291 } 1292 } 1293 vrevoke_suspend_next(mp, NULL); 1294 } 1295 1296 /* 1297 * Eliminate all activity associated with a vnode in preparation for 1298 * reuse. Drops a reference from the vnode. 1299 */ 1300 void 1301 vgone(vnode_t *vp) 1302 { 1303 int lktype; 1304 1305 KASSERT(vp->v_mount == dead_rootmount || 1306 fstrans_is_owner(vp->v_mount)); 1307 1308 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1309 lktype = LK_EXCLUSIVE; 1310 mutex_enter(vp->v_interlock); 1311 VSTATE_WAIT_STABLE(vp); 1312 if (VSTATE_GET(vp) == VS_LOADED) { 1313 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 1314 vcache_reclaim(vp); 1315 lktype = LK_NONE; 1316 } 1317 VSTATE_ASSERT(vp, VS_RECLAIMED); 1318 vrelel(vp, 0, lktype); 1319 } 1320 1321 static inline uint32_t 1322 vcache_hash(const struct vcache_key *key) 1323 { 1324 uint32_t hash = HASH32_BUF_INIT; 1325 1326 KASSERT(key->vk_key_len > 0); 1327 1328 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 1329 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 1330 return hash; 1331 } 1332 1333 static int 1334 vcache_stats(struct hashstat_sysctl *hs, bool fill) 1335 { 1336 vnode_impl_t *vip; 1337 uint64_t chain; 1338 1339 strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name)); 1340 strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc)); 1341 if (!fill) 1342 return 0; 1343 1344 hs->hash_size = vcache_hashmask + 1; 1345 1346 for (size_t i = 0; i < hs->hash_size; i++) { 1347 chain = 0; 1348 mutex_enter(&vcache_lock); 1349 SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) { 1350 chain++; 1351 } 1352 mutex_exit(&vcache_lock); 1353 if (chain > 0) { 1354 hs->hash_used++; 1355 hs->hash_items += chain; 1356 if (chain > hs->hash_maxchain) 1357 hs->hash_maxchain = chain; 1358 } 1359 preempt_point(); 1360 } 1361 1362 return 0; 1363 } 1364 1365 static void 1366 vcache_init(void) 1367 { 1368 1369 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit, 1370 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1371 KASSERT(vcache_pool != NULL); 1372 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); 1373 cv_init(&vcache_cv, "vcache"); 1374 vcache_hashsize = desiredvnodes; 1375 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1376 &vcache_hashmask); 1377 hashstat_register("vcache", vcache_stats); 1378 } 1379 1380 static void 1381 vcache_reinit(void) 1382 { 1383 int i; 1384 uint32_t hash; 1385 u_long oldmask, newmask; 1386 struct hashhead *oldtab, *newtab; 1387 vnode_impl_t *vip; 1388 1389 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1390 mutex_enter(&vcache_lock); 1391 oldtab = vcache_hashtab; 1392 oldmask = vcache_hashmask; 1393 vcache_hashsize = desiredvnodes; 1394 vcache_hashtab = newtab; 1395 vcache_hashmask = newmask; 1396 for (i = 0; i <= oldmask; i++) { 1397 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { 1398 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); 1399 hash = vcache_hash(&vip->vi_key); 1400 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], 1401 vip, vi_hash); 1402 } 1403 } 1404 mutex_exit(&vcache_lock); 1405 hashdone(oldtab, HASH_SLIST, oldmask); 1406 } 1407 1408 static inline vnode_impl_t * 1409 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1410 { 1411 struct hashhead *hashp; 1412 vnode_impl_t *vip; 1413 1414 KASSERT(mutex_owned(&vcache_lock)); 1415 1416 hashp = &vcache_hashtab[hash & vcache_hashmask]; 1417 SLIST_FOREACH(vip, hashp, vi_hash) { 1418 if (key->vk_mount != vip->vi_key.vk_mount) 1419 continue; 1420 if (key->vk_key_len != vip->vi_key.vk_key_len) 1421 continue; 1422 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) 1423 continue; 1424 return vip; 1425 } 1426 return NULL; 1427 } 1428 1429 /* 1430 * Allocate a new, uninitialized vcache node. 1431 */ 1432 static vnode_impl_t * 1433 vcache_alloc(void) 1434 { 1435 vnode_impl_t *vip; 1436 vnode_t *vp; 1437 1438 vip = pool_cache_get(vcache_pool, PR_WAITOK); 1439 vp = VIMPL_TO_VNODE(vip); 1440 memset(vip, 0, sizeof(*vip)); 1441 1442 rw_init(&vip->vi_lock); 1443 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 1444 1445 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); 1446 klist_init(&vip->vi_klist.vk_klist); 1447 vp->v_klist = &vip->vi_klist; 1448 cv_init(&vp->v_cv, "vnode"); 1449 cache_vnode_init(vp); 1450 1451 vp->v_usecount = 1; 1452 vp->v_type = VNON; 1453 vp->v_size = vp->v_writesize = VSIZENOTSET; 1454 1455 vip->vi_state = VS_LOADING; 1456 1457 lru_requeue(vp, &lru_list[LRU_FREE]); 1458 1459 return vip; 1460 } 1461 1462 /* 1463 * Deallocate a vcache node in state VS_LOADING. 1464 * 1465 * vcache_lock held on entry and released on return. 1466 */ 1467 static void 1468 vcache_dealloc(vnode_impl_t *vip) 1469 { 1470 vnode_t *vp; 1471 1472 KASSERT(mutex_owned(&vcache_lock)); 1473 1474 vp = VIMPL_TO_VNODE(vip); 1475 vfs_ref(dead_rootmount); 1476 vfs_insmntque(vp, dead_rootmount); 1477 mutex_enter(vp->v_interlock); 1478 vp->v_op = dead_vnodeop_p; 1479 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1480 mutex_exit(&vcache_lock); 1481 vrelel(vp, 0, LK_NONE); 1482 } 1483 1484 /* 1485 * Free an unused, unreferenced vcache node. 1486 * v_interlock locked on entry. 1487 */ 1488 static void 1489 vcache_free(vnode_impl_t *vip) 1490 { 1491 vnode_t *vp; 1492 1493 vp = VIMPL_TO_VNODE(vip); 1494 KASSERT(mutex_owned(vp->v_interlock)); 1495 1496 KASSERT(vrefcnt(vp) == 0); 1497 KASSERT(vp->v_holdcnt == 0); 1498 KASSERT(vp->v_writecount == 0); 1499 lru_requeue(vp, NULL); 1500 mutex_exit(vp->v_interlock); 1501 1502 vfs_insmntque(vp, NULL); 1503 if (vp->v_type == VBLK || vp->v_type == VCHR) 1504 spec_node_destroy(vp); 1505 1506 mutex_obj_free(vp->v_interlock); 1507 rw_destroy(&vip->vi_lock); 1508 uvm_obj_destroy(&vp->v_uobj, true); 1509 KASSERT(vp->v_klist == &vip->vi_klist); 1510 klist_fini(&vip->vi_klist.vk_klist); 1511 cv_destroy(&vp->v_cv); 1512 cache_vnode_fini(vp); 1513 pool_cache_put(vcache_pool, vip); 1514 } 1515 1516 /* 1517 * Try to get an initial reference on this cached vnode. 1518 * Returns zero on success or EBUSY if the vnode state is not LOADED. 1519 * 1520 * NB: lockless code sequences may rely on this not blocking. 1521 */ 1522 int 1523 vcache_tryvget(vnode_t *vp) 1524 { 1525 u_int use, next; 1526 1527 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { 1528 if (__predict_false((use & VUSECOUNT_GATE) == 0)) { 1529 return SET_ERROR(EBUSY); 1530 } 1531 next = atomic_cas_uint(&vp->v_usecount, 1532 use, (use + 1) | VUSECOUNT_VGET); 1533 if (__predict_true(next == use)) { 1534 membar_acquire(); 1535 return 0; 1536 } 1537 } 1538 } 1539 1540 /* 1541 * Try to get an initial reference on this cached vnode. 1542 * Returns zero on success and ENOENT if the vnode has been reclaimed. 1543 * Will wait for the vnode state to be stable. 1544 * 1545 * v_interlock locked on entry and unlocked on exit. 1546 */ 1547 int 1548 vcache_vget(vnode_t *vp) 1549 { 1550 int error; 1551 1552 KASSERT(mutex_owned(vp->v_interlock)); 1553 1554 /* Increment hold count to prevent vnode from disappearing. */ 1555 vp->v_holdcnt++; 1556 VSTATE_WAIT_STABLE(vp); 1557 vp->v_holdcnt--; 1558 1559 /* If this was the last reference to a reclaimed vnode free it now. */ 1560 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { 1561 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) 1562 vcache_free(VNODE_TO_VIMPL(vp)); 1563 else 1564 mutex_exit(vp->v_interlock); 1565 return SET_ERROR(ENOENT); 1566 } 1567 VSTATE_ASSERT(vp, VS_LOADED); 1568 error = vcache_tryvget(vp); 1569 KASSERT(error == 0); 1570 mutex_exit(vp->v_interlock); 1571 1572 return 0; 1573 } 1574 1575 /* 1576 * Get a vnode / fs node pair by key and return it referenced through vpp. 1577 */ 1578 int 1579 vcache_get(struct mount *mp, const void *key, size_t key_len, 1580 struct vnode **vpp) 1581 { 1582 int error; 1583 uint32_t hash; 1584 const void *new_key; 1585 struct vnode *vp; 1586 struct vcache_key vcache_key; 1587 vnode_impl_t *vip, *new_vip; 1588 1589 new_key = NULL; 1590 *vpp = NULL; 1591 1592 vcache_key.vk_mount = mp; 1593 vcache_key.vk_key = key; 1594 vcache_key.vk_key_len = key_len; 1595 hash = vcache_hash(&vcache_key); 1596 1597 again: 1598 mutex_enter(&vcache_lock); 1599 vip = vcache_hash_lookup(&vcache_key, hash); 1600 1601 /* If found, take a reference or retry. */ 1602 if (__predict_true(vip != NULL)) { 1603 /* 1604 * If the vnode is loading we cannot take the v_interlock 1605 * here as it might change during load (see uvm_obj_setlock()). 1606 * As changing state from VS_LOADING requires both vcache_lock 1607 * and v_interlock it is safe to test with vcache_lock held. 1608 * 1609 * Wait for vnodes changing state from VS_LOADING and retry. 1610 */ 1611 if (__predict_false(vip->vi_state == VS_LOADING)) { 1612 cv_wait(&vcache_cv, &vcache_lock); 1613 mutex_exit(&vcache_lock); 1614 goto again; 1615 } 1616 vp = VIMPL_TO_VNODE(vip); 1617 mutex_enter(vp->v_interlock); 1618 mutex_exit(&vcache_lock); 1619 error = vcache_vget(vp); 1620 if (error == ENOENT) 1621 goto again; 1622 if (error == 0) 1623 *vpp = vp; 1624 KASSERT((error != 0) == (*vpp == NULL)); 1625 return error; 1626 } 1627 mutex_exit(&vcache_lock); 1628 1629 /* Allocate and initialize a new vcache / vnode pair. */ 1630 error = vfs_busy(mp); 1631 if (error) 1632 return error; 1633 new_vip = vcache_alloc(); 1634 new_vip->vi_key = vcache_key; 1635 vp = VIMPL_TO_VNODE(new_vip); 1636 mutex_enter(&vcache_lock); 1637 vip = vcache_hash_lookup(&vcache_key, hash); 1638 if (vip == NULL) { 1639 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1640 new_vip, vi_hash); 1641 vip = new_vip; 1642 } 1643 1644 /* If another thread beat us inserting this node, retry. */ 1645 if (vip != new_vip) { 1646 vcache_dealloc(new_vip); 1647 vfs_unbusy(mp); 1648 goto again; 1649 } 1650 mutex_exit(&vcache_lock); 1651 1652 /* Load the fs node. Exclusive as new_node is VS_LOADING. */ 1653 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1654 if (error) { 1655 mutex_enter(&vcache_lock); 1656 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1657 new_vip, vnode_impl, vi_hash); 1658 vcache_dealloc(new_vip); 1659 vfs_unbusy(mp); 1660 KASSERT(*vpp == NULL); 1661 return error; 1662 } 1663 KASSERT(new_key != NULL); 1664 KASSERT(memcmp(key, new_key, key_len) == 0); 1665 KASSERT(vp->v_op != NULL); 1666 vfs_insmntque(vp, mp); 1667 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1668 vp->v_vflag |= VV_MPSAFE; 1669 vfs_ref(mp); 1670 vfs_unbusy(mp); 1671 1672 /* Finished loading, finalize node. */ 1673 mutex_enter(&vcache_lock); 1674 new_vip->vi_key.vk_key = new_key; 1675 mutex_enter(vp->v_interlock); 1676 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1677 mutex_exit(vp->v_interlock); 1678 mutex_exit(&vcache_lock); 1679 *vpp = vp; 1680 return 0; 1681 } 1682 1683 /* 1684 * Create a new vnode / fs node pair and return it referenced through vpp. 1685 */ 1686 int 1687 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1688 kauth_cred_t cred, void *extra, struct vnode **vpp) 1689 { 1690 int error; 1691 uint32_t hash; 1692 struct vnode *vp, *ovp; 1693 vnode_impl_t *vip, *ovip; 1694 1695 *vpp = NULL; 1696 1697 /* Allocate and initialize a new vcache / vnode pair. */ 1698 error = vfs_busy(mp); 1699 if (error) 1700 return error; 1701 vip = vcache_alloc(); 1702 vip->vi_key.vk_mount = mp; 1703 vp = VIMPL_TO_VNODE(vip); 1704 1705 /* Create and load the fs node. */ 1706 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra, 1707 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); 1708 if (error) { 1709 mutex_enter(&vcache_lock); 1710 vcache_dealloc(vip); 1711 vfs_unbusy(mp); 1712 KASSERT(*vpp == NULL); 1713 return error; 1714 } 1715 KASSERT(vp->v_op != NULL); 1716 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); 1717 if (vip->vi_key.vk_key_len > 0) { 1718 KASSERT(vip->vi_key.vk_key != NULL); 1719 hash = vcache_hash(&vip->vi_key); 1720 1721 /* 1722 * Wait for previous instance to be reclaimed, 1723 * then insert new node. 1724 */ 1725 mutex_enter(&vcache_lock); 1726 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { 1727 ovp = VIMPL_TO_VNODE(ovip); 1728 mutex_enter(ovp->v_interlock); 1729 mutex_exit(&vcache_lock); 1730 error = vcache_vget(ovp); 1731 KASSERT(error == ENOENT); 1732 mutex_enter(&vcache_lock); 1733 } 1734 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1735 vip, vi_hash); 1736 mutex_exit(&vcache_lock); 1737 } 1738 vfs_insmntque(vp, mp); 1739 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1740 vp->v_vflag |= VV_MPSAFE; 1741 vfs_ref(mp); 1742 vfs_unbusy(mp); 1743 1744 /* Finished loading, finalize node. */ 1745 mutex_enter(&vcache_lock); 1746 mutex_enter(vp->v_interlock); 1747 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1748 mutex_exit(&vcache_lock); 1749 mutex_exit(vp->v_interlock); 1750 *vpp = vp; 1751 return 0; 1752 } 1753 1754 /* 1755 * Prepare key change: update old cache nodes key and lock new cache node. 1756 * Return an error if the new node already exists. 1757 */ 1758 int 1759 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1760 const void *old_key, size_t old_key_len, 1761 const void *new_key, size_t new_key_len) 1762 { 1763 uint32_t old_hash, new_hash; 1764 struct vcache_key old_vcache_key, new_vcache_key; 1765 vnode_impl_t *vip, *new_vip; 1766 1767 old_vcache_key.vk_mount = mp; 1768 old_vcache_key.vk_key = old_key; 1769 old_vcache_key.vk_key_len = old_key_len; 1770 old_hash = vcache_hash(&old_vcache_key); 1771 1772 new_vcache_key.vk_mount = mp; 1773 new_vcache_key.vk_key = new_key; 1774 new_vcache_key.vk_key_len = new_key_len; 1775 new_hash = vcache_hash(&new_vcache_key); 1776 1777 new_vip = vcache_alloc(); 1778 new_vip->vi_key = new_vcache_key; 1779 1780 /* Insert locked new node used as placeholder. */ 1781 mutex_enter(&vcache_lock); 1782 vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1783 if (vip != NULL) { 1784 vcache_dealloc(new_vip); 1785 return SET_ERROR(EEXIST); 1786 } 1787 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1788 new_vip, vi_hash); 1789 1790 /* Replace old nodes key with the temporary copy. */ 1791 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1792 KASSERT(vip != NULL); 1793 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1794 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); 1795 vip->vi_key = old_vcache_key; 1796 mutex_exit(&vcache_lock); 1797 return 0; 1798 } 1799 1800 /* 1801 * Key change complete: update old node and remove placeholder. 1802 */ 1803 void 1804 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1805 const void *old_key, size_t old_key_len, 1806 const void *new_key, size_t new_key_len) 1807 { 1808 uint32_t old_hash, new_hash; 1809 struct vcache_key old_vcache_key, new_vcache_key; 1810 vnode_impl_t *vip, *new_vip; 1811 struct vnode *new_vp; 1812 1813 old_vcache_key.vk_mount = mp; 1814 old_vcache_key.vk_key = old_key; 1815 old_vcache_key.vk_key_len = old_key_len; 1816 old_hash = vcache_hash(&old_vcache_key); 1817 1818 new_vcache_key.vk_mount = mp; 1819 new_vcache_key.vk_key = new_key; 1820 new_vcache_key.vk_key_len = new_key_len; 1821 new_hash = vcache_hash(&new_vcache_key); 1822 1823 mutex_enter(&vcache_lock); 1824 1825 /* Lookup old and new node. */ 1826 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1827 KASSERT(vip != NULL); 1828 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1829 1830 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1831 KASSERT(new_vip != NULL); 1832 KASSERT(new_vip->vi_key.vk_key_len == new_key_len); 1833 new_vp = VIMPL_TO_VNODE(new_vip); 1834 mutex_enter(new_vp->v_interlock); 1835 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); 1836 mutex_exit(new_vp->v_interlock); 1837 1838 /* Rekey old node and put it onto its new hashlist. */ 1839 vip->vi_key = new_vcache_key; 1840 if (old_hash != new_hash) { 1841 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], 1842 vip, vnode_impl, vi_hash); 1843 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1844 vip, vi_hash); 1845 } 1846 1847 /* Remove new node used as placeholder. */ 1848 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], 1849 new_vip, vnode_impl, vi_hash); 1850 vcache_dealloc(new_vip); 1851 } 1852 1853 /* 1854 * Disassociate the underlying file system from a vnode. 1855 * 1856 * Must be called with vnode locked and will return unlocked. 1857 * Must be called with the interlock held, and will return with it held. 1858 */ 1859 static void 1860 vcache_reclaim(vnode_t *vp) 1861 { 1862 lwp_t *l = curlwp; 1863 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 1864 struct mount *mp = vp->v_mount; 1865 uint32_t hash; 1866 uint8_t temp_buf[64], *temp_key; 1867 size_t temp_key_len; 1868 bool recycle; 1869 int error; 1870 1871 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1872 KASSERT(mutex_owned(vp->v_interlock)); 1873 KASSERT(vrefcnt(vp) != 0); 1874 1875 temp_key_len = vip->vi_key.vk_key_len; 1876 /* 1877 * Prevent the vnode from being recycled or brought into use 1878 * while we clean it out. 1879 */ 1880 VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING); 1881 1882 /* 1883 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(), 1884 * because VOP_RECLAIM() could cause vp->v_klist to 1885 * become invalid. Don't check for interest in NOTE_REVOKE 1886 * here; it's always posted because it sets EV_EOF. 1887 * 1888 * Once it's been posted, reset vp->v_klist to point to 1889 * our own local storage, in case we were sharing with 1890 * someone else. 1891 */ 1892 KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE); 1893 vp->v_klist = &vip->vi_klist; 1894 mutex_exit(vp->v_interlock); 1895 1896 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1897 mutex_enter(vp->v_interlock); 1898 if ((vp->v_iflag & VI_EXECMAP) != 0) { 1899 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); 1900 } 1901 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1902 vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */ 1903 mutex_exit(vp->v_interlock); 1904 rw_exit(vp->v_uobj.vmobjlock); 1905 1906 /* 1907 * With vnode state set to reclaiming, purge name cache immediately 1908 * to prevent new handles on vnode, and wait for existing threads 1909 * trying to get a handle to notice VS_RECLAIMED status and abort. 1910 */ 1911 cache_purge(vp); 1912 1913 /* Replace the vnode key with a temporary copy. */ 1914 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { 1915 temp_key = kmem_alloc(temp_key_len, KM_SLEEP); 1916 } else { 1917 temp_key = temp_buf; 1918 } 1919 if (vip->vi_key.vk_key_len > 0) { 1920 mutex_enter(&vcache_lock); 1921 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); 1922 vip->vi_key.vk_key = temp_key; 1923 mutex_exit(&vcache_lock); 1924 } 1925 1926 fstrans_start(mp); 1927 1928 /* 1929 * Clean out any cached data associated with the vnode. 1930 */ 1931 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1932 if (error != 0) { 1933 if (wapbl_vphaswapbl(vp)) 1934 WAPBL_DISCARD(wapbl_vptomp(vp)); 1935 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1936 } 1937 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1938 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1939 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1940 spec_node_revoke(vp); 1941 } 1942 1943 /* 1944 * Disassociate the underlying file system from the vnode. 1945 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks 1946 * the vnode, and may destroy the vnode so that VOP_UNLOCK 1947 * would no longer function. 1948 */ 1949 VOP_INACTIVE(vp, &recycle); 1950 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1951 if (VOP_RECLAIM(vp)) { 1952 vnpanic(vp, "%s: cannot reclaim", __func__); 1953 } 1954 1955 KASSERT(vp->v_data == NULL); 1956 KASSERT((vp->v_iflag & VI_PAGES) == 0); 1957 1958 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1959 uvm_ra_freectx(vp->v_ractx); 1960 vp->v_ractx = NULL; 1961 } 1962 1963 if (vip->vi_key.vk_key_len > 0) { 1964 /* Remove from vnode cache. */ 1965 hash = vcache_hash(&vip->vi_key); 1966 mutex_enter(&vcache_lock); 1967 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 1968 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1969 vip, vnode_impl, vi_hash); 1970 mutex_exit(&vcache_lock); 1971 } 1972 if (temp_key != temp_buf) 1973 kmem_free(temp_key, temp_key_len); 1974 1975 /* Done with purge, notify sleepers of the grim news. */ 1976 mutex_enter(vp->v_interlock); 1977 vp->v_op = dead_vnodeop_p; 1978 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); 1979 vp->v_tag = VT_NON; 1980 mutex_exit(vp->v_interlock); 1981 1982 /* 1983 * Move to dead mount. Must be after changing the operations 1984 * vector as vnode operations enter the mount before using the 1985 * operations vector. See sys/kern/vnode_if.c. 1986 */ 1987 vp->v_vflag &= ~VV_ROOT; 1988 vfs_ref(dead_rootmount); 1989 vfs_insmntque(vp, dead_rootmount); 1990 1991 #ifdef PAX_SEGVGUARD 1992 pax_segvguard_cleanup(vp); 1993 #endif /* PAX_SEGVGUARD */ 1994 1995 mutex_enter(vp->v_interlock); 1996 fstrans_done(mp); 1997 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1998 } 1999 2000 /* 2001 * Disassociate the underlying file system from an open device vnode 2002 * and make it anonymous. 2003 * 2004 * Vnode unlocked on entry, drops a reference to the vnode. 2005 */ 2006 void 2007 vcache_make_anon(vnode_t *vp) 2008 { 2009 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 2010 uint32_t hash; 2011 bool recycle; 2012 2013 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 2014 KASSERT(vp->v_mount == dead_rootmount || 2015 fstrans_is_owner(vp->v_mount)); 2016 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); 2017 2018 /* Remove from vnode cache. */ 2019 hash = vcache_hash(&vip->vi_key); 2020 mutex_enter(&vcache_lock); 2021 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 2022 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 2023 vip, vnode_impl, vi_hash); 2024 vip->vi_key.vk_mount = dead_rootmount; 2025 vip->vi_key.vk_key_len = 0; 2026 vip->vi_key.vk_key = NULL; 2027 mutex_exit(&vcache_lock); 2028 2029 /* 2030 * Disassociate the underlying file system from the vnode. 2031 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks 2032 * the vnode, and may destroy the vnode so that VOP_UNLOCK 2033 * would no longer function. 2034 */ 2035 if (vn_lock(vp, LK_EXCLUSIVE)) { 2036 vnpanic(vp, "%s: cannot lock", __func__); 2037 } 2038 VOP_INACTIVE(vp, &recycle); 2039 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 2040 if (VOP_RECLAIM(vp)) { 2041 vnpanic(vp, "%s: cannot reclaim", __func__); 2042 } 2043 2044 /* Purge name cache. */ 2045 cache_purge(vp); 2046 2047 /* Done with purge, change operations vector. */ 2048 mutex_enter(vp->v_interlock); 2049 vp->v_op = spec_vnodeop_p; 2050 vp->v_vflag |= VV_MPSAFE; 2051 mutex_exit(vp->v_interlock); 2052 2053 /* 2054 * Move to dead mount. Must be after changing the operations 2055 * vector as vnode operations enter the mount before using the 2056 * operations vector. See sys/kern/vnode_if.c. 2057 */ 2058 vfs_ref(dead_rootmount); 2059 vfs_insmntque(vp, dead_rootmount); 2060 2061 vrele(vp); 2062 } 2063 2064 /* 2065 * Update outstanding I/O count and do wakeup if requested. 2066 */ 2067 void 2068 vwakeup(struct buf *bp) 2069 { 2070 vnode_t *vp; 2071 2072 if ((vp = bp->b_vp) == NULL) 2073 return; 2074 2075 KASSERT(bp->b_objlock == vp->v_interlock); 2076 KASSERT(mutex_owned(bp->b_objlock)); 2077 2078 if (--vp->v_numoutput < 0) 2079 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 2080 if (vp->v_numoutput == 0) 2081 cv_broadcast(&vp->v_cv); 2082 } 2083 2084 /* 2085 * Test a vnode for being or becoming dead. Returns one of: 2086 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 2087 * ENOENT: vnode is dead. 2088 * 0: otherwise. 2089 * 2090 * Whenever this function returns a non-zero value all future 2091 * calls will also return a non-zero value. 2092 */ 2093 int 2094 vdead_check(struct vnode *vp, int flags) 2095 { 2096 2097 KASSERT(mutex_owned(vp->v_interlock)); 2098 2099 if (! ISSET(flags, VDEAD_NOWAIT)) 2100 VSTATE_WAIT_STABLE(vp); 2101 2102 if (VSTATE_GET(vp) == VS_RECLAIMING) { 2103 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 2104 return SET_ERROR(EBUSY); 2105 } else if (VSTATE_GET(vp) == VS_RECLAIMED) { 2106 return SET_ERROR(ENOENT); 2107 } 2108 2109 return 0; 2110 } 2111 2112 int 2113 vfs_drainvnodes(void) 2114 { 2115 2116 mutex_enter(&vdrain_lock); 2117 2118 if (!vdrain_one(desiredvnodes)) { 2119 mutex_exit(&vdrain_lock); 2120 return SET_ERROR(EBUSY); 2121 } 2122 2123 mutex_exit(&vdrain_lock); 2124 2125 if (vcache_hashsize != desiredvnodes) 2126 vcache_reinit(); 2127 2128 return 0; 2129 } 2130 2131 void 2132 vnpanic(vnode_t *vp, const char *fmt, ...) 2133 { 2134 va_list ap; 2135 2136 #ifdef DIAGNOSTIC 2137 vprint(NULL, vp); 2138 #endif 2139 va_start(ap, fmt); 2140 vpanic(fmt, ap); 2141 va_end(ap); 2142 } 2143 2144 void 2145 vshareilock(vnode_t *tvp, vnode_t *fvp) 2146 { 2147 kmutex_t *oldlock; 2148 2149 oldlock = tvp->v_interlock; 2150 mutex_obj_hold(fvp->v_interlock); 2151 tvp->v_interlock = fvp->v_interlock; 2152 mutex_obj_free(oldlock); 2153 } 2154 2155 void 2156 vshareklist(vnode_t *tvp, vnode_t *fvp) 2157 { 2158 /* 2159 * If two vnodes share klist state, they must also share 2160 * an interlock. 2161 */ 2162 KASSERT(tvp->v_interlock == fvp->v_interlock); 2163 2164 /* 2165 * We make the following assumptions: 2166 * 2167 * ==> Some other synchronization is happening outside of 2168 * our view to make this safe. 2169 * 2170 * ==> That the "to" vnode will have the necessary references 2171 * on the "from" vnode so that the storage for the klist 2172 * won't be yanked out from beneath us (the vnode_impl). 2173 * 2174 * ==> If "from" is also sharing, we then assume that "from" 2175 * has the necessary references, and so on. 2176 */ 2177 tvp->v_klist = fvp->v_klist; 2178 } 2179