1 /* $NetBSD: vfs_vnode.c,v 1.96 2017/06/04 08:05:42 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vcache_vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to 93 * disassociate underlying file system from the vnode, and finally 94 * destroyed. 95 * 96 * Vnode state 97 * 98 * Vnode is always in one of six states: 99 * - MARKER This is a marker vnode to help list traversal. It 100 * will never change its state. 101 * - LOADING Vnode is associating underlying file system and not 102 * yet ready to use. 103 * - LOADED Vnode has associated underlying file system and is 104 * ready to use. 105 * - BLOCKED Vnode is active but cannot get new references. 106 * - RECLAIMING Vnode is disassociating from the underlying file 107 * system. 108 * - RECLAIMED Vnode has disassociated from underlying file system 109 * and is dead. 110 * 111 * Valid state changes are: 112 * LOADING -> LOADED 113 * Vnode has been initialised in vcache_get() or 114 * vcache_new() and is ready to use. 115 * LOADED -> RECLAIMING 116 * Vnode starts disassociation from underlying file 117 * system in vcache_reclaim(). 118 * RECLAIMING -> RECLAIMED 119 * Vnode finished disassociation from underlying file 120 * system in vcache_reclaim(). 121 * LOADED -> BLOCKED 122 * Either vcache_rekey*() is changing the vnode key or 123 * vrelel() is about to call VOP_INACTIVE(). 124 * BLOCKED -> LOADED 125 * The block condition is over. 126 * LOADING -> RECLAIMED 127 * Either vcache_get() or vcache_new() failed to 128 * associate the underlying file system or vcache_rekey*() 129 * drops a vnode used as placeholder. 130 * 131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 132 * and it is possible to wait for state change. 133 * 134 * State is protected with v_interlock with one exception: 135 * to change from LOADING both v_interlock and vcache_lock must be held 136 * so it is possible to check "state == LOADING" without holding 137 * v_interlock. See vcache_get() for details. 138 * 139 * Reference counting 140 * 141 * Vnode is considered active, if reference count (vnode_t::v_usecount) 142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 143 * as vput(9), routines. Common points holding references are e.g. 144 * file openings, current working directory, mount points, etc. 145 * 146 * Note on v_usecount and its locking 147 * 148 * At nearly all points it is known that v_usecount could be zero, 149 * the vnode_t::v_interlock will be held. To change v_usecount away 150 * from zero, the interlock must be held. To change from a non-zero 151 * value to zero, again the interlock must be held. 152 * 153 * Changing the usecount from a non-zero value to a non-zero value can 154 * safely be done using atomic operations, without the interlock held. 155 * 156 */ 157 158 #include <sys/cdefs.h> 159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.96 2017/06/04 08:05:42 hannken Exp $"); 160 161 #include <sys/param.h> 162 #include <sys/kernel.h> 163 164 #include <sys/atomic.h> 165 #include <sys/buf.h> 166 #include <sys/conf.h> 167 #include <sys/device.h> 168 #include <sys/hash.h> 169 #include <sys/kauth.h> 170 #include <sys/kmem.h> 171 #include <sys/kthread.h> 172 #include <sys/module.h> 173 #include <sys/mount.h> 174 #include <sys/namei.h> 175 #include <sys/syscallargs.h> 176 #include <sys/sysctl.h> 177 #include <sys/systm.h> 178 #include <sys/vnode_impl.h> 179 #include <sys/wapbl.h> 180 #include <sys/fstrans.h> 181 182 #include <uvm/uvm.h> 183 #include <uvm/uvm_readahead.h> 184 185 /* Flags to vrelel. */ 186 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 187 #define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */ 188 189 u_int numvnodes __cacheline_aligned; 190 191 /* 192 * There are three lru lists: one holds vnodes waiting for async release, 193 * one is for vnodes which have no buffer/page references and 194 * one for those which do (i.e. v_holdcnt is non-zero). 195 */ 196 static vnodelst_t lru_vrele_list __cacheline_aligned; 197 static vnodelst_t lru_free_list __cacheline_aligned; 198 static vnodelst_t lru_hold_list __cacheline_aligned; 199 static kmutex_t vdrain_lock __cacheline_aligned; 200 static kcondvar_t vdrain_cv __cacheline_aligned; 201 static int vdrain_gen; 202 static kcondvar_t vdrain_gen_cv; 203 static bool vdrain_retry; 204 static lwp_t * vdrain_lwp; 205 SLIST_HEAD(hashhead, vnode_impl); 206 static kmutex_t vcache_lock __cacheline_aligned; 207 static kcondvar_t vcache_cv __cacheline_aligned; 208 static u_int vcache_hashsize; 209 static u_long vcache_hashmask; 210 static struct hashhead *vcache_hashtab __cacheline_aligned; 211 static pool_cache_t vcache_pool; 212 static void lru_requeue(vnode_t *, vnodelst_t *); 213 static vnodelst_t * lru_which(vnode_t *); 214 static vnode_impl_t * vcache_alloc(void); 215 static void vcache_dealloc(vnode_impl_t *); 216 static void vcache_free(vnode_impl_t *); 217 static void vcache_init(void); 218 static void vcache_reinit(void); 219 static void vcache_reclaim(vnode_t *); 220 static void vrelel(vnode_t *, int); 221 static void vdrain_thread(void *); 222 static void vnpanic(vnode_t *, const char *, ...) 223 __printflike(2, 3); 224 225 /* Routines having to do with the management of the vnode table. */ 226 extern struct mount *dead_rootmount; 227 extern int (**dead_vnodeop_p)(void *); 228 extern struct vfsops dead_vfsops; 229 230 /* Vnode state operations and diagnostics. */ 231 232 #if defined(DIAGNOSTIC) 233 234 #define VSTATE_VALID(state) \ 235 ((state) != VS_ACTIVE && (state) != VS_MARKER) 236 #define VSTATE_GET(vp) \ 237 vstate_assert_get((vp), __func__, __LINE__) 238 #define VSTATE_CHANGE(vp, from, to) \ 239 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 240 #define VSTATE_WAIT_STABLE(vp) \ 241 vstate_assert_wait_stable((vp), __func__, __LINE__) 242 243 void 244 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line) 245 { 246 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 247 248 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 249 250 if (state == VS_ACTIVE && vp->v_usecount > 0 && 251 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) 252 return; 253 if (vip->vi_state == state) 254 return; 255 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d", 256 vstate_name(vip->vi_state), vp->v_usecount, 257 vstate_name(state), func, line); 258 } 259 260 static enum vnode_state 261 vstate_assert_get(vnode_t *vp, const char *func, int line) 262 { 263 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 264 265 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 266 if (! VSTATE_VALID(vip->vi_state)) 267 vnpanic(vp, "state is %s at %s:%d", 268 vstate_name(vip->vi_state), func, line); 269 270 return vip->vi_state; 271 } 272 273 static void 274 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 275 { 276 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 277 278 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 279 if (! VSTATE_VALID(vip->vi_state)) 280 vnpanic(vp, "state is %s at %s:%d", 281 vstate_name(vip->vi_state), func, line); 282 283 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 284 cv_wait(&vp->v_cv, vp->v_interlock); 285 286 if (! VSTATE_VALID(vip->vi_state)) 287 vnpanic(vp, "state is %s at %s:%d", 288 vstate_name(vip->vi_state), func, line); 289 } 290 291 static void 292 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, 293 const char *func, int line) 294 { 295 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 296 297 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 298 if (from == VS_LOADING) 299 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); 300 301 if (! VSTATE_VALID(from)) 302 vnpanic(vp, "from is %s at %s:%d", 303 vstate_name(from), func, line); 304 if (! VSTATE_VALID(to)) 305 vnpanic(vp, "to is %s at %s:%d", 306 vstate_name(to), func, line); 307 if (vip->vi_state != from) 308 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 309 vstate_name(vip->vi_state), vstate_name(from), func, line); 310 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1) 311 vnpanic(vp, "%s to %s with usecount %d at %s:%d", 312 vstate_name(from), vstate_name(to), vp->v_usecount, 313 func, line); 314 315 vip->vi_state = to; 316 if (from == VS_LOADING) 317 cv_broadcast(&vcache_cv); 318 if (to == VS_LOADED || to == VS_RECLAIMED) 319 cv_broadcast(&vp->v_cv); 320 } 321 322 #else /* defined(DIAGNOSTIC) */ 323 324 #define VSTATE_GET(vp) \ 325 (VNODE_TO_VIMPL((vp))->vi_state) 326 #define VSTATE_CHANGE(vp, from, to) \ 327 vstate_change((vp), (from), (to)) 328 #define VSTATE_WAIT_STABLE(vp) \ 329 vstate_wait_stable((vp)) 330 void 331 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line) 332 { 333 334 } 335 336 static void 337 vstate_wait_stable(vnode_t *vp) 338 { 339 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 340 341 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) 342 cv_wait(&vp->v_cv, vp->v_interlock); 343 } 344 345 static void 346 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) 347 { 348 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 349 350 vip->vi_state = to; 351 if (from == VS_LOADING) 352 cv_broadcast(&vcache_cv); 353 if (to == VS_LOADED || to == VS_RECLAIMED) 354 cv_broadcast(&vp->v_cv); 355 } 356 357 #endif /* defined(DIAGNOSTIC) */ 358 359 void 360 vfs_vnode_sysinit(void) 361 { 362 int error __diagused; 363 364 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 365 KASSERT(dead_rootmount != NULL); 366 dead_rootmount->mnt_iflag = IMNT_MPSAFE; 367 368 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); 369 TAILQ_INIT(&lru_free_list); 370 TAILQ_INIT(&lru_hold_list); 371 TAILQ_INIT(&lru_vrele_list); 372 373 vcache_init(); 374 375 cv_init(&vdrain_cv, "vdrain"); 376 cv_init(&vdrain_gen_cv, "vdrainwt"); 377 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 378 NULL, &vdrain_lwp, "vdrain"); 379 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); 380 } 381 382 /* 383 * Allocate a new marker vnode. 384 */ 385 vnode_t * 386 vnalloc_marker(struct mount *mp) 387 { 388 vnode_impl_t *vip; 389 vnode_t *vp; 390 391 vip = pool_cache_get(vcache_pool, PR_WAITOK); 392 memset(vip, 0, sizeof(*vip)); 393 vp = VIMPL_TO_VNODE(vip); 394 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 395 vp->v_mount = mp; 396 vp->v_type = VBAD; 397 vip->vi_state = VS_MARKER; 398 399 return vp; 400 } 401 402 /* 403 * Free a marker vnode. 404 */ 405 void 406 vnfree_marker(vnode_t *vp) 407 { 408 vnode_impl_t *vip; 409 410 vip = VNODE_TO_VIMPL(vp); 411 KASSERT(vip->vi_state == VS_MARKER); 412 uvm_obj_destroy(&vp->v_uobj, true); 413 pool_cache_put(vcache_pool, vip); 414 } 415 416 /* 417 * Test a vnode for being a marker vnode. 418 */ 419 bool 420 vnis_marker(vnode_t *vp) 421 { 422 423 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); 424 } 425 426 /* 427 * Return the lru list this node should be on. 428 */ 429 static vnodelst_t * 430 lru_which(vnode_t *vp) 431 { 432 433 KASSERT(mutex_owned(vp->v_interlock)); 434 435 if (vp->v_holdcnt > 0) 436 return &lru_hold_list; 437 else 438 return &lru_free_list; 439 } 440 441 /* 442 * Put vnode to end of given list. 443 * Both the current and the new list may be NULL, used on vnode alloc/free. 444 * Adjust numvnodes and signal vdrain thread if there is work. 445 */ 446 static void 447 lru_requeue(vnode_t *vp, vnodelst_t *listhd) 448 { 449 vnode_impl_t *vip; 450 451 mutex_enter(&vdrain_lock); 452 vip = VNODE_TO_VIMPL(vp); 453 if (vip->vi_lrulisthd != NULL) 454 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 455 else 456 numvnodes++; 457 vip->vi_lrulisthd = listhd; 458 if (vip->vi_lrulisthd != NULL) 459 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 460 else 461 numvnodes--; 462 if (numvnodes > desiredvnodes || listhd == &lru_vrele_list) 463 cv_broadcast(&vdrain_cv); 464 mutex_exit(&vdrain_lock); 465 } 466 467 /* 468 * Release deferred vrele vnodes for this mount. 469 * Called with file system suspended. 470 */ 471 void 472 vrele_flush(struct mount *mp) 473 { 474 vnode_impl_t *vip, *marker; 475 476 KASSERT(fstrans_is_owner(mp)); 477 478 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 479 480 mutex_enter(&vdrain_lock); 481 TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist); 482 483 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 484 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); 485 TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist); 486 if (vnis_marker(VIMPL_TO_VNODE(vip))) 487 continue; 488 489 KASSERT(vip->vi_lrulisthd == &lru_vrele_list); 490 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 491 vip->vi_lrulisthd = &lru_hold_list; 492 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 493 mutex_exit(&vdrain_lock); 494 495 mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock); 496 vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE); 497 498 mutex_enter(&vdrain_lock); 499 } 500 501 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); 502 mutex_exit(&vdrain_lock); 503 504 vnfree_marker(VIMPL_TO_VNODE(marker)); 505 } 506 507 /* 508 * Reclaim a cached vnode. Used from vdrain_thread only. 509 */ 510 static __inline void 511 vdrain_remove(vnode_t *vp) 512 { 513 struct mount *mp; 514 515 KASSERT(mutex_owned(&vdrain_lock)); 516 517 /* Probe usecount (unlocked). */ 518 if (vp->v_usecount > 0) 519 return; 520 /* Try v_interlock -- we lock the wrong direction! */ 521 if (!mutex_tryenter(vp->v_interlock)) 522 return; 523 /* Probe usecount and state. */ 524 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) { 525 mutex_exit(vp->v_interlock); 526 return; 527 } 528 mp = vp->v_mount; 529 if (fstrans_start_nowait(mp) != 0) { 530 mutex_exit(vp->v_interlock); 531 return; 532 } 533 vdrain_retry = true; 534 mutex_exit(&vdrain_lock); 535 536 if (vcache_vget(vp) == 0) { 537 if (!vrecycle(vp)) { 538 mutex_enter(vp->v_interlock); 539 vrelel(vp, VRELEL_FORCE_RELE); 540 } 541 } 542 fstrans_done(mp); 543 544 mutex_enter(&vdrain_lock); 545 } 546 547 /* 548 * Release a cached vnode. Used from vdrain_thread only. 549 */ 550 static __inline void 551 vdrain_vrele(vnode_t *vp) 552 { 553 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 554 struct mount *mp; 555 556 KASSERT(mutex_owned(&vdrain_lock)); 557 558 mp = vp->v_mount; 559 if (fstrans_start_nowait(mp) != 0) 560 return; 561 562 /* 563 * First remove the vnode from the vrele list. 564 * Put it on the last lru list, the last vrele() 565 * will put it back onto the right list before 566 * its v_usecount reaches zero. 567 */ 568 KASSERT(vip->vi_lrulisthd == &lru_vrele_list); 569 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 570 vip->vi_lrulisthd = &lru_hold_list; 571 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 572 573 vdrain_retry = true; 574 mutex_exit(&vdrain_lock); 575 576 mutex_enter(vp->v_interlock); 577 vrelel(vp, VRELEL_FORCE_RELE); 578 fstrans_done(mp); 579 580 mutex_enter(&vdrain_lock); 581 } 582 583 /* 584 * Helper thread to keep the number of vnodes below desiredvnodes 585 * and release vnodes from asynchronous vrele. 586 */ 587 static void 588 vdrain_thread(void *cookie) 589 { 590 vnodelst_t *listhd[] = { 591 &lru_vrele_list, &lru_free_list, &lru_hold_list 592 }; 593 int i; 594 u_int target; 595 vnode_impl_t *vip, *marker; 596 597 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 598 599 mutex_enter(&vdrain_lock); 600 601 for (;;) { 602 vdrain_retry = false; 603 target = desiredvnodes - desiredvnodes/10; 604 605 for (i = 0; i < __arraycount(listhd); i++) { 606 TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist); 607 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 608 TAILQ_REMOVE(listhd[i], marker, vi_lrulist); 609 TAILQ_INSERT_AFTER(listhd[i], vip, marker, 610 vi_lrulist); 611 if (vnis_marker(VIMPL_TO_VNODE(vip))) 612 continue; 613 if (listhd[i] == &lru_vrele_list) 614 vdrain_vrele(VIMPL_TO_VNODE(vip)); 615 else if (numvnodes < target) 616 break; 617 else 618 vdrain_remove(VIMPL_TO_VNODE(vip)); 619 } 620 TAILQ_REMOVE(listhd[i], marker, vi_lrulist); 621 } 622 623 if (vdrain_retry) { 624 mutex_exit(&vdrain_lock); 625 yield(); 626 mutex_enter(&vdrain_lock); 627 } else { 628 vdrain_gen++; 629 cv_broadcast(&vdrain_gen_cv); 630 cv_wait(&vdrain_cv, &vdrain_lock); 631 } 632 } 633 } 634 635 /* 636 * vput: unlock and release the reference. 637 */ 638 void 639 vput(vnode_t *vp) 640 { 641 642 VOP_UNLOCK(vp); 643 vrele(vp); 644 } 645 646 /* 647 * Try to drop reference on a vnode. Abort if we are releasing the 648 * last reference. Note: this _must_ succeed if not the last reference. 649 */ 650 static inline bool 651 vtryrele(vnode_t *vp) 652 { 653 u_int use, next; 654 655 for (use = vp->v_usecount;; use = next) { 656 if (use == 1) { 657 return false; 658 } 659 KASSERT(use > 1); 660 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 661 if (__predict_true(next == use)) { 662 return true; 663 } 664 } 665 } 666 667 /* 668 * Vnode release. If reference count drops to zero, call inactive 669 * routine and either return to freelist or free to the pool. 670 */ 671 static void 672 vrelel(vnode_t *vp, int flags) 673 { 674 const bool async = ((flags & VRELEL_ASYNC_RELE) != 0); 675 const bool force = ((flags & VRELEL_FORCE_RELE) != 0); 676 bool recycle, defer; 677 int error; 678 679 KASSERT(mutex_owned(vp->v_interlock)); 680 681 if (__predict_false(vp->v_op == dead_vnodeop_p && 682 VSTATE_GET(vp) != VS_RECLAIMED)) { 683 vnpanic(vp, "dead but not clean"); 684 } 685 686 /* 687 * If not the last reference, just drop the reference count 688 * and unlock. 689 */ 690 if (vtryrele(vp)) { 691 mutex_exit(vp->v_interlock); 692 return; 693 } 694 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 695 vnpanic(vp, "%s: bad ref count", __func__); 696 } 697 698 #ifdef DIAGNOSTIC 699 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 700 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 701 vprint("vrelel: missing VOP_CLOSE()", vp); 702 } 703 #endif 704 705 /* 706 * First try to get the vnode locked for VOP_INACTIVE(). 707 * Defer vnode release to vdrain_thread if caller requests 708 * it explicitly, is the pagedaemon or the lock failed. 709 */ 710 if ((curlwp == uvm.pagedaemon_lwp) || async) { 711 defer = true; 712 } else { 713 mutex_exit(vp->v_interlock); 714 error = vn_lock(vp, 715 LK_EXCLUSIVE | LK_RETRY | (force ? 0 : LK_NOWAIT)); 716 defer = (error != 0); 717 mutex_enter(vp->v_interlock); 718 } 719 KASSERT(mutex_owned(vp->v_interlock)); 720 KASSERT(! (force && defer)); 721 if (defer) { 722 /* 723 * Defer reclaim to the kthread; it's not safe to 724 * clean it here. We donate it our last reference. 725 */ 726 lru_requeue(vp, &lru_vrele_list); 727 mutex_exit(vp->v_interlock); 728 return; 729 } 730 731 /* 732 * If the node got another reference while we 733 * released the interlock, don't try to inactivate it yet. 734 */ 735 if (__predict_false(vtryrele(vp))) { 736 VOP_UNLOCK(vp); 737 mutex_exit(vp->v_interlock); 738 return; 739 } 740 741 /* 742 * If not clean, deactivate the vnode, but preserve 743 * our reference across the call to VOP_INACTIVE(). 744 */ 745 if (VSTATE_GET(vp) == VS_RECLAIMED) { 746 VOP_UNLOCK(vp); 747 } else { 748 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 749 mutex_exit(vp->v_interlock); 750 751 /* 752 * The vnode must not gain another reference while being 753 * deactivated. If VOP_INACTIVE() indicates that 754 * the described file has been deleted, then recycle 755 * the vnode. 756 * 757 * Note that VOP_INACTIVE() will not drop the vnode lock. 758 */ 759 recycle = false; 760 VOP_INACTIVE(vp, &recycle); 761 if (!recycle) 762 VOP_UNLOCK(vp); 763 mutex_enter(vp->v_interlock); 764 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 765 if (!recycle) { 766 if (vtryrele(vp)) { 767 mutex_exit(vp->v_interlock); 768 return; 769 } 770 } 771 772 /* Take care of space accounting. */ 773 if (vp->v_iflag & VI_EXECMAP) { 774 atomic_add_int(&uvmexp.execpages, 775 -vp->v_uobj.uo_npages); 776 atomic_add_int(&uvmexp.filepages, 777 vp->v_uobj.uo_npages); 778 } 779 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 780 vp->v_vflag &= ~VV_MAPPED; 781 782 /* 783 * Recycle the vnode if the file is now unused (unlinked), 784 * otherwise just free it. 785 */ 786 if (recycle) { 787 VSTATE_ASSERT(vp, VS_LOADED); 788 /* vcache_reclaim drops the lock. */ 789 vcache_reclaim(vp); 790 } 791 KASSERT(vp->v_usecount > 0); 792 } 793 794 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 795 /* Gained another reference while being reclaimed. */ 796 mutex_exit(vp->v_interlock); 797 return; 798 } 799 800 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { 801 /* 802 * It's clean so destroy it. It isn't referenced 803 * anywhere since it has been reclaimed. 804 */ 805 vcache_free(VNODE_TO_VIMPL(vp)); 806 } else { 807 /* 808 * Otherwise, put it back onto the freelist. It 809 * can't be destroyed while still associated with 810 * a file system. 811 */ 812 lru_requeue(vp, lru_which(vp)); 813 mutex_exit(vp->v_interlock); 814 } 815 } 816 817 void 818 vrele(vnode_t *vp) 819 { 820 821 if (vtryrele(vp)) { 822 return; 823 } 824 mutex_enter(vp->v_interlock); 825 vrelel(vp, 0); 826 } 827 828 /* 829 * Asynchronous vnode release, vnode is released in different context. 830 */ 831 void 832 vrele_async(vnode_t *vp) 833 { 834 835 if (vtryrele(vp)) { 836 return; 837 } 838 mutex_enter(vp->v_interlock); 839 vrelel(vp, VRELEL_ASYNC_RELE); 840 } 841 842 /* 843 * Vnode reference, where a reference is already held by some other 844 * object (for example, a file structure). 845 */ 846 void 847 vref(vnode_t *vp) 848 { 849 850 KASSERT(vp->v_usecount != 0); 851 852 atomic_inc_uint(&vp->v_usecount); 853 } 854 855 /* 856 * Page or buffer structure gets a reference. 857 * Called with v_interlock held. 858 */ 859 void 860 vholdl(vnode_t *vp) 861 { 862 863 KASSERT(mutex_owned(vp->v_interlock)); 864 865 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) 866 lru_requeue(vp, lru_which(vp)); 867 } 868 869 /* 870 * Page or buffer structure frees a reference. 871 * Called with v_interlock held. 872 */ 873 void 874 holdrelel(vnode_t *vp) 875 { 876 877 KASSERT(mutex_owned(vp->v_interlock)); 878 879 if (vp->v_holdcnt <= 0) { 880 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 881 } 882 883 vp->v_holdcnt--; 884 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 885 lru_requeue(vp, lru_which(vp)); 886 } 887 888 /* 889 * Recycle an unused vnode if caller holds the last reference. 890 */ 891 bool 892 vrecycle(vnode_t *vp) 893 { 894 int error __diagused; 895 896 mutex_enter(vp->v_interlock); 897 898 /* Make sure we hold the last reference. */ 899 VSTATE_WAIT_STABLE(vp); 900 if (vp->v_usecount != 1) { 901 mutex_exit(vp->v_interlock); 902 return false; 903 } 904 905 /* If the vnode is already clean we're done. */ 906 if (VSTATE_GET(vp) != VS_LOADED) { 907 VSTATE_ASSERT(vp, VS_RECLAIMED); 908 vrelel(vp, 0); 909 return true; 910 } 911 912 /* Prevent further references until the vnode is locked. */ 913 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); 914 mutex_exit(vp->v_interlock); 915 916 /* 917 * On a leaf file system this lock will always succeed as we hold 918 * the last reference and prevent further references. 919 * On layered file systems waiting for the lock would open a can of 920 * deadlocks as the lower vnodes may have other active references. 921 */ 922 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 923 924 mutex_enter(vp->v_interlock); 925 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); 926 927 if (error) { 928 mutex_exit(vp->v_interlock); 929 return false; 930 } 931 932 KASSERT(vp->v_usecount == 1); 933 vcache_reclaim(vp); 934 vrelel(vp, 0); 935 936 return true; 937 } 938 939 /* 940 * Helper for vrevoke() to propagate suspension from lastmp 941 * to thismp. Both args may be NULL. 942 * Returns the currently suspended file system or NULL. 943 */ 944 static struct mount * 945 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp) 946 { 947 int error; 948 949 if (lastmp == thismp) 950 return thismp; 951 952 if (lastmp != NULL) 953 vfs_resume(lastmp); 954 955 if (thismp == NULL) 956 return NULL; 957 958 do { 959 error = vfs_suspend(thismp, 0); 960 } while (error == EINTR || error == ERESTART); 961 962 if (error == 0) 963 return thismp; 964 965 KASSERT(error == EOPNOTSUPP); 966 return NULL; 967 } 968 969 /* 970 * Eliminate all activity associated with the requested vnode 971 * and with all vnodes aliased to the requested vnode. 972 */ 973 void 974 vrevoke(vnode_t *vp) 975 { 976 struct mount *mp; 977 vnode_t *vq; 978 enum vtype type; 979 dev_t dev; 980 981 KASSERT(vp->v_usecount > 0); 982 983 mp = vrevoke_suspend_next(NULL, vp->v_mount); 984 985 mutex_enter(vp->v_interlock); 986 VSTATE_WAIT_STABLE(vp); 987 if (VSTATE_GET(vp) == VS_RECLAIMED) { 988 mutex_exit(vp->v_interlock); 989 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 990 atomic_inc_uint(&vp->v_usecount); 991 mutex_exit(vp->v_interlock); 992 vgone(vp); 993 } else { 994 dev = vp->v_rdev; 995 type = vp->v_type; 996 mutex_exit(vp->v_interlock); 997 998 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 999 mp = vrevoke_suspend_next(mp, vq->v_mount); 1000 vgone(vq); 1001 } 1002 } 1003 vrevoke_suspend_next(mp, NULL); 1004 } 1005 1006 /* 1007 * Eliminate all activity associated with a vnode in preparation for 1008 * reuse. Drops a reference from the vnode. 1009 */ 1010 void 1011 vgone(vnode_t *vp) 1012 { 1013 1014 KASSERT((vp->v_mount->mnt_iflag & IMNT_HAS_TRANS) == 0 || 1015 fstrans_is_owner(vp->v_mount)); 1016 1017 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1018 mutex_enter(vp->v_interlock); 1019 VSTATE_WAIT_STABLE(vp); 1020 if (VSTATE_GET(vp) == VS_LOADED) 1021 vcache_reclaim(vp); 1022 VSTATE_ASSERT(vp, VS_RECLAIMED); 1023 vrelel(vp, 0); 1024 } 1025 1026 static inline uint32_t 1027 vcache_hash(const struct vcache_key *key) 1028 { 1029 uint32_t hash = HASH32_BUF_INIT; 1030 1031 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 1032 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 1033 return hash; 1034 } 1035 1036 static void 1037 vcache_init(void) 1038 { 1039 1040 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0, 1041 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1042 KASSERT(vcache_pool != NULL); 1043 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); 1044 cv_init(&vcache_cv, "vcache"); 1045 vcache_hashsize = desiredvnodes; 1046 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1047 &vcache_hashmask); 1048 } 1049 1050 static void 1051 vcache_reinit(void) 1052 { 1053 int i; 1054 uint32_t hash; 1055 u_long oldmask, newmask; 1056 struct hashhead *oldtab, *newtab; 1057 vnode_impl_t *vip; 1058 1059 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1060 mutex_enter(&vcache_lock); 1061 oldtab = vcache_hashtab; 1062 oldmask = vcache_hashmask; 1063 vcache_hashsize = desiredvnodes; 1064 vcache_hashtab = newtab; 1065 vcache_hashmask = newmask; 1066 for (i = 0; i <= oldmask; i++) { 1067 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { 1068 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); 1069 hash = vcache_hash(&vip->vi_key); 1070 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], 1071 vip, vi_hash); 1072 } 1073 } 1074 mutex_exit(&vcache_lock); 1075 hashdone(oldtab, HASH_SLIST, oldmask); 1076 } 1077 1078 static inline vnode_impl_t * 1079 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1080 { 1081 struct hashhead *hashp; 1082 vnode_impl_t *vip; 1083 1084 KASSERT(mutex_owned(&vcache_lock)); 1085 1086 hashp = &vcache_hashtab[hash & vcache_hashmask]; 1087 SLIST_FOREACH(vip, hashp, vi_hash) { 1088 if (key->vk_mount != vip->vi_key.vk_mount) 1089 continue; 1090 if (key->vk_key_len != vip->vi_key.vk_key_len) 1091 continue; 1092 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) 1093 continue; 1094 return vip; 1095 } 1096 return NULL; 1097 } 1098 1099 /* 1100 * Allocate a new, uninitialized vcache node. 1101 */ 1102 static vnode_impl_t * 1103 vcache_alloc(void) 1104 { 1105 vnode_impl_t *vip; 1106 vnode_t *vp; 1107 1108 vip = pool_cache_get(vcache_pool, PR_WAITOK); 1109 memset(vip, 0, sizeof(*vip)); 1110 1111 rw_init(&vip->vi_lock); 1112 /* SLIST_INIT(&vip->vi_hash); */ 1113 /* LIST_INIT(&vip->vi_nclist); */ 1114 /* LIST_INIT(&vip->vi_dnclist); */ 1115 1116 vp = VIMPL_TO_VNODE(vip); 1117 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 1118 cv_init(&vp->v_cv, "vnode"); 1119 1120 vp->v_usecount = 1; 1121 vp->v_type = VNON; 1122 vp->v_size = vp->v_writesize = VSIZENOTSET; 1123 1124 vip->vi_state = VS_LOADING; 1125 1126 lru_requeue(vp, &lru_free_list); 1127 1128 return vip; 1129 } 1130 1131 /* 1132 * Deallocate a vcache node in state VS_LOADING. 1133 * 1134 * vcache_lock held on entry and released on return. 1135 */ 1136 static void 1137 vcache_dealloc(vnode_impl_t *vip) 1138 { 1139 vnode_t *vp; 1140 1141 KASSERT(mutex_owned(&vcache_lock)); 1142 1143 vp = VIMPL_TO_VNODE(vip); 1144 mutex_enter(vp->v_interlock); 1145 vp->v_op = dead_vnodeop_p; 1146 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1147 mutex_exit(&vcache_lock); 1148 vrelel(vp, 0); 1149 } 1150 1151 /* 1152 * Free an unused, unreferenced vcache node. 1153 * v_interlock locked on entry. 1154 */ 1155 static void 1156 vcache_free(vnode_impl_t *vip) 1157 { 1158 vnode_t *vp; 1159 1160 vp = VIMPL_TO_VNODE(vip); 1161 KASSERT(mutex_owned(vp->v_interlock)); 1162 1163 KASSERT(vp->v_usecount == 0); 1164 KASSERT(vp->v_holdcnt == 0); 1165 KASSERT(vp->v_writecount == 0); 1166 lru_requeue(vp, NULL); 1167 mutex_exit(vp->v_interlock); 1168 1169 vfs_insmntque(vp, NULL); 1170 if (vp->v_type == VBLK || vp->v_type == VCHR) 1171 spec_node_destroy(vp); 1172 1173 rw_destroy(&vip->vi_lock); 1174 uvm_obj_destroy(&vp->v_uobj, true); 1175 cv_destroy(&vp->v_cv); 1176 pool_cache_put(vcache_pool, vip); 1177 } 1178 1179 /* 1180 * Try to get an initial reference on this cached vnode. 1181 * Returns zero on success, ENOENT if the vnode has been reclaimed and 1182 * EBUSY if the vnode state is unstable. 1183 * 1184 * v_interlock locked on entry and unlocked on exit. 1185 */ 1186 int 1187 vcache_tryvget(vnode_t *vp) 1188 { 1189 int error = 0; 1190 1191 KASSERT(mutex_owned(vp->v_interlock)); 1192 1193 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) 1194 error = ENOENT; 1195 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED)) 1196 error = EBUSY; 1197 else if (vp->v_usecount == 0) 1198 vp->v_usecount = 1; 1199 else 1200 atomic_inc_uint(&vp->v_usecount); 1201 1202 mutex_exit(vp->v_interlock); 1203 1204 return error; 1205 } 1206 1207 /* 1208 * Try to get an initial reference on this cached vnode. 1209 * Returns zero on success and ENOENT if the vnode has been reclaimed. 1210 * Will wait for the vnode state to be stable. 1211 * 1212 * v_interlock locked on entry and unlocked on exit. 1213 */ 1214 int 1215 vcache_vget(vnode_t *vp) 1216 { 1217 1218 KASSERT(mutex_owned(vp->v_interlock)); 1219 1220 /* Increment hold count to prevent vnode from disappearing. */ 1221 vp->v_holdcnt++; 1222 VSTATE_WAIT_STABLE(vp); 1223 vp->v_holdcnt--; 1224 1225 /* If this was the last reference to a reclaimed vnode free it now. */ 1226 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { 1227 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 1228 vcache_free(VNODE_TO_VIMPL(vp)); 1229 else 1230 mutex_exit(vp->v_interlock); 1231 return ENOENT; 1232 } 1233 VSTATE_ASSERT(vp, VS_LOADED); 1234 if (vp->v_usecount == 0) 1235 vp->v_usecount = 1; 1236 else 1237 atomic_inc_uint(&vp->v_usecount); 1238 1239 mutex_exit(vp->v_interlock); 1240 1241 return 0; 1242 } 1243 1244 /* 1245 * Get a vnode / fs node pair by key and return it referenced through vpp. 1246 */ 1247 int 1248 vcache_get(struct mount *mp, const void *key, size_t key_len, 1249 struct vnode **vpp) 1250 { 1251 int error; 1252 uint32_t hash; 1253 const void *new_key; 1254 struct vnode *vp; 1255 struct vcache_key vcache_key; 1256 vnode_impl_t *vip, *new_vip; 1257 1258 new_key = NULL; 1259 *vpp = NULL; 1260 1261 vcache_key.vk_mount = mp; 1262 vcache_key.vk_key = key; 1263 vcache_key.vk_key_len = key_len; 1264 hash = vcache_hash(&vcache_key); 1265 1266 again: 1267 mutex_enter(&vcache_lock); 1268 vip = vcache_hash_lookup(&vcache_key, hash); 1269 1270 /* If found, take a reference or retry. */ 1271 if (__predict_true(vip != NULL)) { 1272 /* 1273 * If the vnode is loading we cannot take the v_interlock 1274 * here as it might change during load (see uvm_obj_setlock()). 1275 * As changing state from VS_LOADING requires both vcache_lock 1276 * and v_interlock it is safe to test with vcache_lock held. 1277 * 1278 * Wait for vnodes changing state from VS_LOADING and retry. 1279 */ 1280 if (__predict_false(vip->vi_state == VS_LOADING)) { 1281 cv_wait(&vcache_cv, &vcache_lock); 1282 mutex_exit(&vcache_lock); 1283 goto again; 1284 } 1285 vp = VIMPL_TO_VNODE(vip); 1286 mutex_enter(vp->v_interlock); 1287 mutex_exit(&vcache_lock); 1288 error = vcache_vget(vp); 1289 if (error == ENOENT) 1290 goto again; 1291 if (error == 0) 1292 *vpp = vp; 1293 KASSERT((error != 0) == (*vpp == NULL)); 1294 return error; 1295 } 1296 mutex_exit(&vcache_lock); 1297 1298 /* Allocate and initialize a new vcache / vnode pair. */ 1299 error = vfs_busy(mp); 1300 if (error) 1301 return error; 1302 new_vip = vcache_alloc(); 1303 new_vip->vi_key = vcache_key; 1304 vp = VIMPL_TO_VNODE(new_vip); 1305 mutex_enter(&vcache_lock); 1306 vip = vcache_hash_lookup(&vcache_key, hash); 1307 if (vip == NULL) { 1308 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1309 new_vip, vi_hash); 1310 vip = new_vip; 1311 } 1312 1313 /* If another thread beat us inserting this node, retry. */ 1314 if (vip != new_vip) { 1315 vcache_dealloc(new_vip); 1316 vfs_unbusy(mp); 1317 goto again; 1318 } 1319 mutex_exit(&vcache_lock); 1320 1321 /* Load the fs node. Exclusive as new_node is VS_LOADING. */ 1322 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1323 if (error) { 1324 mutex_enter(&vcache_lock); 1325 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1326 new_vip, vnode_impl, vi_hash); 1327 vcache_dealloc(new_vip); 1328 vfs_unbusy(mp); 1329 KASSERT(*vpp == NULL); 1330 return error; 1331 } 1332 KASSERT(new_key != NULL); 1333 KASSERT(memcmp(key, new_key, key_len) == 0); 1334 KASSERT(vp->v_op != NULL); 1335 vfs_insmntque(vp, mp); 1336 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1337 vp->v_vflag |= VV_MPSAFE; 1338 vfs_ref(mp); 1339 vfs_unbusy(mp); 1340 1341 /* Finished loading, finalize node. */ 1342 mutex_enter(&vcache_lock); 1343 new_vip->vi_key.vk_key = new_key; 1344 mutex_enter(vp->v_interlock); 1345 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1346 mutex_exit(vp->v_interlock); 1347 mutex_exit(&vcache_lock); 1348 *vpp = vp; 1349 return 0; 1350 } 1351 1352 /* 1353 * Create a new vnode / fs node pair and return it referenced through vpp. 1354 */ 1355 int 1356 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1357 kauth_cred_t cred, struct vnode **vpp) 1358 { 1359 int error; 1360 uint32_t hash; 1361 struct vnode *vp, *ovp; 1362 vnode_impl_t *vip, *ovip; 1363 1364 *vpp = NULL; 1365 1366 /* Allocate and initialize a new vcache / vnode pair. */ 1367 error = vfs_busy(mp); 1368 if (error) 1369 return error; 1370 vip = vcache_alloc(); 1371 vip->vi_key.vk_mount = mp; 1372 vp = VIMPL_TO_VNODE(vip); 1373 1374 /* Create and load the fs node. */ 1375 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, 1376 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); 1377 if (error) { 1378 mutex_enter(&vcache_lock); 1379 vcache_dealloc(vip); 1380 vfs_unbusy(mp); 1381 KASSERT(*vpp == NULL); 1382 return error; 1383 } 1384 KASSERT(vip->vi_key.vk_key != NULL); 1385 KASSERT(vp->v_op != NULL); 1386 hash = vcache_hash(&vip->vi_key); 1387 1388 /* Wait for previous instance to be reclaimed, then insert new node. */ 1389 mutex_enter(&vcache_lock); 1390 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { 1391 ovp = VIMPL_TO_VNODE(ovip); 1392 mutex_enter(ovp->v_interlock); 1393 mutex_exit(&vcache_lock); 1394 error = vcache_vget(ovp); 1395 KASSERT(error == ENOENT); 1396 mutex_enter(&vcache_lock); 1397 } 1398 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1399 vip, vi_hash); 1400 mutex_exit(&vcache_lock); 1401 vfs_insmntque(vp, mp); 1402 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1403 vp->v_vflag |= VV_MPSAFE; 1404 vfs_ref(mp); 1405 vfs_unbusy(mp); 1406 1407 /* Finished loading, finalize node. */ 1408 mutex_enter(&vcache_lock); 1409 mutex_enter(vp->v_interlock); 1410 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); 1411 mutex_exit(&vcache_lock); 1412 mutex_exit(vp->v_interlock); 1413 *vpp = vp; 1414 return 0; 1415 } 1416 1417 /* 1418 * Prepare key change: update old cache nodes key and lock new cache node. 1419 * Return an error if the new node already exists. 1420 */ 1421 int 1422 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1423 const void *old_key, size_t old_key_len, 1424 const void *new_key, size_t new_key_len) 1425 { 1426 uint32_t old_hash, new_hash; 1427 struct vcache_key old_vcache_key, new_vcache_key; 1428 vnode_impl_t *vip, *new_vip; 1429 1430 old_vcache_key.vk_mount = mp; 1431 old_vcache_key.vk_key = old_key; 1432 old_vcache_key.vk_key_len = old_key_len; 1433 old_hash = vcache_hash(&old_vcache_key); 1434 1435 new_vcache_key.vk_mount = mp; 1436 new_vcache_key.vk_key = new_key; 1437 new_vcache_key.vk_key_len = new_key_len; 1438 new_hash = vcache_hash(&new_vcache_key); 1439 1440 new_vip = vcache_alloc(); 1441 new_vip->vi_key = new_vcache_key; 1442 1443 /* Insert locked new node used as placeholder. */ 1444 mutex_enter(&vcache_lock); 1445 vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1446 if (vip != NULL) { 1447 vcache_dealloc(new_vip); 1448 return EEXIST; 1449 } 1450 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1451 new_vip, vi_hash); 1452 1453 /* Replace old nodes key with the temporary copy. */ 1454 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1455 KASSERT(vip != NULL); 1456 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1457 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); 1458 vip->vi_key = old_vcache_key; 1459 mutex_exit(&vcache_lock); 1460 return 0; 1461 } 1462 1463 /* 1464 * Key change complete: update old node and remove placeholder. 1465 */ 1466 void 1467 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1468 const void *old_key, size_t old_key_len, 1469 const void *new_key, size_t new_key_len) 1470 { 1471 uint32_t old_hash, new_hash; 1472 struct vcache_key old_vcache_key, new_vcache_key; 1473 vnode_impl_t *vip, *new_vip; 1474 struct vnode *new_vp; 1475 1476 old_vcache_key.vk_mount = mp; 1477 old_vcache_key.vk_key = old_key; 1478 old_vcache_key.vk_key_len = old_key_len; 1479 old_hash = vcache_hash(&old_vcache_key); 1480 1481 new_vcache_key.vk_mount = mp; 1482 new_vcache_key.vk_key = new_key; 1483 new_vcache_key.vk_key_len = new_key_len; 1484 new_hash = vcache_hash(&new_vcache_key); 1485 1486 mutex_enter(&vcache_lock); 1487 1488 /* Lookup old and new node. */ 1489 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1490 KASSERT(vip != NULL); 1491 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1492 1493 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1494 KASSERT(new_vip != NULL); 1495 KASSERT(new_vip->vi_key.vk_key_len == new_key_len); 1496 new_vp = VIMPL_TO_VNODE(new_vip); 1497 mutex_enter(new_vp->v_interlock); 1498 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); 1499 mutex_exit(new_vp->v_interlock); 1500 1501 /* Rekey old node and put it onto its new hashlist. */ 1502 vip->vi_key = new_vcache_key; 1503 if (old_hash != new_hash) { 1504 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], 1505 vip, vnode_impl, vi_hash); 1506 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1507 vip, vi_hash); 1508 } 1509 1510 /* Remove new node used as placeholder. */ 1511 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], 1512 new_vip, vnode_impl, vi_hash); 1513 vcache_dealloc(new_vip); 1514 } 1515 1516 /* 1517 * Disassociate the underlying file system from a vnode. 1518 * 1519 * Must be called with vnode locked and will return unlocked. 1520 * Must be called with the interlock held, and will return with it held. 1521 */ 1522 static void 1523 vcache_reclaim(vnode_t *vp) 1524 { 1525 lwp_t *l = curlwp; 1526 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 1527 struct mount *mp = vp->v_mount; 1528 uint32_t hash; 1529 uint8_t temp_buf[64], *temp_key; 1530 size_t temp_key_len; 1531 bool recycle, active; 1532 int error; 1533 1534 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1535 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1536 KASSERT(mutex_owned(vp->v_interlock)); 1537 KASSERT(vp->v_usecount != 0); 1538 1539 active = (vp->v_usecount > 1); 1540 temp_key_len = vip->vi_key.vk_key_len; 1541 /* 1542 * Prevent the vnode from being recycled or brought into use 1543 * while we clean it out. 1544 */ 1545 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING); 1546 if (vp->v_iflag & VI_EXECMAP) { 1547 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1548 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1549 } 1550 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1551 mutex_exit(vp->v_interlock); 1552 1553 /* Replace the vnode key with a temporary copy. */ 1554 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { 1555 temp_key = kmem_alloc(temp_key_len, KM_SLEEP); 1556 } else { 1557 temp_key = temp_buf; 1558 } 1559 mutex_enter(&vcache_lock); 1560 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); 1561 vip->vi_key.vk_key = temp_key; 1562 mutex_exit(&vcache_lock); 1563 1564 fstrans_start(mp); 1565 1566 /* 1567 * Clean out any cached data associated with the vnode. 1568 * If purging an active vnode, it must be closed and 1569 * deactivated before being reclaimed. 1570 */ 1571 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1572 if (error != 0) { 1573 if (wapbl_vphaswapbl(vp)) 1574 WAPBL_DISCARD(wapbl_vptomp(vp)); 1575 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1576 } 1577 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1578 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1579 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1580 spec_node_revoke(vp); 1581 } 1582 1583 /* 1584 * Disassociate the underlying file system from the vnode. 1585 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks 1586 * the vnode, and may destroy the vnode so that VOP_UNLOCK 1587 * would no longer function. 1588 */ 1589 VOP_INACTIVE(vp, &recycle); 1590 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1591 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1592 if (VOP_RECLAIM(vp)) { 1593 vnpanic(vp, "%s: cannot reclaim", __func__); 1594 } 1595 1596 KASSERT(vp->v_data == NULL); 1597 KASSERT(vp->v_uobj.uo_npages == 0); 1598 1599 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1600 uvm_ra_freectx(vp->v_ractx); 1601 vp->v_ractx = NULL; 1602 } 1603 1604 /* Purge name cache. */ 1605 cache_purge(vp); 1606 1607 /* Remove from vnode cache. */ 1608 hash = vcache_hash(&vip->vi_key); 1609 mutex_enter(&vcache_lock); 1610 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 1611 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1612 vip, vnode_impl, vi_hash); 1613 mutex_exit(&vcache_lock); 1614 if (temp_key != temp_buf) 1615 kmem_free(temp_key, temp_key_len); 1616 1617 /* Done with purge, notify sleepers of the grim news. */ 1618 mutex_enter(vp->v_interlock); 1619 vp->v_op = dead_vnodeop_p; 1620 vp->v_vflag |= VV_LOCKSWORK; 1621 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); 1622 vp->v_tag = VT_NON; 1623 KNOTE(&vp->v_klist, NOTE_REVOKE); 1624 mutex_exit(vp->v_interlock); 1625 1626 /* 1627 * Move to dead mount. Must be after changing the operations 1628 * vector as vnode operations enter the mount before using the 1629 * operations vector. See sys/kern/vnode_if.c. 1630 */ 1631 vp->v_vflag &= ~VV_ROOT; 1632 vfs_ref(dead_rootmount); 1633 vfs_insmntque(vp, dead_rootmount); 1634 1635 mutex_enter(vp->v_interlock); 1636 fstrans_done(mp); 1637 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1638 } 1639 1640 /* 1641 * Update outstanding I/O count and do wakeup if requested. 1642 */ 1643 void 1644 vwakeup(struct buf *bp) 1645 { 1646 vnode_t *vp; 1647 1648 if ((vp = bp->b_vp) == NULL) 1649 return; 1650 1651 KASSERT(bp->b_objlock == vp->v_interlock); 1652 KASSERT(mutex_owned(bp->b_objlock)); 1653 1654 if (--vp->v_numoutput < 0) 1655 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1656 if (vp->v_numoutput == 0) 1657 cv_broadcast(&vp->v_cv); 1658 } 1659 1660 /* 1661 * Test a vnode for being or becoming dead. Returns one of: 1662 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1663 * ENOENT: vnode is dead. 1664 * 0: otherwise. 1665 * 1666 * Whenever this function returns a non-zero value all future 1667 * calls will also return a non-zero value. 1668 */ 1669 int 1670 vdead_check(struct vnode *vp, int flags) 1671 { 1672 1673 KASSERT(mutex_owned(vp->v_interlock)); 1674 1675 if (! ISSET(flags, VDEAD_NOWAIT)) 1676 VSTATE_WAIT_STABLE(vp); 1677 1678 if (VSTATE_GET(vp) == VS_RECLAIMING) { 1679 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 1680 return EBUSY; 1681 } else if (VSTATE_GET(vp) == VS_RECLAIMED) { 1682 return ENOENT; 1683 } 1684 1685 return 0; 1686 } 1687 1688 int 1689 vfs_drainvnodes(void) 1690 { 1691 int i, gen; 1692 1693 mutex_enter(&vdrain_lock); 1694 for (i = 0; i < 2; i++) { 1695 gen = vdrain_gen; 1696 while (gen == vdrain_gen) { 1697 cv_broadcast(&vdrain_cv); 1698 cv_wait(&vdrain_gen_cv, &vdrain_lock); 1699 } 1700 } 1701 mutex_exit(&vdrain_lock); 1702 1703 if (numvnodes >= desiredvnodes) 1704 return EBUSY; 1705 1706 if (vcache_hashsize != desiredvnodes) 1707 vcache_reinit(); 1708 1709 return 0; 1710 } 1711 1712 void 1713 vnpanic(vnode_t *vp, const char *fmt, ...) 1714 { 1715 va_list ap; 1716 1717 #ifdef DIAGNOSTIC 1718 vprint(NULL, vp); 1719 #endif 1720 va_start(ap, fmt); 1721 vpanic(fmt, ap); 1722 va_end(ap); 1723 } 1724