1 /* $NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 61 */ 62 63 /* 64 * Name caching: 65 * 66 * Names found by directory scans are retained in a cache for future 67 * reference. It is managed LRU, so frequently used names will hang 68 * around. The cache is indexed by hash value obtained from the name. 69 * 70 * The name cache is the brainchild of Robert Elz and was introduced in 71 * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk 72 * McKusick, May 21 1984. 73 * 74 * Data structures: 75 * 76 * Most Unix namecaches very sensibly use a global hash table to index 77 * names. The global hash table works well, but can cause concurrency 78 * headaches for the kernel hacker. In the NetBSD 10.0 implementation 79 * we are not sensible, and use a per-directory data structure to index 80 * names, but the cache otherwise functions the same. 81 * 82 * The index is a red-black tree. It should not be difficult to 83 * experiment with other types of index, however note that a tree 84 * can trivially be made to support lockless lookup. 85 * 86 * Each cached name is stored in a struct namecache, along with a 87 * pointer to the associated vnode (nc_vp). Names longer than a 88 * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they 89 * occur infrequently, and names shorter than this are stored directly 90 * in struct namecache. If it is a "negative" entry, (i.e. for a name 91 * that is known NOT to exist) the vnode pointer will be NULL. 92 * 93 * In practice this implementation is not any slower than the hash 94 * table that preceeded it and in some cases it significantly 95 * outperforms the hash table. Some reasons why this might be: 96 * 97 * - natural partitioning provided by the file system structure, which 98 * the prior implementation discarded (global hash table). 99 * - worst case tree traversal of O(log n), the hash table could have 100 * many collisions. 101 * - minimized cache misses & total L2/L3 CPU cache footprint; struct 102 * namecache and vnode_impl_t are laid out to keep cache footprint 103 * minimal in the lookup path; no hash table buckets to cache. 104 * - minimized number of conditionals & string comparisons. 105 * 106 * For a directory with 3 cached names for 3 distinct vnodes, the 107 * various vnodes and namecache structs would be connected like this 108 * (the root is at the bottom of the diagram): 109 * 110 * ... 111 * ^ 112 * |- vi_nc_tree 113 * | 114 * +----o----+ +---------+ +---------+ 115 * | VDIR | | VCHR | | VREG | 116 * | vnode o-----+ | vnode o-----+ | vnode o------+ 117 * +---------+ | +---------+ | +---------+ | 118 * ^ | ^ | ^ | 119 * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | 120 * | | | | | | 121 * +----o----+ | +----o----+ | +----o----+ | 122 * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ 123 * | +---------+ | +---------+ | +---------+ 124 * | ^ | ^ | ^ 125 * | | | | | | 126 * | | +----------------------+ | | 127 * |-nc_dvp | +-------------------------------------------------+ 128 * | |/- vi_nc_tree | | 129 * | | |- nc_dvp |- nc_dvp 130 * | +----o----+ | | 131 * +-->| VDIR |<----------+ | 132 * | vnode |<------------------------------------+ 133 * +---------+ 134 * 135 * START HERE 136 * 137 * Replacement: 138 * 139 * As the cache becomes full, old and unused entries are purged as new 140 * entries are added. The synchronization overhead in maintaining a 141 * strict ordering would be prohibitive, so the VM system's "clock" or 142 * "second chance" page replacement algorithm is aped here. New 143 * entries go to the tail of the active list. After they age out and 144 * reach the head of the list, they are moved to the tail of the 145 * inactive list. Any use of the deactivated cache entry reactivates 146 * it, saving it from impending doom; if not reactivated, the entry 147 * eventually reaches the head of the inactive list and is purged. 148 * 149 * Concurrency: 150 * 151 * From a performance perspective, cache_lookup(nameiop == LOOKUP) is 152 * what really matters; insertion of new entries with cache_enter() is 153 * comparatively infrequent, and overshadowed by the cost of expensive 154 * file system metadata operations (which may involve disk I/O). We 155 * therefore want to make everything simplest in the lookup path. 156 * 157 * struct namecache is mostly stable except for list and tree related 158 * entries, changes to which don't affect the cached name or vnode. 159 * For changes to name+vnode, entries are purged in preference to 160 * modifying them. 161 * 162 * Read access to namecache entries is made via tree, list, or LRU 163 * list. A lock corresponding to the direction of access should be 164 * held. See definition of "struct namecache" in src/sys/namei.src, 165 * and the definition of "struct vnode" for the particulars. 166 * 167 * Per-CPU statistics, and LRU list totals are read unlocked, since an 168 * approximate value is OK. We maintain 32-bit sized per-CPU counters 169 * and 64-bit global counters since 32-bit sized counters can be 170 * observed locklessly while the global counters are protected by a 171 * mutex. 172 * 173 * The lock order is: 174 * 175 * 1) vi->vi_nc_lock (tree or parent -> child direction, 176 * used during forward lookup) 177 * 178 * 2) vi->vi_nc_listlock (list or child -> parent direction, 179 * used during reverse lookup) 180 * 181 * 3) cache_lru_lock (LRU list direction, used during reclaim) 182 */ 183 184 #include <sys/cdefs.h> 185 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $"); 186 187 #define __NAMECACHE_PRIVATE 188 #ifdef _KERNEL_OPT 189 #include "opt_ddb.h" 190 #include "opt_dtrace.h" 191 #endif 192 193 #include <sys/param.h> 194 #include <sys/types.h> 195 #include <sys/atomic.h> 196 #include <sys/callout.h> 197 #include <sys/cpu.h> 198 #include <sys/errno.h> 199 #include <sys/evcnt.h> 200 #include <sys/hash.h> 201 #include <sys/kernel.h> 202 #include <sys/mount.h> 203 #include <sys/mutex.h> 204 #include <sys/namei.h> 205 #include <sys/param.h> 206 #include <sys/pool.h> 207 #include <sys/sdt.h> 208 #include <sys/sysctl.h> 209 #include <sys/systm.h> 210 #include <sys/time.h> 211 #include <sys/vnode_impl.h> 212 213 #include <miscfs/genfs/genfs.h> 214 215 /* 216 * Assert that data structure layout hasn't changed unintentionally. 217 */ 218 #ifdef _LP64 219 CTASSERT(sizeof(struct namecache) == 128); 220 #else 221 CTASSERT(sizeof(struct namecache) == 64); 222 #endif 223 CTASSERT(NC_NLEN_MASK >= MAXPATHLEN); 224 225 static void cache_activate(struct namecache *); 226 static void cache_update_stats(void *); 227 static int cache_compare_nodes(void *, const void *, const void *); 228 static void cache_deactivate(void); 229 static void cache_reclaim(void); 230 static int cache_stat_sysctl(SYSCTLFN_ARGS); 231 232 /* 233 * Global pool cache. 234 */ 235 static pool_cache_t cache_pool __read_mostly; 236 237 /* 238 * LRU replacement. 239 */ 240 enum cache_lru_id { 241 LRU_ACTIVE, 242 LRU_INACTIVE, 243 LRU_COUNT 244 }; 245 246 static struct { 247 TAILQ_HEAD(, namecache) list[LRU_COUNT]; 248 u_int count[LRU_COUNT]; 249 } cache_lru __cacheline_aligned; 250 251 static kmutex_t cache_lru_lock __cacheline_aligned; 252 253 /* 254 * Cache effectiveness statistics. nchstats holds system-wide total. 255 */ 256 struct nchstats nchstats; 257 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); 258 struct nchcpu { 259 struct nchstats_percpu cur; 260 struct nchstats_percpu last; 261 }; 262 static callout_t cache_stat_callout; 263 static kmutex_t cache_stat_lock __cacheline_aligned; 264 265 #define COUNT(f) do { \ 266 lwp_t *l = curlwp; \ 267 KPREEMPT_DISABLE(l); \ 268 struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \ 269 nchcpu->cur.f++; \ 270 KPREEMPT_ENABLE(l); \ 271 } while (/* CONSTCOND */ 0); 272 273 #define UPDATE(nchcpu, f) do { \ 274 uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \ 275 nchstats.f += (uint32_t)(cur - nchcpu->last.f); \ 276 nchcpu->last.f = cur; \ 277 } while (/* CONSTCOND */ 0) 278 279 /* 280 * Tunables. cache_maxlen replaces the historical doingcache: 281 * set it zero to disable caching for debugging purposes. 282 */ 283 int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */ 284 int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */ 285 int cache_maxlen __read_mostly = NC_NLEN_MASK; /* max name length to cache */ 286 int cache_stat_interval __read_mostly = 300; /* in seconds */ 287 288 /* 289 * sysctl stuff. 290 */ 291 static struct sysctllog *cache_sysctllog; 292 293 /* 294 * This is a dummy name that cannot usually occur anywhere in the cache nor 295 * file system. It's used when caching the root vnode of mounted file 296 * systems. The name is attached to the directory that the file system is 297 * mounted on. 298 */ 299 static const char cache_mp_name[] = ""; 300 static const int cache_mp_nlen = sizeof(cache_mp_name) - 1; 301 302 /* 303 * Red-black tree stuff. 304 */ 305 static const rb_tree_ops_t cache_rbtree_ops = { 306 .rbto_compare_nodes = cache_compare_nodes, 307 .rbto_compare_key = cache_compare_nodes, 308 .rbto_node_offset = offsetof(struct namecache, nc_tree), 309 .rbto_context = NULL 310 }; 311 312 /* 313 * dtrace probes. 314 */ 315 SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); 316 SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *"); 317 SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *"); 318 SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t"); 319 SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *"); 320 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", 321 "char *", "size_t"); 322 SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *", 323 "char *", "size_t"); 324 SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *", 325 "char *", "size_t"); 326 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *", 327 "struct vnode *"); 328 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *", 329 "int"); 330 SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int"); 331 SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *", 332 "char *", "size_t"); 333 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", 334 "char *", "size_t"); 335 336 /* 337 * rbtree: compare two nodes. 338 */ 339 static int 340 cache_compare_nodes(void *context, const void *n1, const void *n2) 341 { 342 const struct namecache *nc1 = n1; 343 const struct namecache *nc2 = n2; 344 345 if (nc1->nc_key < nc2->nc_key) { 346 return -1; 347 } 348 if (nc1->nc_key > nc2->nc_key) { 349 return 1; 350 } 351 KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2)); 352 return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1)); 353 } 354 355 /* 356 * Compute a key value for the given name. The name length is encoded in 357 * the key value to try and improve uniqueness, and so that length doesn't 358 * need to be compared separately for string comparisons. 359 */ 360 static uintptr_t 361 cache_key(const char *name, size_t nlen) 362 { 363 uintptr_t key; 364 365 KASSERT((nlen & ~NC_NLEN_MASK) == 0); 366 367 key = hash32_buf(name, nlen, HASH32_STR_INIT); 368 return (key << NC_NLEN_BITS) | (uintptr_t)nlen; 369 } 370 371 /* 372 * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node 373 * is true, then we're locking in the conventional direction and the list 374 * lock will be acquired when removing the entry from the vnode list. 375 */ 376 static void 377 cache_remove(struct namecache *ncp, const bool dir2node) 378 { 379 struct vnode *vp, *dvp = ncp->nc_dvp; 380 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 381 size_t namelen = NC_NLEN(ncp); 382 383 KASSERT(rw_write_held(&dvi->vi_nc_lock)); 384 KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key); 385 KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); 386 387 SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0); 388 389 /* 390 * Remove from the vnode's list. This excludes cache_revlookup(), 391 * and then it's safe to remove from the LRU lists. 392 */ 393 if ((vp = ncp->nc_vp) != NULL) { 394 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 395 if (__predict_true(dir2node)) { 396 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 397 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 398 rw_exit(&vi->vi_nc_listlock); 399 } else { 400 TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); 401 } 402 } 403 404 /* Remove from the directory's rbtree. */ 405 rb_tree_remove_node(&dvi->vi_nc_tree, ncp); 406 407 /* Remove from the LRU lists. */ 408 mutex_enter(&cache_lru_lock); 409 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 410 cache_lru.count[ncp->nc_lrulist]--; 411 mutex_exit(&cache_lru_lock); 412 413 /* Finally, free it. */ 414 if (namelen > NCHNAMLEN) { 415 size_t sz = offsetof(struct namecache, nc_name[namelen]); 416 kmem_free(ncp, sz); 417 } else { 418 pool_cache_put(cache_pool, ncp); 419 } 420 } 421 422 /* 423 * Find a single cache entry and return it. vi_nc_lock must be held. 424 */ 425 static struct namecache * __noinline 426 cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, 427 uintptr_t key) 428 { 429 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 430 struct rb_node *node = dvi->vi_nc_tree.rbt_root; 431 struct namecache *ncp; 432 enum cache_lru_id lrulist; 433 int diff; 434 435 KASSERT(namelen <= MAXPATHLEN); 436 KASSERT(rw_lock_held(&dvi->vi_nc_lock)); 437 438 /* 439 * Search the RB tree for the key. This is an inlined lookup 440 * tailored for exactly what's needed here that turns out to be 441 * quite a bit faster than using rb_tree_find_node(). 442 * 443 * For a matching key memcmp() needs to be called once to confirm 444 * that the correct name has been found. Very rarely there will be 445 * a key value collision and the search will continue. 446 */ 447 for (;;) { 448 if (__predict_false(RB_SENTINEL_P(node))) { 449 return NULL; 450 } 451 ncp = (struct namecache *)node; 452 KASSERT((void *)&ncp->nc_tree == (void *)ncp); 453 KASSERT(ncp->nc_dvp == dvp); 454 if (ncp->nc_key == key) { 455 KASSERT(NC_NLEN(ncp) == namelen); 456 diff = memcmp(ncp->nc_name, name, namelen); 457 if (__predict_true(diff == 0)) { 458 break; 459 } 460 node = node->rb_nodes[diff < 0]; 461 } else { 462 node = node->rb_nodes[ncp->nc_key < key]; 463 } 464 } 465 466 /* 467 * If the entry is on the wrong LRU list, requeue it. This is an 468 * unlocked check, but it will rarely be wrong and even then there 469 * will be no harm caused. 470 */ 471 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 472 if (__predict_false(lrulist != LRU_ACTIVE)) { 473 cache_activate(ncp); 474 } 475 return ncp; 476 } 477 478 /* 479 * Look for a the name in the cache. We don't do this 480 * if the segment name is long, simply so the cache can avoid 481 * holding long names (which would either waste space, or 482 * add greatly to the complexity). 483 * 484 * Lookup is called with DVP pointing to the directory to search, 485 * and CNP providing the name of the entry being sought: cn_nameptr 486 * is the name, cn_namelen is its length, and cn_flags is the flags 487 * word from the namei operation. 488 * 489 * DVP must be locked. 490 * 491 * There are three possible non-error return states: 492 * 1. Nothing was found in the cache. Nothing is known about 493 * the requested name. 494 * 2. A negative entry was found in the cache, meaning that the 495 * requested name definitely does not exist. 496 * 3. A positive entry was found in the cache, meaning that the 497 * requested name does exist and that we are providing the 498 * vnode. 499 * In these cases the results are: 500 * 1. 0 returned; VN is set to NULL. 501 * 2. 1 returned; VN is set to NULL. 502 * 3. 1 returned; VN is set to the vnode found. 503 * 504 * The additional result argument ISWHT is set to zero, unless a 505 * negative entry is found that was entered as a whiteout, in which 506 * case ISWHT is set to one. 507 * 508 * The ISWHT_RET argument pointer may be null. In this case an 509 * assertion is made that the whiteout flag is not set. File systems 510 * that do not support whiteouts can/should do this. 511 * 512 * Filesystems that do support whiteouts should add ISWHITEOUT to 513 * cnp->cn_flags if ISWHT comes back nonzero. 514 * 515 * When a vnode is returned, it is locked, as per the vnode lookup 516 * locking protocol. 517 * 518 * There is no way for this function to fail, in the sense of 519 * generating an error that requires aborting the namei operation. 520 * 521 * (Prior to October 2012, this function returned an integer status, 522 * and a vnode, and mucked with the flags word in CNP for whiteouts. 523 * The integer status was -1 for "nothing found", ENOENT for "a 524 * negative entry found", 0 for "a positive entry found", and possibly 525 * other errors, and the value of VN might or might not have been set 526 * depending on what error occurred.) 527 */ 528 bool 529 cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 530 uint32_t nameiop, uint32_t cnflags, 531 int *iswht_ret, struct vnode **vn_ret) 532 { 533 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 534 struct namecache *ncp; 535 struct vnode *vp; 536 uintptr_t key; 537 int error; 538 bool hit; 539 krw_t op; 540 541 KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); 542 543 /* Establish default result values */ 544 if (iswht_ret != NULL) { 545 *iswht_ret = 0; 546 } 547 *vn_ret = NULL; 548 549 if (__predict_false(namelen > cache_maxlen)) { 550 SDT_PROBE(vfs, namecache, lookup, toolong, dvp, 551 name, namelen, 0, 0); 552 COUNT(ncs_long); 553 return false; 554 } 555 556 /* Compute the key up front - don't need the lock. */ 557 key = cache_key(name, namelen); 558 559 /* Could the entry be purged below? */ 560 if ((cnflags & ISLASTCN) != 0 && 561 ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { 562 op = RW_WRITER; 563 } else { 564 op = RW_READER; 565 } 566 567 /* Now look for the name. */ 568 rw_enter(&dvi->vi_nc_lock, op); 569 ncp = cache_lookup_entry(dvp, name, namelen, key); 570 if (__predict_false(ncp == NULL)) { 571 rw_exit(&dvi->vi_nc_lock); 572 COUNT(ncs_miss); 573 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 574 name, namelen, 0, 0); 575 return false; 576 } 577 if (__predict_false((cnflags & MAKEENTRY) == 0)) { 578 /* 579 * Last component and we are renaming or deleting, 580 * the cache entry is invalid, or otherwise don't 581 * want cache entry to exist. 582 */ 583 KASSERT((cnflags & ISLASTCN) != 0); 584 cache_remove(ncp, true); 585 rw_exit(&dvi->vi_nc_lock); 586 COUNT(ncs_badhits); 587 return false; 588 } 589 if ((vp = ncp->nc_vp) == NULL) { 590 if (iswht_ret != NULL) { 591 /* 592 * Restore the ISWHITEOUT flag saved earlier. 593 */ 594 *iswht_ret = ncp->nc_whiteout; 595 } else { 596 KASSERT(!ncp->nc_whiteout); 597 } 598 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { 599 /* 600 * Last component and we are preparing to create 601 * the named object, so flush the negative cache 602 * entry. 603 */ 604 COUNT(ncs_badhits); 605 cache_remove(ncp, true); 606 hit = false; 607 } else { 608 COUNT(ncs_neghits); 609 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, 610 namelen, 0, 0); 611 /* found neg entry; vn is already null from above */ 612 hit = true; 613 } 614 rw_exit(&dvi->vi_nc_lock); 615 return hit; 616 } 617 error = vcache_tryvget(vp); 618 rw_exit(&dvi->vi_nc_lock); 619 if (error) { 620 KASSERT(error == EBUSY); 621 /* 622 * This vnode is being cleaned out. 623 * XXX badhits? 624 */ 625 COUNT(ncs_falsehits); 626 return false; 627 } 628 629 COUNT(ncs_goodhits); 630 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 631 /* found it */ 632 *vn_ret = vp; 633 return true; 634 } 635 636 /* 637 * Version of the above without the nameiop argument, for NFS. 638 */ 639 bool 640 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 641 uint32_t cnflags, 642 int *iswht_ret, struct vnode **vn_ret) 643 { 644 645 return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, 646 iswht_ret, vn_ret); 647 } 648 649 /* 650 * Used by namei() to walk down a path, component by component by looking up 651 * names in the cache. The node locks are chained along the way: a parent's 652 * lock is not dropped until the child's is acquired. 653 */ 654 bool 655 cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, 656 struct vnode **vn_ret, krwlock_t **plock, 657 kauth_cred_t cred) 658 { 659 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 660 struct namecache *ncp; 661 krwlock_t *oldlock, *newlock; 662 struct vnode *vp; 663 uintptr_t key; 664 int error; 665 666 KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); 667 668 /* If disabled, or file system doesn't support this, bail out. */ 669 if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { 670 return false; 671 } 672 673 if (__predict_false(namelen > cache_maxlen)) { 674 COUNT(ncs_long); 675 return false; 676 } 677 678 /* Compute the key up front - don't need the lock. */ 679 key = cache_key(name, namelen); 680 681 /* 682 * Acquire the directory lock. Once we have that, we can drop the 683 * previous one (if any). 684 * 685 * The two lock holds mean that the directory can't go away while 686 * here: the directory must be purged with cache_purge() before 687 * being freed, and both parent & child's vi_nc_lock must be taken 688 * before that point is passed. 689 * 690 * However if there's no previous lock, like at the root of the 691 * chain, then "dvp" must be referenced to prevent dvp going away 692 * before we get its lock. 693 * 694 * Note that the two locks can be the same if looking up a dot, for 695 * example: /usr/bin/. If looking up the parent (..) we can't wait 696 * on the lock as child -> parent is the wrong direction. 697 */ 698 if (*plock != &dvi->vi_nc_lock) { 699 oldlock = *plock; 700 newlock = &dvi->vi_nc_lock; 701 if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) { 702 return false; 703 } 704 } else { 705 oldlock = NULL; 706 newlock = NULL; 707 if (*plock == NULL) { 708 KASSERT(vrefcnt(dvp) > 0); 709 } 710 } 711 712 /* 713 * First up check if the user is allowed to look up files in this 714 * directory. 715 */ 716 if (cred != FSCRED) { 717 if (dvi->vi_nc_mode == VNOVAL) { 718 if (newlock != NULL) { 719 rw_exit(newlock); 720 } 721 return false; 722 } 723 KASSERT(dvi->vi_nc_uid != VNOVAL); 724 KASSERT(dvi->vi_nc_gid != VNOVAL); 725 error = kauth_authorize_vnode(cred, 726 KAUTH_ACCESS_ACTION(VEXEC, 727 dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, 728 genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid, 729 dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC)); 730 if (error != 0) { 731 if (newlock != NULL) { 732 rw_exit(newlock); 733 } 734 COUNT(ncs_denied); 735 return false; 736 } 737 } 738 739 /* 740 * Now look for a matching cache entry. 741 */ 742 ncp = cache_lookup_entry(dvp, name, namelen, key); 743 if (__predict_false(ncp == NULL)) { 744 if (newlock != NULL) { 745 rw_exit(newlock); 746 } 747 COUNT(ncs_miss); 748 SDT_PROBE(vfs, namecache, lookup, miss, dvp, 749 name, namelen, 0, 0); 750 return false; 751 } 752 if ((vp = ncp->nc_vp) == NULL) { 753 /* found negative entry; vn is already null from above */ 754 KASSERT(namelen != cache_mp_nlen); 755 KASSERT(name != cache_mp_name); 756 COUNT(ncs_neghits); 757 } else { 758 COUNT(ncs_goodhits); /* XXX can be "badhits" */ 759 } 760 SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); 761 762 /* 763 * Return with the directory lock still held. It will either be 764 * returned to us with another call to cache_lookup_linked() when 765 * looking up the next component, or the caller will release it 766 * manually when finished. 767 */ 768 if (oldlock) { 769 rw_exit(oldlock); 770 } 771 if (newlock) { 772 *plock = newlock; 773 } 774 *vn_ret = vp; 775 return true; 776 } 777 778 /* 779 * Scan cache looking for name of directory entry pointing at vp. 780 * Will not search for "." or "..". 781 * 782 * If the lookup succeeds the vnode is referenced and stored in dvpp. 783 * 784 * If bufp is non-NULL, also place the name in the buffer which starts 785 * at bufp, immediately before *bpp, and move bpp backwards to point 786 * at the start of it. (Yes, this is a little baroque, but it's done 787 * this way to cater to the whims of getcwd). 788 * 789 * Returns 0 on success, -1 on cache miss, positive errno on failure. 790 */ 791 int 792 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, 793 bool checkaccess, accmode_t accmode) 794 { 795 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 796 struct namecache *ncp; 797 enum cache_lru_id lrulist; 798 struct vnode *dvp; 799 int error, nlen; 800 char *bp; 801 802 KASSERT(vp != NULL); 803 804 if (cache_maxlen == 0) 805 goto out; 806 807 rw_enter(&vi->vi_nc_listlock, RW_READER); 808 if (checkaccess) { 809 /* 810 * Check if the user is allowed to see. NOTE: this is 811 * checking for access on the "wrong" directory. getcwd() 812 * wants to see that there is access on every component 813 * along the way, not that there is access to any individual 814 * component. Don't use this to check you can look in vp. 815 * 816 * I don't like it, I didn't come up with it, don't blame me! 817 */ 818 if (vi->vi_nc_mode == VNOVAL) { 819 rw_exit(&vi->vi_nc_listlock); 820 return -1; 821 } 822 KASSERT(vi->vi_nc_uid != VNOVAL); 823 KASSERT(vi->vi_nc_gid != VNOVAL); 824 error = kauth_authorize_vnode(kauth_cred_get(), 825 KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & 826 ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred, 827 vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS, 828 NULL, accmode)); 829 if (error != 0) { 830 rw_exit(&vi->vi_nc_listlock); 831 COUNT(ncs_denied); 832 return EACCES; 833 } 834 } 835 TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { 836 KASSERT(ncp->nc_vp == vp); 837 KASSERT(ncp->nc_dvp != NULL); 838 nlen = NC_NLEN(ncp); 839 840 /* 841 * Ignore mountpoint entries. 842 */ 843 if (nlen == cache_mp_nlen) { 844 continue; 845 } 846 847 /* 848 * The queue is partially sorted. Once we hit dots, nothing 849 * else remains but dots and dotdots, so bail out. 850 */ 851 if (ncp->nc_name[0] == '.') { 852 if (nlen == 1 || 853 (nlen == 2 && ncp->nc_name[1] == '.')) { 854 break; 855 } 856 } 857 858 /* 859 * Record a hit on the entry. This is an unlocked read but 860 * even if wrong it doesn't matter too much. 861 */ 862 lrulist = atomic_load_relaxed(&ncp->nc_lrulist); 863 if (lrulist != LRU_ACTIVE) { 864 cache_activate(ncp); 865 } 866 867 if (bufp) { 868 bp = *bpp; 869 bp -= nlen; 870 if (bp <= bufp) { 871 *dvpp = NULL; 872 rw_exit(&vi->vi_nc_listlock); 873 SDT_PROBE(vfs, namecache, revlookup, 874 fail, vp, ERANGE, 0, 0, 0); 875 return (ERANGE); 876 } 877 memcpy(bp, ncp->nc_name, nlen); 878 *bpp = bp; 879 } 880 881 dvp = ncp->nc_dvp; 882 error = vcache_tryvget(dvp); 883 rw_exit(&vi->vi_nc_listlock); 884 if (error) { 885 KASSERT(error == EBUSY); 886 if (bufp) 887 (*bpp) += nlen; 888 *dvpp = NULL; 889 SDT_PROBE(vfs, namecache, revlookup, fail, vp, 890 error, 0, 0, 0); 891 return -1; 892 } 893 *dvpp = dvp; 894 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, 895 0, 0, 0); 896 COUNT(ncs_revhits); 897 return (0); 898 } 899 rw_exit(&vi->vi_nc_listlock); 900 COUNT(ncs_revmiss); 901 out: 902 *dvpp = NULL; 903 return (-1); 904 } 905 906 /* 907 * Add an entry to the cache. 908 */ 909 void 910 cache_enter(struct vnode *dvp, struct vnode *vp, 911 const char *name, size_t namelen, uint32_t cnflags) 912 { 913 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 914 struct namecache *ncp, *oncp; 915 int total; 916 917 KASSERT(namelen != cache_mp_nlen || name == cache_mp_name); 918 919 /* First, check whether we can/should add a cache entry. */ 920 if ((cnflags & MAKEENTRY) == 0 || 921 __predict_false(namelen > cache_maxlen)) { 922 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen, 923 0, 0); 924 return; 925 } 926 927 SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); 928 929 /* 930 * Reclaim some entries if over budget. This is an unlocked check, 931 * but it doesn't matter. Just need to catch up with things 932 * eventually: it doesn't matter if we go over temporarily. 933 */ 934 total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]); 935 total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]); 936 if (__predict_false(total > desiredvnodes)) { 937 cache_reclaim(); 938 } 939 940 /* Now allocate a fresh entry. */ 941 if (__predict_true(namelen <= NCHNAMLEN)) { 942 ncp = pool_cache_get(cache_pool, PR_WAITOK); 943 } else { 944 size_t sz = offsetof(struct namecache, nc_name[namelen]); 945 ncp = kmem_alloc(sz, KM_SLEEP); 946 } 947 948 /* 949 * Fill in cache info. For negative hits, save the ISWHITEOUT flag 950 * so we can restore it later when the cache entry is used again. 951 */ 952 ncp->nc_vp = vp; 953 ncp->nc_dvp = dvp; 954 ncp->nc_key = cache_key(name, namelen); 955 ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); 956 memcpy(ncp->nc_name, name, namelen); 957 958 /* 959 * Insert to the directory. Concurrent lookups may race for a cache 960 * entry. If there's a entry there already, purge it. 961 */ 962 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 963 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 964 if (oncp != ncp) { 965 KASSERT(oncp->nc_key == ncp->nc_key); 966 KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp)); 967 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0); 968 cache_remove(oncp, true); 969 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp); 970 KASSERT(oncp == ncp); 971 } 972 973 /* 974 * With the directory lock still held, insert to the tail of the 975 * ACTIVE LRU list (new) and take the opportunity to incrementally 976 * balance the lists. 977 */ 978 mutex_enter(&cache_lru_lock); 979 ncp->nc_lrulist = LRU_ACTIVE; 980 cache_lru.count[LRU_ACTIVE]++; 981 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 982 cache_deactivate(); 983 mutex_exit(&cache_lru_lock); 984 985 /* 986 * Finally, insert to the vnode and unlock. With everything set up 987 * it's safe to let cache_revlookup() see the entry. Partially sort 988 * the per-vnode list: dots go to back so cache_revlookup() doesn't 989 * have to consider them. 990 */ 991 if (vp != NULL) { 992 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 993 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 994 if ((namelen == 1 && name[0] == '.') || 995 (namelen == 2 && name[0] == '.' && name[1] == '.')) { 996 TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); 997 } else { 998 TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); 999 } 1000 rw_exit(&vi->vi_nc_listlock); 1001 } 1002 rw_exit(&dvi->vi_nc_lock); 1003 } 1004 1005 /* 1006 * Set identity info in cache for a vnode. We only care about directories 1007 * so ignore other updates. The cached info may be marked invalid if the 1008 * inode has an ACL. 1009 */ 1010 void 1011 cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid) 1012 { 1013 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1014 1015 if (vp->v_type == VDIR) { 1016 /* Grab both locks, for forward & reverse lookup. */ 1017 rw_enter(&vi->vi_nc_lock, RW_WRITER); 1018 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1019 if (valid) { 1020 vi->vi_nc_mode = mode; 1021 vi->vi_nc_uid = uid; 1022 vi->vi_nc_gid = gid; 1023 } else { 1024 vi->vi_nc_mode = VNOVAL; 1025 vi->vi_nc_uid = VNOVAL; 1026 vi->vi_nc_gid = VNOVAL; 1027 } 1028 rw_exit(&vi->vi_nc_listlock); 1029 rw_exit(&vi->vi_nc_lock); 1030 } 1031 } 1032 1033 /* 1034 * Return true if we have identity for the given vnode, and use as an 1035 * opportunity to confirm that everything squares up. 1036 * 1037 * Because of shared code, some file systems could provide partial 1038 * information, missing some updates, so check the mount flag too. 1039 */ 1040 bool 1041 cache_have_id(struct vnode *vp) 1042 { 1043 1044 if (vp->v_type == VDIR && 1045 (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 && 1046 atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) { 1047 return true; 1048 } else { 1049 return false; 1050 } 1051 } 1052 1053 /* 1054 * Enter a mount point. cvp is the covered vnode, and rvp is the root of 1055 * the mounted file system. 1056 */ 1057 void 1058 cache_enter_mount(struct vnode *cvp, struct vnode *rvp) 1059 { 1060 1061 KASSERT(vrefcnt(cvp) > 0); 1062 KASSERT(vrefcnt(rvp) > 0); 1063 KASSERT(cvp->v_type == VDIR); 1064 KASSERT((rvp->v_vflag & VV_ROOT) != 0); 1065 1066 if (rvp->v_type == VDIR) { 1067 cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY); 1068 } 1069 } 1070 1071 /* 1072 * Look up a cached mount point. Used in the strongly locked path. 1073 */ 1074 bool 1075 cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret) 1076 { 1077 bool ret; 1078 1079 ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP, 1080 MAKEENTRY, NULL, vn_ret); 1081 KASSERT((*vn_ret != NULL) == ret); 1082 return ret; 1083 } 1084 1085 /* 1086 * Try to cross a mount point. For use with cache_lookup_linked(). 1087 */ 1088 bool 1089 cache_cross_mount(struct vnode **dvp, krwlock_t **plock) 1090 { 1091 1092 return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen, 1093 dvp, plock, FSCRED); 1094 } 1095 1096 /* 1097 * Name cache initialization, from vfs_init() when the system is booting. 1098 */ 1099 void 1100 nchinit(void) 1101 { 1102 1103 cache_pool = pool_cache_init(sizeof(struct namecache), 1104 coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL, 1105 NULL, NULL); 1106 KASSERT(cache_pool != NULL); 1107 1108 mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); 1109 TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); 1110 TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); 1111 1112 mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE); 1113 callout_init(&cache_stat_callout, CALLOUT_MPSAFE); 1114 callout_setfunc(&cache_stat_callout, cache_update_stats, NULL); 1115 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1116 1117 KASSERT(cache_sysctllog == NULL); 1118 sysctl_createv(&cache_sysctllog, 0, NULL, NULL, 1119 CTLFLAG_PERMANENT, 1120 CTLTYPE_STRUCT, "namecache_stats", 1121 SYSCTL_DESCR("namecache statistics"), 1122 cache_stat_sysctl, 0, NULL, 0, 1123 CTL_VFS, CTL_CREATE, CTL_EOL); 1124 } 1125 1126 /* 1127 * Called once for each CPU in the system as attached. 1128 */ 1129 void 1130 cache_cpu_init(struct cpu_info *ci) 1131 { 1132 size_t sz; 1133 1134 sz = roundup2(sizeof(struct nchcpu), coherency_unit); 1135 ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP); 1136 KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0); 1137 } 1138 1139 /* 1140 * A vnode is being allocated: set up cache structures. 1141 */ 1142 void 1143 cache_vnode_init(struct vnode *vp) 1144 { 1145 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1146 1147 rw_init(&vi->vi_nc_lock); 1148 rw_init(&vi->vi_nc_listlock); 1149 rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); 1150 TAILQ_INIT(&vi->vi_nc_list); 1151 vi->vi_nc_mode = VNOVAL; 1152 vi->vi_nc_uid = VNOVAL; 1153 vi->vi_nc_gid = VNOVAL; 1154 } 1155 1156 /* 1157 * A vnode is being freed: finish cache structures. 1158 */ 1159 void 1160 cache_vnode_fini(struct vnode *vp) 1161 { 1162 vnode_impl_t *vi = VNODE_TO_VIMPL(vp); 1163 1164 KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); 1165 KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); 1166 rw_destroy(&vi->vi_nc_lock); 1167 rw_destroy(&vi->vi_nc_listlock); 1168 } 1169 1170 /* 1171 * Helper for cache_purge1(): purge cache entries for the given vnode from 1172 * all directories that the vnode is cached in. 1173 */ 1174 static void 1175 cache_purge_parents(struct vnode *vp) 1176 { 1177 vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); 1178 struct vnode *dvp, *blocked; 1179 struct namecache *ncp; 1180 1181 SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); 1182 1183 blocked = NULL; 1184 1185 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1186 while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { 1187 /* 1188 * Locking in the wrong direction. Try for a hold on the 1189 * directory node's lock, and if we get it then all good, 1190 * nuke the entry and move on to the next. 1191 */ 1192 dvp = ncp->nc_dvp; 1193 dvi = VNODE_TO_VIMPL(dvp); 1194 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1195 cache_remove(ncp, false); 1196 rw_exit(&dvi->vi_nc_lock); 1197 blocked = NULL; 1198 continue; 1199 } 1200 1201 /* 1202 * We can't wait on the directory node's lock with our list 1203 * lock held or the system could deadlock. 1204 * 1205 * Take a hold on the directory vnode to prevent it from 1206 * being freed (taking the vnode & lock with it). Then 1207 * wait for the lock to become available with no other locks 1208 * held, and retry. 1209 * 1210 * If this happens twice in a row, give the other side a 1211 * breather; we can do nothing until it lets go. 1212 */ 1213 vhold(dvp); 1214 rw_exit(&vi->vi_nc_listlock); 1215 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1216 /* Do nothing. */ 1217 rw_exit(&dvi->vi_nc_lock); 1218 holdrele(dvp); 1219 if (blocked == dvp) { 1220 kpause("ncpurge", false, 1, NULL); 1221 } 1222 rw_enter(&vi->vi_nc_listlock, RW_WRITER); 1223 blocked = dvp; 1224 } 1225 rw_exit(&vi->vi_nc_listlock); 1226 } 1227 1228 /* 1229 * Helper for cache_purge1(): purge all cache entries hanging off the given 1230 * directory vnode. 1231 */ 1232 static void 1233 cache_purge_children(struct vnode *dvp) 1234 { 1235 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1236 struct namecache *ncp; 1237 1238 SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); 1239 1240 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1241 while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) { 1242 cache_remove(ncp, true); 1243 } 1244 rw_exit(&dvi->vi_nc_lock); 1245 } 1246 1247 /* 1248 * Helper for cache_purge1(): purge cache entry from the given vnode, 1249 * finding it by name. 1250 */ 1251 static void 1252 cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) 1253 { 1254 vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); 1255 struct namecache *ncp; 1256 uintptr_t key; 1257 1258 SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); 1259 1260 key = cache_key(name, namelen); 1261 rw_enter(&dvi->vi_nc_lock, RW_WRITER); 1262 ncp = cache_lookup_entry(dvp, name, namelen, key); 1263 if (ncp) { 1264 cache_remove(ncp, true); 1265 } 1266 rw_exit(&dvi->vi_nc_lock); 1267 } 1268 1269 /* 1270 * Cache flush, a particular vnode; called when a vnode is renamed to 1271 * hide entries that would now be invalid. 1272 */ 1273 void 1274 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 1275 { 1276 1277 if (flags & PURGE_PARENTS) { 1278 cache_purge_parents(vp); 1279 } 1280 if (flags & PURGE_CHILDREN) { 1281 cache_purge_children(vp); 1282 } 1283 if (name != NULL) { 1284 cache_purge_name(vp, name, namelen); 1285 } 1286 } 1287 1288 /* 1289 * vnode filter for cache_purgevfs(). 1290 */ 1291 static bool 1292 cache_vdir_filter(void *cookie, vnode_t *vp) 1293 { 1294 1295 return vp->v_type == VDIR; 1296 } 1297 1298 /* 1299 * Cache flush, a whole filesystem; called when filesys is umounted to 1300 * remove entries that would now be invalid. 1301 */ 1302 void 1303 cache_purgevfs(struct mount *mp) 1304 { 1305 struct vnode_iterator *iter; 1306 vnode_t *dvp; 1307 1308 vfs_vnode_iterator_init(mp, &iter); 1309 for (;;) { 1310 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); 1311 if (dvp == NULL) { 1312 break; 1313 } 1314 cache_purge_children(dvp); 1315 vrele(dvp); 1316 } 1317 vfs_vnode_iterator_destroy(iter); 1318 } 1319 1320 /* 1321 * Re-queue an entry onto the tail of the active LRU list, after it has 1322 * scored a hit. 1323 */ 1324 static void 1325 cache_activate(struct namecache *ncp) 1326 { 1327 1328 mutex_enter(&cache_lru_lock); 1329 TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); 1330 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1331 cache_lru.count[ncp->nc_lrulist]--; 1332 cache_lru.count[LRU_ACTIVE]++; 1333 ncp->nc_lrulist = LRU_ACTIVE; 1334 mutex_exit(&cache_lru_lock); 1335 } 1336 1337 /* 1338 * Try to balance the LRU lists. Pick some victim entries, and re-queue 1339 * them from the head of the active list to the tail of the inactive list. 1340 */ 1341 static void 1342 cache_deactivate(void) 1343 { 1344 struct namecache *ncp; 1345 int total, i; 1346 1347 KASSERT(mutex_owned(&cache_lru_lock)); 1348 1349 /* If we're nowhere near budget yet, don't bother. */ 1350 total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; 1351 if (total < (desiredvnodes >> 1)) { 1352 return; 1353 } 1354 1355 /* 1356 * Aim for a 1:1 ratio of active to inactive. This is to allow each 1357 * potential victim a reasonable amount of time to cycle through the 1358 * inactive list in order to score a hit and be reactivated, while 1359 * trying not to cause reactivations too frequently. 1360 */ 1361 if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { 1362 return; 1363 } 1364 1365 /* Move only a few at a time; will catch up eventually. */ 1366 for (i = 0; i < cache_lru_maxdeact; i++) { 1367 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); 1368 if (ncp == NULL) { 1369 break; 1370 } 1371 KASSERT(ncp->nc_lrulist == LRU_ACTIVE); 1372 ncp->nc_lrulist = LRU_INACTIVE; 1373 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); 1374 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); 1375 cache_lru.count[LRU_ACTIVE]--; 1376 cache_lru.count[LRU_INACTIVE]++; 1377 } 1378 } 1379 1380 /* 1381 * Free some entries from the cache, when we have gone over budget. 1382 * 1383 * We don't want to cause too much work for any individual caller, and it 1384 * doesn't matter if we temporarily go over budget. This is also "just a 1385 * cache" so it's not a big deal if we screw up and throw out something we 1386 * shouldn't. So we take a relaxed attitude to this process to reduce its 1387 * impact. 1388 */ 1389 static void 1390 cache_reclaim(void) 1391 { 1392 struct namecache *ncp; 1393 vnode_impl_t *dvi; 1394 int toscan; 1395 1396 /* 1397 * Scan up to a preset maximum number of entries, but no more than 1398 * 0.8% of the total at once (to allow for very small systems). 1399 * 1400 * On bigger systems, do a larger chunk of work to reduce the number 1401 * of times that cache_lru_lock is held for any length of time. 1402 */ 1403 mutex_enter(&cache_lru_lock); 1404 toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); 1405 toscan = MAX(toscan, 1); 1406 SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + 1407 cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); 1408 while (toscan-- != 0) { 1409 /* First try to balance the lists. */ 1410 cache_deactivate(); 1411 1412 /* Now look for a victim on head of inactive list (old). */ 1413 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); 1414 if (ncp == NULL) { 1415 break; 1416 } 1417 dvi = VNODE_TO_VIMPL(ncp->nc_dvp); 1418 KASSERT(ncp->nc_lrulist == LRU_INACTIVE); 1419 KASSERT(dvi != NULL); 1420 1421 /* 1422 * Locking in the wrong direction. If we can't get the 1423 * lock, the directory is actively busy, and it could also 1424 * cause problems for the next guy in here, so send the 1425 * entry to the back of the list. 1426 */ 1427 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { 1428 TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], 1429 ncp, nc_lru); 1430 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], 1431 ncp, nc_lru); 1432 continue; 1433 } 1434 1435 /* 1436 * Now have the victim entry locked. Drop the LRU list 1437 * lock, purge the entry, and start over. The hold on 1438 * vi_nc_lock will prevent the vnode from vanishing until 1439 * finished (cache_purge() will be called on dvp before it 1440 * disappears, and that will wait on vi_nc_lock). 1441 */ 1442 mutex_exit(&cache_lru_lock); 1443 cache_remove(ncp, true); 1444 rw_exit(&dvi->vi_nc_lock); 1445 mutex_enter(&cache_lru_lock); 1446 } 1447 mutex_exit(&cache_lru_lock); 1448 } 1449 1450 /* 1451 * For file system code: count a lookup that required a full re-scan of 1452 * directory metadata. 1453 */ 1454 void 1455 namecache_count_pass2(void) 1456 { 1457 1458 COUNT(ncs_pass2); 1459 } 1460 1461 /* 1462 * For file system code: count a lookup that scored a hit in the directory 1463 * metadata near the location of the last lookup. 1464 */ 1465 void 1466 namecache_count_2passes(void) 1467 { 1468 1469 COUNT(ncs_2passes); 1470 } 1471 1472 /* 1473 * Sum the stats from all CPUs into nchstats. This needs to run at least 1474 * once within every window where a 32-bit counter could roll over. It's 1475 * called regularly by timer to ensure this. 1476 */ 1477 static void 1478 cache_update_stats(void *cookie) 1479 { 1480 CPU_INFO_ITERATOR cii; 1481 struct cpu_info *ci; 1482 1483 mutex_enter(&cache_stat_lock); 1484 for (CPU_INFO_FOREACH(cii, ci)) { 1485 struct nchcpu *nchcpu = ci->ci_data.cpu_nch; 1486 UPDATE(nchcpu, ncs_goodhits); 1487 UPDATE(nchcpu, ncs_neghits); 1488 UPDATE(nchcpu, ncs_badhits); 1489 UPDATE(nchcpu, ncs_falsehits); 1490 UPDATE(nchcpu, ncs_miss); 1491 UPDATE(nchcpu, ncs_long); 1492 UPDATE(nchcpu, ncs_pass2); 1493 UPDATE(nchcpu, ncs_2passes); 1494 UPDATE(nchcpu, ncs_revhits); 1495 UPDATE(nchcpu, ncs_revmiss); 1496 UPDATE(nchcpu, ncs_denied); 1497 } 1498 if (cookie != NULL) { 1499 memcpy(cookie, &nchstats, sizeof(nchstats)); 1500 } 1501 /* Reset the timer; arrive back here in N minutes at latest. */ 1502 callout_schedule(&cache_stat_callout, cache_stat_interval * hz); 1503 mutex_exit(&cache_stat_lock); 1504 } 1505 1506 /* 1507 * Fetch the current values of the stats for sysctl. 1508 */ 1509 static int 1510 cache_stat_sysctl(SYSCTLFN_ARGS) 1511 { 1512 struct nchstats stats; 1513 1514 if (oldp == NULL) { 1515 *oldlenp = sizeof(nchstats); 1516 return 0; 1517 } 1518 1519 if (*oldlenp <= 0) { 1520 *oldlenp = 0; 1521 return 0; 1522 } 1523 1524 /* Refresh the global stats. */ 1525 sysctl_unlock(); 1526 cache_update_stats(&stats); 1527 sysctl_relock(); 1528 1529 *oldlenp = MIN(sizeof(stats), *oldlenp); 1530 return sysctl_copyout(l, &stats, oldp, *oldlenp); 1531 } 1532 1533 /* 1534 * For the debugger, given the address of a vnode, print all associated 1535 * names in the cache. 1536 */ 1537 #ifdef DDB 1538 void 1539 namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1540 { 1541 struct vnode *dvp = NULL; 1542 struct namecache *ncp; 1543 enum cache_lru_id id; 1544 1545 for (id = 0; id < LRU_COUNT; id++) { 1546 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1547 if (ncp->nc_vp == vp) { 1548 (*pr)("name %.*s\n", NC_NLEN(ncp), 1549 ncp->nc_name); 1550 dvp = ncp->nc_dvp; 1551 } 1552 } 1553 } 1554 if (dvp == NULL) { 1555 (*pr)("name not found\n"); 1556 return; 1557 } 1558 for (id = 0; id < LRU_COUNT; id++) { 1559 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { 1560 if (ncp->nc_vp == dvp) { 1561 (*pr)("parent %.*s\n", NC_NLEN(ncp), 1562 ncp->nc_name); 1563 } 1564 } 1565 } 1566 } 1567 #endif 1568