1 /* $NetBSD: vfs_cache.c,v 1.107 2015/08/24 22:50:32 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.107 2015/08/24 22:50:32 pooka Exp $"); 62 63 #ifdef _KERNEL_OPT 64 #include "opt_ddb.h" 65 #include "opt_revcache.h" 66 #endif 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/sysctl.h> 71 #include <sys/time.h> 72 #include <sys/mount.h> 73 #include <sys/vnode.h> 74 #include <sys/namei.h> 75 #include <sys/errno.h> 76 #include <sys/pool.h> 77 #include <sys/mutex.h> 78 #include <sys/atomic.h> 79 #include <sys/kthread.h> 80 #include <sys/kernel.h> 81 #include <sys/cpu.h> 82 #include <sys/evcnt.h> 83 84 #define NAMECACHE_ENTER_REVERSE 85 /* 86 * Name caching works as follows: 87 * 88 * Names found by directory scans are retained in a cache 89 * for future reference. It is managed LRU, so frequently 90 * used names will hang around. Cache is indexed by hash value 91 * obtained from (dvp, name) where dvp refers to the directory 92 * containing name. 93 * 94 * For simplicity (and economy of storage), names longer than 95 * a maximum length of NCHNAMLEN are not cached; they occur 96 * infrequently in any case, and are almost never of interest. 97 * 98 * Upon reaching the last segment of a path, if the reference 99 * is for DELETE, or NOCACHE is set (rewrite), and the 100 * name is located in the cache, it will be dropped. 101 * The entry is dropped also when it was not possible to lock 102 * the cached vnode, either because vget() failed or the generation 103 * number has changed while waiting for the lock. 104 */ 105 106 /* 107 * The locking in this subsystem works as follows: 108 * 109 * When an entry is added to the cache, via cache_enter(), 110 * namecache_lock is taken to exclude other writers. The new 111 * entry is added to the hash list in a way which permits 112 * concurrent lookups and invalidations in the cache done on 113 * other CPUs to continue in parallel. 114 * 115 * When a lookup is done in the cache, via cache_lookup() or 116 * cache_lookup_raw(), the per-cpu lock below is taken. This 117 * protects calls to cache_lookup_entry() and cache_invalidate() 118 * against cache_reclaim() but allows lookups to continue in 119 * parallel with cache_enter(). 120 * 121 * cache_revlookup() takes namecache_lock to exclude cache_enter() 122 * and cache_reclaim() since the list it operates on is not 123 * maintained to allow concurrent reads. 124 * 125 * When cache_reclaim() is called namecache_lock is held to hold 126 * off calls to cache_enter()/cache_revlookup() and each of the 127 * per-cpu locks is taken to hold off lookups. Holding all these 128 * locks essentially idles the subsystem, ensuring there are no 129 * concurrent references to the cache entries being freed. 130 * 131 * 32 bit per-cpu statistic counters (struct nchstats_percpu) are 132 * incremented when the operations they count are performed while 133 * running on the corresponding CPU. Frequently individual counters 134 * are incremented while holding a lock (either a per-cpu lock or 135 * namecache_lock) sufficient to preclude concurrent increments 136 * being done to the same counter, so non-atomic increments are 137 * done using the COUNT() macro. Counters which are incremented 138 * when one of these locks is not held use the COUNT_UNL() macro 139 * instead. COUNT_UNL() could be defined to do atomic increments 140 * but currently just does what COUNT() does, on the theory that 141 * it is unlikely the non-atomic increment will be interrupted 142 * by something on the same CPU that increments the same counter, 143 * but even if it does happen the consequences aren't serious. 144 * 145 * N.B.: Attempting to protect COUNT_UNL() increments by taking 146 * a per-cpu lock in the namecache_count_*() functions causes 147 * a deadlock. Don't do that, use atomic increments instead if 148 * the imperfections here bug you. 149 * 150 * The 64 bit system-wide statistic counts (struct nchstats) are 151 * maintained by sampling the per-cpu counters periodically, adding 152 * in the deltas since the last samples and recording the current 153 * samples to use to compute the next delta. The sampling is done 154 * as a side effect of cache_reclaim() which is run periodically, 155 * for its own purposes, often enough to avoid overflow of the 32 156 * bit counters. While sampling in this fashion requires no locking 157 * it is never-the-less done only after all locks have been taken by 158 * cache_reclaim() to allow cache_stat_sysctl() to hold off 159 * cache_reclaim() with minimal locking. 160 * 161 * cache_stat_sysctl() takes its CPU's per-cpu lock to hold off 162 * cache_reclaim() so that it can copy the subsystem total stats 163 * without them being concurrently modified. If CACHE_STATS_CURRENT 164 * is defined it also harvests the per-cpu increments into the total, 165 * which again requires cache_reclaim() to be held off. 166 * 167 * The per-cpu data (a lock and the per-cpu stats structures) 168 * are defined next. 169 */ 170 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); 171 172 struct nchcpu { 173 kmutex_t cpu_lock; 174 struct nchstats_percpu cpu_stats; 175 /* XXX maybe __cacheline_aligned would improve this? */ 176 struct nchstats_percpu cpu_stats_last; /* from last sample */ 177 }; 178 179 /* 180 * The type for the hash code. While the hash function generates a 181 * u32, the hash code has historically been passed around as a u_long, 182 * and the value is modified by xor'ing a uintptr_t, so it's not 183 * entirely clear what the best type is. For now I'll leave it 184 * unchanged as u_long. 185 */ 186 187 typedef u_long nchash_t; 188 189 /* 190 * Structures associated with name cacheing. 191 */ 192 193 static kmutex_t *namecache_lock __read_mostly; 194 static pool_cache_t namecache_cache __read_mostly; 195 static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned; 196 197 static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly; 198 static u_long nchash __read_mostly; 199 200 #define NCHASH2(hash, dvp) \ 201 (((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash) 202 203 static LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl __read_mostly; 204 static u_long ncvhash __read_mostly; 205 206 #define NCVHASH(vp) (((uintptr_t)(vp) >> 3) & ncvhash) 207 208 /* Number of cache entries allocated. */ 209 static long numcache __cacheline_aligned; 210 211 /* Garbage collection queue and number of entries pending in it. */ 212 static void *cache_gcqueue; 213 static u_int cache_gcpend; 214 215 /* Cache effectiveness statistics. This holds total from per-cpu stats */ 216 struct nchstats nchstats __cacheline_aligned; 217 218 /* 219 * Macros to count an event, update the central stats with per-cpu 220 * values and add current per-cpu increments to the subsystem total 221 * last collected by cache_reclaim(). 222 */ 223 #define CACHE_STATS_CURRENT /* nothing */ 224 225 #define COUNT(cpup, f) ((cpup)->cpu_stats.f++) 226 227 #define UPDATE(cpup, f) do { \ 228 struct nchcpu *Xcpup = (cpup); \ 229 uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \ 230 nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \ 231 Xcpup->cpu_stats_last.f = Xcnt; \ 232 } while (/* CONSTCOND */ 0) 233 234 #define ADD(stats, cpup, f) do { \ 235 struct nchcpu *Xcpup = (cpup); \ 236 stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \ 237 } while (/* CONSTCOND */ 0) 238 239 /* Do unlocked stats the same way. Use a different name to allow mind changes */ 240 #define COUNT_UNL(cpup, f) COUNT((cpup), f) 241 242 static const int cache_lowat = 95; 243 static const int cache_hiwat = 98; 244 static const int cache_hottime = 5; /* number of seconds */ 245 static int doingcache = 1; /* 1 => enable the cache */ 246 247 static struct evcnt cache_ev_scan; 248 static struct evcnt cache_ev_gc; 249 static struct evcnt cache_ev_over; 250 static struct evcnt cache_ev_under; 251 static struct evcnt cache_ev_forced; 252 253 static void cache_invalidate(struct namecache *); 254 static struct namecache *cache_lookup_entry( 255 const struct vnode *, const char *, size_t); 256 static void cache_thread(void *); 257 static void cache_invalidate(struct namecache *); 258 static void cache_disassociate(struct namecache *); 259 static void cache_reclaim(void); 260 static int cache_ctor(void *, void *, int); 261 static void cache_dtor(void *, void *); 262 263 static struct sysctllog *sysctllog; 264 static void sysctl_cache_stat_setup(void); 265 266 /* 267 * Compute the hash for an entry. 268 * 269 * (This is for now a wrapper around namei_hash, whose interface is 270 * for the time being slightly inconvenient.) 271 */ 272 static nchash_t 273 cache_hash(const char *name, size_t namelen) 274 { 275 const char *endptr; 276 277 endptr = name + namelen; 278 return namei_hash(name, &endptr); 279 } 280 281 /* 282 * Invalidate a cache entry and enqueue it for garbage collection. 283 * The caller needs to hold namecache_lock or a per-cpu lock to hold 284 * off cache_reclaim(). 285 */ 286 static void 287 cache_invalidate(struct namecache *ncp) 288 { 289 void *head; 290 291 KASSERT(mutex_owned(&ncp->nc_lock)); 292 293 if (ncp->nc_dvp != NULL) { 294 ncp->nc_vp = NULL; 295 ncp->nc_dvp = NULL; 296 do { 297 head = cache_gcqueue; 298 ncp->nc_gcqueue = head; 299 } while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head); 300 atomic_inc_uint(&cache_gcpend); 301 } 302 } 303 304 /* 305 * Disassociate a namecache entry from any vnodes it is attached to, 306 * and remove from the global LRU list. 307 */ 308 static void 309 cache_disassociate(struct namecache *ncp) 310 { 311 312 KASSERT(mutex_owned(namecache_lock)); 313 KASSERT(ncp->nc_dvp == NULL); 314 315 if (ncp->nc_lru.tqe_prev != NULL) { 316 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 317 ncp->nc_lru.tqe_prev = NULL; 318 } 319 if (ncp->nc_vhash.le_prev != NULL) { 320 LIST_REMOVE(ncp, nc_vhash); 321 ncp->nc_vhash.le_prev = NULL; 322 } 323 if (ncp->nc_vlist.le_prev != NULL) { 324 LIST_REMOVE(ncp, nc_vlist); 325 ncp->nc_vlist.le_prev = NULL; 326 } 327 if (ncp->nc_dvlist.le_prev != NULL) { 328 LIST_REMOVE(ncp, nc_dvlist); 329 ncp->nc_dvlist.le_prev = NULL; 330 } 331 } 332 333 /* 334 * Lock all CPUs to prevent any cache lookup activity. Conceptually, 335 * this locks out all "readers". 336 */ 337 static void 338 cache_lock_cpus(void) 339 { 340 CPU_INFO_ITERATOR cii; 341 struct cpu_info *ci; 342 struct nchcpu *cpup; 343 344 /* 345 * Lock out all CPUs first, then harvest per-cpu stats. This 346 * is probably not quite as cache-efficient as doing the lock 347 * and harvest at the same time, but allows cache_stat_sysctl() 348 * to make do with a per-cpu lock. 349 */ 350 for (CPU_INFO_FOREACH(cii, ci)) { 351 cpup = ci->ci_data.cpu_nch; 352 mutex_enter(&cpup->cpu_lock); 353 } 354 for (CPU_INFO_FOREACH(cii, ci)) { 355 cpup = ci->ci_data.cpu_nch; 356 UPDATE(cpup, ncs_goodhits); 357 UPDATE(cpup, ncs_neghits); 358 UPDATE(cpup, ncs_badhits); 359 UPDATE(cpup, ncs_falsehits); 360 UPDATE(cpup, ncs_miss); 361 UPDATE(cpup, ncs_long); 362 UPDATE(cpup, ncs_pass2); 363 UPDATE(cpup, ncs_2passes); 364 UPDATE(cpup, ncs_revhits); 365 UPDATE(cpup, ncs_revmiss); 366 } 367 } 368 369 /* 370 * Release all CPU locks. 371 */ 372 static void 373 cache_unlock_cpus(void) 374 { 375 CPU_INFO_ITERATOR cii; 376 struct cpu_info *ci; 377 struct nchcpu *cpup; 378 379 for (CPU_INFO_FOREACH(cii, ci)) { 380 cpup = ci->ci_data.cpu_nch; 381 mutex_exit(&cpup->cpu_lock); 382 } 383 } 384 385 /* 386 * Find a single cache entry and return it locked. 387 * The caller needs to hold namecache_lock or a per-cpu lock to hold 388 * off cache_reclaim(). 389 */ 390 static struct namecache * 391 cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen) 392 { 393 struct nchashhead *ncpp; 394 struct namecache *ncp; 395 nchash_t hash; 396 397 KASSERT(dvp != NULL); 398 hash = cache_hash(name, namelen); 399 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 400 401 LIST_FOREACH(ncp, ncpp, nc_hash) { 402 membar_datadep_consumer(); /* for Alpha... */ 403 if (ncp->nc_dvp != dvp || 404 ncp->nc_nlen != namelen || 405 memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen)) 406 continue; 407 mutex_enter(&ncp->nc_lock); 408 if (__predict_true(ncp->nc_dvp == dvp)) { 409 ncp->nc_hittime = hardclock_ticks; 410 return ncp; 411 } 412 /* Raced: entry has been nullified. */ 413 mutex_exit(&ncp->nc_lock); 414 } 415 416 return NULL; 417 } 418 419 /* 420 * Look for a the name in the cache. We don't do this 421 * if the segment name is long, simply so the cache can avoid 422 * holding long names (which would either waste space, or 423 * add greatly to the complexity). 424 * 425 * Lookup is called with DVP pointing to the directory to search, 426 * and CNP providing the name of the entry being sought: cn_nameptr 427 * is the name, cn_namelen is its length, and cn_flags is the flags 428 * word from the namei operation. 429 * 430 * DVP must be locked. 431 * 432 * There are three possible non-error return states: 433 * 1. Nothing was found in the cache. Nothing is known about 434 * the requested name. 435 * 2. A negative entry was found in the cache, meaning that the 436 * requested name definitely does not exist. 437 * 3. A positive entry was found in the cache, meaning that the 438 * requested name does exist and that we are providing the 439 * vnode. 440 * In these cases the results are: 441 * 1. 0 returned; VN is set to NULL. 442 * 2. 1 returned; VN is set to NULL. 443 * 3. 1 returned; VN is set to the vnode found. 444 * 445 * The additional result argument ISWHT is set to zero, unless a 446 * negative entry is found that was entered as a whiteout, in which 447 * case ISWHT is set to one. 448 * 449 * The ISWHT_RET argument pointer may be null. In this case an 450 * assertion is made that the whiteout flag is not set. File systems 451 * that do not support whiteouts can/should do this. 452 * 453 * Filesystems that do support whiteouts should add ISWHITEOUT to 454 * cnp->cn_flags if ISWHT comes back nonzero. 455 * 456 * When a vnode is returned, it is locked, as per the vnode lookup 457 * locking protocol. 458 * 459 * There is no way for this function to fail, in the sense of 460 * generating an error that requires aborting the namei operation. 461 * 462 * (Prior to October 2012, this function returned an integer status, 463 * and a vnode, and mucked with the flags word in CNP for whiteouts. 464 * The integer status was -1 for "nothing found", ENOENT for "a 465 * negative entry found", 0 for "a positive entry found", and possibly 466 * other errors, and the value of VN might or might not have been set 467 * depending on what error occurred.) 468 */ 469 int 470 cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 471 uint32_t nameiop, uint32_t cnflags, 472 int *iswht_ret, struct vnode **vn_ret) 473 { 474 struct namecache *ncp; 475 struct vnode *vp; 476 struct nchcpu *cpup; 477 int error, ret_value; 478 479 480 /* Establish default result values */ 481 if (iswht_ret != NULL) { 482 *iswht_ret = 0; 483 } 484 *vn_ret = NULL; 485 486 if (__predict_false(!doingcache)) { 487 return 0; 488 } 489 490 cpup = curcpu()->ci_data.cpu_nch; 491 mutex_enter(&cpup->cpu_lock); 492 if (__predict_false(namelen > NCHNAMLEN)) { 493 COUNT(cpup, ncs_long); 494 mutex_exit(&cpup->cpu_lock); 495 /* found nothing */ 496 return 0; 497 } 498 499 ncp = cache_lookup_entry(dvp, name, namelen); 500 if (__predict_false(ncp == NULL)) { 501 COUNT(cpup, ncs_miss); 502 mutex_exit(&cpup->cpu_lock); 503 /* found nothing */ 504 return 0; 505 } 506 if ((cnflags & MAKEENTRY) == 0) { 507 COUNT(cpup, ncs_badhits); 508 /* 509 * Last component and we are renaming or deleting, 510 * the cache entry is invalid, or otherwise don't 511 * want cache entry to exist. 512 */ 513 cache_invalidate(ncp); 514 mutex_exit(&ncp->nc_lock); 515 mutex_exit(&cpup->cpu_lock); 516 /* found nothing */ 517 return 0; 518 } 519 if (ncp->nc_vp == NULL) { 520 if (iswht_ret != NULL) { 521 /* 522 * Restore the ISWHITEOUT flag saved earlier. 523 */ 524 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 525 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 526 } else { 527 KASSERT(ncp->nc_flags == 0); 528 } 529 530 if (__predict_true(nameiop != CREATE || 531 (cnflags & ISLASTCN) == 0)) { 532 COUNT(cpup, ncs_neghits); 533 /* found neg entry; vn is already null from above */ 534 ret_value = 1; 535 } else { 536 COUNT(cpup, ncs_badhits); 537 /* 538 * Last component and we are renaming or 539 * deleting, the cache entry is invalid, 540 * or otherwise don't want cache entry to 541 * exist. 542 */ 543 cache_invalidate(ncp); 544 /* found nothing */ 545 ret_value = 0; 546 } 547 mutex_exit(&ncp->nc_lock); 548 mutex_exit(&cpup->cpu_lock); 549 return ret_value; 550 } 551 552 vp = ncp->nc_vp; 553 mutex_enter(vp->v_interlock); 554 mutex_exit(&ncp->nc_lock); 555 mutex_exit(&cpup->cpu_lock); 556 557 /* 558 * Unlocked except for the vnode interlock. Call vget(). 559 */ 560 error = vget(vp, LK_NOWAIT, false /* !wait */); 561 if (error) { 562 KASSERT(error == EBUSY); 563 /* 564 * This vnode is being cleaned out. 565 * XXX badhits? 566 */ 567 COUNT_UNL(cpup, ncs_falsehits); 568 /* found nothing */ 569 return 0; 570 } 571 572 COUNT_UNL(cpup, ncs_goodhits); 573 /* found it */ 574 *vn_ret = vp; 575 return 1; 576 } 577 578 579 /* 580 * Cut-'n-pasted version of the above without the nameiop argument. 581 */ 582 int 583 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 584 uint32_t cnflags, 585 int *iswht_ret, struct vnode **vn_ret) 586 { 587 struct namecache *ncp; 588 struct vnode *vp; 589 struct nchcpu *cpup; 590 int error; 591 592 /* Establish default results. */ 593 if (iswht_ret != NULL) { 594 *iswht_ret = 0; 595 } 596 *vn_ret = NULL; 597 598 if (__predict_false(!doingcache)) { 599 /* found nothing */ 600 return 0; 601 } 602 603 cpup = curcpu()->ci_data.cpu_nch; 604 mutex_enter(&cpup->cpu_lock); 605 if (__predict_false(namelen > NCHNAMLEN)) { 606 COUNT(cpup, ncs_long); 607 mutex_exit(&cpup->cpu_lock); 608 /* found nothing */ 609 return 0; 610 } 611 ncp = cache_lookup_entry(dvp, name, namelen); 612 if (__predict_false(ncp == NULL)) { 613 COUNT(cpup, ncs_miss); 614 mutex_exit(&cpup->cpu_lock); 615 /* found nothing */ 616 return 0; 617 } 618 vp = ncp->nc_vp; 619 if (vp == NULL) { 620 /* 621 * Restore the ISWHITEOUT flag saved earlier. 622 */ 623 if (iswht_ret != NULL) { 624 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 625 /*cnp->cn_flags |= ncp->nc_flags;*/ 626 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 627 } 628 COUNT(cpup, ncs_neghits); 629 mutex_exit(&ncp->nc_lock); 630 mutex_exit(&cpup->cpu_lock); 631 /* found negative entry; vn is already null from above */ 632 return 1; 633 } 634 mutex_enter(vp->v_interlock); 635 mutex_exit(&ncp->nc_lock); 636 mutex_exit(&cpup->cpu_lock); 637 638 /* 639 * Unlocked except for the vnode interlock. Call vget(). 640 */ 641 error = vget(vp, LK_NOWAIT, false /* !wait */); 642 if (error) { 643 KASSERT(error == EBUSY); 644 /* 645 * This vnode is being cleaned out. 646 * XXX badhits? 647 */ 648 COUNT_UNL(cpup, ncs_falsehits); 649 /* found nothing */ 650 return 0; 651 } 652 653 COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */ 654 /* found it */ 655 *vn_ret = vp; 656 return 1; 657 } 658 659 /* 660 * Scan cache looking for name of directory entry pointing at vp. 661 * 662 * If the lookup succeeds the vnode is referenced and stored in dvpp. 663 * 664 * If bufp is non-NULL, also place the name in the buffer which starts 665 * at bufp, immediately before *bpp, and move bpp backwards to point 666 * at the start of it. (Yes, this is a little baroque, but it's done 667 * this way to cater to the whims of getcwd). 668 * 669 * Returns 0 on success, -1 on cache miss, positive errno on failure. 670 */ 671 int 672 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) 673 { 674 struct namecache *ncp; 675 struct vnode *dvp; 676 struct ncvhashhead *nvcpp; 677 struct nchcpu *cpup; 678 char *bp; 679 int error, nlen; 680 681 if (!doingcache) 682 goto out; 683 684 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 685 686 /* 687 * We increment counters in the local CPU's per-cpu stats. 688 * We don't take the per-cpu lock, however, since this function 689 * is the only place these counters are incremented so no one 690 * will be racing with us to increment them. 691 */ 692 cpup = curcpu()->ci_data.cpu_nch; 693 mutex_enter(namecache_lock); 694 LIST_FOREACH(ncp, nvcpp, nc_vhash) { 695 mutex_enter(&ncp->nc_lock); 696 if (ncp->nc_vp == vp && 697 (dvp = ncp->nc_dvp) != NULL && 698 dvp != vp) { /* avoid pesky . entries.. */ 699 700 #ifdef DIAGNOSTIC 701 if (ncp->nc_nlen == 1 && 702 ncp->nc_name[0] == '.') 703 panic("cache_revlookup: found entry for ."); 704 705 if (ncp->nc_nlen == 2 && 706 ncp->nc_name[0] == '.' && 707 ncp->nc_name[1] == '.') 708 panic("cache_revlookup: found entry for .."); 709 #endif 710 COUNT(cpup, ncs_revhits); 711 nlen = ncp->nc_nlen; 712 713 if (bufp) { 714 bp = *bpp; 715 bp -= nlen; 716 if (bp <= bufp) { 717 *dvpp = NULL; 718 mutex_exit(&ncp->nc_lock); 719 mutex_exit(namecache_lock); 720 return (ERANGE); 721 } 722 memcpy(bp, ncp->nc_name, nlen); 723 *bpp = bp; 724 } 725 726 mutex_enter(dvp->v_interlock); 727 mutex_exit(&ncp->nc_lock); 728 mutex_exit(namecache_lock); 729 error = vget(dvp, LK_NOWAIT, false /* !wait */); 730 if (error) { 731 KASSERT(error == EBUSY); 732 if (bufp) 733 (*bpp) += nlen; 734 *dvpp = NULL; 735 return -1; 736 } 737 *dvpp = dvp; 738 return (0); 739 } 740 mutex_exit(&ncp->nc_lock); 741 } 742 COUNT(cpup, ncs_revmiss); 743 mutex_exit(namecache_lock); 744 out: 745 *dvpp = NULL; 746 return (-1); 747 } 748 749 /* 750 * Add an entry to the cache 751 */ 752 void 753 cache_enter(struct vnode *dvp, struct vnode *vp, 754 const char *name, size_t namelen, uint32_t cnflags) 755 { 756 struct namecache *ncp; 757 struct namecache *oncp; 758 struct nchashhead *ncpp; 759 struct ncvhashhead *nvcpp; 760 nchash_t hash; 761 762 /* First, check whether we can/should add a cache entry. */ 763 if ((cnflags & MAKEENTRY) == 0 || 764 __predict_false(namelen > NCHNAMLEN || !doingcache)) { 765 return; 766 } 767 768 if (numcache > desiredvnodes) { 769 mutex_enter(namecache_lock); 770 cache_ev_forced.ev_count++; 771 cache_reclaim(); 772 mutex_exit(namecache_lock); 773 } 774 775 ncp = pool_cache_get(namecache_cache, PR_WAITOK); 776 mutex_enter(namecache_lock); 777 numcache++; 778 779 /* 780 * Concurrent lookups in the same directory may race for a 781 * cache entry. if there's a duplicated entry, free it. 782 */ 783 oncp = cache_lookup_entry(dvp, name, namelen); 784 if (oncp) { 785 cache_invalidate(oncp); 786 mutex_exit(&oncp->nc_lock); 787 } 788 789 /* Grab the vnode we just found. */ 790 mutex_enter(&ncp->nc_lock); 791 ncp->nc_vp = vp; 792 ncp->nc_flags = 0; 793 ncp->nc_hittime = 0; 794 ncp->nc_gcqueue = NULL; 795 if (vp == NULL) { 796 /* 797 * For negative hits, save the ISWHITEOUT flag so we can 798 * restore it later when the cache entry is used again. 799 */ 800 ncp->nc_flags = cnflags & ISWHITEOUT; 801 } 802 803 /* Fill in cache info. */ 804 ncp->nc_dvp = dvp; 805 LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist); 806 if (vp) 807 LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist); 808 else { 809 ncp->nc_vlist.le_prev = NULL; 810 ncp->nc_vlist.le_next = NULL; 811 } 812 KASSERT(namelen <= NCHNAMLEN); 813 ncp->nc_nlen = namelen; 814 memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen); 815 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 816 hash = cache_hash(name, namelen); 817 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 818 819 /* 820 * Flush updates before making visible in table. No need for a 821 * memory barrier on the other side: to see modifications the 822 * list must be followed, meaning a dependent pointer load. 823 * The below is LIST_INSERT_HEAD() inlined, with the memory 824 * barrier included in the correct place. 825 */ 826 if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL) 827 ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next; 828 ncp->nc_hash.le_prev = &ncpp->lh_first; 829 membar_producer(); 830 ncpp->lh_first = ncp; 831 832 ncp->nc_vhash.le_prev = NULL; 833 ncp->nc_vhash.le_next = NULL; 834 835 /* 836 * Create reverse-cache entries (used in getcwd) for directories. 837 * (and in linux procfs exe node) 838 */ 839 if (vp != NULL && 840 vp != dvp && 841 #ifndef NAMECACHE_ENTER_REVERSE 842 vp->v_type == VDIR && 843 #endif 844 (ncp->nc_nlen > 2 || 845 (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || 846 (/* ncp->nc_nlen > 0 && */ ncp->nc_name[0] != '.'))) { 847 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 848 LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); 849 } 850 mutex_exit(&ncp->nc_lock); 851 mutex_exit(namecache_lock); 852 } 853 854 /* 855 * Name cache initialization, from vfs_init() when we are booting 856 */ 857 void 858 nchinit(void) 859 { 860 int error; 861 862 TAILQ_INIT(&nclruhead); 863 namecache_cache = pool_cache_init(sizeof(struct namecache), 864 coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor, 865 cache_dtor, NULL); 866 KASSERT(namecache_cache != NULL); 867 868 namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 869 870 nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash); 871 ncvhashtbl = 872 #ifdef NAMECACHE_ENTER_REVERSE 873 hashinit(desiredvnodes, HASH_LIST, true, &ncvhash); 874 #else 875 hashinit(desiredvnodes/8, HASH_LIST, true, &ncvhash); 876 #endif 877 878 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread, 879 NULL, NULL, "cachegc"); 880 if (error != 0) 881 panic("nchinit %d", error); 882 883 evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL, 884 "namecache", "entries scanned"); 885 evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL, 886 "namecache", "entries collected"); 887 evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL, 888 "namecache", "over scan target"); 889 evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL, 890 "namecache", "under scan target"); 891 evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL, 892 "namecache", "forced reclaims"); 893 894 sysctl_cache_stat_setup(); 895 } 896 897 static int 898 cache_ctor(void *arg, void *obj, int flag) 899 { 900 struct namecache *ncp; 901 902 ncp = obj; 903 mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE); 904 905 return 0; 906 } 907 908 static void 909 cache_dtor(void *arg, void *obj) 910 { 911 struct namecache *ncp; 912 913 ncp = obj; 914 mutex_destroy(&ncp->nc_lock); 915 } 916 917 /* 918 * Called once for each CPU in the system as attached. 919 */ 920 void 921 cache_cpu_init(struct cpu_info *ci) 922 { 923 struct nchcpu *cpup; 924 size_t sz; 925 926 sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit; 927 cpup = kmem_zalloc(sz, KM_SLEEP); 928 cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit); 929 mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE); 930 ci->ci_data.cpu_nch = cpup; 931 } 932 933 /* 934 * Name cache reinitialization, for when the maximum number of vnodes increases. 935 */ 936 void 937 nchreinit(void) 938 { 939 struct namecache *ncp; 940 struct nchashhead *oldhash1, *hash1; 941 struct ncvhashhead *oldhash2, *hash2; 942 u_long i, oldmask1, oldmask2, mask1, mask2; 943 944 hash1 = hashinit(desiredvnodes, HASH_LIST, true, &mask1); 945 hash2 = 946 #ifdef NAMECACHE_ENTER_REVERSE 947 hashinit(desiredvnodes, HASH_LIST, true, &mask2); 948 #else 949 hashinit(desiredvnodes/8, HASH_LIST, true, &mask2); 950 #endif 951 mutex_enter(namecache_lock); 952 cache_lock_cpus(); 953 oldhash1 = nchashtbl; 954 oldmask1 = nchash; 955 nchashtbl = hash1; 956 nchash = mask1; 957 oldhash2 = ncvhashtbl; 958 oldmask2 = ncvhash; 959 ncvhashtbl = hash2; 960 ncvhash = mask2; 961 for (i = 0; i <= oldmask1; i++) { 962 while ((ncp = LIST_FIRST(&oldhash1[i])) != NULL) { 963 LIST_REMOVE(ncp, nc_hash); 964 ncp->nc_hash.le_prev = NULL; 965 } 966 } 967 for (i = 0; i <= oldmask2; i++) { 968 while ((ncp = LIST_FIRST(&oldhash2[i])) != NULL) { 969 LIST_REMOVE(ncp, nc_vhash); 970 ncp->nc_vhash.le_prev = NULL; 971 } 972 } 973 cache_unlock_cpus(); 974 mutex_exit(namecache_lock); 975 hashdone(oldhash1, HASH_LIST, oldmask1); 976 hashdone(oldhash2, HASH_LIST, oldmask2); 977 } 978 979 /* 980 * Cache flush, a particular vnode; called when a vnode is renamed to 981 * hide entries that would now be invalid 982 */ 983 void 984 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 985 { 986 struct namecache *ncp, *ncnext; 987 988 mutex_enter(namecache_lock); 989 if (flags & PURGE_PARENTS) { 990 for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL; 991 ncp = ncnext) { 992 ncnext = LIST_NEXT(ncp, nc_vlist); 993 mutex_enter(&ncp->nc_lock); 994 cache_invalidate(ncp); 995 mutex_exit(&ncp->nc_lock); 996 cache_disassociate(ncp); 997 } 998 } 999 if (flags & PURGE_CHILDREN) { 1000 for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL; 1001 ncp = ncnext) { 1002 ncnext = LIST_NEXT(ncp, nc_dvlist); 1003 mutex_enter(&ncp->nc_lock); 1004 cache_invalidate(ncp); 1005 mutex_exit(&ncp->nc_lock); 1006 cache_disassociate(ncp); 1007 } 1008 } 1009 if (name != NULL) { 1010 ncp = cache_lookup_entry(vp, name, namelen); 1011 if (ncp) { 1012 cache_invalidate(ncp); 1013 mutex_exit(&ncp->nc_lock); 1014 cache_disassociate(ncp); 1015 } 1016 } 1017 mutex_exit(namecache_lock); 1018 } 1019 1020 /* 1021 * Cache flush, a whole filesystem; called when filesys is umounted to 1022 * remove entries that would now be invalid. 1023 */ 1024 void 1025 cache_purgevfs(struct mount *mp) 1026 { 1027 struct namecache *ncp, *nxtcp; 1028 1029 mutex_enter(namecache_lock); 1030 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 1031 nxtcp = TAILQ_NEXT(ncp, nc_lru); 1032 mutex_enter(&ncp->nc_lock); 1033 if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) { 1034 /* Free the resources we had. */ 1035 cache_invalidate(ncp); 1036 cache_disassociate(ncp); 1037 } 1038 mutex_exit(&ncp->nc_lock); 1039 } 1040 cache_reclaim(); 1041 mutex_exit(namecache_lock); 1042 } 1043 1044 /* 1045 * Scan global list invalidating entries until we meet a preset target. 1046 * Prefer to invalidate entries that have not scored a hit within 1047 * cache_hottime seconds. We sort the LRU list only for this routine's 1048 * benefit. 1049 */ 1050 static void 1051 cache_prune(int incache, int target) 1052 { 1053 struct namecache *ncp, *nxtcp, *sentinel; 1054 int items, recent, tryharder; 1055 1056 KASSERT(mutex_owned(namecache_lock)); 1057 1058 items = 0; 1059 tryharder = 0; 1060 recent = hardclock_ticks - hz * cache_hottime; 1061 sentinel = NULL; 1062 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 1063 if (incache <= target) 1064 break; 1065 items++; 1066 nxtcp = TAILQ_NEXT(ncp, nc_lru); 1067 if (ncp == sentinel) { 1068 /* 1069 * If we looped back on ourself, then ignore 1070 * recent entries and purge whatever we find. 1071 */ 1072 tryharder = 1; 1073 } 1074 if (ncp->nc_dvp == NULL) 1075 continue; 1076 if (!tryharder && (ncp->nc_hittime - recent) > 0) { 1077 if (sentinel == NULL) 1078 sentinel = ncp; 1079 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 1080 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 1081 continue; 1082 } 1083 mutex_enter(&ncp->nc_lock); 1084 if (ncp->nc_dvp != NULL) { 1085 cache_invalidate(ncp); 1086 cache_disassociate(ncp); 1087 incache--; 1088 } 1089 mutex_exit(&ncp->nc_lock); 1090 } 1091 cache_ev_scan.ev_count += items; 1092 } 1093 1094 /* 1095 * Collect dead cache entries from all CPUs and garbage collect. 1096 */ 1097 static void 1098 cache_reclaim(void) 1099 { 1100 struct namecache *ncp, *next; 1101 int items; 1102 1103 KASSERT(mutex_owned(namecache_lock)); 1104 1105 /* 1106 * If the number of extant entries not awaiting garbage collection 1107 * exceeds the high water mark, then reclaim stale entries until we 1108 * reach our low water mark. 1109 */ 1110 items = numcache - cache_gcpend; 1111 if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) { 1112 cache_prune(items, (int)((uint64_t)desiredvnodes * 1113 cache_lowat / 100)); 1114 cache_ev_over.ev_count++; 1115 } else 1116 cache_ev_under.ev_count++; 1117 1118 /* 1119 * Stop forward lookup activity on all CPUs and garbage collect dead 1120 * entries. 1121 */ 1122 cache_lock_cpus(); 1123 ncp = cache_gcqueue; 1124 cache_gcqueue = NULL; 1125 items = cache_gcpend; 1126 cache_gcpend = 0; 1127 while (ncp != NULL) { 1128 next = ncp->nc_gcqueue; 1129 cache_disassociate(ncp); 1130 KASSERT(ncp->nc_dvp == NULL); 1131 if (ncp->nc_hash.le_prev != NULL) { 1132 LIST_REMOVE(ncp, nc_hash); 1133 ncp->nc_hash.le_prev = NULL; 1134 } 1135 pool_cache_put(namecache_cache, ncp); 1136 ncp = next; 1137 } 1138 cache_unlock_cpus(); 1139 numcache -= items; 1140 cache_ev_gc.ev_count += items; 1141 } 1142 1143 /* 1144 * Cache maintainence thread, awakening once per second to: 1145 * 1146 * => keep number of entries below the high water mark 1147 * => sort pseudo-LRU list 1148 * => garbage collect dead entries 1149 */ 1150 static void 1151 cache_thread(void *arg) 1152 { 1153 1154 mutex_enter(namecache_lock); 1155 for (;;) { 1156 cache_reclaim(); 1157 kpause("cachegc", false, hz, namecache_lock); 1158 } 1159 } 1160 1161 #ifdef DDB 1162 void 1163 namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1164 { 1165 struct vnode *dvp = NULL; 1166 struct namecache *ncp; 1167 1168 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1169 if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) { 1170 (*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name); 1171 dvp = ncp->nc_dvp; 1172 } 1173 } 1174 if (dvp == NULL) { 1175 (*pr)("name not found\n"); 1176 return; 1177 } 1178 vp = dvp; 1179 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1180 if (ncp->nc_vp == vp) { 1181 (*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name); 1182 } 1183 } 1184 } 1185 #endif 1186 1187 void 1188 namecache_count_pass2(void) 1189 { 1190 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1191 1192 COUNT_UNL(cpup, ncs_pass2); 1193 } 1194 1195 void 1196 namecache_count_2passes(void) 1197 { 1198 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1199 1200 COUNT_UNL(cpup, ncs_2passes); 1201 } 1202 1203 /* 1204 * Fetch the current values of the stats. We return the most 1205 * recent values harvested into nchstats by cache_reclaim(), which 1206 * will be less than a second old. 1207 */ 1208 static int 1209 cache_stat_sysctl(SYSCTLFN_ARGS) 1210 { 1211 struct nchstats stats; 1212 struct nchcpu *my_cpup; 1213 #ifdef CACHE_STATS_CURRENT 1214 CPU_INFO_ITERATOR cii; 1215 struct cpu_info *ci; 1216 #endif /* CACHE_STATS_CURRENT */ 1217 1218 if (oldp == NULL) { 1219 *oldlenp = sizeof(stats); 1220 return 0; 1221 } 1222 1223 if (*oldlenp < sizeof(stats)) { 1224 *oldlenp = 0; 1225 return 0; 1226 } 1227 1228 /* 1229 * Take this CPU's per-cpu lock to hold off cache_reclaim() 1230 * from doing a stats update while doing minimal damage to 1231 * concurrent operations. 1232 */ 1233 sysctl_unlock(); 1234 my_cpup = curcpu()->ci_data.cpu_nch; 1235 mutex_enter(&my_cpup->cpu_lock); 1236 stats = nchstats; 1237 #ifdef CACHE_STATS_CURRENT 1238 for (CPU_INFO_FOREACH(cii, ci)) { 1239 struct nchcpu *cpup = ci->ci_data.cpu_nch; 1240 1241 ADD(stats, cpup, ncs_goodhits); 1242 ADD(stats, cpup, ncs_neghits); 1243 ADD(stats, cpup, ncs_badhits); 1244 ADD(stats, cpup, ncs_falsehits); 1245 ADD(stats, cpup, ncs_miss); 1246 ADD(stats, cpup, ncs_long); 1247 ADD(stats, cpup, ncs_pass2); 1248 ADD(stats, cpup, ncs_2passes); 1249 ADD(stats, cpup, ncs_revhits); 1250 ADD(stats, cpup, ncs_revmiss); 1251 } 1252 #endif /* CACHE_STATS_CURRENT */ 1253 mutex_exit(&my_cpup->cpu_lock); 1254 sysctl_relock(); 1255 1256 *oldlenp = sizeof(stats); 1257 return sysctl_copyout(l, &stats, oldp, sizeof(stats)); 1258 } 1259 1260 static void 1261 sysctl_cache_stat_setup(void) 1262 { 1263 1264 KASSERT(sysctllog == NULL); 1265 sysctl_createv(&sysctllog, 0, NULL, NULL, 1266 CTLFLAG_PERMANENT, 1267 CTLTYPE_STRUCT, "namecache_stats", 1268 SYSCTL_DESCR("namecache statistics"), 1269 cache_stat_sysctl, 0, NULL, 0, 1270 CTL_VFS, CTL_CREATE, CTL_EOL); 1271 } 1272