1 /* $NetBSD: vfs_cache.c,v 1.105 2015/01/16 20:10:25 dennis Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.105 2015/01/16 20:10:25 dennis Exp $"); 62 63 #include "opt_ddb.h" 64 #include "opt_revcache.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/sysctl.h> 69 #include <sys/time.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/namei.h> 73 #include <sys/errno.h> 74 #include <sys/pool.h> 75 #include <sys/mutex.h> 76 #include <sys/atomic.h> 77 #include <sys/kthread.h> 78 #include <sys/kernel.h> 79 #include <sys/cpu.h> 80 #include <sys/evcnt.h> 81 82 #define NAMECACHE_ENTER_REVERSE 83 /* 84 * Name caching works as follows: 85 * 86 * Names found by directory scans are retained in a cache 87 * for future reference. It is managed LRU, so frequently 88 * used names will hang around. Cache is indexed by hash value 89 * obtained from (dvp, name) where dvp refers to the directory 90 * containing name. 91 * 92 * For simplicity (and economy of storage), names longer than 93 * a maximum length of NCHNAMLEN are not cached; they occur 94 * infrequently in any case, and are almost never of interest. 95 * 96 * Upon reaching the last segment of a path, if the reference 97 * is for DELETE, or NOCACHE is set (rewrite), and the 98 * name is located in the cache, it will be dropped. 99 * The entry is dropped also when it was not possible to lock 100 * the cached vnode, either because vget() failed or the generation 101 * number has changed while waiting for the lock. 102 */ 103 104 /* 105 * The locking in this subsystem works as follows: 106 * 107 * When an entry is added to the cache, via cache_enter(), 108 * namecache_lock is taken to exclude other writers. The new 109 * entry is added to the hash list in a way which permits 110 * concurrent lookups and invalidations in the cache done on 111 * other CPUs to continue in parallel. 112 * 113 * When a lookup is done in the cache, via cache_lookup() or 114 * cache_lookup_raw(), the per-cpu lock below is taken. This 115 * protects calls to cache_lookup_entry() and cache_invalidate() 116 * against cache_reclaim() but allows lookups to continue in 117 * parallel with cache_enter(). 118 * 119 * cache_revlookup() takes namecache_lock to exclude cache_enter() 120 * and cache_reclaim() since the list it operates on is not 121 * maintained to allow concurrent reads. 122 * 123 * When cache_reclaim() is called namecache_lock is held to hold 124 * off calls to cache_enter()/cache_revlookup() and each of the 125 * per-cpu locks is taken to hold off lookups. Holding all these 126 * locks essentially idles the subsystem, ensuring there are no 127 * concurrent references to the cache entries being freed. 128 * 129 * 32 bit per-cpu statistic counters (struct nchstats_percpu) are 130 * incremented when the operations they count are performed while 131 * running on the corresponding CPU. Frequently individual counters 132 * are incremented while holding a lock (either a per-cpu lock or 133 * namecache_lock) sufficient to preclude concurrent increments 134 * being done to the same counter, so non-atomic increments are 135 * done using the COUNT() macro. Counters which are incremented 136 * when one of these locks is not held use the COUNT_UNL() macro 137 * instead. COUNT_UNL() could be defined to do atomic increments 138 * but currently just does what COUNT() does, on the theory that 139 * it is unlikely the non-atomic increment will be interrupted 140 * by something on the same CPU that increments the same counter, 141 * but even if it does happen the consequences aren't serious. 142 * 143 * N.B.: Attempting to protect COUNT_UNL() increments by taking 144 * a per-cpu lock in the namecache_count_*() functions causes 145 * a deadlock. Don't do that, use atomic increments instead if 146 * the imperfections here bug you. 147 * 148 * The 64 bit system-wide statistic counts (struct nchstats) are 149 * maintained by sampling the per-cpu counters periodically, adding 150 * in the deltas since the last samples and recording the current 151 * samples to use to compute the next delta. The sampling is done 152 * as a side effect of cache_reclaim() which is run periodically, 153 * for its own purposes, often enough to avoid overflow of the 32 154 * bit counters. While sampling in this fashion requires no locking 155 * it is never-the-less done only after all locks have been taken by 156 * cache_reclaim() to allow cache_stat_sysctl() to hold off 157 * cache_reclaim() with minimal locking. 158 * 159 * cache_stat_sysctl() takes its CPU's per-cpu lock to hold off 160 * cache_reclaim() so that it can copy the subsystem total stats 161 * without them being concurrently modified. If CACHE_STATS_CURRENT 162 * is defined it also harvests the per-cpu increments into the total, 163 * which again requires cache_reclaim() to be held off. 164 * 165 * The per-cpu data (a lock and the per-cpu stats structures) 166 * are defined next. 167 */ 168 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); 169 170 struct nchcpu { 171 kmutex_t cpu_lock; 172 struct nchstats_percpu cpu_stats; 173 /* XXX maybe __cacheline_aligned would improve this? */ 174 struct nchstats_percpu cpu_stats_last; /* from last sample */ 175 }; 176 177 /* 178 * The type for the hash code. While the hash function generates a 179 * u32, the hash code has historically been passed around as a u_long, 180 * and the value is modified by xor'ing a uintptr_t, so it's not 181 * entirely clear what the best type is. For now I'll leave it 182 * unchanged as u_long. 183 */ 184 185 typedef u_long nchash_t; 186 187 /* 188 * Structures associated with name cacheing. 189 */ 190 191 static kmutex_t *namecache_lock __read_mostly; 192 static pool_cache_t namecache_cache __read_mostly; 193 static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned; 194 195 static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly; 196 static u_long nchash __read_mostly; 197 198 #define NCHASH2(hash, dvp) \ 199 (((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash) 200 201 static LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl __read_mostly; 202 static u_long ncvhash __read_mostly; 203 204 #define NCVHASH(vp) (((uintptr_t)(vp) >> 3) & ncvhash) 205 206 /* Number of cache entries allocated. */ 207 static long numcache __cacheline_aligned; 208 209 /* Garbage collection queue and number of entries pending in it. */ 210 static void *cache_gcqueue; 211 static u_int cache_gcpend; 212 213 /* Cache effectiveness statistics. This holds total from per-cpu stats */ 214 struct nchstats nchstats __cacheline_aligned; 215 216 /* 217 * Macros to count an event, update the central stats with per-cpu 218 * values and add current per-cpu increments to the subsystem total 219 * last collected by cache_reclaim(). 220 */ 221 #define CACHE_STATS_CURRENT /* nothing */ 222 223 #define COUNT(cpup, f) ((cpup)->cpu_stats.f++) 224 225 #define UPDATE(cpup, f) do { \ 226 struct nchcpu *Xcpup = (cpup); \ 227 uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \ 228 nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \ 229 Xcpup->cpu_stats_last.f = Xcnt; \ 230 } while (/* CONSTCOND */ 0) 231 232 #define ADD(stats, cpup, f) do { \ 233 struct nchcpu *Xcpup = (cpup); \ 234 stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \ 235 } while (/* CONSTCOND */ 0) 236 237 /* Do unlocked stats the same way. Use a different name to allow mind changes */ 238 #define COUNT_UNL(cpup, f) COUNT((cpup), f) 239 240 static const int cache_lowat = 95; 241 static const int cache_hiwat = 98; 242 static const int cache_hottime = 5; /* number of seconds */ 243 static int doingcache = 1; /* 1 => enable the cache */ 244 245 static struct evcnt cache_ev_scan; 246 static struct evcnt cache_ev_gc; 247 static struct evcnt cache_ev_over; 248 static struct evcnt cache_ev_under; 249 static struct evcnt cache_ev_forced; 250 251 static void cache_invalidate(struct namecache *); 252 static struct namecache *cache_lookup_entry( 253 const struct vnode *, const char *, size_t); 254 static void cache_thread(void *); 255 static void cache_invalidate(struct namecache *); 256 static void cache_disassociate(struct namecache *); 257 static void cache_reclaim(void); 258 static int cache_ctor(void *, void *, int); 259 static void cache_dtor(void *, void *); 260 261 static struct sysctllog *sysctllog; 262 static void sysctl_cache_stat_setup(void); 263 264 /* 265 * Compute the hash for an entry. 266 * 267 * (This is for now a wrapper around namei_hash, whose interface is 268 * for the time being slightly inconvenient.) 269 */ 270 static nchash_t 271 cache_hash(const char *name, size_t namelen) 272 { 273 const char *endptr; 274 275 endptr = name + namelen; 276 return namei_hash(name, &endptr); 277 } 278 279 /* 280 * Invalidate a cache entry and enqueue it for garbage collection. 281 * The caller needs to hold namecache_lock or a per-cpu lock to hold 282 * off cache_reclaim(). 283 */ 284 static void 285 cache_invalidate(struct namecache *ncp) 286 { 287 void *head; 288 289 KASSERT(mutex_owned(&ncp->nc_lock)); 290 291 if (ncp->nc_dvp != NULL) { 292 ncp->nc_vp = NULL; 293 ncp->nc_dvp = NULL; 294 do { 295 head = cache_gcqueue; 296 ncp->nc_gcqueue = head; 297 } while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head); 298 atomic_inc_uint(&cache_gcpend); 299 } 300 } 301 302 /* 303 * Disassociate a namecache entry from any vnodes it is attached to, 304 * and remove from the global LRU list. 305 */ 306 static void 307 cache_disassociate(struct namecache *ncp) 308 { 309 310 KASSERT(mutex_owned(namecache_lock)); 311 KASSERT(ncp->nc_dvp == NULL); 312 313 if (ncp->nc_lru.tqe_prev != NULL) { 314 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 315 ncp->nc_lru.tqe_prev = NULL; 316 } 317 if (ncp->nc_vhash.le_prev != NULL) { 318 LIST_REMOVE(ncp, nc_vhash); 319 ncp->nc_vhash.le_prev = NULL; 320 } 321 if (ncp->nc_vlist.le_prev != NULL) { 322 LIST_REMOVE(ncp, nc_vlist); 323 ncp->nc_vlist.le_prev = NULL; 324 } 325 if (ncp->nc_dvlist.le_prev != NULL) { 326 LIST_REMOVE(ncp, nc_dvlist); 327 ncp->nc_dvlist.le_prev = NULL; 328 } 329 } 330 331 /* 332 * Lock all CPUs to prevent any cache lookup activity. Conceptually, 333 * this locks out all "readers". 334 */ 335 static void 336 cache_lock_cpus(void) 337 { 338 CPU_INFO_ITERATOR cii; 339 struct cpu_info *ci; 340 struct nchcpu *cpup; 341 342 /* 343 * Lock out all CPUs first, then harvest per-cpu stats. This 344 * is probably not quite as cache-efficient as doing the lock 345 * and harvest at the same time, but allows cache_stat_sysctl() 346 * to make do with a per-cpu lock. 347 */ 348 for (CPU_INFO_FOREACH(cii, ci)) { 349 cpup = ci->ci_data.cpu_nch; 350 mutex_enter(&cpup->cpu_lock); 351 } 352 for (CPU_INFO_FOREACH(cii, ci)) { 353 cpup = ci->ci_data.cpu_nch; 354 UPDATE(cpup, ncs_goodhits); 355 UPDATE(cpup, ncs_neghits); 356 UPDATE(cpup, ncs_badhits); 357 UPDATE(cpup, ncs_falsehits); 358 UPDATE(cpup, ncs_miss); 359 UPDATE(cpup, ncs_long); 360 UPDATE(cpup, ncs_pass2); 361 UPDATE(cpup, ncs_2passes); 362 UPDATE(cpup, ncs_revhits); 363 UPDATE(cpup, ncs_revmiss); 364 } 365 } 366 367 /* 368 * Release all CPU locks. 369 */ 370 static void 371 cache_unlock_cpus(void) 372 { 373 CPU_INFO_ITERATOR cii; 374 struct cpu_info *ci; 375 struct nchcpu *cpup; 376 377 for (CPU_INFO_FOREACH(cii, ci)) { 378 cpup = ci->ci_data.cpu_nch; 379 mutex_exit(&cpup->cpu_lock); 380 } 381 } 382 383 /* 384 * Find a single cache entry and return it locked. 385 * The caller needs to hold namecache_lock or a per-cpu lock to hold 386 * off cache_reclaim(). 387 */ 388 static struct namecache * 389 cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen) 390 { 391 struct nchashhead *ncpp; 392 struct namecache *ncp; 393 nchash_t hash; 394 395 KASSERT(dvp != NULL); 396 hash = cache_hash(name, namelen); 397 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 398 399 LIST_FOREACH(ncp, ncpp, nc_hash) { 400 membar_datadep_consumer(); /* for Alpha... */ 401 if (ncp->nc_dvp != dvp || 402 ncp->nc_nlen != namelen || 403 memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen)) 404 continue; 405 mutex_enter(&ncp->nc_lock); 406 if (__predict_true(ncp->nc_dvp == dvp)) { 407 ncp->nc_hittime = hardclock_ticks; 408 return ncp; 409 } 410 /* Raced: entry has been nullified. */ 411 mutex_exit(&ncp->nc_lock); 412 } 413 414 return NULL; 415 } 416 417 /* 418 * Look for a the name in the cache. We don't do this 419 * if the segment name is long, simply so the cache can avoid 420 * holding long names (which would either waste space, or 421 * add greatly to the complexity). 422 * 423 * Lookup is called with DVP pointing to the directory to search, 424 * and CNP providing the name of the entry being sought: cn_nameptr 425 * is the name, cn_namelen is its length, and cn_flags is the flags 426 * word from the namei operation. 427 * 428 * DVP must be locked. 429 * 430 * There are three possible non-error return states: 431 * 1. Nothing was found in the cache. Nothing is known about 432 * the requested name. 433 * 2. A negative entry was found in the cache, meaning that the 434 * requested name definitely does not exist. 435 * 3. A positive entry was found in the cache, meaning that the 436 * requested name does exist and that we are providing the 437 * vnode. 438 * In these cases the results are: 439 * 1. 0 returned; VN is set to NULL. 440 * 2. 1 returned; VN is set to NULL. 441 * 3. 1 returned; VN is set to the vnode found. 442 * 443 * The additional result argument ISWHT is set to zero, unless a 444 * negative entry is found that was entered as a whiteout, in which 445 * case ISWHT is set to one. 446 * 447 * The ISWHT_RET argument pointer may be null. In this case an 448 * assertion is made that the whiteout flag is not set. File systems 449 * that do not support whiteouts can/should do this. 450 * 451 * Filesystems that do support whiteouts should add ISWHITEOUT to 452 * cnp->cn_flags if ISWHT comes back nonzero. 453 * 454 * When a vnode is returned, it is locked, as per the vnode lookup 455 * locking protocol. 456 * 457 * There is no way for this function to fail, in the sense of 458 * generating an error that requires aborting the namei operation. 459 * 460 * (Prior to October 2012, this function returned an integer status, 461 * and a vnode, and mucked with the flags word in CNP for whiteouts. 462 * The integer status was -1 for "nothing found", ENOENT for "a 463 * negative entry found", 0 for "a positive entry found", and possibly 464 * other errors, and the value of VN might or might not have been set 465 * depending on what error occurred.) 466 */ 467 int 468 cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 469 uint32_t nameiop, uint32_t cnflags, 470 int *iswht_ret, struct vnode **vn_ret) 471 { 472 struct namecache *ncp; 473 struct vnode *vp; 474 struct nchcpu *cpup; 475 int error, ret_value; 476 477 478 /* Establish default result values */ 479 if (iswht_ret != NULL) { 480 *iswht_ret = 0; 481 } 482 *vn_ret = NULL; 483 484 if (__predict_false(!doingcache)) { 485 return 0; 486 } 487 488 cpup = curcpu()->ci_data.cpu_nch; 489 mutex_enter(&cpup->cpu_lock); 490 if (__predict_false(namelen > NCHNAMLEN)) { 491 COUNT(cpup, ncs_long); 492 mutex_exit(&cpup->cpu_lock); 493 /* found nothing */ 494 return 0; 495 } 496 497 ncp = cache_lookup_entry(dvp, name, namelen); 498 if (__predict_false(ncp == NULL)) { 499 COUNT(cpup, ncs_miss); 500 mutex_exit(&cpup->cpu_lock); 501 /* found nothing */ 502 return 0; 503 } 504 if ((cnflags & MAKEENTRY) == 0) { 505 COUNT(cpup, ncs_badhits); 506 /* 507 * Last component and we are renaming or deleting, 508 * the cache entry is invalid, or otherwise don't 509 * want cache entry to exist. 510 */ 511 cache_invalidate(ncp); 512 mutex_exit(&ncp->nc_lock); 513 mutex_exit(&cpup->cpu_lock); 514 /* found nothing */ 515 return 0; 516 } 517 if (ncp->nc_vp == NULL) { 518 if (iswht_ret != NULL) { 519 /* 520 * Restore the ISWHITEOUT flag saved earlier. 521 */ 522 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 523 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 524 } else { 525 KASSERT(ncp->nc_flags == 0); 526 } 527 528 if (__predict_true(nameiop != CREATE || 529 (cnflags & ISLASTCN) == 0)) { 530 COUNT(cpup, ncs_neghits); 531 /* found neg entry; vn is already null from above */ 532 ret_value = 1; 533 } else { 534 COUNT(cpup, ncs_badhits); 535 /* 536 * Last component and we are renaming or 537 * deleting, the cache entry is invalid, 538 * or otherwise don't want cache entry to 539 * exist. 540 */ 541 cache_invalidate(ncp); 542 /* found nothing */ 543 ret_value = 0; 544 } 545 mutex_exit(&ncp->nc_lock); 546 mutex_exit(&cpup->cpu_lock); 547 return ret_value; 548 } 549 550 vp = ncp->nc_vp; 551 mutex_enter(vp->v_interlock); 552 mutex_exit(&ncp->nc_lock); 553 mutex_exit(&cpup->cpu_lock); 554 555 /* 556 * Unlocked except for the vnode interlock. Call vget(). 557 */ 558 error = vget(vp, LK_NOWAIT); 559 if (error) { 560 KASSERT(error == EBUSY); 561 /* 562 * This vnode is being cleaned out. 563 * XXX badhits? 564 */ 565 COUNT_UNL(cpup, ncs_falsehits); 566 /* found nothing */ 567 return 0; 568 } 569 570 COUNT_UNL(cpup, ncs_goodhits); 571 /* found it */ 572 *vn_ret = vp; 573 return 1; 574 } 575 576 577 /* 578 * Cut-'n-pasted version of the above without the nameiop argument. 579 */ 580 int 581 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 582 uint32_t cnflags, 583 int *iswht_ret, struct vnode **vn_ret) 584 { 585 struct namecache *ncp; 586 struct vnode *vp; 587 struct nchcpu *cpup; 588 int error; 589 590 /* Establish default results. */ 591 if (iswht_ret != NULL) { 592 *iswht_ret = 0; 593 } 594 *vn_ret = NULL; 595 596 if (__predict_false(!doingcache)) { 597 /* found nothing */ 598 return 0; 599 } 600 601 cpup = curcpu()->ci_data.cpu_nch; 602 mutex_enter(&cpup->cpu_lock); 603 if (__predict_false(namelen > NCHNAMLEN)) { 604 COUNT(cpup, ncs_long); 605 mutex_exit(&cpup->cpu_lock); 606 /* found nothing */ 607 return 0; 608 } 609 ncp = cache_lookup_entry(dvp, name, namelen); 610 if (__predict_false(ncp == NULL)) { 611 COUNT(cpup, ncs_miss); 612 mutex_exit(&cpup->cpu_lock); 613 /* found nothing */ 614 return 0; 615 } 616 vp = ncp->nc_vp; 617 if (vp == NULL) { 618 /* 619 * Restore the ISWHITEOUT flag saved earlier. 620 */ 621 if (iswht_ret != NULL) { 622 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 623 /*cnp->cn_flags |= ncp->nc_flags;*/ 624 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 625 } 626 COUNT(cpup, ncs_neghits); 627 mutex_exit(&ncp->nc_lock); 628 mutex_exit(&cpup->cpu_lock); 629 /* found negative entry; vn is already null from above */ 630 return 1; 631 } 632 mutex_enter(vp->v_interlock); 633 mutex_exit(&ncp->nc_lock); 634 mutex_exit(&cpup->cpu_lock); 635 636 /* 637 * Unlocked except for the vnode interlock. Call vget(). 638 */ 639 error = vget(vp, LK_NOWAIT); 640 if (error) { 641 KASSERT(error == EBUSY); 642 /* 643 * This vnode is being cleaned out. 644 * XXX badhits? 645 */ 646 COUNT_UNL(cpup, ncs_falsehits); 647 /* found nothing */ 648 return 0; 649 } 650 651 COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */ 652 /* found it */ 653 *vn_ret = vp; 654 return 1; 655 } 656 657 /* 658 * Scan cache looking for name of directory entry pointing at vp. 659 * 660 * If the lookup succeeds the vnode is referenced and stored in dvpp. 661 * 662 * If bufp is non-NULL, also place the name in the buffer which starts 663 * at bufp, immediately before *bpp, and move bpp backwards to point 664 * at the start of it. (Yes, this is a little baroque, but it's done 665 * this way to cater to the whims of getcwd). 666 * 667 * Returns 0 on success, -1 on cache miss, positive errno on failure. 668 */ 669 int 670 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) 671 { 672 struct namecache *ncp; 673 struct vnode *dvp; 674 struct ncvhashhead *nvcpp; 675 struct nchcpu *cpup; 676 char *bp; 677 int error, nlen; 678 679 if (!doingcache) 680 goto out; 681 682 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 683 684 /* 685 * We increment counters in the local CPU's per-cpu stats. 686 * We don't take the per-cpu lock, however, since this function 687 * is the only place these counters are incremented so no one 688 * will be racing with us to increment them. 689 */ 690 cpup = curcpu()->ci_data.cpu_nch; 691 mutex_enter(namecache_lock); 692 LIST_FOREACH(ncp, nvcpp, nc_vhash) { 693 mutex_enter(&ncp->nc_lock); 694 if (ncp->nc_vp == vp && 695 (dvp = ncp->nc_dvp) != NULL && 696 dvp != vp) { /* avoid pesky . entries.. */ 697 698 #ifdef DIAGNOSTIC 699 if (ncp->nc_nlen == 1 && 700 ncp->nc_name[0] == '.') 701 panic("cache_revlookup: found entry for ."); 702 703 if (ncp->nc_nlen == 2 && 704 ncp->nc_name[0] == '.' && 705 ncp->nc_name[1] == '.') 706 panic("cache_revlookup: found entry for .."); 707 #endif 708 COUNT(cpup, ncs_revhits); 709 nlen = ncp->nc_nlen; 710 711 if (bufp) { 712 bp = *bpp; 713 bp -= nlen; 714 if (bp <= bufp) { 715 *dvpp = NULL; 716 mutex_exit(&ncp->nc_lock); 717 mutex_exit(namecache_lock); 718 return (ERANGE); 719 } 720 memcpy(bp, ncp->nc_name, nlen); 721 *bpp = bp; 722 } 723 724 mutex_enter(dvp->v_interlock); 725 mutex_exit(&ncp->nc_lock); 726 mutex_exit(namecache_lock); 727 error = vget(dvp, LK_NOWAIT); 728 if (error) { 729 KASSERT(error == EBUSY); 730 if (bufp) 731 (*bpp) += nlen; 732 *dvpp = NULL; 733 return -1; 734 } 735 *dvpp = dvp; 736 return (0); 737 } 738 mutex_exit(&ncp->nc_lock); 739 } 740 COUNT(cpup, ncs_revmiss); 741 mutex_exit(namecache_lock); 742 out: 743 *dvpp = NULL; 744 return (-1); 745 } 746 747 /* 748 * Add an entry to the cache 749 */ 750 void 751 cache_enter(struct vnode *dvp, struct vnode *vp, 752 const char *name, size_t namelen, uint32_t cnflags) 753 { 754 struct namecache *ncp; 755 struct namecache *oncp; 756 struct nchashhead *ncpp; 757 struct ncvhashhead *nvcpp; 758 nchash_t hash; 759 760 /* First, check whether we can/should add a cache entry. */ 761 if ((cnflags & MAKEENTRY) == 0 || 762 __predict_false(namelen > NCHNAMLEN || !doingcache)) { 763 return; 764 } 765 766 if (numcache > desiredvnodes) { 767 mutex_enter(namecache_lock); 768 cache_ev_forced.ev_count++; 769 cache_reclaim(); 770 mutex_exit(namecache_lock); 771 } 772 773 ncp = pool_cache_get(namecache_cache, PR_WAITOK); 774 mutex_enter(namecache_lock); 775 numcache++; 776 777 /* 778 * Concurrent lookups in the same directory may race for a 779 * cache entry. if there's a duplicated entry, free it. 780 */ 781 oncp = cache_lookup_entry(dvp, name, namelen); 782 if (oncp) { 783 cache_invalidate(oncp); 784 mutex_exit(&oncp->nc_lock); 785 } 786 787 /* Grab the vnode we just found. */ 788 mutex_enter(&ncp->nc_lock); 789 ncp->nc_vp = vp; 790 ncp->nc_flags = 0; 791 ncp->nc_hittime = 0; 792 ncp->nc_gcqueue = NULL; 793 if (vp == NULL) { 794 /* 795 * For negative hits, save the ISWHITEOUT flag so we can 796 * restore it later when the cache entry is used again. 797 */ 798 ncp->nc_flags = cnflags & ISWHITEOUT; 799 } 800 801 /* Fill in cache info. */ 802 ncp->nc_dvp = dvp; 803 LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist); 804 if (vp) 805 LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist); 806 else { 807 ncp->nc_vlist.le_prev = NULL; 808 ncp->nc_vlist.le_next = NULL; 809 } 810 KASSERT(namelen <= NCHNAMLEN); 811 ncp->nc_nlen = namelen; 812 memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen); 813 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 814 hash = cache_hash(name, namelen); 815 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 816 817 /* 818 * Flush updates before making visible in table. No need for a 819 * memory barrier on the other side: to see modifications the 820 * list must be followed, meaning a dependent pointer load. 821 * The below is LIST_INSERT_HEAD() inlined, with the memory 822 * barrier included in the correct place. 823 */ 824 if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL) 825 ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next; 826 ncp->nc_hash.le_prev = &ncpp->lh_first; 827 membar_producer(); 828 ncpp->lh_first = ncp; 829 830 ncp->nc_vhash.le_prev = NULL; 831 ncp->nc_vhash.le_next = NULL; 832 833 /* 834 * Create reverse-cache entries (used in getcwd) for directories. 835 * (and in linux procfs exe node) 836 */ 837 if (vp != NULL && 838 vp != dvp && 839 #ifndef NAMECACHE_ENTER_REVERSE 840 vp->v_type == VDIR && 841 #endif 842 (ncp->nc_nlen > 2 || 843 (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || 844 (/* ncp->nc_nlen > 0 && */ ncp->nc_name[0] != '.'))) { 845 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 846 LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); 847 } 848 mutex_exit(&ncp->nc_lock); 849 mutex_exit(namecache_lock); 850 } 851 852 /* 853 * Name cache initialization, from vfs_init() when we are booting 854 */ 855 void 856 nchinit(void) 857 { 858 int error; 859 860 TAILQ_INIT(&nclruhead); 861 namecache_cache = pool_cache_init(sizeof(struct namecache), 862 coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor, 863 cache_dtor, NULL); 864 KASSERT(namecache_cache != NULL); 865 866 namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 867 868 nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash); 869 ncvhashtbl = 870 #ifdef NAMECACHE_ENTER_REVERSE 871 hashinit(desiredvnodes, HASH_LIST, true, &ncvhash); 872 #else 873 hashinit(desiredvnodes/8, HASH_LIST, true, &ncvhash); 874 #endif 875 876 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread, 877 NULL, NULL, "cachegc"); 878 if (error != 0) 879 panic("nchinit %d", error); 880 881 evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL, 882 "namecache", "entries scanned"); 883 evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL, 884 "namecache", "entries collected"); 885 evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL, 886 "namecache", "over scan target"); 887 evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL, 888 "namecache", "under scan target"); 889 evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL, 890 "namecache", "forced reclaims"); 891 892 sysctl_cache_stat_setup(); 893 } 894 895 static int 896 cache_ctor(void *arg, void *obj, int flag) 897 { 898 struct namecache *ncp; 899 900 ncp = obj; 901 mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE); 902 903 return 0; 904 } 905 906 static void 907 cache_dtor(void *arg, void *obj) 908 { 909 struct namecache *ncp; 910 911 ncp = obj; 912 mutex_destroy(&ncp->nc_lock); 913 } 914 915 /* 916 * Called once for each CPU in the system as attached. 917 */ 918 void 919 cache_cpu_init(struct cpu_info *ci) 920 { 921 struct nchcpu *cpup; 922 size_t sz; 923 924 sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit; 925 cpup = kmem_zalloc(sz, KM_SLEEP); 926 cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit); 927 mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE); 928 ci->ci_data.cpu_nch = cpup; 929 } 930 931 /* 932 * Name cache reinitialization, for when the maximum number of vnodes increases. 933 */ 934 void 935 nchreinit(void) 936 { 937 struct namecache *ncp; 938 struct nchashhead *oldhash1, *hash1; 939 struct ncvhashhead *oldhash2, *hash2; 940 u_long i, oldmask1, oldmask2, mask1, mask2; 941 942 hash1 = hashinit(desiredvnodes, HASH_LIST, true, &mask1); 943 hash2 = 944 #ifdef NAMECACHE_ENTER_REVERSE 945 hashinit(desiredvnodes, HASH_LIST, true, &mask2); 946 #else 947 hashinit(desiredvnodes/8, HASH_LIST, true, &mask2); 948 #endif 949 mutex_enter(namecache_lock); 950 cache_lock_cpus(); 951 oldhash1 = nchashtbl; 952 oldmask1 = nchash; 953 nchashtbl = hash1; 954 nchash = mask1; 955 oldhash2 = ncvhashtbl; 956 oldmask2 = ncvhash; 957 ncvhashtbl = hash2; 958 ncvhash = mask2; 959 for (i = 0; i <= oldmask1; i++) { 960 while ((ncp = LIST_FIRST(&oldhash1[i])) != NULL) { 961 LIST_REMOVE(ncp, nc_hash); 962 ncp->nc_hash.le_prev = NULL; 963 } 964 } 965 for (i = 0; i <= oldmask2; i++) { 966 while ((ncp = LIST_FIRST(&oldhash2[i])) != NULL) { 967 LIST_REMOVE(ncp, nc_vhash); 968 ncp->nc_vhash.le_prev = NULL; 969 } 970 } 971 cache_unlock_cpus(); 972 mutex_exit(namecache_lock); 973 hashdone(oldhash1, HASH_LIST, oldmask1); 974 hashdone(oldhash2, HASH_LIST, oldmask2); 975 } 976 977 /* 978 * Cache flush, a particular vnode; called when a vnode is renamed to 979 * hide entries that would now be invalid 980 */ 981 void 982 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 983 { 984 struct namecache *ncp, *ncnext; 985 986 mutex_enter(namecache_lock); 987 if (flags & PURGE_PARENTS) { 988 for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL; 989 ncp = ncnext) { 990 ncnext = LIST_NEXT(ncp, nc_vlist); 991 mutex_enter(&ncp->nc_lock); 992 cache_invalidate(ncp); 993 mutex_exit(&ncp->nc_lock); 994 cache_disassociate(ncp); 995 } 996 } 997 if (flags & PURGE_CHILDREN) { 998 for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL; 999 ncp = ncnext) { 1000 ncnext = LIST_NEXT(ncp, nc_dvlist); 1001 mutex_enter(&ncp->nc_lock); 1002 cache_invalidate(ncp); 1003 mutex_exit(&ncp->nc_lock); 1004 cache_disassociate(ncp); 1005 } 1006 } 1007 if (name != NULL) { 1008 ncp = cache_lookup_entry(vp, name, namelen); 1009 if (ncp) { 1010 cache_invalidate(ncp); 1011 mutex_exit(&ncp->nc_lock); 1012 cache_disassociate(ncp); 1013 } 1014 } 1015 mutex_exit(namecache_lock); 1016 } 1017 1018 /* 1019 * Cache flush, a whole filesystem; called when filesys is umounted to 1020 * remove entries that would now be invalid. 1021 */ 1022 void 1023 cache_purgevfs(struct mount *mp) 1024 { 1025 struct namecache *ncp, *nxtcp; 1026 1027 mutex_enter(namecache_lock); 1028 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 1029 nxtcp = TAILQ_NEXT(ncp, nc_lru); 1030 mutex_enter(&ncp->nc_lock); 1031 if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) { 1032 /* Free the resources we had. */ 1033 cache_invalidate(ncp); 1034 cache_disassociate(ncp); 1035 } 1036 mutex_exit(&ncp->nc_lock); 1037 } 1038 cache_reclaim(); 1039 mutex_exit(namecache_lock); 1040 } 1041 1042 /* 1043 * Scan global list invalidating entries until we meet a preset target. 1044 * Prefer to invalidate entries that have not scored a hit within 1045 * cache_hottime seconds. We sort the LRU list only for this routine's 1046 * benefit. 1047 */ 1048 static void 1049 cache_prune(int incache, int target) 1050 { 1051 struct namecache *ncp, *nxtcp, *sentinel; 1052 int items, recent, tryharder; 1053 1054 KASSERT(mutex_owned(namecache_lock)); 1055 1056 items = 0; 1057 tryharder = 0; 1058 recent = hardclock_ticks - hz * cache_hottime; 1059 sentinel = NULL; 1060 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 1061 if (incache <= target) 1062 break; 1063 items++; 1064 nxtcp = TAILQ_NEXT(ncp, nc_lru); 1065 if (ncp == sentinel) { 1066 /* 1067 * If we looped back on ourself, then ignore 1068 * recent entries and purge whatever we find. 1069 */ 1070 tryharder = 1; 1071 } 1072 if (ncp->nc_dvp == NULL) 1073 continue; 1074 if (!tryharder && (ncp->nc_hittime - recent) > 0) { 1075 if (sentinel == NULL) 1076 sentinel = ncp; 1077 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 1078 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 1079 continue; 1080 } 1081 mutex_enter(&ncp->nc_lock); 1082 if (ncp->nc_dvp != NULL) { 1083 cache_invalidate(ncp); 1084 cache_disassociate(ncp); 1085 incache--; 1086 } 1087 mutex_exit(&ncp->nc_lock); 1088 } 1089 cache_ev_scan.ev_count += items; 1090 } 1091 1092 /* 1093 * Collect dead cache entries from all CPUs and garbage collect. 1094 */ 1095 static void 1096 cache_reclaim(void) 1097 { 1098 struct namecache *ncp, *next; 1099 int items; 1100 1101 KASSERT(mutex_owned(namecache_lock)); 1102 1103 /* 1104 * If the number of extant entries not awaiting garbage collection 1105 * exceeds the high water mark, then reclaim stale entries until we 1106 * reach our low water mark. 1107 */ 1108 items = numcache - cache_gcpend; 1109 if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) { 1110 cache_prune(items, (int)((uint64_t)desiredvnodes * 1111 cache_lowat / 100)); 1112 cache_ev_over.ev_count++; 1113 } else 1114 cache_ev_under.ev_count++; 1115 1116 /* 1117 * Stop forward lookup activity on all CPUs and garbage collect dead 1118 * entries. 1119 */ 1120 cache_lock_cpus(); 1121 ncp = cache_gcqueue; 1122 cache_gcqueue = NULL; 1123 items = cache_gcpend; 1124 cache_gcpend = 0; 1125 while (ncp != NULL) { 1126 next = ncp->nc_gcqueue; 1127 cache_disassociate(ncp); 1128 KASSERT(ncp->nc_dvp == NULL); 1129 if (ncp->nc_hash.le_prev != NULL) { 1130 LIST_REMOVE(ncp, nc_hash); 1131 ncp->nc_hash.le_prev = NULL; 1132 } 1133 pool_cache_put(namecache_cache, ncp); 1134 ncp = next; 1135 } 1136 cache_unlock_cpus(); 1137 numcache -= items; 1138 cache_ev_gc.ev_count += items; 1139 } 1140 1141 /* 1142 * Cache maintainence thread, awakening once per second to: 1143 * 1144 * => keep number of entries below the high water mark 1145 * => sort pseudo-LRU list 1146 * => garbage collect dead entries 1147 */ 1148 static void 1149 cache_thread(void *arg) 1150 { 1151 1152 mutex_enter(namecache_lock); 1153 for (;;) { 1154 cache_reclaim(); 1155 kpause("cachegc", false, hz, namecache_lock); 1156 } 1157 } 1158 1159 #ifdef DDB 1160 void 1161 namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1162 { 1163 struct vnode *dvp = NULL; 1164 struct namecache *ncp; 1165 1166 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1167 if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) { 1168 (*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name); 1169 dvp = ncp->nc_dvp; 1170 } 1171 } 1172 if (dvp == NULL) { 1173 (*pr)("name not found\n"); 1174 return; 1175 } 1176 vp = dvp; 1177 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1178 if (ncp->nc_vp == vp) { 1179 (*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name); 1180 } 1181 } 1182 } 1183 #endif 1184 1185 void 1186 namecache_count_pass2(void) 1187 { 1188 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1189 1190 COUNT_UNL(cpup, ncs_pass2); 1191 } 1192 1193 void 1194 namecache_count_2passes(void) 1195 { 1196 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1197 1198 COUNT_UNL(cpup, ncs_2passes); 1199 } 1200 1201 /* 1202 * Fetch the current values of the stats. We return the most 1203 * recent values harvested into nchstats by cache_reclaim(), which 1204 * will be less than a second old. 1205 */ 1206 static int 1207 cache_stat_sysctl(SYSCTLFN_ARGS) 1208 { 1209 struct nchstats stats; 1210 struct nchcpu *my_cpup; 1211 #ifdef CACHE_STATS_CURRENT 1212 CPU_INFO_ITERATOR cii; 1213 struct cpu_info *ci; 1214 #endif /* CACHE_STATS_CURRENT */ 1215 1216 if (oldp == NULL) { 1217 *oldlenp = sizeof(stats); 1218 return 0; 1219 } 1220 1221 if (*oldlenp < sizeof(stats)) { 1222 *oldlenp = 0; 1223 return 0; 1224 } 1225 1226 /* 1227 * Take this CPU's per-cpu lock to hold off cache_reclaim() 1228 * from doing a stats update while doing minimal damage to 1229 * concurrent operations. 1230 */ 1231 sysctl_unlock(); 1232 my_cpup = curcpu()->ci_data.cpu_nch; 1233 mutex_enter(&my_cpup->cpu_lock); 1234 stats = nchstats; 1235 #ifdef CACHE_STATS_CURRENT 1236 for (CPU_INFO_FOREACH(cii, ci)) { 1237 struct nchcpu *cpup = ci->ci_data.cpu_nch; 1238 1239 ADD(stats, cpup, ncs_goodhits); 1240 ADD(stats, cpup, ncs_neghits); 1241 ADD(stats, cpup, ncs_badhits); 1242 ADD(stats, cpup, ncs_falsehits); 1243 ADD(stats, cpup, ncs_miss); 1244 ADD(stats, cpup, ncs_long); 1245 ADD(stats, cpup, ncs_pass2); 1246 ADD(stats, cpup, ncs_2passes); 1247 ADD(stats, cpup, ncs_revhits); 1248 ADD(stats, cpup, ncs_revmiss); 1249 } 1250 #endif /* CACHE_STATS_CURRENT */ 1251 mutex_exit(&my_cpup->cpu_lock); 1252 sysctl_relock(); 1253 1254 *oldlenp = sizeof(stats); 1255 return sysctl_copyout(l, &stats, oldp, sizeof(stats)); 1256 } 1257 1258 static void 1259 sysctl_cache_stat_setup(void) 1260 { 1261 1262 KASSERT(sysctllog == NULL); 1263 sysctl_createv(&sysctllog, 0, NULL, NULL, 1264 CTLFLAG_PERMANENT, 1265 CTLTYPE_STRUCT, "namecache_stats", 1266 SYSCTL_DESCR("namecache statistics"), 1267 cache_stat_sysctl, 0, NULL, 0, 1268 CTL_VFS, CTL_CREATE, CTL_EOL); 1269 } 1270