1 /* $NetBSD: vfs_cache.c,v 1.99 2014/06/16 12:28:10 joerg Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.99 2014/06/16 12:28:10 joerg Exp $"); 62 63 #include "opt_ddb.h" 64 #include "opt_revcache.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/sysctl.h> 69 #include <sys/time.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/namei.h> 73 #include <sys/errno.h> 74 #include <sys/pool.h> 75 #include <sys/mutex.h> 76 #include <sys/atomic.h> 77 #include <sys/kthread.h> 78 #include <sys/kernel.h> 79 #include <sys/cpu.h> 80 #include <sys/evcnt.h> 81 82 #define NAMECACHE_ENTER_REVERSE 83 /* 84 * Name caching works as follows: 85 * 86 * Names found by directory scans are retained in a cache 87 * for future reference. It is managed LRU, so frequently 88 * used names will hang around. Cache is indexed by hash value 89 * obtained from (dvp, name) where dvp refers to the directory 90 * containing name. 91 * 92 * For simplicity (and economy of storage), names longer than 93 * a maximum length of NCHNAMLEN are not cached; they occur 94 * infrequently in any case, and are almost never of interest. 95 * 96 * Upon reaching the last segment of a path, if the reference 97 * is for DELETE, or NOCACHE is set (rewrite), and the 98 * name is located in the cache, it will be dropped. 99 * The entry is dropped also when it was not possible to lock 100 * the cached vnode, either because vget() failed or the generation 101 * number has changed while waiting for the lock. 102 */ 103 104 /* 105 * Per-cpu namecache data. 106 */ 107 struct nchcpu { 108 kmutex_t cpu_lock; 109 struct nchstats cpu_stats; 110 }; 111 112 /* 113 * The type for the hash code. While the hash function generates a 114 * u32, the hash code has historically been passed around as a u_long, 115 * and the value is modified by xor'ing a uintptr_t, so it's not 116 * entirely clear what the best type is. For now I'll leave it 117 * unchanged as u_long. 118 */ 119 120 typedef u_long nchash_t; 121 122 /* 123 * Structures associated with name cacheing. 124 */ 125 126 static kmutex_t *namecache_lock __read_mostly; 127 static pool_cache_t namecache_cache __read_mostly; 128 static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned; 129 130 static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly; 131 static u_long nchash __read_mostly; 132 133 #define NCHASH2(hash, dvp) \ 134 (((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash) 135 136 static LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl __read_mostly; 137 static u_long ncvhash __read_mostly; 138 139 #define NCVHASH(vp) (((uintptr_t)(vp) >> 3) & ncvhash) 140 141 /* Number of cache entries allocated. */ 142 static long numcache __cacheline_aligned; 143 144 /* Garbage collection queue and number of entries pending in it. */ 145 static void *cache_gcqueue; 146 static u_int cache_gcpend; 147 148 /* Cache effectiveness statistics. */ 149 struct nchstats nchstats __cacheline_aligned; 150 #define COUNT(c,x) (c.x++) 151 152 static const int cache_lowat = 95; 153 static const int cache_hiwat = 98; 154 static const int cache_hottime = 5; /* number of seconds */ 155 static int doingcache = 1; /* 1 => enable the cache */ 156 157 static struct evcnt cache_ev_scan; 158 static struct evcnt cache_ev_gc; 159 static struct evcnt cache_ev_over; 160 static struct evcnt cache_ev_under; 161 static struct evcnt cache_ev_forced; 162 163 static void cache_invalidate(struct namecache *); 164 static struct namecache *cache_lookup_entry( 165 const struct vnode *, const char *, size_t); 166 static void cache_thread(void *); 167 static void cache_invalidate(struct namecache *); 168 static void cache_disassociate(struct namecache *); 169 static void cache_reclaim(void); 170 static int cache_ctor(void *, void *, int); 171 static void cache_dtor(void *, void *); 172 173 /* 174 * Compute the hash for an entry. 175 * 176 * (This is for now a wrapper around namei_hash, whose interface is 177 * for the time being slightly inconvenient.) 178 */ 179 static nchash_t 180 cache_hash(const char *name, size_t namelen) 181 { 182 const char *endptr; 183 184 endptr = name + namelen; 185 return namei_hash(name, &endptr); 186 } 187 188 /* 189 * Invalidate a cache entry and enqueue it for garbage collection. 190 */ 191 static void 192 cache_invalidate(struct namecache *ncp) 193 { 194 void *head; 195 196 KASSERT(mutex_owned(&ncp->nc_lock)); 197 198 if (ncp->nc_dvp != NULL) { 199 ncp->nc_vp = NULL; 200 ncp->nc_dvp = NULL; 201 do { 202 head = cache_gcqueue; 203 ncp->nc_gcqueue = head; 204 } while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head); 205 atomic_inc_uint(&cache_gcpend); 206 } 207 } 208 209 /* 210 * Disassociate a namecache entry from any vnodes it is attached to, 211 * and remove from the global LRU list. 212 */ 213 static void 214 cache_disassociate(struct namecache *ncp) 215 { 216 217 KASSERT(mutex_owned(namecache_lock)); 218 KASSERT(ncp->nc_dvp == NULL); 219 220 if (ncp->nc_lru.tqe_prev != NULL) { 221 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 222 ncp->nc_lru.tqe_prev = NULL; 223 } 224 if (ncp->nc_vhash.le_prev != NULL) { 225 LIST_REMOVE(ncp, nc_vhash); 226 ncp->nc_vhash.le_prev = NULL; 227 } 228 if (ncp->nc_vlist.le_prev != NULL) { 229 LIST_REMOVE(ncp, nc_vlist); 230 ncp->nc_vlist.le_prev = NULL; 231 } 232 if (ncp->nc_dvlist.le_prev != NULL) { 233 LIST_REMOVE(ncp, nc_dvlist); 234 ncp->nc_dvlist.le_prev = NULL; 235 } 236 } 237 238 /* 239 * Lock all CPUs to prevent any cache lookup activity. Conceptually, 240 * this locks out all "readers". 241 */ 242 #define UPDATE(f) do { \ 243 nchstats.f += cpup->cpu_stats.f; \ 244 cpup->cpu_stats.f = 0; \ 245 } while (/* CONSTCOND */ 0) 246 247 static void 248 cache_lock_cpus(void) 249 { 250 CPU_INFO_ITERATOR cii; 251 struct cpu_info *ci; 252 struct nchcpu *cpup; 253 254 for (CPU_INFO_FOREACH(cii, ci)) { 255 cpup = ci->ci_data.cpu_nch; 256 mutex_enter(&cpup->cpu_lock); 257 UPDATE(ncs_goodhits); 258 UPDATE(ncs_neghits); 259 UPDATE(ncs_badhits); 260 UPDATE(ncs_falsehits); 261 UPDATE(ncs_miss); 262 UPDATE(ncs_long); 263 UPDATE(ncs_pass2); 264 UPDATE(ncs_2passes); 265 UPDATE(ncs_revhits); 266 UPDATE(ncs_revmiss); 267 } 268 } 269 270 #undef UPDATE 271 272 /* 273 * Release all CPU locks. 274 */ 275 static void 276 cache_unlock_cpus(void) 277 { 278 CPU_INFO_ITERATOR cii; 279 struct cpu_info *ci; 280 struct nchcpu *cpup; 281 282 for (CPU_INFO_FOREACH(cii, ci)) { 283 cpup = ci->ci_data.cpu_nch; 284 mutex_exit(&cpup->cpu_lock); 285 } 286 } 287 288 /* 289 * Find a single cache entry and return it locked. 290 */ 291 static struct namecache * 292 cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen) 293 { 294 struct nchashhead *ncpp; 295 struct namecache *ncp; 296 nchash_t hash; 297 298 KASSERT(dvp != NULL); 299 KASSERT(mutex_owned(namecache_lock)); 300 hash = cache_hash(name, namelen); 301 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 302 303 LIST_FOREACH(ncp, ncpp, nc_hash) { 304 if (ncp->nc_dvp != dvp || 305 ncp->nc_nlen != namelen || 306 memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen)) 307 continue; 308 mutex_enter(&ncp->nc_lock); 309 if (__predict_true(ncp->nc_dvp == dvp)) { 310 ncp->nc_hittime = hardclock_ticks; 311 return ncp; 312 } 313 /* Raced: entry has been nullified. */ 314 mutex_exit(&ncp->nc_lock); 315 } 316 317 return NULL; 318 } 319 320 /* 321 * Look for a the name in the cache. We don't do this 322 * if the segment name is long, simply so the cache can avoid 323 * holding long names (which would either waste space, or 324 * add greatly to the complexity). 325 * 326 * Lookup is called with DVP pointing to the directory to search, 327 * and CNP providing the name of the entry being sought: cn_nameptr 328 * is the name, cn_namelen is its length, and cn_flags is the flags 329 * word from the namei operation. 330 * 331 * DVP must be locked. 332 * 333 * There are three possible non-error return states: 334 * 1. Nothing was found in the cache. Nothing is known about 335 * the requested name. 336 * 2. A negative entry was found in the cache, meaning that the 337 * requested name definitely does not exist. 338 * 3. A positive entry was found in the cache, meaning that the 339 * requested name does exist and that we are providing the 340 * vnode. 341 * In these cases the results are: 342 * 1. 0 returned; VN is set to NULL. 343 * 2. 1 returned; VN is set to NULL. 344 * 3. 1 returned; VN is set to the vnode found. 345 * 346 * The additional result argument ISWHT is set to zero, unless a 347 * negative entry is found that was entered as a whiteout, in which 348 * case ISWHT is set to one. 349 * 350 * The ISWHT_RET argument pointer may be null. In this case an 351 * assertion is made that the whiteout flag is not set. File systems 352 * that do not support whiteouts can/should do this. 353 * 354 * Filesystems that do support whiteouts should add ISWHITEOUT to 355 * cnp->cn_flags if ISWHT comes back nonzero. 356 * 357 * When a vnode is returned, it is locked, as per the vnode lookup 358 * locking protocol. 359 * 360 * There is no way for this function to fail, in the sense of 361 * generating an error that requires aborting the namei operation. 362 * 363 * (Prior to October 2012, this function returned an integer status, 364 * and a vnode, and mucked with the flags word in CNP for whiteouts. 365 * The integer status was -1 for "nothing found", ENOENT for "a 366 * negative entry found", 0 for "a positive entry found", and possibly 367 * other errors, and the value of VN might or might not have been set 368 * depending on what error occurred.) 369 */ 370 int 371 cache_lookup(struct vnode *dvp, const char *name, size_t namelen, 372 uint32_t nameiop, uint32_t cnflags, 373 int *iswht_ret, struct vnode **vn_ret) 374 { 375 struct namecache *ncp; 376 struct vnode *vp; 377 struct nchcpu *cpup; 378 int error; 379 380 /* Establish default result values */ 381 if (iswht_ret != NULL) { 382 *iswht_ret = 0; 383 } 384 *vn_ret = NULL; 385 386 if (__predict_false(!doingcache)) { 387 return 0; 388 } 389 390 cpup = curcpu()->ci_data.cpu_nch; 391 if (__predict_false(namelen > NCHNAMLEN)) { 392 mutex_enter(&cpup->cpu_lock); 393 COUNT(cpup->cpu_stats, ncs_long); 394 mutex_exit(&cpup->cpu_lock); 395 /* found nothing */ 396 return 0; 397 } 398 mutex_enter(namecache_lock); 399 ncp = cache_lookup_entry(dvp, name, namelen); 400 mutex_exit(namecache_lock); 401 if (__predict_false(ncp == NULL)) { 402 mutex_enter(&cpup->cpu_lock); 403 COUNT(cpup->cpu_stats, ncs_miss); 404 mutex_exit(&cpup->cpu_lock); 405 /* found nothing */ 406 return 0; 407 } 408 if ((cnflags & MAKEENTRY) == 0) { 409 mutex_enter(&cpup->cpu_lock); 410 COUNT(cpup->cpu_stats, ncs_badhits); 411 mutex_exit(&cpup->cpu_lock); 412 /* 413 * Last component and we are renaming or deleting, 414 * the cache entry is invalid, or otherwise don't 415 * want cache entry to exist. 416 */ 417 cache_invalidate(ncp); 418 mutex_exit(&ncp->nc_lock); 419 /* found nothing */ 420 return 0; 421 } 422 if (ncp->nc_vp == NULL) { 423 if (iswht_ret != NULL) { 424 /* 425 * Restore the ISWHITEOUT flag saved earlier. 426 */ 427 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 428 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 429 } else { 430 KASSERT(ncp->nc_flags == 0); 431 } 432 433 if (__predict_true(nameiop != CREATE || 434 (cnflags & ISLASTCN) == 0)) { 435 mutex_enter(&cpup->cpu_lock); 436 COUNT(cpup->cpu_stats, ncs_neghits); 437 mutex_exit(&cpup->cpu_lock); 438 mutex_exit(&ncp->nc_lock); 439 /* found neg entry; vn is already null from above */ 440 return 1; 441 } else { 442 mutex_enter(&cpup->cpu_lock); 443 COUNT(cpup->cpu_stats, ncs_badhits); 444 mutex_exit(&cpup->cpu_lock); 445 /* 446 * Last component and we are renaming or 447 * deleting, the cache entry is invalid, 448 * or otherwise don't want cache entry to 449 * exist. 450 */ 451 cache_invalidate(ncp); 452 mutex_exit(&ncp->nc_lock); 453 /* found nothing */ 454 return 0; 455 } 456 } 457 458 vp = ncp->nc_vp; 459 mutex_enter(vp->v_interlock); 460 mutex_exit(&ncp->nc_lock); 461 error = vget(vp, LK_NOWAIT); 462 if (error) { 463 KASSERT(error == EBUSY); 464 /* 465 * This vnode is being cleaned out. 466 * XXX badhits? 467 */ 468 mutex_enter(&cpup->cpu_lock); 469 COUNT(cpup->cpu_stats, ncs_falsehits); 470 mutex_exit(&cpup->cpu_lock); 471 /* found nothing */ 472 return 0; 473 } 474 475 #ifdef DEBUG 476 /* 477 * since we released nb->nb_lock, 478 * we can't use this pointer any more. 479 */ 480 ncp = NULL; 481 #endif /* DEBUG */ 482 483 /* We don't have the right lock, but this is only for stats. */ 484 mutex_enter(&cpup->cpu_lock); 485 COUNT(cpup->cpu_stats, ncs_goodhits); 486 mutex_exit(&cpup->cpu_lock); 487 488 /* found it */ 489 *vn_ret = vp; 490 return 1; 491 } 492 493 int 494 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, 495 uint32_t cnflags, 496 int *iswht_ret, struct vnode **vn_ret) 497 { 498 struct namecache *ncp; 499 struct vnode *vp; 500 struct nchcpu *cpup; 501 int error; 502 503 /* Establish default results. */ 504 if (iswht_ret != NULL) { 505 *iswht_ret = 0; 506 } 507 *vn_ret = NULL; 508 509 if (__predict_false(!doingcache)) { 510 /* found nothing */ 511 return 0; 512 } 513 514 cpup = curcpu()->ci_data.cpu_nch; 515 if (__predict_false(namelen > NCHNAMLEN)) { 516 mutex_enter(&cpup->cpu_lock); 517 COUNT(cpup->cpu_stats, ncs_long); 518 mutex_exit(&cpup->cpu_lock); 519 /* found nothing */ 520 return 0; 521 } 522 mutex_enter(namecache_lock); 523 ncp = cache_lookup_entry(dvp, name, namelen); 524 mutex_exit(namecache_lock); 525 if (__predict_false(ncp == NULL)) { 526 mutex_enter(&cpup->cpu_lock); 527 COUNT(cpup->cpu_stats, ncs_miss); 528 mutex_exit(&cpup->cpu_lock); 529 /* found nothing */ 530 return 0; 531 } 532 vp = ncp->nc_vp; 533 if (vp == NULL) { 534 /* 535 * Restore the ISWHITEOUT flag saved earlier. 536 */ 537 if (iswht_ret != NULL) { 538 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); 539 /*cnp->cn_flags |= ncp->nc_flags;*/ 540 *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; 541 } 542 mutex_enter(&cpup->cpu_lock); 543 COUNT(cpup->cpu_stats, ncs_neghits); 544 mutex_exit(&cpup->cpu_lock); 545 mutex_exit(&ncp->nc_lock); 546 /* found negative entry; vn is already null from above */ 547 return 1; 548 } 549 mutex_enter(vp->v_interlock); 550 mutex_exit(&ncp->nc_lock); 551 error = vget(vp, LK_NOWAIT); 552 if (error) { 553 KASSERT(error == EBUSY); 554 /* 555 * This vnode is being cleaned out. 556 * XXX badhits? 557 */ 558 mutex_enter(&cpup->cpu_lock); 559 COUNT(cpup->cpu_stats, ncs_falsehits); 560 mutex_exit(&cpup->cpu_lock); 561 /* found nothing */ 562 return 0; 563 } 564 565 /* Unlocked, but only for stats. */ 566 mutex_enter(&cpup->cpu_lock); 567 COUNT(cpup->cpu_stats, ncs_goodhits); /* XXX can be "badhits" */ 568 mutex_exit(&cpup->cpu_lock); 569 570 /* found it */ 571 *vn_ret = vp; 572 return 1; 573 } 574 575 /* 576 * Scan cache looking for name of directory entry pointing at vp. 577 * 578 * If the lookup succeeds the vnode is referenced and stored in dvpp. 579 * 580 * If bufp is non-NULL, also place the name in the buffer which starts 581 * at bufp, immediately before *bpp, and move bpp backwards to point 582 * at the start of it. (Yes, this is a little baroque, but it's done 583 * this way to cater to the whims of getcwd). 584 * 585 * Returns 0 on success, -1 on cache miss, positive errno on failure. 586 */ 587 int 588 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) 589 { 590 struct namecache *ncp; 591 struct vnode *dvp; 592 struct nchcpu *cpup; 593 struct ncvhashhead *nvcpp; 594 char *bp; 595 int error, nlen; 596 597 if (!doingcache) 598 goto out; 599 600 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 601 cpup = curcpu()->ci_data.cpu_nch; 602 603 mutex_enter(namecache_lock); 604 LIST_FOREACH(ncp, nvcpp, nc_vhash) { 605 mutex_enter(&ncp->nc_lock); 606 if (ncp->nc_vp == vp && 607 (dvp = ncp->nc_dvp) != NULL && 608 dvp != vp) { /* avoid pesky . entries.. */ 609 610 #ifdef DIAGNOSTIC 611 if (ncp->nc_nlen == 1 && 612 ncp->nc_name[0] == '.') 613 panic("cache_revlookup: found entry for ."); 614 615 if (ncp->nc_nlen == 2 && 616 ncp->nc_name[0] == '.' && 617 ncp->nc_name[1] == '.') 618 panic("cache_revlookup: found entry for .."); 619 #endif 620 mutex_enter(&cpup->cpu_lock); 621 COUNT(cpup->cpu_stats, ncs_revhits); 622 mutex_exit(&cpup->cpu_lock); 623 nlen = ncp->nc_nlen; 624 625 if (bufp) { 626 bp = *bpp; 627 bp -= nlen; 628 if (bp <= bufp) { 629 *dvpp = NULL; 630 mutex_exit(&ncp->nc_lock); 631 mutex_exit(namecache_lock); 632 return (ERANGE); 633 } 634 memcpy(bp, ncp->nc_name, nlen); 635 *bpp = bp; 636 } 637 638 mutex_enter(dvp->v_interlock); 639 mutex_exit(&ncp->nc_lock); 640 mutex_exit(namecache_lock); 641 error = vget(dvp, LK_NOWAIT); 642 if (error) { 643 KASSERT(error == EBUSY); 644 if (bufp) 645 (*bpp) += nlen; 646 *dvpp = NULL; 647 return -1; 648 } 649 *dvpp = dvp; 650 return (0); 651 } 652 mutex_exit(&ncp->nc_lock); 653 } 654 mutex_enter(&cpup->cpu_lock); 655 COUNT(cpup->cpu_stats, ncs_revmiss); 656 mutex_exit(&cpup->cpu_lock); 657 mutex_exit(namecache_lock); 658 out: 659 *dvpp = NULL; 660 return (-1); 661 } 662 663 /* 664 * Add an entry to the cache 665 */ 666 void 667 cache_enter(struct vnode *dvp, struct vnode *vp, 668 const char *name, size_t namelen, uint32_t cnflags) 669 { 670 struct namecache *ncp; 671 struct namecache *oncp; 672 struct nchashhead *ncpp; 673 struct ncvhashhead *nvcpp; 674 nchash_t hash; 675 676 /* First, check whether we can/should add a cache entry. */ 677 if ((cnflags & MAKEENTRY) == 0 || 678 __predict_false(namelen > NCHNAMLEN || !doingcache)) { 679 return; 680 } 681 682 if (numcache > desiredvnodes) { 683 mutex_enter(namecache_lock); 684 cache_ev_forced.ev_count++; 685 cache_reclaim(); 686 mutex_exit(namecache_lock); 687 } 688 689 ncp = pool_cache_get(namecache_cache, PR_WAITOK); 690 mutex_enter(namecache_lock); 691 numcache++; 692 693 /* 694 * Concurrent lookups in the same directory may race for a 695 * cache entry. if there's a duplicated entry, free it. 696 */ 697 oncp = cache_lookup_entry(dvp, name, namelen); 698 if (oncp) { 699 cache_invalidate(oncp); 700 mutex_exit(&oncp->nc_lock); 701 } 702 703 /* Grab the vnode we just found. */ 704 mutex_enter(&ncp->nc_lock); 705 ncp->nc_vp = vp; 706 ncp->nc_flags = 0; 707 ncp->nc_hittime = 0; 708 ncp->nc_gcqueue = NULL; 709 if (vp == NULL) { 710 /* 711 * For negative hits, save the ISWHITEOUT flag so we can 712 * restore it later when the cache entry is used again. 713 */ 714 ncp->nc_flags = cnflags & ISWHITEOUT; 715 } 716 717 /* Fill in cache info. */ 718 ncp->nc_dvp = dvp; 719 LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist); 720 if (vp) 721 LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist); 722 else { 723 ncp->nc_vlist.le_prev = NULL; 724 ncp->nc_vlist.le_next = NULL; 725 } 726 KASSERT(namelen <= NCHNAMLEN); 727 ncp->nc_nlen = namelen; 728 memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen); 729 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 730 hash = cache_hash(name, namelen); 731 ncpp = &nchashtbl[NCHASH2(hash, dvp)]; 732 733 /* 734 * Flush updates before making visible in table. No need for a 735 * memory barrier on the other side: to see modifications the 736 * list must be followed, meaning a dependent pointer load. 737 * The below is LIST_INSERT_HEAD() inlined, with the memory 738 * barrier included in the correct place. 739 */ 740 if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL) 741 ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next; 742 ncp->nc_hash.le_prev = &ncpp->lh_first; 743 membar_producer(); 744 ncpp->lh_first = ncp; 745 746 ncp->nc_vhash.le_prev = NULL; 747 ncp->nc_vhash.le_next = NULL; 748 749 /* 750 * Create reverse-cache entries (used in getcwd) for directories. 751 * (and in linux procfs exe node) 752 */ 753 if (vp != NULL && 754 vp != dvp && 755 #ifndef NAMECACHE_ENTER_REVERSE 756 vp->v_type == VDIR && 757 #endif 758 (ncp->nc_nlen > 2 || 759 (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || 760 (/* ncp->nc_nlen > 0 && */ ncp->nc_name[0] != '.'))) { 761 nvcpp = &ncvhashtbl[NCVHASH(vp)]; 762 LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); 763 } 764 mutex_exit(&ncp->nc_lock); 765 mutex_exit(namecache_lock); 766 } 767 768 /* 769 * Name cache initialization, from vfs_init() when we are booting 770 */ 771 void 772 nchinit(void) 773 { 774 int error; 775 776 TAILQ_INIT(&nclruhead); 777 namecache_cache = pool_cache_init(sizeof(struct namecache), 778 coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor, 779 cache_dtor, NULL); 780 KASSERT(namecache_cache != NULL); 781 782 namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 783 784 nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash); 785 ncvhashtbl = 786 #ifdef NAMECACHE_ENTER_REVERSE 787 hashinit(desiredvnodes, HASH_LIST, true, &ncvhash); 788 #else 789 hashinit(desiredvnodes/8, HASH_LIST, true, &ncvhash); 790 #endif 791 792 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread, 793 NULL, NULL, "cachegc"); 794 if (error != 0) 795 panic("nchinit %d", error); 796 797 evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL, 798 "namecache", "entries scanned"); 799 evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL, 800 "namecache", "entries collected"); 801 evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL, 802 "namecache", "over scan target"); 803 evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL, 804 "namecache", "under scan target"); 805 evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL, 806 "namecache", "forced reclaims"); 807 } 808 809 static int 810 cache_ctor(void *arg, void *obj, int flag) 811 { 812 struct namecache *ncp; 813 814 ncp = obj; 815 mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE); 816 817 return 0; 818 } 819 820 static void 821 cache_dtor(void *arg, void *obj) 822 { 823 struct namecache *ncp; 824 825 ncp = obj; 826 mutex_destroy(&ncp->nc_lock); 827 } 828 829 /* 830 * Called once for each CPU in the system as attached. 831 */ 832 void 833 cache_cpu_init(struct cpu_info *ci) 834 { 835 struct nchcpu *cpup; 836 size_t sz; 837 838 sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit; 839 cpup = kmem_zalloc(sz, KM_SLEEP); 840 cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit); 841 mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE); 842 ci->ci_data.cpu_nch = cpup; 843 } 844 845 /* 846 * Name cache reinitialization, for when the maximum number of vnodes increases. 847 */ 848 void 849 nchreinit(void) 850 { 851 struct namecache *ncp; 852 struct nchashhead *oldhash1, *hash1; 853 struct ncvhashhead *oldhash2, *hash2; 854 u_long i, oldmask1, oldmask2, mask1, mask2; 855 856 hash1 = hashinit(desiredvnodes, HASH_LIST, true, &mask1); 857 hash2 = 858 #ifdef NAMECACHE_ENTER_REVERSE 859 hashinit(desiredvnodes, HASH_LIST, true, &mask2); 860 #else 861 hashinit(desiredvnodes/8, HASH_LIST, true, &mask2); 862 #endif 863 mutex_enter(namecache_lock); 864 cache_lock_cpus(); 865 oldhash1 = nchashtbl; 866 oldmask1 = nchash; 867 nchashtbl = hash1; 868 nchash = mask1; 869 oldhash2 = ncvhashtbl; 870 oldmask2 = ncvhash; 871 ncvhashtbl = hash2; 872 ncvhash = mask2; 873 for (i = 0; i <= oldmask1; i++) { 874 while ((ncp = LIST_FIRST(&oldhash1[i])) != NULL) { 875 LIST_REMOVE(ncp, nc_hash); 876 ncp->nc_hash.le_prev = NULL; 877 } 878 } 879 for (i = 0; i <= oldmask2; i++) { 880 while ((ncp = LIST_FIRST(&oldhash2[i])) != NULL) { 881 LIST_REMOVE(ncp, nc_vhash); 882 ncp->nc_vhash.le_prev = NULL; 883 } 884 } 885 cache_unlock_cpus(); 886 mutex_exit(namecache_lock); 887 hashdone(oldhash1, HASH_LIST, oldmask1); 888 hashdone(oldhash2, HASH_LIST, oldmask2); 889 } 890 891 /* 892 * Cache flush, a particular vnode; called when a vnode is renamed to 893 * hide entries that would now be invalid 894 */ 895 void 896 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) 897 { 898 struct namecache *ncp, *ncnext; 899 900 mutex_enter(namecache_lock); 901 if (flags & PURGE_PARENTS) { 902 for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL; 903 ncp = ncnext) { 904 ncnext = LIST_NEXT(ncp, nc_vlist); 905 mutex_enter(&ncp->nc_lock); 906 cache_invalidate(ncp); 907 mutex_exit(&ncp->nc_lock); 908 cache_disassociate(ncp); 909 } 910 } 911 if (flags & PURGE_CHILDREN) { 912 for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL; 913 ncp = ncnext) { 914 ncnext = LIST_NEXT(ncp, nc_dvlist); 915 mutex_enter(&ncp->nc_lock); 916 cache_invalidate(ncp); 917 mutex_exit(&ncp->nc_lock); 918 cache_disassociate(ncp); 919 } 920 } 921 if (name != NULL) { 922 ncp = cache_lookup_entry(vp, name, namelen); 923 if (ncp) { 924 cache_invalidate(ncp); 925 mutex_exit(&ncp->nc_lock); 926 cache_disassociate(ncp); 927 } 928 } 929 mutex_exit(namecache_lock); 930 } 931 932 /* 933 * Cache flush, a whole filesystem; called when filesys is umounted to 934 * remove entries that would now be invalid. 935 */ 936 void 937 cache_purgevfs(struct mount *mp) 938 { 939 struct namecache *ncp, *nxtcp; 940 941 mutex_enter(namecache_lock); 942 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 943 nxtcp = TAILQ_NEXT(ncp, nc_lru); 944 mutex_enter(&ncp->nc_lock); 945 if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) { 946 /* Free the resources we had. */ 947 cache_invalidate(ncp); 948 cache_disassociate(ncp); 949 } 950 mutex_exit(&ncp->nc_lock); 951 } 952 cache_reclaim(); 953 mutex_exit(namecache_lock); 954 } 955 956 /* 957 * Scan global list invalidating entries until we meet a preset target. 958 * Prefer to invalidate entries that have not scored a hit within 959 * cache_hottime seconds. We sort the LRU list only for this routine's 960 * benefit. 961 */ 962 static void 963 cache_prune(int incache, int target) 964 { 965 struct namecache *ncp, *nxtcp, *sentinel; 966 int items, recent, tryharder; 967 968 KASSERT(mutex_owned(namecache_lock)); 969 970 items = 0; 971 tryharder = 0; 972 recent = hardclock_ticks - hz * cache_hottime; 973 sentinel = NULL; 974 for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { 975 if (incache <= target) 976 break; 977 items++; 978 nxtcp = TAILQ_NEXT(ncp, nc_lru); 979 if (ncp == sentinel) { 980 /* 981 * If we looped back on ourself, then ignore 982 * recent entries and purge whatever we find. 983 */ 984 tryharder = 1; 985 } 986 if (ncp->nc_dvp == NULL) 987 continue; 988 if (!tryharder && (ncp->nc_hittime - recent) > 0) { 989 if (sentinel == NULL) 990 sentinel = ncp; 991 TAILQ_REMOVE(&nclruhead, ncp, nc_lru); 992 TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); 993 continue; 994 } 995 mutex_enter(&ncp->nc_lock); 996 if (ncp->nc_dvp != NULL) { 997 cache_invalidate(ncp); 998 cache_disassociate(ncp); 999 incache--; 1000 } 1001 mutex_exit(&ncp->nc_lock); 1002 } 1003 cache_ev_scan.ev_count += items; 1004 } 1005 1006 /* 1007 * Collect dead cache entries from all CPUs and garbage collect. 1008 */ 1009 static void 1010 cache_reclaim(void) 1011 { 1012 struct namecache *ncp, *next; 1013 int items; 1014 1015 KASSERT(mutex_owned(namecache_lock)); 1016 1017 /* 1018 * If the number of extant entries not awaiting garbage collection 1019 * exceeds the high water mark, then reclaim stale entries until we 1020 * reach our low water mark. 1021 */ 1022 items = numcache - cache_gcpend; 1023 if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) { 1024 cache_prune(items, (int)((uint64_t)desiredvnodes * 1025 cache_lowat / 100)); 1026 cache_ev_over.ev_count++; 1027 } else 1028 cache_ev_under.ev_count++; 1029 1030 /* 1031 * Stop forward lookup activity on all CPUs and garbage collect dead 1032 * entries. 1033 */ 1034 cache_lock_cpus(); 1035 ncp = cache_gcqueue; 1036 cache_gcqueue = NULL; 1037 items = cache_gcpend; 1038 cache_gcpend = 0; 1039 while (ncp != NULL) { 1040 next = ncp->nc_gcqueue; 1041 cache_disassociate(ncp); 1042 KASSERT(ncp->nc_dvp == NULL); 1043 if (ncp->nc_hash.le_prev != NULL) { 1044 LIST_REMOVE(ncp, nc_hash); 1045 ncp->nc_hash.le_prev = NULL; 1046 } 1047 pool_cache_put(namecache_cache, ncp); 1048 ncp = next; 1049 } 1050 cache_unlock_cpus(); 1051 numcache -= items; 1052 cache_ev_gc.ev_count += items; 1053 } 1054 1055 /* 1056 * Cache maintainence thread, awakening once per second to: 1057 * 1058 * => keep number of entries below the high water mark 1059 * => sort pseudo-LRU list 1060 * => garbage collect dead entries 1061 */ 1062 static void 1063 cache_thread(void *arg) 1064 { 1065 1066 mutex_enter(namecache_lock); 1067 for (;;) { 1068 cache_reclaim(); 1069 kpause("cachegc", false, hz, namecache_lock); 1070 } 1071 } 1072 1073 #ifdef DDB 1074 void 1075 namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) 1076 { 1077 struct vnode *dvp = NULL; 1078 struct namecache *ncp; 1079 1080 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1081 if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) { 1082 (*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name); 1083 dvp = ncp->nc_dvp; 1084 } 1085 } 1086 if (dvp == NULL) { 1087 (*pr)("name not found\n"); 1088 return; 1089 } 1090 vp = dvp; 1091 TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { 1092 if (ncp->nc_vp == vp) { 1093 (*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name); 1094 } 1095 } 1096 } 1097 #endif 1098 1099 void 1100 namecache_count_pass2(void) 1101 { 1102 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1103 1104 mutex_enter(&cpup->cpu_lock); 1105 COUNT(cpup->cpu_stats, ncs_pass2); 1106 mutex_exit(&cpup->cpu_lock); 1107 } 1108 1109 void 1110 namecache_count_2passes(void) 1111 { 1112 struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; 1113 1114 mutex_enter(&cpup->cpu_lock); 1115 COUNT(cpup->cpu_stats, ncs_2passes); 1116 mutex_exit(&cpup->cpu_lock); 1117 } 1118 1119 static int 1120 cache_stat_sysctl(SYSCTLFN_ARGS) 1121 { 1122 struct nchstats_sysctl stats; 1123 1124 if (oldp == NULL) { 1125 *oldlenp = sizeof(stats); 1126 return 0; 1127 } 1128 1129 if (*oldlenp < sizeof(stats)) { 1130 *oldlenp = 0; 1131 return 0; 1132 } 1133 1134 memset(&stats, 0, sizeof(stats)); 1135 1136 sysctl_unlock(); 1137 cache_lock_cpus(); 1138 stats.ncs_goodhits = nchstats.ncs_goodhits; 1139 stats.ncs_neghits = nchstats.ncs_neghits; 1140 stats.ncs_badhits = nchstats.ncs_badhits; 1141 stats.ncs_falsehits = nchstats.ncs_falsehits; 1142 stats.ncs_miss = nchstats.ncs_miss; 1143 stats.ncs_long = nchstats.ncs_long; 1144 stats.ncs_pass2 = nchstats.ncs_pass2; 1145 stats.ncs_2passes = nchstats.ncs_2passes; 1146 stats.ncs_revhits = nchstats.ncs_revhits; 1147 stats.ncs_revmiss = nchstats.ncs_revmiss; 1148 cache_unlock_cpus(); 1149 sysctl_relock(); 1150 1151 *oldlenp = sizeof(stats); 1152 return sysctl_copyout(l, &stats, oldp, sizeof(stats)); 1153 } 1154 1155 SYSCTL_SETUP(sysctl_cache_stat_setup, "vfs.namecache_stats subtree setup") 1156 { 1157 sysctl_createv(clog, 0, NULL, NULL, 1158 CTLFLAG_PERMANENT, 1159 CTLTYPE_STRUCT, "namecache_stats", 1160 SYSCTL_DESCR("namecache statistics"), 1161 cache_stat_sysctl, 0, NULL, 0, 1162 CTL_VFS, CTL_CREATE, CTL_EOL); 1163 } 1164