1 /* $NetBSD: subr_pool.c,v 1.250 2019/05/09 08:16:14 skrll Exp $ */ 2 3 /* 4 * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by 11 * Maxime Villard. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.250 2019/05/09 08:16:14 skrll Exp $"); 37 38 #ifdef _KERNEL_OPT 39 #include "opt_ddb.h" 40 #include "opt_lockdebug.h" 41 #include "opt_pool.h" 42 #include "opt_kleak.h" 43 #endif 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysctl.h> 48 #include <sys/bitops.h> 49 #include <sys/proc.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/vmem.h> 53 #include <sys/pool.h> 54 #include <sys/syslog.h> 55 #include <sys/debug.h> 56 #include <sys/lockdebug.h> 57 #include <sys/xcall.h> 58 #include <sys/cpu.h> 59 #include <sys/atomic.h> 60 #include <sys/asan.h> 61 62 #include <uvm/uvm_extern.h> 63 64 /* 65 * Pool resource management utility. 66 * 67 * Memory is allocated in pages which are split into pieces according to 68 * the pool item size. Each page is kept on one of three lists in the 69 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 70 * for empty, full and partially-full pages respectively. The individual 71 * pool items are on a linked list headed by `ph_itemlist' in each page 72 * header. The memory for building the page list is either taken from 73 * the allocated pages themselves (for small pool items) or taken from 74 * an internal pool of page headers (`phpool'). 75 */ 76 77 /* List of all pools. Non static as needed by 'vmstat -m' */ 78 TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 79 80 /* Private pool for page header structures */ 81 #define PHPOOL_MAX 8 82 static struct pool phpool[PHPOOL_MAX]; 83 #define PHPOOL_FREELIST_NELEM(idx) \ 84 (((idx) == 0) ? 0 : BITMAP_SIZE * (1 << (idx))) 85 86 #if defined(KASAN) 87 #define POOL_REDZONE 88 #endif 89 90 #ifdef POOL_REDZONE 91 # ifdef KASAN 92 # define POOL_REDZONE_SIZE 8 93 # else 94 # define POOL_REDZONE_SIZE 2 95 # endif 96 static void pool_redzone_init(struct pool *, size_t); 97 static void pool_redzone_fill(struct pool *, void *); 98 static void pool_redzone_check(struct pool *, void *); 99 static void pool_cache_redzone_check(pool_cache_t, void *); 100 #else 101 # define pool_redzone_init(pp, sz) __nothing 102 # define pool_redzone_fill(pp, ptr) __nothing 103 # define pool_redzone_check(pp, ptr) __nothing 104 # define pool_cache_redzone_check(pc, ptr) __nothing 105 #endif 106 107 #ifdef KLEAK 108 static void pool_kleak_fill(struct pool *, void *); 109 static void pool_cache_kleak_fill(pool_cache_t, void *); 110 #else 111 #define pool_kleak_fill(pp, ptr) __nothing 112 #define pool_cache_kleak_fill(pc, ptr) __nothing 113 #endif 114 115 #ifdef POOL_QUARANTINE 116 static void pool_quarantine_init(struct pool *); 117 static void pool_quarantine_flush(struct pool *); 118 static bool pool_put_quarantine(struct pool *, void *, 119 struct pool_pagelist *); 120 static bool pool_cache_put_quarantine(pool_cache_t, void *, paddr_t); 121 #else 122 #define pool_quarantine_init(a) __nothing 123 #define pool_quarantine_flush(a) __nothing 124 #define pool_put_quarantine(a, b, c) false 125 #define pool_cache_put_quarantine(a, b, c) false 126 #endif 127 128 #define pc_has_ctor(pc) \ 129 (pc->pc_ctor != (int (*)(void *, void *, int))nullop) 130 #define pc_has_dtor(pc) \ 131 (pc->pc_dtor != (void (*)(void *, void *))nullop) 132 133 static void *pool_page_alloc_meta(struct pool *, int); 134 static void pool_page_free_meta(struct pool *, void *); 135 136 /* allocator for pool metadata */ 137 struct pool_allocator pool_allocator_meta = { 138 .pa_alloc = pool_page_alloc_meta, 139 .pa_free = pool_page_free_meta, 140 .pa_pagesz = 0 141 }; 142 143 #define POOL_ALLOCATOR_BIG_BASE 13 144 extern struct pool_allocator pool_allocator_big[]; 145 static int pool_bigidx(size_t); 146 147 /* # of seconds to retain page after last use */ 148 int pool_inactive_time = 10; 149 150 /* Next candidate for drainage (see pool_drain()) */ 151 static struct pool *drainpp; 152 153 /* This lock protects both pool_head and drainpp. */ 154 static kmutex_t pool_head_lock; 155 static kcondvar_t pool_busy; 156 157 /* This lock protects initialization of a potentially shared pool allocator */ 158 static kmutex_t pool_allocator_lock; 159 160 static unsigned int poolid_counter = 0; 161 162 typedef uint32_t pool_item_bitmap_t; 163 #define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t)) 164 #define BITMAP_MASK (BITMAP_SIZE - 1) 165 166 struct pool_item_header { 167 /* Page headers */ 168 LIST_ENTRY(pool_item_header) 169 ph_pagelist; /* pool page list */ 170 union { 171 /* !PR_PHINPAGE */ 172 struct { 173 SPLAY_ENTRY(pool_item_header) 174 phu_node; /* off-page page headers */ 175 } phu_offpage; 176 /* PR_PHINPAGE */ 177 struct { 178 unsigned int phu_poolid; 179 } phu_onpage; 180 } ph_u1; 181 void * ph_page; /* this page's address */ 182 uint32_t ph_time; /* last referenced */ 183 uint16_t ph_nmissing; /* # of chunks in use */ 184 uint16_t ph_off; /* start offset in page */ 185 union { 186 /* !PR_USEBMAP */ 187 struct { 188 LIST_HEAD(, pool_item) 189 phu_itemlist; /* chunk list for this page */ 190 } phu_normal; 191 /* PR_USEBMAP */ 192 struct { 193 pool_item_bitmap_t phu_bitmap[1]; 194 } phu_notouch; 195 } ph_u2; 196 }; 197 #define ph_node ph_u1.phu_offpage.phu_node 198 #define ph_poolid ph_u1.phu_onpage.phu_poolid 199 #define ph_itemlist ph_u2.phu_normal.phu_itemlist 200 #define ph_bitmap ph_u2.phu_notouch.phu_bitmap 201 202 #define PHSIZE ALIGN(sizeof(struct pool_item_header)) 203 204 #if defined(DIAGNOSTIC) && !defined(KASAN) 205 #define POOL_CHECK_MAGIC 206 #endif 207 208 struct pool_item { 209 #ifdef POOL_CHECK_MAGIC 210 u_int pi_magic; 211 #endif 212 #define PI_MAGIC 0xdeaddeadU 213 /* Other entries use only this list entry */ 214 LIST_ENTRY(pool_item) pi_list; 215 }; 216 217 #define POOL_NEEDS_CATCHUP(pp) \ 218 ((pp)->pr_nitems < (pp)->pr_minitems) 219 220 /* 221 * Pool cache management. 222 * 223 * Pool caches provide a way for constructed objects to be cached by the 224 * pool subsystem. This can lead to performance improvements by avoiding 225 * needless object construction/destruction; it is deferred until absolutely 226 * necessary. 227 * 228 * Caches are grouped into cache groups. Each cache group references up 229 * to PCG_NUMOBJECTS constructed objects. When a cache allocates an 230 * object from the pool, it calls the object's constructor and places it 231 * into a cache group. When a cache group frees an object back to the 232 * pool, it first calls the object's destructor. This allows the object 233 * to persist in constructed form while freed to the cache. 234 * 235 * The pool references each cache, so that when a pool is drained by the 236 * pagedaemon, it can drain each individual cache as well. Each time a 237 * cache is drained, the most idle cache group is freed to the pool in 238 * its entirety. 239 * 240 * Pool caches are layed on top of pools. By layering them, we can avoid 241 * the complexity of cache management for pools which would not benefit 242 * from it. 243 */ 244 245 static struct pool pcg_normal_pool; 246 static struct pool pcg_large_pool; 247 static struct pool cache_pool; 248 static struct pool cache_cpu_pool; 249 250 /* List of all caches. */ 251 TAILQ_HEAD(,pool_cache) pool_cache_head = 252 TAILQ_HEAD_INITIALIZER(pool_cache_head); 253 254 int pool_cache_disable; /* global disable for caching */ 255 static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */ 256 257 static bool pool_cache_put_slow(pool_cache_cpu_t *, int, 258 void *); 259 static bool pool_cache_get_slow(pool_cache_cpu_t *, int, 260 void **, paddr_t *, int); 261 static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t); 262 static void pool_cache_invalidate_groups(pool_cache_t, pcg_t *); 263 static void pool_cache_invalidate_cpu(pool_cache_t, u_int); 264 static void pool_cache_transfer(pool_cache_t); 265 266 static int pool_catchup(struct pool *); 267 static void pool_prime_page(struct pool *, void *, 268 struct pool_item_header *); 269 static void pool_update_curpage(struct pool *); 270 271 static int pool_grow(struct pool *, int); 272 static void *pool_allocator_alloc(struct pool *, int); 273 static void pool_allocator_free(struct pool *, void *); 274 275 static void pool_print_pagelist(struct pool *, struct pool_pagelist *, 276 void (*)(const char *, ...) __printflike(1, 2)); 277 static void pool_print1(struct pool *, const char *, 278 void (*)(const char *, ...) __printflike(1, 2)); 279 280 static int pool_chk_page(struct pool *, const char *, 281 struct pool_item_header *); 282 283 /* -------------------------------------------------------------------------- */ 284 285 static inline unsigned int 286 pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph, 287 const void *v) 288 { 289 const char *cp = v; 290 unsigned int idx; 291 292 KASSERT(pp->pr_roflags & PR_USEBMAP); 293 idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size; 294 295 if (__predict_false(idx >= pp->pr_itemsperpage)) { 296 panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx, 297 pp->pr_itemsperpage); 298 } 299 300 return idx; 301 } 302 303 static inline void 304 pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph, 305 void *obj) 306 { 307 unsigned int idx = pr_item_bitmap_index(pp, ph, obj); 308 pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); 309 pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); 310 311 if (__predict_false((*bitmap & mask) != 0)) { 312 panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj); 313 } 314 315 *bitmap |= mask; 316 } 317 318 static inline void * 319 pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph) 320 { 321 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 322 unsigned int idx; 323 int i; 324 325 for (i = 0; ; i++) { 326 int bit; 327 328 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); 329 bit = ffs32(bitmap[i]); 330 if (bit) { 331 pool_item_bitmap_t mask; 332 333 bit--; 334 idx = (i * BITMAP_SIZE) + bit; 335 mask = 1U << bit; 336 KASSERT((bitmap[i] & mask) != 0); 337 bitmap[i] &= ~mask; 338 break; 339 } 340 } 341 KASSERT(idx < pp->pr_itemsperpage); 342 return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size; 343 } 344 345 static inline void 346 pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph) 347 { 348 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 349 const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE); 350 int i; 351 352 for (i = 0; i < n; i++) { 353 bitmap[i] = (pool_item_bitmap_t)-1; 354 } 355 } 356 357 /* -------------------------------------------------------------------------- */ 358 359 static inline void 360 pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph, 361 void *obj) 362 { 363 struct pool_item *pi = obj; 364 365 #ifdef POOL_CHECK_MAGIC 366 pi->pi_magic = PI_MAGIC; 367 #endif 368 369 if (pp->pr_redzone) { 370 /* 371 * Mark the pool_item as valid. The rest is already 372 * invalid. 373 */ 374 kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0); 375 } 376 377 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 378 } 379 380 static inline void * 381 pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph) 382 { 383 struct pool_item *pi; 384 void *v; 385 386 v = pi = LIST_FIRST(&ph->ph_itemlist); 387 if (__predict_false(v == NULL)) { 388 mutex_exit(&pp->pr_lock); 389 panic("%s: [%s] page empty", __func__, pp->pr_wchan); 390 } 391 KASSERTMSG((pp->pr_nitems > 0), 392 "%s: [%s] nitems %u inconsistent on itemlist", 393 __func__, pp->pr_wchan, pp->pr_nitems); 394 #ifdef POOL_CHECK_MAGIC 395 KASSERTMSG((pi->pi_magic == PI_MAGIC), 396 "%s: [%s] free list modified: " 397 "magic=%x; page %p; item addr %p", __func__, 398 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 399 #endif 400 401 /* 402 * Remove from item list. 403 */ 404 LIST_REMOVE(pi, pi_list); 405 406 return v; 407 } 408 409 /* -------------------------------------------------------------------------- */ 410 411 static inline int 412 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 413 { 414 415 /* 416 * We consider pool_item_header with smaller ph_page bigger. This 417 * unnatural ordering is for the benefit of pr_find_pagehead. 418 */ 419 if (a->ph_page < b->ph_page) 420 return 1; 421 else if (a->ph_page > b->ph_page) 422 return -1; 423 else 424 return 0; 425 } 426 427 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 428 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 429 430 static inline struct pool_item_header * 431 pr_find_pagehead_noalign(struct pool *pp, void *v) 432 { 433 struct pool_item_header *ph, tmp; 434 435 tmp.ph_page = (void *)(uintptr_t)v; 436 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 437 if (ph == NULL) { 438 ph = SPLAY_ROOT(&pp->pr_phtree); 439 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { 440 ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph); 441 } 442 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0); 443 } 444 445 return ph; 446 } 447 448 /* 449 * Return the pool page header based on item address. 450 */ 451 static inline struct pool_item_header * 452 pr_find_pagehead(struct pool *pp, void *v) 453 { 454 struct pool_item_header *ph, tmp; 455 456 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 457 ph = pr_find_pagehead_noalign(pp, v); 458 } else { 459 void *page = 460 (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask); 461 462 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 463 ph = (struct pool_item_header *)page; 464 if (__predict_false((void *)ph->ph_page != page)) { 465 panic("%s: [%s] item %p not part of pool", 466 __func__, pp->pr_wchan, v); 467 } 468 if (__predict_false((char *)v < (char *)page + 469 ph->ph_off)) { 470 panic("%s: [%s] item %p below item space", 471 __func__, pp->pr_wchan, v); 472 } 473 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 474 panic("%s: [%s] item %p poolid %u != %u", 475 __func__, pp->pr_wchan, v, ph->ph_poolid, 476 pp->pr_poolid); 477 } 478 } else { 479 tmp.ph_page = page; 480 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 481 } 482 } 483 484 KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) || 485 ((char *)ph->ph_page <= (char *)v && 486 (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz)); 487 return ph; 488 } 489 490 static void 491 pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq) 492 { 493 struct pool_item_header *ph; 494 495 while ((ph = LIST_FIRST(pq)) != NULL) { 496 LIST_REMOVE(ph, ph_pagelist); 497 pool_allocator_free(pp, ph->ph_page); 498 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 499 pool_put(pp->pr_phpool, ph); 500 } 501 } 502 503 /* 504 * Remove a page from the pool. 505 */ 506 static inline void 507 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 508 struct pool_pagelist *pq) 509 { 510 511 KASSERT(mutex_owned(&pp->pr_lock)); 512 513 /* 514 * If the page was idle, decrement the idle page count. 515 */ 516 if (ph->ph_nmissing == 0) { 517 KASSERT(pp->pr_nidle != 0); 518 KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage), 519 "nitems=%u < itemsperpage=%u", 520 pp->pr_nitems, pp->pr_itemsperpage); 521 pp->pr_nidle--; 522 } 523 524 pp->pr_nitems -= pp->pr_itemsperpage; 525 526 /* 527 * Unlink the page from the pool and queue it for release. 528 */ 529 LIST_REMOVE(ph, ph_pagelist); 530 if (pp->pr_roflags & PR_PHINPAGE) { 531 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 532 panic("%s: [%s] ph %p poolid %u != %u", 533 __func__, pp->pr_wchan, ph, ph->ph_poolid, 534 pp->pr_poolid); 535 } 536 } else { 537 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 538 } 539 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 540 541 pp->pr_npages--; 542 pp->pr_npagefree++; 543 544 pool_update_curpage(pp); 545 } 546 547 /* 548 * Initialize all the pools listed in the "pools" link set. 549 */ 550 void 551 pool_subsystem_init(void) 552 { 553 size_t size; 554 int idx; 555 556 mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE); 557 mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE); 558 cv_init(&pool_busy, "poolbusy"); 559 560 /* 561 * Initialize private page header pool and cache magazine pool if we 562 * haven't done so yet. 563 */ 564 for (idx = 0; idx < PHPOOL_MAX; idx++) { 565 static char phpool_names[PHPOOL_MAX][6+1+6+1]; 566 int nelem; 567 size_t sz; 568 569 nelem = PHPOOL_FREELIST_NELEM(idx); 570 snprintf(phpool_names[idx], sizeof(phpool_names[idx]), 571 "phpool-%d", nelem); 572 sz = sizeof(struct pool_item_header); 573 if (nelem) { 574 sz = offsetof(struct pool_item_header, 575 ph_bitmap[howmany(nelem, BITMAP_SIZE)]); 576 } 577 pool_init(&phpool[idx], sz, 0, 0, 0, 578 phpool_names[idx], &pool_allocator_meta, IPL_VM); 579 } 580 581 size = sizeof(pcg_t) + 582 (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t); 583 pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0, 584 "pcgnormal", &pool_allocator_meta, IPL_VM); 585 586 size = sizeof(pcg_t) + 587 (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t); 588 pool_init(&pcg_large_pool, size, coherency_unit, 0, 0, 589 "pcglarge", &pool_allocator_meta, IPL_VM); 590 591 pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit, 592 0, 0, "pcache", &pool_allocator_meta, IPL_NONE); 593 594 pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit, 595 0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE); 596 } 597 598 static inline bool 599 pool_init_is_phinpage(const struct pool *pp) 600 { 601 size_t pagesize; 602 603 if (pp->pr_roflags & PR_PHINPAGE) { 604 return true; 605 } 606 if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) { 607 return false; 608 } 609 610 pagesize = pp->pr_alloc->pa_pagesz; 611 612 /* 613 * Threshold: the item size is below 1/16 of a page size, and below 614 * 8 times the page header size. The latter ensures we go off-page 615 * if the page header would make us waste a rather big item. 616 */ 617 if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) { 618 return true; 619 } 620 621 /* Put the header into the page if it doesn't waste any items. */ 622 if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) { 623 return true; 624 } 625 626 return false; 627 } 628 629 static inline bool 630 pool_init_is_usebmap(const struct pool *pp) 631 { 632 size_t bmapsize; 633 634 if (pp->pr_roflags & PR_NOTOUCH) { 635 return true; 636 } 637 638 /* 639 * If we're on-page, and the page header can already contain a bitmap 640 * big enough to cover all the items of the page, go with a bitmap. 641 */ 642 if (!(pp->pr_roflags & PR_PHINPAGE)) { 643 return false; 644 } 645 bmapsize = roundup(PHSIZE, pp->pr_align) - 646 offsetof(struct pool_item_header, ph_bitmap[0]); 647 KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0); 648 if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) { 649 return true; 650 } 651 652 return false; 653 } 654 655 /* 656 * Initialize the given pool resource structure. 657 * 658 * We export this routine to allow other kernel parts to declare 659 * static pools that must be initialized before kmem(9) is available. 660 */ 661 void 662 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 663 const char *wchan, struct pool_allocator *palloc, int ipl) 664 { 665 struct pool *pp1; 666 size_t prsize; 667 int itemspace, slack; 668 669 /* XXX ioff will be removed. */ 670 KASSERT(ioff == 0); 671 672 #ifdef DEBUG 673 if (__predict_true(!cold)) 674 mutex_enter(&pool_head_lock); 675 /* 676 * Check that the pool hasn't already been initialised and 677 * added to the list of all pools. 678 */ 679 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 680 if (pp == pp1) 681 panic("%s: [%s] already initialised", __func__, 682 wchan); 683 } 684 if (__predict_true(!cold)) 685 mutex_exit(&pool_head_lock); 686 #endif 687 688 if (palloc == NULL) 689 palloc = &pool_allocator_kmem; 690 691 if (!cold) 692 mutex_enter(&pool_allocator_lock); 693 if (palloc->pa_refcnt++ == 0) { 694 if (palloc->pa_pagesz == 0) 695 palloc->pa_pagesz = PAGE_SIZE; 696 697 TAILQ_INIT(&palloc->pa_list); 698 699 mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM); 700 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 701 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 702 } 703 if (!cold) 704 mutex_exit(&pool_allocator_lock); 705 706 if (align == 0) 707 align = ALIGN(1); 708 709 prsize = size; 710 if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item)) 711 prsize = sizeof(struct pool_item); 712 713 prsize = roundup(prsize, align); 714 KASSERTMSG((prsize <= palloc->pa_pagesz), 715 "%s: [%s] pool item size (%zu) larger than page size (%u)", 716 __func__, wchan, prsize, palloc->pa_pagesz); 717 718 /* 719 * Initialize the pool structure. 720 */ 721 LIST_INIT(&pp->pr_emptypages); 722 LIST_INIT(&pp->pr_fullpages); 723 LIST_INIT(&pp->pr_partpages); 724 pp->pr_cache = NULL; 725 pp->pr_curpage = NULL; 726 pp->pr_npages = 0; 727 pp->pr_minitems = 0; 728 pp->pr_minpages = 0; 729 pp->pr_maxpages = UINT_MAX; 730 pp->pr_roflags = flags; 731 pp->pr_flags = 0; 732 pp->pr_size = prsize; 733 pp->pr_reqsize = size; 734 pp->pr_align = align; 735 pp->pr_wchan = wchan; 736 pp->pr_alloc = palloc; 737 pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter); 738 pp->pr_nitems = 0; 739 pp->pr_nout = 0; 740 pp->pr_hardlimit = UINT_MAX; 741 pp->pr_hardlimit_warning = NULL; 742 pp->pr_hardlimit_ratecap.tv_sec = 0; 743 pp->pr_hardlimit_ratecap.tv_usec = 0; 744 pp->pr_hardlimit_warning_last.tv_sec = 0; 745 pp->pr_hardlimit_warning_last.tv_usec = 0; 746 pp->pr_drain_hook = NULL; 747 pp->pr_drain_hook_arg = NULL; 748 pp->pr_freecheck = NULL; 749 pool_redzone_init(pp, size); 750 pool_quarantine_init(pp); 751 752 /* 753 * Decide whether to put the page header off-page to avoid wasting too 754 * large a part of the page or too big an item. Off-page page headers 755 * go on a hash table, so we can match a returned item with its header 756 * based on the page address. 757 */ 758 if (pool_init_is_phinpage(pp)) { 759 /* Use the beginning of the page for the page header */ 760 itemspace = palloc->pa_pagesz - roundup(PHSIZE, align); 761 pp->pr_itemoffset = roundup(PHSIZE, align); 762 pp->pr_roflags |= PR_PHINPAGE; 763 } else { 764 /* The page header will be taken from our page header pool */ 765 itemspace = palloc->pa_pagesz; 766 pp->pr_itemoffset = 0; 767 SPLAY_INIT(&pp->pr_phtree); 768 } 769 770 pp->pr_itemsperpage = itemspace / pp->pr_size; 771 KASSERT(pp->pr_itemsperpage != 0); 772 773 /* 774 * Decide whether to use a bitmap or a linked list to manage freed 775 * items. 776 */ 777 if (pool_init_is_usebmap(pp)) { 778 pp->pr_roflags |= PR_USEBMAP; 779 } 780 781 /* 782 * If we're off-page and use a bitmap, choose the appropriate pool to 783 * allocate page headers, whose size varies depending on the bitmap. If 784 * we're just off-page, take the first pool, no extra size. If we're 785 * on-page, nothing to do. 786 */ 787 if (!(pp->pr_roflags & PR_PHINPAGE) && (pp->pr_roflags & PR_USEBMAP)) { 788 int idx; 789 790 for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx); 791 idx++) { 792 /* nothing */ 793 } 794 if (idx >= PHPOOL_MAX) { 795 /* 796 * if you see this panic, consider to tweak 797 * PHPOOL_MAX and PHPOOL_FREELIST_NELEM. 798 */ 799 panic("%s: [%s] too large itemsperpage(%d) for " 800 "PR_USEBMAP", __func__, 801 pp->pr_wchan, pp->pr_itemsperpage); 802 } 803 pp->pr_phpool = &phpool[idx]; 804 } else if (!(pp->pr_roflags & PR_PHINPAGE)) { 805 pp->pr_phpool = &phpool[0]; 806 } else { 807 pp->pr_phpool = NULL; 808 } 809 810 /* 811 * Use the slack between the chunks and the page header 812 * for "cache coloring". 813 */ 814 slack = itemspace - pp->pr_itemsperpage * pp->pr_size; 815 pp->pr_maxcolor = rounddown(slack, align); 816 pp->pr_curcolor = 0; 817 818 pp->pr_nget = 0; 819 pp->pr_nfail = 0; 820 pp->pr_nput = 0; 821 pp->pr_npagealloc = 0; 822 pp->pr_npagefree = 0; 823 pp->pr_hiwat = 0; 824 pp->pr_nidle = 0; 825 pp->pr_refcnt = 0; 826 827 mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl); 828 cv_init(&pp->pr_cv, wchan); 829 pp->pr_ipl = ipl; 830 831 /* Insert into the list of all pools. */ 832 if (!cold) 833 mutex_enter(&pool_head_lock); 834 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 835 if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0) 836 break; 837 } 838 if (pp1 == NULL) 839 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 840 else 841 TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist); 842 if (!cold) 843 mutex_exit(&pool_head_lock); 844 845 /* Insert this into the list of pools using this allocator. */ 846 if (!cold) 847 mutex_enter(&palloc->pa_lock); 848 TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list); 849 if (!cold) 850 mutex_exit(&palloc->pa_lock); 851 } 852 853 /* 854 * De-commision a pool resource. 855 */ 856 void 857 pool_destroy(struct pool *pp) 858 { 859 struct pool_pagelist pq; 860 struct pool_item_header *ph; 861 862 pool_quarantine_flush(pp); 863 864 /* Remove from global pool list */ 865 mutex_enter(&pool_head_lock); 866 while (pp->pr_refcnt != 0) 867 cv_wait(&pool_busy, &pool_head_lock); 868 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 869 if (drainpp == pp) 870 drainpp = NULL; 871 mutex_exit(&pool_head_lock); 872 873 /* Remove this pool from its allocator's list of pools. */ 874 mutex_enter(&pp->pr_alloc->pa_lock); 875 TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list); 876 mutex_exit(&pp->pr_alloc->pa_lock); 877 878 mutex_enter(&pool_allocator_lock); 879 if (--pp->pr_alloc->pa_refcnt == 0) 880 mutex_destroy(&pp->pr_alloc->pa_lock); 881 mutex_exit(&pool_allocator_lock); 882 883 mutex_enter(&pp->pr_lock); 884 885 KASSERT(pp->pr_cache == NULL); 886 KASSERTMSG((pp->pr_nout == 0), 887 "%s: pool busy: still out: %u", __func__, pp->pr_nout); 888 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 889 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 890 891 /* Remove all pages */ 892 LIST_INIT(&pq); 893 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 894 pr_rmpage(pp, ph, &pq); 895 896 mutex_exit(&pp->pr_lock); 897 898 pr_pagelist_free(pp, &pq); 899 cv_destroy(&pp->pr_cv); 900 mutex_destroy(&pp->pr_lock); 901 } 902 903 void 904 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg) 905 { 906 907 /* XXX no locking -- must be used just after pool_init() */ 908 KASSERTMSG((pp->pr_drain_hook == NULL), 909 "%s: [%s] already set", __func__, pp->pr_wchan); 910 pp->pr_drain_hook = fn; 911 pp->pr_drain_hook_arg = arg; 912 } 913 914 static struct pool_item_header * 915 pool_alloc_item_header(struct pool *pp, void *storage, int flags) 916 { 917 struct pool_item_header *ph; 918 919 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 920 ph = storage; 921 else 922 ph = pool_get(pp->pr_phpool, flags); 923 924 return ph; 925 } 926 927 /* 928 * Grab an item from the pool. 929 */ 930 void * 931 pool_get(struct pool *pp, int flags) 932 { 933 struct pool_item_header *ph; 934 void *v; 935 936 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 937 KASSERTMSG((pp->pr_itemsperpage != 0), 938 "%s: [%s] pr_itemsperpage is zero, " 939 "pool not initialized?", __func__, pp->pr_wchan); 940 KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p()) 941 || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL), 942 "%s: [%s] is IPL_NONE, but called from interrupt context", 943 __func__, pp->pr_wchan); 944 if (flags & PR_WAITOK) { 945 ASSERT_SLEEPABLE(); 946 } 947 948 mutex_enter(&pp->pr_lock); 949 startover: 950 /* 951 * Check to see if we've reached the hard limit. If we have, 952 * and we can wait, then wait until an item has been returned to 953 * the pool. 954 */ 955 KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit), 956 "%s: %s: crossed hard limit", __func__, pp->pr_wchan); 957 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 958 if (pp->pr_drain_hook != NULL) { 959 /* 960 * Since the drain hook is going to free things 961 * back to the pool, unlock, call the hook, re-lock, 962 * and check the hardlimit condition again. 963 */ 964 mutex_exit(&pp->pr_lock); 965 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 966 mutex_enter(&pp->pr_lock); 967 if (pp->pr_nout < pp->pr_hardlimit) 968 goto startover; 969 } 970 971 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 972 /* 973 * XXX: A warning isn't logged in this case. Should 974 * it be? 975 */ 976 pp->pr_flags |= PR_WANTED; 977 do { 978 cv_wait(&pp->pr_cv, &pp->pr_lock); 979 } while (pp->pr_flags & PR_WANTED); 980 goto startover; 981 } 982 983 /* 984 * Log a message that the hard limit has been hit. 985 */ 986 if (pp->pr_hardlimit_warning != NULL && 987 ratecheck(&pp->pr_hardlimit_warning_last, 988 &pp->pr_hardlimit_ratecap)) 989 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 990 991 pp->pr_nfail++; 992 993 mutex_exit(&pp->pr_lock); 994 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 995 return NULL; 996 } 997 998 /* 999 * The convention we use is that if `curpage' is not NULL, then 1000 * it points at a non-empty bucket. In particular, `curpage' 1001 * never points at a page header which has PR_PHINPAGE set and 1002 * has no items in its bucket. 1003 */ 1004 if ((ph = pp->pr_curpage) == NULL) { 1005 int error; 1006 1007 KASSERTMSG((pp->pr_nitems == 0), 1008 "%s: [%s] curpage NULL, inconsistent nitems %u", 1009 __func__, pp->pr_wchan, pp->pr_nitems); 1010 1011 /* 1012 * Call the back-end page allocator for more memory. 1013 * Release the pool lock, as the back-end page allocator 1014 * may block. 1015 */ 1016 error = pool_grow(pp, flags); 1017 if (error != 0) { 1018 /* 1019 * pool_grow aborts when another thread 1020 * is allocating a new page. Retry if it 1021 * waited for it. 1022 */ 1023 if (error == ERESTART) 1024 goto startover; 1025 1026 /* 1027 * We were unable to allocate a page or item 1028 * header, but we released the lock during 1029 * allocation, so perhaps items were freed 1030 * back to the pool. Check for this case. 1031 */ 1032 if (pp->pr_curpage != NULL) 1033 goto startover; 1034 1035 pp->pr_nfail++; 1036 mutex_exit(&pp->pr_lock); 1037 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 1038 return NULL; 1039 } 1040 1041 /* Start the allocation process over. */ 1042 goto startover; 1043 } 1044 if (pp->pr_roflags & PR_USEBMAP) { 1045 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage), 1046 "%s: %s: page empty", __func__, pp->pr_wchan); 1047 v = pr_item_bitmap_get(pp, ph); 1048 } else { 1049 v = pr_item_linkedlist_get(pp, ph); 1050 } 1051 pp->pr_nitems--; 1052 pp->pr_nout++; 1053 if (ph->ph_nmissing == 0) { 1054 KASSERT(pp->pr_nidle > 0); 1055 pp->pr_nidle--; 1056 1057 /* 1058 * This page was previously empty. Move it to the list of 1059 * partially-full pages. This page is already curpage. 1060 */ 1061 LIST_REMOVE(ph, ph_pagelist); 1062 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1063 } 1064 ph->ph_nmissing++; 1065 if (ph->ph_nmissing == pp->pr_itemsperpage) { 1066 KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) || 1067 LIST_EMPTY(&ph->ph_itemlist)), 1068 "%s: [%s] nmissing (%u) inconsistent", __func__, 1069 pp->pr_wchan, ph->ph_nmissing); 1070 /* 1071 * This page is now full. Move it to the full list 1072 * and select a new current page. 1073 */ 1074 LIST_REMOVE(ph, ph_pagelist); 1075 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 1076 pool_update_curpage(pp); 1077 } 1078 1079 pp->pr_nget++; 1080 1081 /* 1082 * If we have a low water mark and we are now below that low 1083 * water mark, add more items to the pool. 1084 */ 1085 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1086 /* 1087 * XXX: Should we log a warning? Should we set up a timeout 1088 * to try again in a second or so? The latter could break 1089 * a caller's assumptions about interrupt protection, etc. 1090 */ 1091 } 1092 1093 mutex_exit(&pp->pr_lock); 1094 KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0); 1095 FREECHECK_OUT(&pp->pr_freecheck, v); 1096 pool_redzone_fill(pp, v); 1097 if (flags & PR_ZERO) 1098 memset(v, 0, pp->pr_reqsize); 1099 else 1100 pool_kleak_fill(pp, v); 1101 return v; 1102 } 1103 1104 /* 1105 * Internal version of pool_put(). Pool is already locked/entered. 1106 */ 1107 static void 1108 pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq) 1109 { 1110 struct pool_item_header *ph; 1111 1112 KASSERT(mutex_owned(&pp->pr_lock)); 1113 pool_redzone_check(pp, v); 1114 FREECHECK_IN(&pp->pr_freecheck, v); 1115 LOCKDEBUG_MEM_CHECK(v, pp->pr_size); 1116 1117 KASSERTMSG((pp->pr_nout > 0), 1118 "%s: [%s] putting with none out", __func__, pp->pr_wchan); 1119 1120 if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) { 1121 panic("%s: [%s] page header missing", __func__, pp->pr_wchan); 1122 } 1123 1124 /* 1125 * Return to item list. 1126 */ 1127 if (pp->pr_roflags & PR_USEBMAP) { 1128 pr_item_bitmap_put(pp, ph, v); 1129 } else { 1130 pr_item_linkedlist_put(pp, ph, v); 1131 } 1132 KDASSERT(ph->ph_nmissing != 0); 1133 ph->ph_nmissing--; 1134 pp->pr_nput++; 1135 pp->pr_nitems++; 1136 pp->pr_nout--; 1137 1138 /* Cancel "pool empty" condition if it exists */ 1139 if (pp->pr_curpage == NULL) 1140 pp->pr_curpage = ph; 1141 1142 if (pp->pr_flags & PR_WANTED) { 1143 pp->pr_flags &= ~PR_WANTED; 1144 cv_broadcast(&pp->pr_cv); 1145 } 1146 1147 /* 1148 * If this page is now empty, do one of two things: 1149 * 1150 * (1) If we have more pages than the page high water mark, 1151 * free the page back to the system. ONLY CONSIDER 1152 * FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE 1153 * CLAIM. 1154 * 1155 * (2) Otherwise, move the page to the empty page list. 1156 * 1157 * Either way, select a new current page (so we use a partially-full 1158 * page if one is available). 1159 */ 1160 if (ph->ph_nmissing == 0) { 1161 pp->pr_nidle++; 1162 if (pp->pr_npages > pp->pr_minpages && 1163 pp->pr_npages > pp->pr_maxpages) { 1164 pr_rmpage(pp, ph, pq); 1165 } else { 1166 LIST_REMOVE(ph, ph_pagelist); 1167 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1168 1169 /* 1170 * Update the timestamp on the page. A page must 1171 * be idle for some period of time before it can 1172 * be reclaimed by the pagedaemon. This minimizes 1173 * ping-pong'ing for memory. 1174 * 1175 * note for 64-bit time_t: truncating to 32-bit is not 1176 * a problem for our usage. 1177 */ 1178 ph->ph_time = time_uptime; 1179 } 1180 pool_update_curpage(pp); 1181 } 1182 1183 /* 1184 * If the page was previously completely full, move it to the 1185 * partially-full list and make it the current page. The next 1186 * allocation will get the item from this page, instead of 1187 * further fragmenting the pool. 1188 */ 1189 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 1190 LIST_REMOVE(ph, ph_pagelist); 1191 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1192 pp->pr_curpage = ph; 1193 } 1194 } 1195 1196 void 1197 pool_put(struct pool *pp, void *v) 1198 { 1199 struct pool_pagelist pq; 1200 1201 LIST_INIT(&pq); 1202 1203 mutex_enter(&pp->pr_lock); 1204 if (!pool_put_quarantine(pp, v, &pq)) { 1205 pool_do_put(pp, v, &pq); 1206 } 1207 mutex_exit(&pp->pr_lock); 1208 1209 pr_pagelist_free(pp, &pq); 1210 } 1211 1212 /* 1213 * pool_grow: grow a pool by a page. 1214 * 1215 * => called with pool locked. 1216 * => unlock and relock the pool. 1217 * => return with pool locked. 1218 */ 1219 1220 static int 1221 pool_grow(struct pool *pp, int flags) 1222 { 1223 struct pool_item_header *ph; 1224 char *storage; 1225 1226 /* 1227 * If there's a pool_grow in progress, wait for it to complete 1228 * and try again from the top. 1229 */ 1230 if (pp->pr_flags & PR_GROWING) { 1231 if (flags & PR_WAITOK) { 1232 do { 1233 cv_wait(&pp->pr_cv, &pp->pr_lock); 1234 } while (pp->pr_flags & PR_GROWING); 1235 return ERESTART; 1236 } else { 1237 if (pp->pr_flags & PR_GROWINGNOWAIT) { 1238 /* 1239 * This needs an unlock/relock dance so 1240 * that the other caller has a chance to 1241 * run and actually do the thing. Note 1242 * that this is effectively a busy-wait. 1243 */ 1244 mutex_exit(&pp->pr_lock); 1245 mutex_enter(&pp->pr_lock); 1246 return ERESTART; 1247 } 1248 return EWOULDBLOCK; 1249 } 1250 } 1251 pp->pr_flags |= PR_GROWING; 1252 if (flags & PR_WAITOK) 1253 mutex_exit(&pp->pr_lock); 1254 else 1255 pp->pr_flags |= PR_GROWINGNOWAIT; 1256 1257 storage = pool_allocator_alloc(pp, flags); 1258 if (__predict_false(storage == NULL)) 1259 goto out; 1260 1261 ph = pool_alloc_item_header(pp, storage, flags); 1262 if (__predict_false(ph == NULL)) { 1263 pool_allocator_free(pp, storage); 1264 goto out; 1265 } 1266 1267 if (flags & PR_WAITOK) 1268 mutex_enter(&pp->pr_lock); 1269 pool_prime_page(pp, storage, ph); 1270 pp->pr_npagealloc++; 1271 KASSERT(pp->pr_flags & PR_GROWING); 1272 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1273 /* 1274 * If anyone was waiting for pool_grow, notify them that we 1275 * may have just done it. 1276 */ 1277 cv_broadcast(&pp->pr_cv); 1278 return 0; 1279 out: 1280 if (flags & PR_WAITOK) 1281 mutex_enter(&pp->pr_lock); 1282 KASSERT(pp->pr_flags & PR_GROWING); 1283 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1284 return ENOMEM; 1285 } 1286 1287 /* 1288 * Add N items to the pool. 1289 */ 1290 int 1291 pool_prime(struct pool *pp, int n) 1292 { 1293 int newpages; 1294 int error = 0; 1295 1296 mutex_enter(&pp->pr_lock); 1297 1298 newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1299 1300 while (newpages > 0) { 1301 error = pool_grow(pp, PR_NOWAIT); 1302 if (error) { 1303 if (error == ERESTART) 1304 continue; 1305 break; 1306 } 1307 pp->pr_minpages++; 1308 newpages--; 1309 } 1310 1311 if (pp->pr_minpages >= pp->pr_maxpages) 1312 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 1313 1314 mutex_exit(&pp->pr_lock); 1315 return error; 1316 } 1317 1318 /* 1319 * Add a page worth of items to the pool. 1320 * 1321 * Note, we must be called with the pool descriptor LOCKED. 1322 */ 1323 static void 1324 pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph) 1325 { 1326 const unsigned int align = pp->pr_align; 1327 struct pool_item *pi; 1328 void *cp = storage; 1329 int n; 1330 1331 KASSERT(mutex_owned(&pp->pr_lock)); 1332 KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) || 1333 (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)), 1334 "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp); 1335 1336 /* 1337 * Insert page header. 1338 */ 1339 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1340 LIST_INIT(&ph->ph_itemlist); 1341 ph->ph_page = storage; 1342 ph->ph_nmissing = 0; 1343 ph->ph_time = time_uptime; 1344 if (pp->pr_roflags & PR_PHINPAGE) 1345 ph->ph_poolid = pp->pr_poolid; 1346 else 1347 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 1348 1349 pp->pr_nidle++; 1350 1351 /* 1352 * The item space starts after the on-page header, if any. 1353 */ 1354 ph->ph_off = pp->pr_itemoffset; 1355 1356 /* 1357 * Color this page. 1358 */ 1359 ph->ph_off += pp->pr_curcolor; 1360 cp = (char *)cp + ph->ph_off; 1361 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 1362 pp->pr_curcolor = 0; 1363 1364 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1365 1366 /* 1367 * Insert remaining chunks on the bucket list. 1368 */ 1369 n = pp->pr_itemsperpage; 1370 pp->pr_nitems += n; 1371 1372 if (pp->pr_roflags & PR_USEBMAP) { 1373 pr_item_bitmap_init(pp, ph); 1374 } else { 1375 while (n--) { 1376 pi = (struct pool_item *)cp; 1377 1378 KASSERT((((vaddr_t)pi) & (align - 1)) == 0); 1379 1380 /* Insert on page list */ 1381 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 1382 #ifdef POOL_CHECK_MAGIC 1383 pi->pi_magic = PI_MAGIC; 1384 #endif 1385 cp = (char *)cp + pp->pr_size; 1386 1387 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1388 } 1389 } 1390 1391 /* 1392 * If the pool was depleted, point at the new page. 1393 */ 1394 if (pp->pr_curpage == NULL) 1395 pp->pr_curpage = ph; 1396 1397 if (++pp->pr_npages > pp->pr_hiwat) 1398 pp->pr_hiwat = pp->pr_npages; 1399 } 1400 1401 /* 1402 * Used by pool_get() when nitems drops below the low water mark. This 1403 * is used to catch up pr_nitems with the low water mark. 1404 * 1405 * Note 1, we never wait for memory here, we let the caller decide what to do. 1406 * 1407 * Note 2, we must be called with the pool already locked, and we return 1408 * with it locked. 1409 */ 1410 static int 1411 pool_catchup(struct pool *pp) 1412 { 1413 int error = 0; 1414 1415 while (POOL_NEEDS_CATCHUP(pp)) { 1416 error = pool_grow(pp, PR_NOWAIT); 1417 if (error) { 1418 if (error == ERESTART) 1419 continue; 1420 break; 1421 } 1422 } 1423 return error; 1424 } 1425 1426 static void 1427 pool_update_curpage(struct pool *pp) 1428 { 1429 1430 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 1431 if (pp->pr_curpage == NULL) { 1432 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 1433 } 1434 KASSERT((pp->pr_curpage == NULL && pp->pr_nitems == 0) || 1435 (pp->pr_curpage != NULL && pp->pr_nitems > 0)); 1436 } 1437 1438 void 1439 pool_setlowat(struct pool *pp, int n) 1440 { 1441 1442 mutex_enter(&pp->pr_lock); 1443 1444 pp->pr_minitems = n; 1445 pp->pr_minpages = (n == 0) 1446 ? 0 1447 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1448 1449 /* Make sure we're caught up with the newly-set low water mark. */ 1450 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1451 /* 1452 * XXX: Should we log a warning? Should we set up a timeout 1453 * to try again in a second or so? The latter could break 1454 * a caller's assumptions about interrupt protection, etc. 1455 */ 1456 } 1457 1458 mutex_exit(&pp->pr_lock); 1459 } 1460 1461 void 1462 pool_sethiwat(struct pool *pp, int n) 1463 { 1464 1465 mutex_enter(&pp->pr_lock); 1466 1467 pp->pr_maxpages = (n == 0) 1468 ? 0 1469 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1470 1471 mutex_exit(&pp->pr_lock); 1472 } 1473 1474 void 1475 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap) 1476 { 1477 1478 mutex_enter(&pp->pr_lock); 1479 1480 pp->pr_hardlimit = n; 1481 pp->pr_hardlimit_warning = warnmess; 1482 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 1483 pp->pr_hardlimit_warning_last.tv_sec = 0; 1484 pp->pr_hardlimit_warning_last.tv_usec = 0; 1485 1486 /* 1487 * In-line version of pool_sethiwat(), because we don't want to 1488 * release the lock. 1489 */ 1490 pp->pr_maxpages = (n == 0) 1491 ? 0 1492 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1493 1494 mutex_exit(&pp->pr_lock); 1495 } 1496 1497 /* 1498 * Release all complete pages that have not been used recently. 1499 * 1500 * Must not be called from interrupt context. 1501 */ 1502 int 1503 pool_reclaim(struct pool *pp) 1504 { 1505 struct pool_item_header *ph, *phnext; 1506 struct pool_pagelist pq; 1507 uint32_t curtime; 1508 bool klock; 1509 int rv; 1510 1511 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 1512 1513 if (pp->pr_drain_hook != NULL) { 1514 /* 1515 * The drain hook must be called with the pool unlocked. 1516 */ 1517 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT); 1518 } 1519 1520 /* 1521 * XXXSMP Because we do not want to cause non-MPSAFE code 1522 * to block. 1523 */ 1524 if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK || 1525 pp->pr_ipl == IPL_SOFTSERIAL) { 1526 KERNEL_LOCK(1, NULL); 1527 klock = true; 1528 } else 1529 klock = false; 1530 1531 /* Reclaim items from the pool's cache (if any). */ 1532 if (pp->pr_cache != NULL) 1533 pool_cache_invalidate(pp->pr_cache); 1534 1535 if (mutex_tryenter(&pp->pr_lock) == 0) { 1536 if (klock) { 1537 KERNEL_UNLOCK_ONE(NULL); 1538 } 1539 return 0; 1540 } 1541 1542 LIST_INIT(&pq); 1543 1544 curtime = time_uptime; 1545 1546 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 1547 phnext = LIST_NEXT(ph, ph_pagelist); 1548 1549 /* Check our minimum page claim */ 1550 if (pp->pr_npages <= pp->pr_minpages) 1551 break; 1552 1553 KASSERT(ph->ph_nmissing == 0); 1554 if (curtime - ph->ph_time < pool_inactive_time) 1555 continue; 1556 1557 /* 1558 * If freeing this page would put us below 1559 * the low water mark, stop now. 1560 */ 1561 if ((pp->pr_nitems - pp->pr_itemsperpage) < 1562 pp->pr_minitems) 1563 break; 1564 1565 pr_rmpage(pp, ph, &pq); 1566 } 1567 1568 mutex_exit(&pp->pr_lock); 1569 1570 if (LIST_EMPTY(&pq)) 1571 rv = 0; 1572 else { 1573 pr_pagelist_free(pp, &pq); 1574 rv = 1; 1575 } 1576 1577 if (klock) { 1578 KERNEL_UNLOCK_ONE(NULL); 1579 } 1580 1581 return rv; 1582 } 1583 1584 /* 1585 * Drain pools, one at a time. The drained pool is returned within ppp. 1586 * 1587 * Note, must never be called from interrupt context. 1588 */ 1589 bool 1590 pool_drain(struct pool **ppp) 1591 { 1592 bool reclaimed; 1593 struct pool *pp; 1594 1595 KASSERT(!TAILQ_EMPTY(&pool_head)); 1596 1597 pp = NULL; 1598 1599 /* Find next pool to drain, and add a reference. */ 1600 mutex_enter(&pool_head_lock); 1601 do { 1602 if (drainpp == NULL) { 1603 drainpp = TAILQ_FIRST(&pool_head); 1604 } 1605 if (drainpp != NULL) { 1606 pp = drainpp; 1607 drainpp = TAILQ_NEXT(pp, pr_poollist); 1608 } 1609 /* 1610 * Skip completely idle pools. We depend on at least 1611 * one pool in the system being active. 1612 */ 1613 } while (pp == NULL || pp->pr_npages == 0); 1614 pp->pr_refcnt++; 1615 mutex_exit(&pool_head_lock); 1616 1617 /* Drain the cache (if any) and pool.. */ 1618 reclaimed = pool_reclaim(pp); 1619 1620 /* Finally, unlock the pool. */ 1621 mutex_enter(&pool_head_lock); 1622 pp->pr_refcnt--; 1623 cv_broadcast(&pool_busy); 1624 mutex_exit(&pool_head_lock); 1625 1626 if (ppp != NULL) 1627 *ppp = pp; 1628 1629 return reclaimed; 1630 } 1631 1632 /* 1633 * Calculate the total number of pages consumed by pools. 1634 */ 1635 int 1636 pool_totalpages(void) 1637 { 1638 1639 mutex_enter(&pool_head_lock); 1640 int pages = pool_totalpages_locked(); 1641 mutex_exit(&pool_head_lock); 1642 1643 return pages; 1644 } 1645 1646 int 1647 pool_totalpages_locked(void) 1648 { 1649 struct pool *pp; 1650 uint64_t total = 0; 1651 1652 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1653 uint64_t bytes = pp->pr_npages * pp->pr_alloc->pa_pagesz; 1654 1655 if ((pp->pr_roflags & PR_RECURSIVE) != 0) 1656 bytes -= (pp->pr_nout * pp->pr_size); 1657 total += bytes; 1658 } 1659 1660 return atop(total); 1661 } 1662 1663 /* 1664 * Diagnostic helpers. 1665 */ 1666 1667 void 1668 pool_printall(const char *modif, void (*pr)(const char *, ...)) 1669 { 1670 struct pool *pp; 1671 1672 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1673 pool_printit(pp, modif, pr); 1674 } 1675 } 1676 1677 void 1678 pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1679 { 1680 1681 if (pp == NULL) { 1682 (*pr)("Must specify a pool to print.\n"); 1683 return; 1684 } 1685 1686 pool_print1(pp, modif, pr); 1687 } 1688 1689 static void 1690 pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl, 1691 void (*pr)(const char *, ...)) 1692 { 1693 struct pool_item_header *ph; 1694 1695 LIST_FOREACH(ph, pl, ph_pagelist) { 1696 (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n", 1697 ph->ph_page, ph->ph_nmissing, ph->ph_time); 1698 #ifdef POOL_CHECK_MAGIC 1699 struct pool_item *pi; 1700 if (!(pp->pr_roflags & PR_USEBMAP)) { 1701 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 1702 if (pi->pi_magic != PI_MAGIC) { 1703 (*pr)("\t\t\titem %p, magic 0x%x\n", 1704 pi, pi->pi_magic); 1705 } 1706 } 1707 } 1708 #endif 1709 } 1710 } 1711 1712 static void 1713 pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1714 { 1715 struct pool_item_header *ph; 1716 pool_cache_t pc; 1717 pcg_t *pcg; 1718 pool_cache_cpu_t *cc; 1719 uint64_t cpuhit, cpumiss; 1720 int i, print_log = 0, print_pagelist = 0, print_cache = 0; 1721 char c; 1722 1723 while ((c = *modif++) != '\0') { 1724 if (c == 'l') 1725 print_log = 1; 1726 if (c == 'p') 1727 print_pagelist = 1; 1728 if (c == 'c') 1729 print_cache = 1; 1730 } 1731 1732 if ((pc = pp->pr_cache) != NULL) { 1733 (*pr)("POOL CACHE"); 1734 } else { 1735 (*pr)("POOL"); 1736 } 1737 1738 (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n", 1739 pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, 1740 pp->pr_roflags); 1741 (*pr)("\talloc %p\n", pp->pr_alloc); 1742 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1743 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1744 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1745 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1746 1747 (*pr)("\tnget %lu, nfail %lu, nput %lu\n", 1748 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1749 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1750 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1751 1752 if (print_pagelist == 0) 1753 goto skip_pagelist; 1754 1755 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1756 (*pr)("\n\tempty page list:\n"); 1757 pool_print_pagelist(pp, &pp->pr_emptypages, pr); 1758 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1759 (*pr)("\n\tfull page list:\n"); 1760 pool_print_pagelist(pp, &pp->pr_fullpages, pr); 1761 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1762 (*pr)("\n\tpartial-page list:\n"); 1763 pool_print_pagelist(pp, &pp->pr_partpages, pr); 1764 1765 if (pp->pr_curpage == NULL) 1766 (*pr)("\tno current page\n"); 1767 else 1768 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1769 1770 skip_pagelist: 1771 if (print_log == 0) 1772 goto skip_log; 1773 1774 (*pr)("\n"); 1775 1776 skip_log: 1777 1778 #define PR_GROUPLIST(pcg) \ 1779 (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \ 1780 for (i = 0; i < pcg->pcg_size; i++) { \ 1781 if (pcg->pcg_objects[i].pcgo_pa != \ 1782 POOL_PADDR_INVALID) { \ 1783 (*pr)("\t\t\t%p, 0x%llx\n", \ 1784 pcg->pcg_objects[i].pcgo_va, \ 1785 (unsigned long long) \ 1786 pcg->pcg_objects[i].pcgo_pa); \ 1787 } else { \ 1788 (*pr)("\t\t\t%p\n", \ 1789 pcg->pcg_objects[i].pcgo_va); \ 1790 } \ 1791 } 1792 1793 if (pc != NULL) { 1794 cpuhit = 0; 1795 cpumiss = 0; 1796 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 1797 if ((cc = pc->pc_cpus[i]) == NULL) 1798 continue; 1799 cpuhit += cc->cc_hits; 1800 cpumiss += cc->cc_misses; 1801 } 1802 (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss); 1803 (*pr)("\tcache layer hits %llu misses %llu\n", 1804 pc->pc_hits, pc->pc_misses); 1805 (*pr)("\tcache layer entry uncontended %llu contended %llu\n", 1806 pc->pc_hits + pc->pc_misses - pc->pc_contended, 1807 pc->pc_contended); 1808 (*pr)("\tcache layer empty groups %u full groups %u\n", 1809 pc->pc_nempty, pc->pc_nfull); 1810 if (print_cache) { 1811 (*pr)("\tfull cache groups:\n"); 1812 for (pcg = pc->pc_fullgroups; pcg != NULL; 1813 pcg = pcg->pcg_next) { 1814 PR_GROUPLIST(pcg); 1815 } 1816 (*pr)("\tempty cache groups:\n"); 1817 for (pcg = pc->pc_emptygroups; pcg != NULL; 1818 pcg = pcg->pcg_next) { 1819 PR_GROUPLIST(pcg); 1820 } 1821 } 1822 } 1823 #undef PR_GROUPLIST 1824 } 1825 1826 static int 1827 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 1828 { 1829 struct pool_item *pi; 1830 void *page; 1831 int n; 1832 1833 if ((pp->pr_roflags & PR_NOALIGN) == 0) { 1834 page = (void *)((uintptr_t)ph & pp->pr_alloc->pa_pagemask); 1835 if (page != ph->ph_page && 1836 (pp->pr_roflags & PR_PHINPAGE) != 0) { 1837 if (label != NULL) 1838 printf("%s: ", label); 1839 printf("pool(%p:%s): page inconsistency: page %p;" 1840 " at page head addr %p (p %p)\n", pp, 1841 pp->pr_wchan, ph->ph_page, 1842 ph, page); 1843 return 1; 1844 } 1845 } 1846 1847 if ((pp->pr_roflags & PR_USEBMAP) != 0) 1848 return 0; 1849 1850 for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0; 1851 pi != NULL; 1852 pi = LIST_NEXT(pi,pi_list), n++) { 1853 1854 #ifdef POOL_CHECK_MAGIC 1855 if (pi->pi_magic != PI_MAGIC) { 1856 if (label != NULL) 1857 printf("%s: ", label); 1858 printf("pool(%s): free list modified: magic=%x;" 1859 " page %p; item ordinal %d; addr %p\n", 1860 pp->pr_wchan, pi->pi_magic, ph->ph_page, 1861 n, pi); 1862 panic("pool"); 1863 } 1864 #endif 1865 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 1866 continue; 1867 } 1868 page = (void *)((uintptr_t)pi & pp->pr_alloc->pa_pagemask); 1869 if (page == ph->ph_page) 1870 continue; 1871 1872 if (label != NULL) 1873 printf("%s: ", label); 1874 printf("pool(%p:%s): page inconsistency: page %p;" 1875 " item ordinal %d; addr %p (p %p)\n", pp, 1876 pp->pr_wchan, ph->ph_page, 1877 n, pi, page); 1878 return 1; 1879 } 1880 return 0; 1881 } 1882 1883 1884 int 1885 pool_chk(struct pool *pp, const char *label) 1886 { 1887 struct pool_item_header *ph; 1888 int r = 0; 1889 1890 mutex_enter(&pp->pr_lock); 1891 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 1892 r = pool_chk_page(pp, label, ph); 1893 if (r) { 1894 goto out; 1895 } 1896 } 1897 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 1898 r = pool_chk_page(pp, label, ph); 1899 if (r) { 1900 goto out; 1901 } 1902 } 1903 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 1904 r = pool_chk_page(pp, label, ph); 1905 if (r) { 1906 goto out; 1907 } 1908 } 1909 1910 out: 1911 mutex_exit(&pp->pr_lock); 1912 return r; 1913 } 1914 1915 /* 1916 * pool_cache_init: 1917 * 1918 * Initialize a pool cache. 1919 */ 1920 pool_cache_t 1921 pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags, 1922 const char *wchan, struct pool_allocator *palloc, int ipl, 1923 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) 1924 { 1925 pool_cache_t pc; 1926 1927 pc = pool_get(&cache_pool, PR_WAITOK); 1928 if (pc == NULL) 1929 return NULL; 1930 1931 pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan, 1932 palloc, ipl, ctor, dtor, arg); 1933 1934 return pc; 1935 } 1936 1937 /* 1938 * pool_cache_bootstrap: 1939 * 1940 * Kernel-private version of pool_cache_init(). The caller 1941 * provides initial storage. 1942 */ 1943 void 1944 pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, 1945 u_int align_offset, u_int flags, const char *wchan, 1946 struct pool_allocator *palloc, int ipl, 1947 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), 1948 void *arg) 1949 { 1950 CPU_INFO_ITERATOR cii; 1951 pool_cache_t pc1; 1952 struct cpu_info *ci; 1953 struct pool *pp; 1954 1955 pp = &pc->pc_pool; 1956 if (palloc == NULL && ipl == IPL_NONE) { 1957 if (size > PAGE_SIZE) { 1958 int bigidx = pool_bigidx(size); 1959 1960 palloc = &pool_allocator_big[bigidx]; 1961 } else 1962 palloc = &pool_allocator_nointr; 1963 } 1964 pool_init(pp, size, align, align_offset, flags, wchan, palloc, ipl); 1965 mutex_init(&pc->pc_lock, MUTEX_DEFAULT, ipl); 1966 1967 if (ctor == NULL) { 1968 ctor = (int (*)(void *, void *, int))nullop; 1969 } 1970 if (dtor == NULL) { 1971 dtor = (void (*)(void *, void *))nullop; 1972 } 1973 1974 pc->pc_emptygroups = NULL; 1975 pc->pc_fullgroups = NULL; 1976 pc->pc_partgroups = NULL; 1977 pc->pc_ctor = ctor; 1978 pc->pc_dtor = dtor; 1979 pc->pc_arg = arg; 1980 pc->pc_hits = 0; 1981 pc->pc_misses = 0; 1982 pc->pc_nempty = 0; 1983 pc->pc_npart = 0; 1984 pc->pc_nfull = 0; 1985 pc->pc_contended = 0; 1986 pc->pc_refcnt = 0; 1987 pc->pc_freecheck = NULL; 1988 1989 if ((flags & PR_LARGECACHE) != 0) { 1990 pc->pc_pcgsize = PCG_NOBJECTS_LARGE; 1991 pc->pc_pcgpool = &pcg_large_pool; 1992 } else { 1993 pc->pc_pcgsize = PCG_NOBJECTS_NORMAL; 1994 pc->pc_pcgpool = &pcg_normal_pool; 1995 } 1996 1997 /* Allocate per-CPU caches. */ 1998 memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus)); 1999 pc->pc_ncpu = 0; 2000 if (ncpu < 2) { 2001 /* XXX For sparc: boot CPU is not attached yet. */ 2002 pool_cache_cpu_init1(curcpu(), pc); 2003 } else { 2004 for (CPU_INFO_FOREACH(cii, ci)) { 2005 pool_cache_cpu_init1(ci, pc); 2006 } 2007 } 2008 2009 /* Add to list of all pools. */ 2010 if (__predict_true(!cold)) 2011 mutex_enter(&pool_head_lock); 2012 TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) { 2013 if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0) 2014 break; 2015 } 2016 if (pc1 == NULL) 2017 TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist); 2018 else 2019 TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist); 2020 if (__predict_true(!cold)) 2021 mutex_exit(&pool_head_lock); 2022 2023 membar_sync(); 2024 pp->pr_cache = pc; 2025 } 2026 2027 /* 2028 * pool_cache_destroy: 2029 * 2030 * Destroy a pool cache. 2031 */ 2032 void 2033 pool_cache_destroy(pool_cache_t pc) 2034 { 2035 2036 pool_cache_bootstrap_destroy(pc); 2037 pool_put(&cache_pool, pc); 2038 } 2039 2040 /* 2041 * pool_cache_bootstrap_destroy: 2042 * 2043 * Destroy a pool cache. 2044 */ 2045 void 2046 pool_cache_bootstrap_destroy(pool_cache_t pc) 2047 { 2048 struct pool *pp = &pc->pc_pool; 2049 u_int i; 2050 2051 /* Remove it from the global list. */ 2052 mutex_enter(&pool_head_lock); 2053 while (pc->pc_refcnt != 0) 2054 cv_wait(&pool_busy, &pool_head_lock); 2055 TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist); 2056 mutex_exit(&pool_head_lock); 2057 2058 /* First, invalidate the entire cache. */ 2059 pool_cache_invalidate(pc); 2060 2061 /* Disassociate it from the pool. */ 2062 mutex_enter(&pp->pr_lock); 2063 pp->pr_cache = NULL; 2064 mutex_exit(&pp->pr_lock); 2065 2066 /* Destroy per-CPU data */ 2067 for (i = 0; i < __arraycount(pc->pc_cpus); i++) 2068 pool_cache_invalidate_cpu(pc, i); 2069 2070 /* Finally, destroy it. */ 2071 mutex_destroy(&pc->pc_lock); 2072 pool_destroy(pp); 2073 } 2074 2075 /* 2076 * pool_cache_cpu_init1: 2077 * 2078 * Called for each pool_cache whenever a new CPU is attached. 2079 */ 2080 static void 2081 pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) 2082 { 2083 pool_cache_cpu_t *cc; 2084 int index; 2085 2086 index = ci->ci_index; 2087 2088 KASSERT(index < __arraycount(pc->pc_cpus)); 2089 2090 if ((cc = pc->pc_cpus[index]) != NULL) { 2091 KASSERT(cc->cc_cpuindex == index); 2092 return; 2093 } 2094 2095 /* 2096 * The first CPU is 'free'. This needs to be the case for 2097 * bootstrap - we may not be able to allocate yet. 2098 */ 2099 if (pc->pc_ncpu == 0) { 2100 cc = &pc->pc_cpu0; 2101 pc->pc_ncpu = 1; 2102 } else { 2103 mutex_enter(&pc->pc_lock); 2104 pc->pc_ncpu++; 2105 mutex_exit(&pc->pc_lock); 2106 cc = pool_get(&cache_cpu_pool, PR_WAITOK); 2107 } 2108 2109 cc->cc_ipl = pc->pc_pool.pr_ipl; 2110 cc->cc_iplcookie = makeiplcookie(cc->cc_ipl); 2111 cc->cc_cache = pc; 2112 cc->cc_cpuindex = index; 2113 cc->cc_hits = 0; 2114 cc->cc_misses = 0; 2115 cc->cc_current = __UNCONST(&pcg_dummy); 2116 cc->cc_previous = __UNCONST(&pcg_dummy); 2117 2118 pc->pc_cpus[index] = cc; 2119 } 2120 2121 /* 2122 * pool_cache_cpu_init: 2123 * 2124 * Called whenever a new CPU is attached. 2125 */ 2126 void 2127 pool_cache_cpu_init(struct cpu_info *ci) 2128 { 2129 pool_cache_t pc; 2130 2131 mutex_enter(&pool_head_lock); 2132 TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) { 2133 pc->pc_refcnt++; 2134 mutex_exit(&pool_head_lock); 2135 2136 pool_cache_cpu_init1(ci, pc); 2137 2138 mutex_enter(&pool_head_lock); 2139 pc->pc_refcnt--; 2140 cv_broadcast(&pool_busy); 2141 } 2142 mutex_exit(&pool_head_lock); 2143 } 2144 2145 /* 2146 * pool_cache_reclaim: 2147 * 2148 * Reclaim memory from a pool cache. 2149 */ 2150 bool 2151 pool_cache_reclaim(pool_cache_t pc) 2152 { 2153 2154 return pool_reclaim(&pc->pc_pool); 2155 } 2156 2157 static void 2158 pool_cache_destruct_object1(pool_cache_t pc, void *object) 2159 { 2160 (*pc->pc_dtor)(pc->pc_arg, object); 2161 pool_put(&pc->pc_pool, object); 2162 } 2163 2164 /* 2165 * pool_cache_destruct_object: 2166 * 2167 * Force destruction of an object and its release back into 2168 * the pool. 2169 */ 2170 void 2171 pool_cache_destruct_object(pool_cache_t pc, void *object) 2172 { 2173 2174 FREECHECK_IN(&pc->pc_freecheck, object); 2175 2176 pool_cache_destruct_object1(pc, object); 2177 } 2178 2179 /* 2180 * pool_cache_invalidate_groups: 2181 * 2182 * Invalidate a chain of groups and destruct all objects. 2183 */ 2184 static void 2185 pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg) 2186 { 2187 void *object; 2188 pcg_t *next; 2189 int i; 2190 2191 for (; pcg != NULL; pcg = next) { 2192 next = pcg->pcg_next; 2193 2194 for (i = 0; i < pcg->pcg_avail; i++) { 2195 object = pcg->pcg_objects[i].pcgo_va; 2196 pool_cache_destruct_object1(pc, object); 2197 } 2198 2199 if (pcg->pcg_size == PCG_NOBJECTS_LARGE) { 2200 pool_put(&pcg_large_pool, pcg); 2201 } else { 2202 KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL); 2203 pool_put(&pcg_normal_pool, pcg); 2204 } 2205 } 2206 } 2207 2208 /* 2209 * pool_cache_invalidate: 2210 * 2211 * Invalidate a pool cache (destruct and release all of the 2212 * cached objects). Does not reclaim objects from the pool. 2213 * 2214 * Note: For pool caches that provide constructed objects, there 2215 * is an assumption that another level of synchronization is occurring 2216 * between the input to the constructor and the cache invalidation. 2217 * 2218 * Invalidation is a costly process and should not be called from 2219 * interrupt context. 2220 */ 2221 void 2222 pool_cache_invalidate(pool_cache_t pc) 2223 { 2224 uint64_t where; 2225 pcg_t *full, *empty, *part; 2226 2227 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 2228 2229 if (ncpu < 2 || !mp_online) { 2230 /* 2231 * We might be called early enough in the boot process 2232 * for the CPU data structures to not be fully initialized. 2233 * In this case, transfer the content of the local CPU's 2234 * cache back into global cache as only this CPU is currently 2235 * running. 2236 */ 2237 pool_cache_transfer(pc); 2238 } else { 2239 /* 2240 * Signal all CPUs that they must transfer their local 2241 * cache back to the global pool then wait for the xcall to 2242 * complete. 2243 */ 2244 where = xc_broadcast(0, (xcfunc_t)pool_cache_transfer, 2245 pc, NULL); 2246 xc_wait(where); 2247 } 2248 2249 /* Empty pool caches, then invalidate objects */ 2250 mutex_enter(&pc->pc_lock); 2251 full = pc->pc_fullgroups; 2252 empty = pc->pc_emptygroups; 2253 part = pc->pc_partgroups; 2254 pc->pc_fullgroups = NULL; 2255 pc->pc_emptygroups = NULL; 2256 pc->pc_partgroups = NULL; 2257 pc->pc_nfull = 0; 2258 pc->pc_nempty = 0; 2259 pc->pc_npart = 0; 2260 mutex_exit(&pc->pc_lock); 2261 2262 pool_cache_invalidate_groups(pc, full); 2263 pool_cache_invalidate_groups(pc, empty); 2264 pool_cache_invalidate_groups(pc, part); 2265 } 2266 2267 /* 2268 * pool_cache_invalidate_cpu: 2269 * 2270 * Invalidate all CPU-bound cached objects in pool cache, the CPU being 2271 * identified by its associated index. 2272 * It is caller's responsibility to ensure that no operation is 2273 * taking place on this pool cache while doing this invalidation. 2274 * WARNING: as no inter-CPU locking is enforced, trying to invalidate 2275 * pool cached objects from a CPU different from the one currently running 2276 * may result in an undefined behaviour. 2277 */ 2278 static void 2279 pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) 2280 { 2281 pool_cache_cpu_t *cc; 2282 pcg_t *pcg; 2283 2284 if ((cc = pc->pc_cpus[index]) == NULL) 2285 return; 2286 2287 if ((pcg = cc->cc_current) != &pcg_dummy) { 2288 pcg->pcg_next = NULL; 2289 pool_cache_invalidate_groups(pc, pcg); 2290 } 2291 if ((pcg = cc->cc_previous) != &pcg_dummy) { 2292 pcg->pcg_next = NULL; 2293 pool_cache_invalidate_groups(pc, pcg); 2294 } 2295 if (cc != &pc->pc_cpu0) 2296 pool_put(&cache_cpu_pool, cc); 2297 2298 } 2299 2300 void 2301 pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg) 2302 { 2303 2304 pool_set_drain_hook(&pc->pc_pool, fn, arg); 2305 } 2306 2307 void 2308 pool_cache_setlowat(pool_cache_t pc, int n) 2309 { 2310 2311 pool_setlowat(&pc->pc_pool, n); 2312 } 2313 2314 void 2315 pool_cache_sethiwat(pool_cache_t pc, int n) 2316 { 2317 2318 pool_sethiwat(&pc->pc_pool, n); 2319 } 2320 2321 void 2322 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) 2323 { 2324 2325 pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap); 2326 } 2327 2328 static bool __noinline 2329 pool_cache_get_slow(pool_cache_cpu_t *cc, int s, void **objectp, 2330 paddr_t *pap, int flags) 2331 { 2332 pcg_t *pcg, *cur; 2333 uint64_t ncsw; 2334 pool_cache_t pc; 2335 void *object; 2336 2337 KASSERT(cc->cc_current->pcg_avail == 0); 2338 KASSERT(cc->cc_previous->pcg_avail == 0); 2339 2340 pc = cc->cc_cache; 2341 cc->cc_misses++; 2342 2343 /* 2344 * Nothing was available locally. Try and grab a group 2345 * from the cache. 2346 */ 2347 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2348 ncsw = curlwp->l_ncsw; 2349 mutex_enter(&pc->pc_lock); 2350 pc->pc_contended++; 2351 2352 /* 2353 * If we context switched while locking, then 2354 * our view of the per-CPU data is invalid: 2355 * retry. 2356 */ 2357 if (curlwp->l_ncsw != ncsw) { 2358 mutex_exit(&pc->pc_lock); 2359 return true; 2360 } 2361 } 2362 2363 if (__predict_true((pcg = pc->pc_fullgroups) != NULL)) { 2364 /* 2365 * If there's a full group, release our empty 2366 * group back to the cache. Install the full 2367 * group as cc_current and return. 2368 */ 2369 if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { 2370 KASSERT(cur->pcg_avail == 0); 2371 cur->pcg_next = pc->pc_emptygroups; 2372 pc->pc_emptygroups = cur; 2373 pc->pc_nempty++; 2374 } 2375 KASSERT(pcg->pcg_avail == pcg->pcg_size); 2376 cc->cc_current = pcg; 2377 pc->pc_fullgroups = pcg->pcg_next; 2378 pc->pc_hits++; 2379 pc->pc_nfull--; 2380 mutex_exit(&pc->pc_lock); 2381 return true; 2382 } 2383 2384 /* 2385 * Nothing available locally or in cache. Take the slow 2386 * path: fetch a new object from the pool and construct 2387 * it. 2388 */ 2389 pc->pc_misses++; 2390 mutex_exit(&pc->pc_lock); 2391 splx(s); 2392 2393 object = pool_get(&pc->pc_pool, flags); 2394 *objectp = object; 2395 if (__predict_false(object == NULL)) { 2396 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 2397 return false; 2398 } 2399 2400 if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) { 2401 pool_put(&pc->pc_pool, object); 2402 *objectp = NULL; 2403 return false; 2404 } 2405 2406 KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); 2407 2408 if (pap != NULL) { 2409 #ifdef POOL_VTOPHYS 2410 *pap = POOL_VTOPHYS(object); 2411 #else 2412 *pap = POOL_PADDR_INVALID; 2413 #endif 2414 } 2415 2416 FREECHECK_OUT(&pc->pc_freecheck, object); 2417 pool_cache_kleak_fill(pc, object); 2418 return false; 2419 } 2420 2421 /* 2422 * pool_cache_get{,_paddr}: 2423 * 2424 * Get an object from a pool cache (optionally returning 2425 * the physical address of the object). 2426 */ 2427 void * 2428 pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) 2429 { 2430 pool_cache_cpu_t *cc; 2431 pcg_t *pcg; 2432 void *object; 2433 int s; 2434 2435 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 2436 KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()) || 2437 (pc->pc_pool.pr_ipl != IPL_NONE || cold || panicstr != NULL), 2438 "%s: [%s] is IPL_NONE, but called from interrupt context", 2439 __func__, pc->pc_pool.pr_wchan); 2440 2441 if (flags & PR_WAITOK) { 2442 ASSERT_SLEEPABLE(); 2443 } 2444 2445 /* Lock out interrupts and disable preemption. */ 2446 s = splvm(); 2447 while (/* CONSTCOND */ true) { 2448 /* Try and allocate an object from the current group. */ 2449 cc = pc->pc_cpus[curcpu()->ci_index]; 2450 KASSERT(cc->cc_cache == pc); 2451 pcg = cc->cc_current; 2452 if (__predict_true(pcg->pcg_avail > 0)) { 2453 object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; 2454 if (__predict_false(pap != NULL)) 2455 *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa; 2456 #if defined(DIAGNOSTIC) 2457 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; 2458 KASSERT(pcg->pcg_avail < pcg->pcg_size); 2459 KASSERT(object != NULL); 2460 #endif 2461 cc->cc_hits++; 2462 splx(s); 2463 FREECHECK_OUT(&pc->pc_freecheck, object); 2464 pool_redzone_fill(&pc->pc_pool, object); 2465 pool_cache_kleak_fill(pc, object); 2466 return object; 2467 } 2468 2469 /* 2470 * That failed. If the previous group isn't empty, swap 2471 * it with the current group and allocate from there. 2472 */ 2473 pcg = cc->cc_previous; 2474 if (__predict_true(pcg->pcg_avail > 0)) { 2475 cc->cc_previous = cc->cc_current; 2476 cc->cc_current = pcg; 2477 continue; 2478 } 2479 2480 /* 2481 * Can't allocate from either group: try the slow path. 2482 * If get_slow() allocated an object for us, or if 2483 * no more objects are available, it will return false. 2484 * Otherwise, we need to retry. 2485 */ 2486 if (!pool_cache_get_slow(cc, s, &object, pap, flags)) 2487 break; 2488 } 2489 2490 /* 2491 * We would like to KASSERT(object || (flags & PR_NOWAIT)), but 2492 * pool_cache_get can fail even in the PR_WAITOK case, if the 2493 * constructor fails. 2494 */ 2495 return object; 2496 } 2497 2498 static bool __noinline 2499 pool_cache_put_slow(pool_cache_cpu_t *cc, int s, void *object) 2500 { 2501 struct lwp *l = curlwp; 2502 pcg_t *pcg, *cur; 2503 uint64_t ncsw; 2504 pool_cache_t pc; 2505 2506 KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); 2507 KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); 2508 2509 pc = cc->cc_cache; 2510 pcg = NULL; 2511 cc->cc_misses++; 2512 ncsw = l->l_ncsw; 2513 2514 /* 2515 * If there are no empty groups in the cache then allocate one 2516 * while still unlocked. 2517 */ 2518 if (__predict_false(pc->pc_emptygroups == NULL)) { 2519 if (__predict_true(!pool_cache_disable)) { 2520 pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT); 2521 } 2522 /* 2523 * If pool_get() blocked, then our view of 2524 * the per-CPU data is invalid: retry. 2525 */ 2526 if (__predict_false(l->l_ncsw != ncsw)) { 2527 if (pcg != NULL) { 2528 pool_put(pc->pc_pcgpool, pcg); 2529 } 2530 return true; 2531 } 2532 if (__predict_true(pcg != NULL)) { 2533 pcg->pcg_avail = 0; 2534 pcg->pcg_size = pc->pc_pcgsize; 2535 } 2536 } 2537 2538 /* Lock the cache. */ 2539 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2540 mutex_enter(&pc->pc_lock); 2541 pc->pc_contended++; 2542 2543 /* 2544 * If we context switched while locking, then our view of 2545 * the per-CPU data is invalid: retry. 2546 */ 2547 if (__predict_false(l->l_ncsw != ncsw)) { 2548 mutex_exit(&pc->pc_lock); 2549 if (pcg != NULL) { 2550 pool_put(pc->pc_pcgpool, pcg); 2551 } 2552 return true; 2553 } 2554 } 2555 2556 /* If there are no empty groups in the cache then allocate one. */ 2557 if (pcg == NULL && pc->pc_emptygroups != NULL) { 2558 pcg = pc->pc_emptygroups; 2559 pc->pc_emptygroups = pcg->pcg_next; 2560 pc->pc_nempty--; 2561 } 2562 2563 /* 2564 * If there's a empty group, release our full group back 2565 * to the cache. Install the empty group to the local CPU 2566 * and return. 2567 */ 2568 if (pcg != NULL) { 2569 KASSERT(pcg->pcg_avail == 0); 2570 if (__predict_false(cc->cc_previous == &pcg_dummy)) { 2571 cc->cc_previous = pcg; 2572 } else { 2573 cur = cc->cc_current; 2574 if (__predict_true(cur != &pcg_dummy)) { 2575 KASSERT(cur->pcg_avail == cur->pcg_size); 2576 cur->pcg_next = pc->pc_fullgroups; 2577 pc->pc_fullgroups = cur; 2578 pc->pc_nfull++; 2579 } 2580 cc->cc_current = pcg; 2581 } 2582 pc->pc_hits++; 2583 mutex_exit(&pc->pc_lock); 2584 return true; 2585 } 2586 2587 /* 2588 * Nothing available locally or in cache, and we didn't 2589 * allocate an empty group. Take the slow path and destroy 2590 * the object here and now. 2591 */ 2592 pc->pc_misses++; 2593 mutex_exit(&pc->pc_lock); 2594 splx(s); 2595 pool_cache_destruct_object(pc, object); 2596 2597 return false; 2598 } 2599 2600 /* 2601 * pool_cache_put{,_paddr}: 2602 * 2603 * Put an object back to the pool cache (optionally caching the 2604 * physical address of the object). 2605 */ 2606 void 2607 pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) 2608 { 2609 pool_cache_cpu_t *cc; 2610 pcg_t *pcg; 2611 int s; 2612 2613 KASSERT(object != NULL); 2614 pool_cache_redzone_check(pc, object); 2615 FREECHECK_IN(&pc->pc_freecheck, object); 2616 2617 if (pool_cache_put_quarantine(pc, object, pa)) { 2618 return; 2619 } 2620 2621 /* Lock out interrupts and disable preemption. */ 2622 s = splvm(); 2623 while (/* CONSTCOND */ true) { 2624 /* If the current group isn't full, release it there. */ 2625 cc = pc->pc_cpus[curcpu()->ci_index]; 2626 KASSERT(cc->cc_cache == pc); 2627 pcg = cc->cc_current; 2628 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2629 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object; 2630 pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; 2631 pcg->pcg_avail++; 2632 cc->cc_hits++; 2633 splx(s); 2634 return; 2635 } 2636 2637 /* 2638 * That failed. If the previous group isn't full, swap 2639 * it with the current group and try again. 2640 */ 2641 pcg = cc->cc_previous; 2642 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2643 cc->cc_previous = cc->cc_current; 2644 cc->cc_current = pcg; 2645 continue; 2646 } 2647 2648 /* 2649 * Can't free to either group: try the slow path. 2650 * If put_slow() releases the object for us, it 2651 * will return false. Otherwise we need to retry. 2652 */ 2653 if (!pool_cache_put_slow(cc, s, object)) 2654 break; 2655 } 2656 } 2657 2658 /* 2659 * pool_cache_transfer: 2660 * 2661 * Transfer objects from the per-CPU cache to the global cache. 2662 * Run within a cross-call thread. 2663 */ 2664 static void 2665 pool_cache_transfer(pool_cache_t pc) 2666 { 2667 pool_cache_cpu_t *cc; 2668 pcg_t *prev, *cur, **list; 2669 int s; 2670 2671 s = splvm(); 2672 mutex_enter(&pc->pc_lock); 2673 cc = pc->pc_cpus[curcpu()->ci_index]; 2674 cur = cc->cc_current; 2675 cc->cc_current = __UNCONST(&pcg_dummy); 2676 prev = cc->cc_previous; 2677 cc->cc_previous = __UNCONST(&pcg_dummy); 2678 if (cur != &pcg_dummy) { 2679 if (cur->pcg_avail == cur->pcg_size) { 2680 list = &pc->pc_fullgroups; 2681 pc->pc_nfull++; 2682 } else if (cur->pcg_avail == 0) { 2683 list = &pc->pc_emptygroups; 2684 pc->pc_nempty++; 2685 } else { 2686 list = &pc->pc_partgroups; 2687 pc->pc_npart++; 2688 } 2689 cur->pcg_next = *list; 2690 *list = cur; 2691 } 2692 if (prev != &pcg_dummy) { 2693 if (prev->pcg_avail == prev->pcg_size) { 2694 list = &pc->pc_fullgroups; 2695 pc->pc_nfull++; 2696 } else if (prev->pcg_avail == 0) { 2697 list = &pc->pc_emptygroups; 2698 pc->pc_nempty++; 2699 } else { 2700 list = &pc->pc_partgroups; 2701 pc->pc_npart++; 2702 } 2703 prev->pcg_next = *list; 2704 *list = prev; 2705 } 2706 mutex_exit(&pc->pc_lock); 2707 splx(s); 2708 } 2709 2710 /* 2711 * Pool backend allocators. 2712 * 2713 * Each pool has a backend allocator that handles allocation, deallocation, 2714 * and any additional draining that might be needed. 2715 * 2716 * We provide two standard allocators: 2717 * 2718 * pool_allocator_kmem - the default when no allocator is specified 2719 * 2720 * pool_allocator_nointr - used for pools that will not be accessed 2721 * in interrupt context. 2722 */ 2723 void *pool_page_alloc(struct pool *, int); 2724 void pool_page_free(struct pool *, void *); 2725 2726 struct pool_allocator pool_allocator_kmem = { 2727 .pa_alloc = pool_page_alloc, 2728 .pa_free = pool_page_free, 2729 .pa_pagesz = 0 2730 }; 2731 2732 struct pool_allocator pool_allocator_nointr = { 2733 .pa_alloc = pool_page_alloc, 2734 .pa_free = pool_page_free, 2735 .pa_pagesz = 0 2736 }; 2737 2738 struct pool_allocator pool_allocator_big[] = { 2739 { 2740 .pa_alloc = pool_page_alloc, 2741 .pa_free = pool_page_free, 2742 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0), 2743 }, 2744 { 2745 .pa_alloc = pool_page_alloc, 2746 .pa_free = pool_page_free, 2747 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1), 2748 }, 2749 { 2750 .pa_alloc = pool_page_alloc, 2751 .pa_free = pool_page_free, 2752 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2), 2753 }, 2754 { 2755 .pa_alloc = pool_page_alloc, 2756 .pa_free = pool_page_free, 2757 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3), 2758 }, 2759 { 2760 .pa_alloc = pool_page_alloc, 2761 .pa_free = pool_page_free, 2762 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4), 2763 }, 2764 { 2765 .pa_alloc = pool_page_alloc, 2766 .pa_free = pool_page_free, 2767 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5), 2768 }, 2769 { 2770 .pa_alloc = pool_page_alloc, 2771 .pa_free = pool_page_free, 2772 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6), 2773 }, 2774 { 2775 .pa_alloc = pool_page_alloc, 2776 .pa_free = pool_page_free, 2777 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7), 2778 } 2779 }; 2780 2781 static int 2782 pool_bigidx(size_t size) 2783 { 2784 int i; 2785 2786 for (i = 0; i < __arraycount(pool_allocator_big); i++) { 2787 if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size) 2788 return i; 2789 } 2790 panic("pool item size %zu too large, use a custom allocator", size); 2791 } 2792 2793 static void * 2794 pool_allocator_alloc(struct pool *pp, int flags) 2795 { 2796 struct pool_allocator *pa = pp->pr_alloc; 2797 void *res; 2798 2799 res = (*pa->pa_alloc)(pp, flags); 2800 if (res == NULL && (flags & PR_WAITOK) == 0) { 2801 /* 2802 * We only run the drain hook here if PR_NOWAIT. 2803 * In other cases, the hook will be run in 2804 * pool_reclaim(). 2805 */ 2806 if (pp->pr_drain_hook != NULL) { 2807 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 2808 res = (*pa->pa_alloc)(pp, flags); 2809 } 2810 } 2811 return res; 2812 } 2813 2814 static void 2815 pool_allocator_free(struct pool *pp, void *v) 2816 { 2817 struct pool_allocator *pa = pp->pr_alloc; 2818 2819 if (pp->pr_redzone) { 2820 kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0); 2821 } 2822 (*pa->pa_free)(pp, v); 2823 } 2824 2825 void * 2826 pool_page_alloc(struct pool *pp, int flags) 2827 { 2828 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2829 vmem_addr_t va; 2830 int ret; 2831 2832 ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz, 2833 vflags | VM_INSTANTFIT, &va); 2834 2835 return ret ? NULL : (void *)va; 2836 } 2837 2838 void 2839 pool_page_free(struct pool *pp, void *v) 2840 { 2841 2842 uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz); 2843 } 2844 2845 static void * 2846 pool_page_alloc_meta(struct pool *pp, int flags) 2847 { 2848 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2849 vmem_addr_t va; 2850 int ret; 2851 2852 ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, 2853 vflags | VM_INSTANTFIT, &va); 2854 2855 return ret ? NULL : (void *)va; 2856 } 2857 2858 static void 2859 pool_page_free_meta(struct pool *pp, void *v) 2860 { 2861 2862 vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); 2863 } 2864 2865 #ifdef KLEAK 2866 static void 2867 pool_kleak_fill(struct pool *pp, void *p) 2868 { 2869 if (__predict_false(pp->pr_roflags & PR_NOTOUCH)) { 2870 return; 2871 } 2872 kleak_fill_area(p, pp->pr_size); 2873 } 2874 2875 static void 2876 pool_cache_kleak_fill(pool_cache_t pc, void *p) 2877 { 2878 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 2879 return; 2880 } 2881 pool_kleak_fill(&pc->pc_pool, p); 2882 } 2883 #endif 2884 2885 #ifdef POOL_QUARANTINE 2886 static void 2887 pool_quarantine_init(struct pool *pp) 2888 { 2889 pp->pr_quar.rotor = 0; 2890 memset(&pp->pr_quar, 0, sizeof(pp->pr_quar)); 2891 } 2892 2893 static void 2894 pool_quarantine_flush(struct pool *pp) 2895 { 2896 pool_quar_t *quar = &pp->pr_quar; 2897 struct pool_pagelist pq; 2898 size_t i; 2899 2900 LIST_INIT(&pq); 2901 2902 mutex_enter(&pp->pr_lock); 2903 for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) { 2904 if (quar->list[i] == 0) 2905 continue; 2906 pool_do_put(pp, (void *)quar->list[i], &pq); 2907 } 2908 mutex_exit(&pp->pr_lock); 2909 2910 pr_pagelist_free(pp, &pq); 2911 } 2912 2913 static bool 2914 pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq) 2915 { 2916 pool_quar_t *quar = &pp->pr_quar; 2917 uintptr_t old; 2918 2919 if (pp->pr_roflags & PR_NOTOUCH) { 2920 return false; 2921 } 2922 2923 pool_redzone_check(pp, v); 2924 2925 old = quar->list[quar->rotor]; 2926 quar->list[quar->rotor] = (uintptr_t)v; 2927 quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH; 2928 if (old != 0) { 2929 pool_do_put(pp, (void *)old, pq); 2930 } 2931 2932 return true; 2933 } 2934 2935 static bool 2936 pool_cache_put_quarantine(pool_cache_t pc, void *p, paddr_t pa) 2937 { 2938 pool_cache_destruct_object(pc, p); 2939 return true; 2940 } 2941 #endif 2942 2943 #ifdef POOL_REDZONE 2944 #if defined(_LP64) 2945 # define PRIME 0x9e37fffffffc0000UL 2946 #else /* defined(_LP64) */ 2947 # define PRIME 0x9e3779b1 2948 #endif /* defined(_LP64) */ 2949 #define STATIC_BYTE 0xFE 2950 CTASSERT(POOL_REDZONE_SIZE > 1); 2951 2952 #ifndef KASAN 2953 static inline uint8_t 2954 pool_pattern_generate(const void *p) 2955 { 2956 return (uint8_t)(((uintptr_t)p) * PRIME 2957 >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT); 2958 } 2959 #endif 2960 2961 static void 2962 pool_redzone_init(struct pool *pp, size_t requested_size) 2963 { 2964 size_t redzsz; 2965 size_t nsz; 2966 2967 #ifdef KASAN 2968 redzsz = requested_size; 2969 kasan_add_redzone(&redzsz); 2970 redzsz -= requested_size; 2971 #else 2972 redzsz = POOL_REDZONE_SIZE; 2973 #endif 2974 2975 if (pp->pr_roflags & PR_NOTOUCH) { 2976 pp->pr_redzone = false; 2977 return; 2978 } 2979 2980 /* 2981 * We may have extended the requested size earlier; check if 2982 * there's naturally space in the padding for a red zone. 2983 */ 2984 if (pp->pr_size - requested_size >= redzsz) { 2985 pp->pr_reqsize_with_redzone = requested_size + redzsz; 2986 pp->pr_redzone = true; 2987 return; 2988 } 2989 2990 /* 2991 * No space in the natural padding; check if we can extend a 2992 * bit the size of the pool. 2993 */ 2994 nsz = roundup(pp->pr_size + redzsz, pp->pr_align); 2995 if (nsz <= pp->pr_alloc->pa_pagesz) { 2996 /* Ok, we can */ 2997 pp->pr_size = nsz; 2998 pp->pr_reqsize_with_redzone = requested_size + redzsz; 2999 pp->pr_redzone = true; 3000 } else { 3001 /* No space for a red zone... snif :'( */ 3002 pp->pr_redzone = false; 3003 printf("pool redzone disabled for '%s'\n", pp->pr_wchan); 3004 } 3005 } 3006 3007 static void 3008 pool_redzone_fill(struct pool *pp, void *p) 3009 { 3010 if (!pp->pr_redzone) 3011 return; 3012 #ifdef KASAN 3013 kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone, 3014 KASAN_POOL_REDZONE); 3015 #else 3016 uint8_t *cp, pat; 3017 const uint8_t *ep; 3018 3019 cp = (uint8_t *)p + pp->pr_reqsize; 3020 ep = cp + POOL_REDZONE_SIZE; 3021 3022 /* 3023 * We really don't want the first byte of the red zone to be '\0'; 3024 * an off-by-one in a string may not be properly detected. 3025 */ 3026 pat = pool_pattern_generate(cp); 3027 *cp = (pat == '\0') ? STATIC_BYTE: pat; 3028 cp++; 3029 3030 while (cp < ep) { 3031 *cp = pool_pattern_generate(cp); 3032 cp++; 3033 } 3034 #endif 3035 } 3036 3037 static void 3038 pool_redzone_check(struct pool *pp, void *p) 3039 { 3040 if (!pp->pr_redzone) 3041 return; 3042 #ifdef KASAN 3043 kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED); 3044 #else 3045 uint8_t *cp, pat, expected; 3046 const uint8_t *ep; 3047 3048 cp = (uint8_t *)p + pp->pr_reqsize; 3049 ep = cp + POOL_REDZONE_SIZE; 3050 3051 pat = pool_pattern_generate(cp); 3052 expected = (pat == '\0') ? STATIC_BYTE: pat; 3053 if (__predict_false(expected != *cp)) { 3054 printf("%s: %p: 0x%02x != 0x%02x\n", 3055 __func__, cp, *cp, expected); 3056 } 3057 cp++; 3058 3059 while (cp < ep) { 3060 expected = pool_pattern_generate(cp); 3061 if (__predict_false(*cp != expected)) { 3062 printf("%s: %p: 0x%02x != 0x%02x\n", 3063 __func__, cp, *cp, expected); 3064 } 3065 cp++; 3066 } 3067 #endif 3068 } 3069 3070 static void 3071 pool_cache_redzone_check(pool_cache_t pc, void *p) 3072 { 3073 #ifdef KASAN 3074 /* If there is a ctor/dtor, leave the data as valid. */ 3075 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 3076 return; 3077 } 3078 #endif 3079 pool_redzone_check(&pc->pc_pool, p); 3080 } 3081 3082 #endif /* POOL_REDZONE */ 3083 3084 #if defined(DDB) 3085 static bool 3086 pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3087 { 3088 3089 return (uintptr_t)ph->ph_page <= addr && 3090 addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz; 3091 } 3092 3093 static bool 3094 pool_in_item(struct pool *pp, void *item, uintptr_t addr) 3095 { 3096 3097 return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size; 3098 } 3099 3100 static bool 3101 pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr) 3102 { 3103 int i; 3104 3105 if (pcg == NULL) { 3106 return false; 3107 } 3108 for (i = 0; i < pcg->pcg_avail; i++) { 3109 if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) { 3110 return true; 3111 } 3112 } 3113 return false; 3114 } 3115 3116 static bool 3117 pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3118 { 3119 3120 if ((pp->pr_roflags & PR_USEBMAP) != 0) { 3121 unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr); 3122 pool_item_bitmap_t *bitmap = 3123 ph->ph_bitmap + (idx / BITMAP_SIZE); 3124 pool_item_bitmap_t mask = 1 << (idx & BITMAP_MASK); 3125 3126 return (*bitmap & mask) == 0; 3127 } else { 3128 struct pool_item *pi; 3129 3130 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 3131 if (pool_in_item(pp, pi, addr)) { 3132 return false; 3133 } 3134 } 3135 return true; 3136 } 3137 } 3138 3139 void 3140 pool_whatis(uintptr_t addr, void (*pr)(const char *, ...)) 3141 { 3142 struct pool *pp; 3143 3144 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3145 struct pool_item_header *ph; 3146 uintptr_t item; 3147 bool allocated = true; 3148 bool incache = false; 3149 bool incpucache = false; 3150 char cpucachestr[32]; 3151 3152 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 3153 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 3154 if (pool_in_page(pp, ph, addr)) { 3155 goto found; 3156 } 3157 } 3158 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 3159 if (pool_in_page(pp, ph, addr)) { 3160 allocated = 3161 pool_allocated(pp, ph, addr); 3162 goto found; 3163 } 3164 } 3165 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 3166 if (pool_in_page(pp, ph, addr)) { 3167 allocated = false; 3168 goto found; 3169 } 3170 } 3171 continue; 3172 } else { 3173 ph = pr_find_pagehead_noalign(pp, (void *)addr); 3174 if (ph == NULL || !pool_in_page(pp, ph, addr)) { 3175 continue; 3176 } 3177 allocated = pool_allocated(pp, ph, addr); 3178 } 3179 found: 3180 if (allocated && pp->pr_cache) { 3181 pool_cache_t pc = pp->pr_cache; 3182 struct pool_cache_group *pcg; 3183 int i; 3184 3185 for (pcg = pc->pc_fullgroups; pcg != NULL; 3186 pcg = pcg->pcg_next) { 3187 if (pool_in_cg(pp, pcg, addr)) { 3188 incache = true; 3189 goto print; 3190 } 3191 } 3192 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 3193 pool_cache_cpu_t *cc; 3194 3195 if ((cc = pc->pc_cpus[i]) == NULL) { 3196 continue; 3197 } 3198 if (pool_in_cg(pp, cc->cc_current, addr) || 3199 pool_in_cg(pp, cc->cc_previous, addr)) { 3200 struct cpu_info *ci = 3201 cpu_lookup(i); 3202 3203 incpucache = true; 3204 snprintf(cpucachestr, 3205 sizeof(cpucachestr), 3206 "cached by CPU %u", 3207 ci->ci_index); 3208 goto print; 3209 } 3210 } 3211 } 3212 print: 3213 item = (uintptr_t)ph->ph_page + ph->ph_off; 3214 item = item + rounddown(addr - item, pp->pr_size); 3215 (*pr)("%p is %p+%zu in POOL '%s' (%s)\n", 3216 (void *)addr, item, (size_t)(addr - item), 3217 pp->pr_wchan, 3218 incpucache ? cpucachestr : 3219 incache ? "cached" : allocated ? "allocated" : "free"); 3220 } 3221 } 3222 #endif /* defined(DDB) */ 3223 3224 static int 3225 pool_sysctl(SYSCTLFN_ARGS) 3226 { 3227 struct pool_sysctl data; 3228 struct pool *pp; 3229 struct pool_cache *pc; 3230 pool_cache_cpu_t *cc; 3231 int error; 3232 size_t i, written; 3233 3234 if (oldp == NULL) { 3235 *oldlenp = 0; 3236 TAILQ_FOREACH(pp, &pool_head, pr_poollist) 3237 *oldlenp += sizeof(data); 3238 return 0; 3239 } 3240 3241 memset(&data, 0, sizeof(data)); 3242 error = 0; 3243 written = 0; 3244 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3245 if (written + sizeof(data) > *oldlenp) 3246 break; 3247 strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan)); 3248 data.pr_pagesize = pp->pr_alloc->pa_pagesz; 3249 data.pr_flags = pp->pr_roflags | pp->pr_flags; 3250 #define COPY(field) data.field = pp->field 3251 COPY(pr_size); 3252 3253 COPY(pr_itemsperpage); 3254 COPY(pr_nitems); 3255 COPY(pr_nout); 3256 COPY(pr_hardlimit); 3257 COPY(pr_npages); 3258 COPY(pr_minpages); 3259 COPY(pr_maxpages); 3260 3261 COPY(pr_nget); 3262 COPY(pr_nfail); 3263 COPY(pr_nput); 3264 COPY(pr_npagealloc); 3265 COPY(pr_npagefree); 3266 COPY(pr_hiwat); 3267 COPY(pr_nidle); 3268 #undef COPY 3269 3270 data.pr_cache_nmiss_pcpu = 0; 3271 data.pr_cache_nhit_pcpu = 0; 3272 if (pp->pr_cache) { 3273 pc = pp->pr_cache; 3274 data.pr_cache_meta_size = pc->pc_pcgsize; 3275 data.pr_cache_nfull = pc->pc_nfull; 3276 data.pr_cache_npartial = pc->pc_npart; 3277 data.pr_cache_nempty = pc->pc_nempty; 3278 data.pr_cache_ncontended = pc->pc_contended; 3279 data.pr_cache_nmiss_global = pc->pc_misses; 3280 data.pr_cache_nhit_global = pc->pc_hits; 3281 for (i = 0; i < pc->pc_ncpu; ++i) { 3282 cc = pc->pc_cpus[i]; 3283 if (cc == NULL) 3284 continue; 3285 data.pr_cache_nmiss_pcpu += cc->cc_misses; 3286 data.pr_cache_nhit_pcpu += cc->cc_hits; 3287 } 3288 } else { 3289 data.pr_cache_meta_size = 0; 3290 data.pr_cache_nfull = 0; 3291 data.pr_cache_npartial = 0; 3292 data.pr_cache_nempty = 0; 3293 data.pr_cache_ncontended = 0; 3294 data.pr_cache_nmiss_global = 0; 3295 data.pr_cache_nhit_global = 0; 3296 } 3297 3298 error = sysctl_copyout(l, &data, oldp, sizeof(data)); 3299 if (error) 3300 break; 3301 written += sizeof(data); 3302 oldp = (char *)oldp + sizeof(data); 3303 } 3304 3305 *oldlenp = written; 3306 return error; 3307 } 3308 3309 SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup") 3310 { 3311 const struct sysctlnode *rnode = NULL; 3312 3313 sysctl_createv(clog, 0, NULL, &rnode, 3314 CTLFLAG_PERMANENT, 3315 CTLTYPE_STRUCT, "pool", 3316 SYSCTL_DESCR("Get pool statistics"), 3317 pool_sysctl, 0, NULL, 0, 3318 CTL_KERN, CTL_CREATE, CTL_EOL); 3319 } 3320