1 /* $NetBSD: subr_pool.c,v 1.233 2019/02/11 11:12:58 maxv Exp $ */ 2 3 /* 4 * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by 11 * Maxime Villard. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.233 2019/02/11 11:12:58 maxv Exp $"); 37 38 #ifdef _KERNEL_OPT 39 #include "opt_ddb.h" 40 #include "opt_lockdebug.h" 41 #include "opt_kleak.h" 42 #endif 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysctl.h> 47 #include <sys/bitops.h> 48 #include <sys/proc.h> 49 #include <sys/errno.h> 50 #include <sys/kernel.h> 51 #include <sys/vmem.h> 52 #include <sys/pool.h> 53 #include <sys/syslog.h> 54 #include <sys/debug.h> 55 #include <sys/lockdebug.h> 56 #include <sys/xcall.h> 57 #include <sys/cpu.h> 58 #include <sys/atomic.h> 59 #include <sys/asan.h> 60 61 #include <uvm/uvm_extern.h> 62 63 /* 64 * Pool resource management utility. 65 * 66 * Memory is allocated in pages which are split into pieces according to 67 * the pool item size. Each page is kept on one of three lists in the 68 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 69 * for empty, full and partially-full pages respectively. The individual 70 * pool items are on a linked list headed by `ph_itemlist' in each page 71 * header. The memory for building the page list is either taken from 72 * the allocated pages themselves (for small pool items) or taken from 73 * an internal pool of page headers (`phpool'). 74 */ 75 76 /* List of all pools. Non static as needed by 'vmstat -m' */ 77 TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 78 79 /* Private pool for page header structures */ 80 #define PHPOOL_MAX 8 81 static struct pool phpool[PHPOOL_MAX]; 82 #define PHPOOL_FREELIST_NELEM(idx) \ 83 (((idx) == 0) ? 0 : BITMAP_SIZE * (1 << (idx))) 84 85 #ifdef POOL_SUBPAGE 86 /* Pool of subpages for use by normal pools. */ 87 static struct pool psppool; 88 #endif 89 90 #if defined(KASAN) 91 #define POOL_REDZONE 92 #endif 93 94 #ifdef POOL_REDZONE 95 # ifdef KASAN 96 # define POOL_REDZONE_SIZE 8 97 # else 98 # define POOL_REDZONE_SIZE 2 99 # endif 100 static void pool_redzone_init(struct pool *, size_t); 101 static void pool_redzone_fill(struct pool *, void *); 102 static void pool_redzone_check(struct pool *, void *); 103 static void pool_cache_redzone_check(pool_cache_t, void *); 104 #else 105 # define pool_redzone_init(pp, sz) __nothing 106 # define pool_redzone_fill(pp, ptr) __nothing 107 # define pool_redzone_check(pp, ptr) __nothing 108 # define pool_cache_redzone_check(pc, ptr) __nothing 109 #endif 110 111 #ifdef KLEAK 112 static void pool_kleak_fill(struct pool *, void *); 113 static void pool_cache_kleak_fill(pool_cache_t, void *); 114 #else 115 #define pool_kleak_fill(pp, ptr) __nothing 116 #define pool_cache_kleak_fill(pc, ptr) __nothing 117 #endif 118 119 #define pc_has_ctor(pc) \ 120 (pc->pc_ctor != (int (*)(void *, void *, int))nullop) 121 #define pc_has_dtor(pc) \ 122 (pc->pc_dtor != (void (*)(void *, void *))nullop) 123 124 static void *pool_page_alloc_meta(struct pool *, int); 125 static void pool_page_free_meta(struct pool *, void *); 126 127 /* allocator for pool metadata */ 128 struct pool_allocator pool_allocator_meta = { 129 .pa_alloc = pool_page_alloc_meta, 130 .pa_free = pool_page_free_meta, 131 .pa_pagesz = 0 132 }; 133 134 #define POOL_ALLOCATOR_BIG_BASE 13 135 extern struct pool_allocator pool_allocator_big[]; 136 static int pool_bigidx(size_t); 137 138 /* # of seconds to retain page after last use */ 139 int pool_inactive_time = 10; 140 141 /* Next candidate for drainage (see pool_drain()) */ 142 static struct pool *drainpp; 143 144 /* This lock protects both pool_head and drainpp. */ 145 static kmutex_t pool_head_lock; 146 static kcondvar_t pool_busy; 147 148 /* This lock protects initialization of a potentially shared pool allocator */ 149 static kmutex_t pool_allocator_lock; 150 151 typedef uint32_t pool_item_bitmap_t; 152 #define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t)) 153 #define BITMAP_MASK (BITMAP_SIZE - 1) 154 155 struct pool_item_header { 156 /* Page headers */ 157 LIST_ENTRY(pool_item_header) 158 ph_pagelist; /* pool page list */ 159 SPLAY_ENTRY(pool_item_header) 160 ph_node; /* Off-page page headers */ 161 void * ph_page; /* this page's address */ 162 uint32_t ph_time; /* last referenced */ 163 uint16_t ph_nmissing; /* # of chunks in use */ 164 uint16_t ph_off; /* start offset in page */ 165 union { 166 /* !PR_NOTOUCH */ 167 struct { 168 LIST_HEAD(, pool_item) 169 phu_itemlist; /* chunk list for this page */ 170 } phu_normal; 171 /* PR_NOTOUCH */ 172 struct { 173 pool_item_bitmap_t phu_bitmap[1]; 174 } phu_notouch; 175 } ph_u; 176 }; 177 #define ph_itemlist ph_u.phu_normal.phu_itemlist 178 #define ph_bitmap ph_u.phu_notouch.phu_bitmap 179 180 #if defined(DIAGNOSTIC) && !defined(KASAN) 181 #define POOL_CHECK_MAGIC 182 #endif 183 184 struct pool_item { 185 #ifdef POOL_CHECK_MAGIC 186 u_int pi_magic; 187 #endif 188 #define PI_MAGIC 0xdeaddeadU 189 /* Other entries use only this list entry */ 190 LIST_ENTRY(pool_item) pi_list; 191 }; 192 193 #define POOL_NEEDS_CATCHUP(pp) \ 194 ((pp)->pr_nitems < (pp)->pr_minitems) 195 196 /* 197 * Pool cache management. 198 * 199 * Pool caches provide a way for constructed objects to be cached by the 200 * pool subsystem. This can lead to performance improvements by avoiding 201 * needless object construction/destruction; it is deferred until absolutely 202 * necessary. 203 * 204 * Caches are grouped into cache groups. Each cache group references up 205 * to PCG_NUMOBJECTS constructed objects. When a cache allocates an 206 * object from the pool, it calls the object's constructor and places it 207 * into a cache group. When a cache group frees an object back to the 208 * pool, it first calls the object's destructor. This allows the object 209 * to persist in constructed form while freed to the cache. 210 * 211 * The pool references each cache, so that when a pool is drained by the 212 * pagedaemon, it can drain each individual cache as well. Each time a 213 * cache is drained, the most idle cache group is freed to the pool in 214 * its entirety. 215 * 216 * Pool caches are layed on top of pools. By layering them, we can avoid 217 * the complexity of cache management for pools which would not benefit 218 * from it. 219 */ 220 221 static struct pool pcg_normal_pool; 222 static struct pool pcg_large_pool; 223 static struct pool cache_pool; 224 static struct pool cache_cpu_pool; 225 226 pool_cache_t pnbuf_cache; /* pathname buffer cache */ 227 228 /* List of all caches. */ 229 TAILQ_HEAD(,pool_cache) pool_cache_head = 230 TAILQ_HEAD_INITIALIZER(pool_cache_head); 231 232 int pool_cache_disable; /* global disable for caching */ 233 static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */ 234 235 static bool pool_cache_put_slow(pool_cache_cpu_t *, int, 236 void *); 237 static bool pool_cache_get_slow(pool_cache_cpu_t *, int, 238 void **, paddr_t *, int); 239 static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t); 240 static void pool_cache_invalidate_groups(pool_cache_t, pcg_t *); 241 static void pool_cache_invalidate_cpu(pool_cache_t, u_int); 242 static void pool_cache_transfer(pool_cache_t); 243 244 static int pool_catchup(struct pool *); 245 static void pool_prime_page(struct pool *, void *, 246 struct pool_item_header *); 247 static void pool_update_curpage(struct pool *); 248 249 static int pool_grow(struct pool *, int); 250 static void *pool_allocator_alloc(struct pool *, int); 251 static void pool_allocator_free(struct pool *, void *); 252 253 static void pool_print_pagelist(struct pool *, struct pool_pagelist *, 254 void (*)(const char *, ...) __printflike(1, 2)); 255 static void pool_print1(struct pool *, const char *, 256 void (*)(const char *, ...) __printflike(1, 2)); 257 258 static int pool_chk_page(struct pool *, const char *, 259 struct pool_item_header *); 260 261 static inline unsigned int 262 pr_item_notouch_index(const struct pool *pp, const struct pool_item_header *ph, 263 const void *v) 264 { 265 const char *cp = v; 266 unsigned int idx; 267 268 KASSERT(pp->pr_roflags & PR_NOTOUCH); 269 idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size; 270 KASSERT(idx < pp->pr_itemsperpage); 271 return idx; 272 } 273 274 static inline void 275 pr_item_notouch_put(const struct pool *pp, struct pool_item_header *ph, 276 void *obj) 277 { 278 unsigned int idx = pr_item_notouch_index(pp, ph, obj); 279 pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); 280 pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); 281 282 KASSERT((*bitmap & mask) == 0); 283 *bitmap |= mask; 284 } 285 286 static inline void * 287 pr_item_notouch_get(const struct pool *pp, struct pool_item_header *ph) 288 { 289 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 290 unsigned int idx; 291 int i; 292 293 for (i = 0; ; i++) { 294 int bit; 295 296 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); 297 bit = ffs32(bitmap[i]); 298 if (bit) { 299 pool_item_bitmap_t mask; 300 301 bit--; 302 idx = (i * BITMAP_SIZE) + bit; 303 mask = 1U << bit; 304 KASSERT((bitmap[i] & mask) != 0); 305 bitmap[i] &= ~mask; 306 break; 307 } 308 } 309 KASSERT(idx < pp->pr_itemsperpage); 310 return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size; 311 } 312 313 static inline void 314 pr_item_notouch_init(const struct pool *pp, struct pool_item_header *ph) 315 { 316 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 317 const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE); 318 int i; 319 320 for (i = 0; i < n; i++) { 321 bitmap[i] = (pool_item_bitmap_t)-1; 322 } 323 } 324 325 static inline int 326 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 327 { 328 329 /* 330 * we consider pool_item_header with smaller ph_page bigger. 331 * (this unnatural ordering is for the benefit of pr_find_pagehead.) 332 */ 333 334 if (a->ph_page < b->ph_page) 335 return (1); 336 else if (a->ph_page > b->ph_page) 337 return (-1); 338 else 339 return (0); 340 } 341 342 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 343 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 344 345 static inline struct pool_item_header * 346 pr_find_pagehead_noalign(struct pool *pp, void *v) 347 { 348 struct pool_item_header *ph, tmp; 349 350 tmp.ph_page = (void *)(uintptr_t)v; 351 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 352 if (ph == NULL) { 353 ph = SPLAY_ROOT(&pp->pr_phtree); 354 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { 355 ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph); 356 } 357 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0); 358 } 359 360 return ph; 361 } 362 363 /* 364 * Return the pool page header based on item address. 365 */ 366 static inline struct pool_item_header * 367 pr_find_pagehead(struct pool *pp, void *v) 368 { 369 struct pool_item_header *ph, tmp; 370 371 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 372 ph = pr_find_pagehead_noalign(pp, v); 373 } else { 374 void *page = 375 (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask); 376 377 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 378 ph = (struct pool_item_header *)((char *)page + pp->pr_phoffset); 379 } else { 380 tmp.ph_page = page; 381 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 382 } 383 } 384 385 KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) || 386 ((char *)ph->ph_page <= (char *)v && 387 (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz)); 388 return ph; 389 } 390 391 static void 392 pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq) 393 { 394 struct pool_item_header *ph; 395 396 while ((ph = LIST_FIRST(pq)) != NULL) { 397 LIST_REMOVE(ph, ph_pagelist); 398 pool_allocator_free(pp, ph->ph_page); 399 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 400 pool_put(pp->pr_phpool, ph); 401 } 402 } 403 404 /* 405 * Remove a page from the pool. 406 */ 407 static inline void 408 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 409 struct pool_pagelist *pq) 410 { 411 412 KASSERT(mutex_owned(&pp->pr_lock)); 413 414 /* 415 * If the page was idle, decrement the idle page count. 416 */ 417 if (ph->ph_nmissing == 0) { 418 KASSERT(pp->pr_nidle != 0); 419 KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage), 420 "nitems=%u < itemsperpage=%u", 421 pp->pr_nitems, pp->pr_itemsperpage); 422 pp->pr_nidle--; 423 } 424 425 pp->pr_nitems -= pp->pr_itemsperpage; 426 427 /* 428 * Unlink the page from the pool and queue it for release. 429 */ 430 LIST_REMOVE(ph, ph_pagelist); 431 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 432 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 433 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 434 435 pp->pr_npages--; 436 pp->pr_npagefree++; 437 438 pool_update_curpage(pp); 439 } 440 441 /* 442 * Initialize all the pools listed in the "pools" link set. 443 */ 444 void 445 pool_subsystem_init(void) 446 { 447 size_t size; 448 int idx; 449 450 mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE); 451 mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE); 452 cv_init(&pool_busy, "poolbusy"); 453 454 /* 455 * Initialize private page header pool and cache magazine pool if we 456 * haven't done so yet. 457 */ 458 for (idx = 0; idx < PHPOOL_MAX; idx++) { 459 static char phpool_names[PHPOOL_MAX][6+1+6+1]; 460 int nelem; 461 size_t sz; 462 463 nelem = PHPOOL_FREELIST_NELEM(idx); 464 snprintf(phpool_names[idx], sizeof(phpool_names[idx]), 465 "phpool-%d", nelem); 466 sz = sizeof(struct pool_item_header); 467 if (nelem) { 468 sz = offsetof(struct pool_item_header, 469 ph_bitmap[howmany(nelem, BITMAP_SIZE)]); 470 } 471 pool_init(&phpool[idx], sz, 0, 0, 0, 472 phpool_names[idx], &pool_allocator_meta, IPL_VM); 473 } 474 #ifdef POOL_SUBPAGE 475 pool_init(&psppool, POOL_SUBPAGE, POOL_SUBPAGE, 0, 476 PR_RECURSIVE, "psppool", &pool_allocator_meta, IPL_VM); 477 #endif 478 479 size = sizeof(pcg_t) + 480 (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t); 481 pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0, 482 "pcgnormal", &pool_allocator_meta, IPL_VM); 483 484 size = sizeof(pcg_t) + 485 (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t); 486 pool_init(&pcg_large_pool, size, coherency_unit, 0, 0, 487 "pcglarge", &pool_allocator_meta, IPL_VM); 488 489 pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit, 490 0, 0, "pcache", &pool_allocator_meta, IPL_NONE); 491 492 pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit, 493 0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE); 494 } 495 496 /* 497 * Initialize the given pool resource structure. 498 * 499 * We export this routine to allow other kernel parts to declare 500 * static pools that must be initialized before kmem(9) is available. 501 */ 502 void 503 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 504 const char *wchan, struct pool_allocator *palloc, int ipl) 505 { 506 struct pool *pp1; 507 size_t trysize, phsize, prsize; 508 int off, slack; 509 510 #ifdef DEBUG 511 if (__predict_true(!cold)) 512 mutex_enter(&pool_head_lock); 513 /* 514 * Check that the pool hasn't already been initialised and 515 * added to the list of all pools. 516 */ 517 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 518 if (pp == pp1) 519 panic("%s: [%s] already initialised", __func__, 520 wchan); 521 } 522 if (__predict_true(!cold)) 523 mutex_exit(&pool_head_lock); 524 #endif 525 526 if (palloc == NULL) 527 palloc = &pool_allocator_kmem; 528 #ifdef POOL_SUBPAGE 529 if (size > palloc->pa_pagesz) { 530 if (palloc == &pool_allocator_kmem) 531 palloc = &pool_allocator_kmem_fullpage; 532 else if (palloc == &pool_allocator_nointr) 533 palloc = &pool_allocator_nointr_fullpage; 534 } 535 #endif /* POOL_SUBPAGE */ 536 if (!cold) 537 mutex_enter(&pool_allocator_lock); 538 if (palloc->pa_refcnt++ == 0) { 539 if (palloc->pa_pagesz == 0) 540 palloc->pa_pagesz = PAGE_SIZE; 541 542 TAILQ_INIT(&palloc->pa_list); 543 544 mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM); 545 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 546 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 547 } 548 if (!cold) 549 mutex_exit(&pool_allocator_lock); 550 551 if (align == 0) 552 align = ALIGN(1); 553 554 prsize = size; 555 if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item)) 556 prsize = sizeof(struct pool_item); 557 558 prsize = roundup(prsize, align); 559 KASSERTMSG((prsize <= palloc->pa_pagesz), 560 "%s: [%s] pool item size (%zu) larger than page size (%u)", 561 __func__, wchan, prsize, palloc->pa_pagesz); 562 563 /* 564 * Initialize the pool structure. 565 */ 566 LIST_INIT(&pp->pr_emptypages); 567 LIST_INIT(&pp->pr_fullpages); 568 LIST_INIT(&pp->pr_partpages); 569 pp->pr_cache = NULL; 570 pp->pr_curpage = NULL; 571 pp->pr_npages = 0; 572 pp->pr_minitems = 0; 573 pp->pr_minpages = 0; 574 pp->pr_maxpages = UINT_MAX; 575 pp->pr_roflags = flags; 576 pp->pr_flags = 0; 577 pp->pr_size = prsize; 578 pp->pr_reqsize = size; 579 pp->pr_align = align; 580 pp->pr_wchan = wchan; 581 pp->pr_alloc = palloc; 582 pp->pr_nitems = 0; 583 pp->pr_nout = 0; 584 pp->pr_hardlimit = UINT_MAX; 585 pp->pr_hardlimit_warning = NULL; 586 pp->pr_hardlimit_ratecap.tv_sec = 0; 587 pp->pr_hardlimit_ratecap.tv_usec = 0; 588 pp->pr_hardlimit_warning_last.tv_sec = 0; 589 pp->pr_hardlimit_warning_last.tv_usec = 0; 590 pp->pr_drain_hook = NULL; 591 pp->pr_drain_hook_arg = NULL; 592 pp->pr_freecheck = NULL; 593 pool_redzone_init(pp, size); 594 595 /* 596 * Decide whether to put the page header off page to avoid 597 * wasting too large a part of the page or too big item. 598 * Off-page page headers go on a hash table, so we can match 599 * a returned item with its header based on the page address. 600 * We use 1/16 of the page size and about 8 times of the item 601 * size as the threshold (XXX: tune) 602 * 603 * However, we'll put the header into the page if we can put 604 * it without wasting any items. 605 * 606 * Silently enforce `0 <= ioff < align'. 607 */ 608 pp->pr_itemoffset = ioff %= align; 609 /* See the comment below about reserved bytes. */ 610 trysize = palloc->pa_pagesz - ((align - ioff) % align); 611 phsize = ALIGN(sizeof(struct pool_item_header)); 612 if (pp->pr_roflags & PR_PHINPAGE || 613 ((pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) == 0 && 614 (pp->pr_size < MIN(palloc->pa_pagesz / 16, phsize << 3) || 615 trysize / pp->pr_size == (trysize - phsize) / pp->pr_size))) { 616 /* Use the end of the page for the page header */ 617 pp->pr_roflags |= PR_PHINPAGE; 618 pp->pr_phoffset = off = palloc->pa_pagesz - phsize; 619 } else { 620 /* The page header will be taken from our page header pool */ 621 pp->pr_phoffset = 0; 622 off = palloc->pa_pagesz; 623 SPLAY_INIT(&pp->pr_phtree); 624 } 625 626 /* 627 * Alignment is to take place at `ioff' within the item. This means 628 * we must reserve up to `align - 1' bytes on the page to allow 629 * appropriate positioning of each item. 630 */ 631 pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size; 632 KASSERT(pp->pr_itemsperpage != 0); 633 if ((pp->pr_roflags & PR_NOTOUCH)) { 634 int idx; 635 636 for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx); 637 idx++) { 638 /* nothing */ 639 } 640 if (idx >= PHPOOL_MAX) { 641 /* 642 * if you see this panic, consider to tweak 643 * PHPOOL_MAX and PHPOOL_FREELIST_NELEM. 644 */ 645 panic("%s: [%s] too large itemsperpage(%d) for " 646 "PR_NOTOUCH", __func__, 647 pp->pr_wchan, pp->pr_itemsperpage); 648 } 649 pp->pr_phpool = &phpool[idx]; 650 } else if ((pp->pr_roflags & PR_PHINPAGE) == 0) { 651 pp->pr_phpool = &phpool[0]; 652 } 653 #if defined(DIAGNOSTIC) 654 else { 655 pp->pr_phpool = NULL; 656 } 657 #endif 658 659 /* 660 * Use the slack between the chunks and the page header 661 * for "cache coloring". 662 */ 663 slack = off - pp->pr_itemsperpage * pp->pr_size; 664 pp->pr_maxcolor = (slack / align) * align; 665 pp->pr_curcolor = 0; 666 667 pp->pr_nget = 0; 668 pp->pr_nfail = 0; 669 pp->pr_nput = 0; 670 pp->pr_npagealloc = 0; 671 pp->pr_npagefree = 0; 672 pp->pr_hiwat = 0; 673 pp->pr_nidle = 0; 674 pp->pr_refcnt = 0; 675 676 mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl); 677 cv_init(&pp->pr_cv, wchan); 678 pp->pr_ipl = ipl; 679 680 /* Insert into the list of all pools. */ 681 if (!cold) 682 mutex_enter(&pool_head_lock); 683 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 684 if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0) 685 break; 686 } 687 if (pp1 == NULL) 688 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 689 else 690 TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist); 691 if (!cold) 692 mutex_exit(&pool_head_lock); 693 694 /* Insert this into the list of pools using this allocator. */ 695 if (!cold) 696 mutex_enter(&palloc->pa_lock); 697 TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list); 698 if (!cold) 699 mutex_exit(&palloc->pa_lock); 700 } 701 702 /* 703 * De-commision a pool resource. 704 */ 705 void 706 pool_destroy(struct pool *pp) 707 { 708 struct pool_pagelist pq; 709 struct pool_item_header *ph; 710 711 /* Remove from global pool list */ 712 mutex_enter(&pool_head_lock); 713 while (pp->pr_refcnt != 0) 714 cv_wait(&pool_busy, &pool_head_lock); 715 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 716 if (drainpp == pp) 717 drainpp = NULL; 718 mutex_exit(&pool_head_lock); 719 720 /* Remove this pool from its allocator's list of pools. */ 721 mutex_enter(&pp->pr_alloc->pa_lock); 722 TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list); 723 mutex_exit(&pp->pr_alloc->pa_lock); 724 725 mutex_enter(&pool_allocator_lock); 726 if (--pp->pr_alloc->pa_refcnt == 0) 727 mutex_destroy(&pp->pr_alloc->pa_lock); 728 mutex_exit(&pool_allocator_lock); 729 730 mutex_enter(&pp->pr_lock); 731 732 KASSERT(pp->pr_cache == NULL); 733 KASSERTMSG((pp->pr_nout == 0), 734 "%s: pool busy: still out: %u", __func__, pp->pr_nout); 735 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 736 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 737 738 /* Remove all pages */ 739 LIST_INIT(&pq); 740 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 741 pr_rmpage(pp, ph, &pq); 742 743 mutex_exit(&pp->pr_lock); 744 745 pr_pagelist_free(pp, &pq); 746 cv_destroy(&pp->pr_cv); 747 mutex_destroy(&pp->pr_lock); 748 } 749 750 void 751 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg) 752 { 753 754 /* XXX no locking -- must be used just after pool_init() */ 755 KASSERTMSG((pp->pr_drain_hook == NULL), 756 "%s: [%s] already set", __func__, pp->pr_wchan); 757 pp->pr_drain_hook = fn; 758 pp->pr_drain_hook_arg = arg; 759 } 760 761 static struct pool_item_header * 762 pool_alloc_item_header(struct pool *pp, void *storage, int flags) 763 { 764 struct pool_item_header *ph; 765 766 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 767 ph = (void *)((char *)storage + pp->pr_phoffset); 768 else 769 ph = pool_get(pp->pr_phpool, flags); 770 771 return (ph); 772 } 773 774 /* 775 * Grab an item from the pool. 776 */ 777 void * 778 pool_get(struct pool *pp, int flags) 779 { 780 struct pool_item *pi; 781 struct pool_item_header *ph; 782 void *v; 783 784 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 785 KASSERTMSG((pp->pr_itemsperpage != 0), 786 "%s: [%s] pr_itemsperpage is zero, " 787 "pool not initialized?", __func__, pp->pr_wchan); 788 KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p()) 789 || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL), 790 "%s: [%s] is IPL_NONE, but called from interrupt context", 791 __func__, pp->pr_wchan); 792 if (flags & PR_WAITOK) { 793 ASSERT_SLEEPABLE(); 794 } 795 796 mutex_enter(&pp->pr_lock); 797 startover: 798 /* 799 * Check to see if we've reached the hard limit. If we have, 800 * and we can wait, then wait until an item has been returned to 801 * the pool. 802 */ 803 KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit), 804 "%s: %s: crossed hard limit", __func__, pp->pr_wchan); 805 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 806 if (pp->pr_drain_hook != NULL) { 807 /* 808 * Since the drain hook is going to free things 809 * back to the pool, unlock, call the hook, re-lock, 810 * and check the hardlimit condition again. 811 */ 812 mutex_exit(&pp->pr_lock); 813 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 814 mutex_enter(&pp->pr_lock); 815 if (pp->pr_nout < pp->pr_hardlimit) 816 goto startover; 817 } 818 819 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 820 /* 821 * XXX: A warning isn't logged in this case. Should 822 * it be? 823 */ 824 pp->pr_flags |= PR_WANTED; 825 do { 826 cv_wait(&pp->pr_cv, &pp->pr_lock); 827 } while (pp->pr_flags & PR_WANTED); 828 goto startover; 829 } 830 831 /* 832 * Log a message that the hard limit has been hit. 833 */ 834 if (pp->pr_hardlimit_warning != NULL && 835 ratecheck(&pp->pr_hardlimit_warning_last, 836 &pp->pr_hardlimit_ratecap)) 837 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 838 839 pp->pr_nfail++; 840 841 mutex_exit(&pp->pr_lock); 842 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 843 return (NULL); 844 } 845 846 /* 847 * The convention we use is that if `curpage' is not NULL, then 848 * it points at a non-empty bucket. In particular, `curpage' 849 * never points at a page header which has PR_PHINPAGE set and 850 * has no items in its bucket. 851 */ 852 if ((ph = pp->pr_curpage) == NULL) { 853 int error; 854 855 KASSERTMSG((pp->pr_nitems == 0), 856 "%s: [%s] curpage NULL, inconsistent nitems %u", 857 __func__, pp->pr_wchan, pp->pr_nitems); 858 859 /* 860 * Call the back-end page allocator for more memory. 861 * Release the pool lock, as the back-end page allocator 862 * may block. 863 */ 864 error = pool_grow(pp, flags); 865 if (error != 0) { 866 /* 867 * pool_grow aborts when another thread 868 * is allocating a new page. Retry if it 869 * waited for it. 870 */ 871 if (error == ERESTART) 872 goto startover; 873 874 /* 875 * We were unable to allocate a page or item 876 * header, but we released the lock during 877 * allocation, so perhaps items were freed 878 * back to the pool. Check for this case. 879 */ 880 if (pp->pr_curpage != NULL) 881 goto startover; 882 883 pp->pr_nfail++; 884 mutex_exit(&pp->pr_lock); 885 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 886 return (NULL); 887 } 888 889 /* Start the allocation process over. */ 890 goto startover; 891 } 892 if (pp->pr_roflags & PR_NOTOUCH) { 893 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage), 894 "%s: %s: page empty", __func__, pp->pr_wchan); 895 v = pr_item_notouch_get(pp, ph); 896 } else { 897 v = pi = LIST_FIRST(&ph->ph_itemlist); 898 if (__predict_false(v == NULL)) { 899 mutex_exit(&pp->pr_lock); 900 panic("%s: [%s] page empty", __func__, pp->pr_wchan); 901 } 902 KASSERTMSG((pp->pr_nitems > 0), 903 "%s: [%s] nitems %u inconsistent on itemlist", 904 __func__, pp->pr_wchan, pp->pr_nitems); 905 #ifdef POOL_CHECK_MAGIC 906 KASSERTMSG((pi->pi_magic == PI_MAGIC), 907 "%s: [%s] free list modified: " 908 "magic=%x; page %p; item addr %p", __func__, 909 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 910 #endif 911 912 /* 913 * Remove from item list. 914 */ 915 LIST_REMOVE(pi, pi_list); 916 } 917 pp->pr_nitems--; 918 pp->pr_nout++; 919 if (ph->ph_nmissing == 0) { 920 KASSERT(pp->pr_nidle > 0); 921 pp->pr_nidle--; 922 923 /* 924 * This page was previously empty. Move it to the list of 925 * partially-full pages. This page is already curpage. 926 */ 927 LIST_REMOVE(ph, ph_pagelist); 928 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 929 } 930 ph->ph_nmissing++; 931 if (ph->ph_nmissing == pp->pr_itemsperpage) { 932 KASSERTMSG(((pp->pr_roflags & PR_NOTOUCH) || 933 LIST_EMPTY(&ph->ph_itemlist)), 934 "%s: [%s] nmissing (%u) inconsistent", __func__, 935 pp->pr_wchan, ph->ph_nmissing); 936 /* 937 * This page is now full. Move it to the full list 938 * and select a new current page. 939 */ 940 LIST_REMOVE(ph, ph_pagelist); 941 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 942 pool_update_curpage(pp); 943 } 944 945 pp->pr_nget++; 946 947 /* 948 * If we have a low water mark and we are now below that low 949 * water mark, add more items to the pool. 950 */ 951 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 952 /* 953 * XXX: Should we log a warning? Should we set up a timeout 954 * to try again in a second or so? The latter could break 955 * a caller's assumptions about interrupt protection, etc. 956 */ 957 } 958 959 mutex_exit(&pp->pr_lock); 960 KASSERT((((vaddr_t)v + pp->pr_itemoffset) & (pp->pr_align - 1)) == 0); 961 FREECHECK_OUT(&pp->pr_freecheck, v); 962 pool_redzone_fill(pp, v); 963 if (flags & PR_ZERO) 964 memset(v, 0, pp->pr_reqsize); 965 else 966 pool_kleak_fill(pp, v); 967 return v; 968 } 969 970 /* 971 * Internal version of pool_put(). Pool is already locked/entered. 972 */ 973 static void 974 pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq) 975 { 976 struct pool_item *pi = v; 977 struct pool_item_header *ph; 978 979 KASSERT(mutex_owned(&pp->pr_lock)); 980 pool_redzone_check(pp, v); 981 FREECHECK_IN(&pp->pr_freecheck, v); 982 LOCKDEBUG_MEM_CHECK(v, pp->pr_size); 983 984 KASSERTMSG((pp->pr_nout > 0), 985 "%s: [%s] putting with none out", __func__, pp->pr_wchan); 986 987 if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) { 988 panic("%s: [%s] page header missing", __func__, pp->pr_wchan); 989 } 990 991 /* 992 * Return to item list. 993 */ 994 if (pp->pr_roflags & PR_NOTOUCH) { 995 pr_item_notouch_put(pp, ph, v); 996 } else { 997 #ifdef POOL_CHECK_MAGIC 998 pi->pi_magic = PI_MAGIC; 999 #endif 1000 1001 if (pp->pr_redzone) { 1002 /* 1003 * Mark the pool_item as valid. The rest is already 1004 * invalid. 1005 */ 1006 kasan_mark(pi, sizeof(*pi), sizeof(*pi)); 1007 } 1008 1009 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 1010 } 1011 KDASSERT(ph->ph_nmissing != 0); 1012 ph->ph_nmissing--; 1013 pp->pr_nput++; 1014 pp->pr_nitems++; 1015 pp->pr_nout--; 1016 1017 /* Cancel "pool empty" condition if it exists */ 1018 if (pp->pr_curpage == NULL) 1019 pp->pr_curpage = ph; 1020 1021 if (pp->pr_flags & PR_WANTED) { 1022 pp->pr_flags &= ~PR_WANTED; 1023 cv_broadcast(&pp->pr_cv); 1024 } 1025 1026 /* 1027 * If this page is now empty, do one of two things: 1028 * 1029 * (1) If we have more pages than the page high water mark, 1030 * free the page back to the system. ONLY CONSIDER 1031 * FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE 1032 * CLAIM. 1033 * 1034 * (2) Otherwise, move the page to the empty page list. 1035 * 1036 * Either way, select a new current page (so we use a partially-full 1037 * page if one is available). 1038 */ 1039 if (ph->ph_nmissing == 0) { 1040 pp->pr_nidle++; 1041 if (pp->pr_npages > pp->pr_minpages && 1042 pp->pr_npages > pp->pr_maxpages) { 1043 pr_rmpage(pp, ph, pq); 1044 } else { 1045 LIST_REMOVE(ph, ph_pagelist); 1046 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1047 1048 /* 1049 * Update the timestamp on the page. A page must 1050 * be idle for some period of time before it can 1051 * be reclaimed by the pagedaemon. This minimizes 1052 * ping-pong'ing for memory. 1053 * 1054 * note for 64-bit time_t: truncating to 32-bit is not 1055 * a problem for our usage. 1056 */ 1057 ph->ph_time = time_uptime; 1058 } 1059 pool_update_curpage(pp); 1060 } 1061 1062 /* 1063 * If the page was previously completely full, move it to the 1064 * partially-full list and make it the current page. The next 1065 * allocation will get the item from this page, instead of 1066 * further fragmenting the pool. 1067 */ 1068 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 1069 LIST_REMOVE(ph, ph_pagelist); 1070 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1071 pp->pr_curpage = ph; 1072 } 1073 } 1074 1075 void 1076 pool_put(struct pool *pp, void *v) 1077 { 1078 struct pool_pagelist pq; 1079 1080 LIST_INIT(&pq); 1081 1082 mutex_enter(&pp->pr_lock); 1083 pool_do_put(pp, v, &pq); 1084 mutex_exit(&pp->pr_lock); 1085 1086 pr_pagelist_free(pp, &pq); 1087 } 1088 1089 /* 1090 * pool_grow: grow a pool by a page. 1091 * 1092 * => called with pool locked. 1093 * => unlock and relock the pool. 1094 * => return with pool locked. 1095 */ 1096 1097 static int 1098 pool_grow(struct pool *pp, int flags) 1099 { 1100 /* 1101 * If there's a pool_grow in progress, wait for it to complete 1102 * and try again from the top. 1103 */ 1104 if (pp->pr_flags & PR_GROWING) { 1105 if (flags & PR_WAITOK) { 1106 do { 1107 cv_wait(&pp->pr_cv, &pp->pr_lock); 1108 } while (pp->pr_flags & PR_GROWING); 1109 return ERESTART; 1110 } else { 1111 if (pp->pr_flags & PR_GROWINGNOWAIT) { 1112 /* 1113 * This needs an unlock/relock dance so 1114 * that the other caller has a chance to 1115 * run and actually do the thing. Note 1116 * that this is effectively a busy-wait. 1117 */ 1118 mutex_exit(&pp->pr_lock); 1119 mutex_enter(&pp->pr_lock); 1120 return ERESTART; 1121 } 1122 return EWOULDBLOCK; 1123 } 1124 } 1125 pp->pr_flags |= PR_GROWING; 1126 if (flags & PR_WAITOK) 1127 mutex_exit(&pp->pr_lock); 1128 else 1129 pp->pr_flags |= PR_GROWINGNOWAIT; 1130 1131 char *cp = pool_allocator_alloc(pp, flags); 1132 if (__predict_false(cp == NULL)) 1133 goto out; 1134 1135 struct pool_item_header *ph = pool_alloc_item_header(pp, cp, flags); 1136 if (__predict_false(ph == NULL)) { 1137 pool_allocator_free(pp, cp); 1138 goto out; 1139 } 1140 1141 if (flags & PR_WAITOK) 1142 mutex_enter(&pp->pr_lock); 1143 pool_prime_page(pp, cp, ph); 1144 pp->pr_npagealloc++; 1145 KASSERT(pp->pr_flags & PR_GROWING); 1146 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1147 /* 1148 * If anyone was waiting for pool_grow, notify them that we 1149 * may have just done it. 1150 */ 1151 cv_broadcast(&pp->pr_cv); 1152 return 0; 1153 out: 1154 if (flags & PR_WAITOK) 1155 mutex_enter(&pp->pr_lock); 1156 KASSERT(pp->pr_flags & PR_GROWING); 1157 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1158 return ENOMEM; 1159 } 1160 1161 /* 1162 * Add N items to the pool. 1163 */ 1164 int 1165 pool_prime(struct pool *pp, int n) 1166 { 1167 int newpages; 1168 int error = 0; 1169 1170 mutex_enter(&pp->pr_lock); 1171 1172 newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1173 1174 while (newpages > 0) { 1175 error = pool_grow(pp, PR_NOWAIT); 1176 if (error) { 1177 if (error == ERESTART) 1178 continue; 1179 break; 1180 } 1181 pp->pr_minpages++; 1182 newpages--; 1183 } 1184 1185 if (pp->pr_minpages >= pp->pr_maxpages) 1186 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 1187 1188 mutex_exit(&pp->pr_lock); 1189 return error; 1190 } 1191 1192 /* 1193 * Add a page worth of items to the pool. 1194 * 1195 * Note, we must be called with the pool descriptor LOCKED. 1196 */ 1197 static void 1198 pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph) 1199 { 1200 struct pool_item *pi; 1201 void *cp = storage; 1202 const unsigned int align = pp->pr_align; 1203 const unsigned int ioff = pp->pr_itemoffset; 1204 int n; 1205 1206 KASSERT(mutex_owned(&pp->pr_lock)); 1207 KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) || 1208 (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)), 1209 "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp); 1210 1211 /* 1212 * Insert page header. 1213 */ 1214 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1215 LIST_INIT(&ph->ph_itemlist); 1216 ph->ph_page = storage; 1217 ph->ph_nmissing = 0; 1218 ph->ph_time = time_uptime; 1219 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 1220 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 1221 1222 pp->pr_nidle++; 1223 1224 /* 1225 * Color this page. 1226 */ 1227 ph->ph_off = pp->pr_curcolor; 1228 cp = (char *)cp + ph->ph_off; 1229 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 1230 pp->pr_curcolor = 0; 1231 1232 /* 1233 * Adjust storage to apply aligment to `pr_itemoffset' in each item. 1234 */ 1235 if (ioff != 0) 1236 cp = (char *)cp + align - ioff; 1237 1238 KASSERT((((vaddr_t)cp + ioff) & (align - 1)) == 0); 1239 1240 /* 1241 * Insert remaining chunks on the bucket list. 1242 */ 1243 n = pp->pr_itemsperpage; 1244 pp->pr_nitems += n; 1245 1246 if (pp->pr_roflags & PR_NOTOUCH) { 1247 pr_item_notouch_init(pp, ph); 1248 } else { 1249 while (n--) { 1250 pi = (struct pool_item *)cp; 1251 1252 KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0); 1253 1254 /* Insert on page list */ 1255 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 1256 #ifdef POOL_CHECK_MAGIC 1257 pi->pi_magic = PI_MAGIC; 1258 #endif 1259 cp = (char *)cp + pp->pr_size; 1260 1261 KASSERT((((vaddr_t)cp + ioff) & (align - 1)) == 0); 1262 } 1263 } 1264 1265 /* 1266 * If the pool was depleted, point at the new page. 1267 */ 1268 if (pp->pr_curpage == NULL) 1269 pp->pr_curpage = ph; 1270 1271 if (++pp->pr_npages > pp->pr_hiwat) 1272 pp->pr_hiwat = pp->pr_npages; 1273 } 1274 1275 /* 1276 * Used by pool_get() when nitems drops below the low water mark. This 1277 * is used to catch up pr_nitems with the low water mark. 1278 * 1279 * Note 1, we never wait for memory here, we let the caller decide what to do. 1280 * 1281 * Note 2, we must be called with the pool already locked, and we return 1282 * with it locked. 1283 */ 1284 static int 1285 pool_catchup(struct pool *pp) 1286 { 1287 int error = 0; 1288 1289 while (POOL_NEEDS_CATCHUP(pp)) { 1290 error = pool_grow(pp, PR_NOWAIT); 1291 if (error) { 1292 if (error == ERESTART) 1293 continue; 1294 break; 1295 } 1296 } 1297 return error; 1298 } 1299 1300 static void 1301 pool_update_curpage(struct pool *pp) 1302 { 1303 1304 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 1305 if (pp->pr_curpage == NULL) { 1306 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 1307 } 1308 KASSERT((pp->pr_curpage == NULL && pp->pr_nitems == 0) || 1309 (pp->pr_curpage != NULL && pp->pr_nitems > 0)); 1310 } 1311 1312 void 1313 pool_setlowat(struct pool *pp, int n) 1314 { 1315 1316 mutex_enter(&pp->pr_lock); 1317 1318 pp->pr_minitems = n; 1319 pp->pr_minpages = (n == 0) 1320 ? 0 1321 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1322 1323 /* Make sure we're caught up with the newly-set low water mark. */ 1324 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1325 /* 1326 * XXX: Should we log a warning? Should we set up a timeout 1327 * to try again in a second or so? The latter could break 1328 * a caller's assumptions about interrupt protection, etc. 1329 */ 1330 } 1331 1332 mutex_exit(&pp->pr_lock); 1333 } 1334 1335 void 1336 pool_sethiwat(struct pool *pp, int n) 1337 { 1338 1339 mutex_enter(&pp->pr_lock); 1340 1341 pp->pr_maxpages = (n == 0) 1342 ? 0 1343 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1344 1345 mutex_exit(&pp->pr_lock); 1346 } 1347 1348 void 1349 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap) 1350 { 1351 1352 mutex_enter(&pp->pr_lock); 1353 1354 pp->pr_hardlimit = n; 1355 pp->pr_hardlimit_warning = warnmess; 1356 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 1357 pp->pr_hardlimit_warning_last.tv_sec = 0; 1358 pp->pr_hardlimit_warning_last.tv_usec = 0; 1359 1360 /* 1361 * In-line version of pool_sethiwat(), because we don't want to 1362 * release the lock. 1363 */ 1364 pp->pr_maxpages = (n == 0) 1365 ? 0 1366 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1367 1368 mutex_exit(&pp->pr_lock); 1369 } 1370 1371 /* 1372 * Release all complete pages that have not been used recently. 1373 * 1374 * Must not be called from interrupt context. 1375 */ 1376 int 1377 pool_reclaim(struct pool *pp) 1378 { 1379 struct pool_item_header *ph, *phnext; 1380 struct pool_pagelist pq; 1381 uint32_t curtime; 1382 bool klock; 1383 int rv; 1384 1385 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 1386 1387 if (pp->pr_drain_hook != NULL) { 1388 /* 1389 * The drain hook must be called with the pool unlocked. 1390 */ 1391 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT); 1392 } 1393 1394 /* 1395 * XXXSMP Because we do not want to cause non-MPSAFE code 1396 * to block. 1397 */ 1398 if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK || 1399 pp->pr_ipl == IPL_SOFTSERIAL) { 1400 KERNEL_LOCK(1, NULL); 1401 klock = true; 1402 } else 1403 klock = false; 1404 1405 /* Reclaim items from the pool's cache (if any). */ 1406 if (pp->pr_cache != NULL) 1407 pool_cache_invalidate(pp->pr_cache); 1408 1409 if (mutex_tryenter(&pp->pr_lock) == 0) { 1410 if (klock) { 1411 KERNEL_UNLOCK_ONE(NULL); 1412 } 1413 return (0); 1414 } 1415 1416 LIST_INIT(&pq); 1417 1418 curtime = time_uptime; 1419 1420 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 1421 phnext = LIST_NEXT(ph, ph_pagelist); 1422 1423 /* Check our minimum page claim */ 1424 if (pp->pr_npages <= pp->pr_minpages) 1425 break; 1426 1427 KASSERT(ph->ph_nmissing == 0); 1428 if (curtime - ph->ph_time < pool_inactive_time) 1429 continue; 1430 1431 /* 1432 * If freeing this page would put us below 1433 * the low water mark, stop now. 1434 */ 1435 if ((pp->pr_nitems - pp->pr_itemsperpage) < 1436 pp->pr_minitems) 1437 break; 1438 1439 pr_rmpage(pp, ph, &pq); 1440 } 1441 1442 mutex_exit(&pp->pr_lock); 1443 1444 if (LIST_EMPTY(&pq)) 1445 rv = 0; 1446 else { 1447 pr_pagelist_free(pp, &pq); 1448 rv = 1; 1449 } 1450 1451 if (klock) { 1452 KERNEL_UNLOCK_ONE(NULL); 1453 } 1454 1455 return (rv); 1456 } 1457 1458 /* 1459 * Drain pools, one at a time. The drained pool is returned within ppp. 1460 * 1461 * Note, must never be called from interrupt context. 1462 */ 1463 bool 1464 pool_drain(struct pool **ppp) 1465 { 1466 bool reclaimed; 1467 struct pool *pp; 1468 1469 KASSERT(!TAILQ_EMPTY(&pool_head)); 1470 1471 pp = NULL; 1472 1473 /* Find next pool to drain, and add a reference. */ 1474 mutex_enter(&pool_head_lock); 1475 do { 1476 if (drainpp == NULL) { 1477 drainpp = TAILQ_FIRST(&pool_head); 1478 } 1479 if (drainpp != NULL) { 1480 pp = drainpp; 1481 drainpp = TAILQ_NEXT(pp, pr_poollist); 1482 } 1483 /* 1484 * Skip completely idle pools. We depend on at least 1485 * one pool in the system being active. 1486 */ 1487 } while (pp == NULL || pp->pr_npages == 0); 1488 pp->pr_refcnt++; 1489 mutex_exit(&pool_head_lock); 1490 1491 /* Drain the cache (if any) and pool.. */ 1492 reclaimed = pool_reclaim(pp); 1493 1494 /* Finally, unlock the pool. */ 1495 mutex_enter(&pool_head_lock); 1496 pp->pr_refcnt--; 1497 cv_broadcast(&pool_busy); 1498 mutex_exit(&pool_head_lock); 1499 1500 if (ppp != NULL) 1501 *ppp = pp; 1502 1503 return reclaimed; 1504 } 1505 1506 /* 1507 * Calculate the total number of pages consumed by pools. 1508 */ 1509 int 1510 pool_totalpages(void) 1511 { 1512 struct pool *pp; 1513 uint64_t total = 0; 1514 1515 mutex_enter(&pool_head_lock); 1516 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1517 uint64_t bytes = pp->pr_npages * pp->pr_alloc->pa_pagesz; 1518 1519 if ((pp->pr_roflags & PR_RECURSIVE) != 0) 1520 bytes -= (pp->pr_nout * pp->pr_size); 1521 total += bytes; 1522 } 1523 mutex_exit(&pool_head_lock); 1524 1525 return atop(total); 1526 } 1527 1528 /* 1529 * Diagnostic helpers. 1530 */ 1531 1532 void 1533 pool_printall(const char *modif, void (*pr)(const char *, ...)) 1534 { 1535 struct pool *pp; 1536 1537 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1538 pool_printit(pp, modif, pr); 1539 } 1540 } 1541 1542 void 1543 pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1544 { 1545 1546 if (pp == NULL) { 1547 (*pr)("Must specify a pool to print.\n"); 1548 return; 1549 } 1550 1551 pool_print1(pp, modif, pr); 1552 } 1553 1554 static void 1555 pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl, 1556 void (*pr)(const char *, ...)) 1557 { 1558 struct pool_item_header *ph; 1559 1560 LIST_FOREACH(ph, pl, ph_pagelist) { 1561 (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n", 1562 ph->ph_page, ph->ph_nmissing, ph->ph_time); 1563 #ifdef POOL_CHECK_MAGIC 1564 struct pool_item *pi; 1565 if (!(pp->pr_roflags & PR_NOTOUCH)) { 1566 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 1567 if (pi->pi_magic != PI_MAGIC) { 1568 (*pr)("\t\t\titem %p, magic 0x%x\n", 1569 pi, pi->pi_magic); 1570 } 1571 } 1572 } 1573 #endif 1574 } 1575 } 1576 1577 static void 1578 pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1579 { 1580 struct pool_item_header *ph; 1581 pool_cache_t pc; 1582 pcg_t *pcg; 1583 pool_cache_cpu_t *cc; 1584 uint64_t cpuhit, cpumiss; 1585 int i, print_log = 0, print_pagelist = 0, print_cache = 0; 1586 char c; 1587 1588 while ((c = *modif++) != '\0') { 1589 if (c == 'l') 1590 print_log = 1; 1591 if (c == 'p') 1592 print_pagelist = 1; 1593 if (c == 'c') 1594 print_cache = 1; 1595 } 1596 1597 if ((pc = pp->pr_cache) != NULL) { 1598 (*pr)("POOL CACHE"); 1599 } else { 1600 (*pr)("POOL"); 1601 } 1602 1603 (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n", 1604 pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, 1605 pp->pr_roflags); 1606 (*pr)("\talloc %p\n", pp->pr_alloc); 1607 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1608 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1609 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1610 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1611 1612 (*pr)("\tnget %lu, nfail %lu, nput %lu\n", 1613 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1614 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1615 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1616 1617 if (print_pagelist == 0) 1618 goto skip_pagelist; 1619 1620 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1621 (*pr)("\n\tempty page list:\n"); 1622 pool_print_pagelist(pp, &pp->pr_emptypages, pr); 1623 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1624 (*pr)("\n\tfull page list:\n"); 1625 pool_print_pagelist(pp, &pp->pr_fullpages, pr); 1626 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1627 (*pr)("\n\tpartial-page list:\n"); 1628 pool_print_pagelist(pp, &pp->pr_partpages, pr); 1629 1630 if (pp->pr_curpage == NULL) 1631 (*pr)("\tno current page\n"); 1632 else 1633 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1634 1635 skip_pagelist: 1636 if (print_log == 0) 1637 goto skip_log; 1638 1639 (*pr)("\n"); 1640 1641 skip_log: 1642 1643 #define PR_GROUPLIST(pcg) \ 1644 (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \ 1645 for (i = 0; i < pcg->pcg_size; i++) { \ 1646 if (pcg->pcg_objects[i].pcgo_pa != \ 1647 POOL_PADDR_INVALID) { \ 1648 (*pr)("\t\t\t%p, 0x%llx\n", \ 1649 pcg->pcg_objects[i].pcgo_va, \ 1650 (unsigned long long) \ 1651 pcg->pcg_objects[i].pcgo_pa); \ 1652 } else { \ 1653 (*pr)("\t\t\t%p\n", \ 1654 pcg->pcg_objects[i].pcgo_va); \ 1655 } \ 1656 } 1657 1658 if (pc != NULL) { 1659 cpuhit = 0; 1660 cpumiss = 0; 1661 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 1662 if ((cc = pc->pc_cpus[i]) == NULL) 1663 continue; 1664 cpuhit += cc->cc_hits; 1665 cpumiss += cc->cc_misses; 1666 } 1667 (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss); 1668 (*pr)("\tcache layer hits %llu misses %llu\n", 1669 pc->pc_hits, pc->pc_misses); 1670 (*pr)("\tcache layer entry uncontended %llu contended %llu\n", 1671 pc->pc_hits + pc->pc_misses - pc->pc_contended, 1672 pc->pc_contended); 1673 (*pr)("\tcache layer empty groups %u full groups %u\n", 1674 pc->pc_nempty, pc->pc_nfull); 1675 if (print_cache) { 1676 (*pr)("\tfull cache groups:\n"); 1677 for (pcg = pc->pc_fullgroups; pcg != NULL; 1678 pcg = pcg->pcg_next) { 1679 PR_GROUPLIST(pcg); 1680 } 1681 (*pr)("\tempty cache groups:\n"); 1682 for (pcg = pc->pc_emptygroups; pcg != NULL; 1683 pcg = pcg->pcg_next) { 1684 PR_GROUPLIST(pcg); 1685 } 1686 } 1687 } 1688 #undef PR_GROUPLIST 1689 } 1690 1691 static int 1692 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 1693 { 1694 struct pool_item *pi; 1695 void *page; 1696 int n; 1697 1698 if ((pp->pr_roflags & PR_NOALIGN) == 0) { 1699 page = (void *)((uintptr_t)ph & pp->pr_alloc->pa_pagemask); 1700 if (page != ph->ph_page && 1701 (pp->pr_roflags & PR_PHINPAGE) != 0) { 1702 if (label != NULL) 1703 printf("%s: ", label); 1704 printf("pool(%p:%s): page inconsistency: page %p;" 1705 " at page head addr %p (p %p)\n", pp, 1706 pp->pr_wchan, ph->ph_page, 1707 ph, page); 1708 return 1; 1709 } 1710 } 1711 1712 if ((pp->pr_roflags & PR_NOTOUCH) != 0) 1713 return 0; 1714 1715 for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0; 1716 pi != NULL; 1717 pi = LIST_NEXT(pi,pi_list), n++) { 1718 1719 #ifdef POOL_CHECK_MAGIC 1720 if (pi->pi_magic != PI_MAGIC) { 1721 if (label != NULL) 1722 printf("%s: ", label); 1723 printf("pool(%s): free list modified: magic=%x;" 1724 " page %p; item ordinal %d; addr %p\n", 1725 pp->pr_wchan, pi->pi_magic, ph->ph_page, 1726 n, pi); 1727 panic("pool"); 1728 } 1729 #endif 1730 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 1731 continue; 1732 } 1733 page = (void *)((uintptr_t)pi & pp->pr_alloc->pa_pagemask); 1734 if (page == ph->ph_page) 1735 continue; 1736 1737 if (label != NULL) 1738 printf("%s: ", label); 1739 printf("pool(%p:%s): page inconsistency: page %p;" 1740 " item ordinal %d; addr %p (p %p)\n", pp, 1741 pp->pr_wchan, ph->ph_page, 1742 n, pi, page); 1743 return 1; 1744 } 1745 return 0; 1746 } 1747 1748 1749 int 1750 pool_chk(struct pool *pp, const char *label) 1751 { 1752 struct pool_item_header *ph; 1753 int r = 0; 1754 1755 mutex_enter(&pp->pr_lock); 1756 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 1757 r = pool_chk_page(pp, label, ph); 1758 if (r) { 1759 goto out; 1760 } 1761 } 1762 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 1763 r = pool_chk_page(pp, label, ph); 1764 if (r) { 1765 goto out; 1766 } 1767 } 1768 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 1769 r = pool_chk_page(pp, label, ph); 1770 if (r) { 1771 goto out; 1772 } 1773 } 1774 1775 out: 1776 mutex_exit(&pp->pr_lock); 1777 return (r); 1778 } 1779 1780 /* 1781 * pool_cache_init: 1782 * 1783 * Initialize a pool cache. 1784 */ 1785 pool_cache_t 1786 pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags, 1787 const char *wchan, struct pool_allocator *palloc, int ipl, 1788 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) 1789 { 1790 pool_cache_t pc; 1791 1792 pc = pool_get(&cache_pool, PR_WAITOK); 1793 if (pc == NULL) 1794 return NULL; 1795 1796 pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan, 1797 palloc, ipl, ctor, dtor, arg); 1798 1799 return pc; 1800 } 1801 1802 /* 1803 * pool_cache_bootstrap: 1804 * 1805 * Kernel-private version of pool_cache_init(). The caller 1806 * provides initial storage. 1807 */ 1808 void 1809 pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, 1810 u_int align_offset, u_int flags, const char *wchan, 1811 struct pool_allocator *palloc, int ipl, 1812 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), 1813 void *arg) 1814 { 1815 CPU_INFO_ITERATOR cii; 1816 pool_cache_t pc1; 1817 struct cpu_info *ci; 1818 struct pool *pp; 1819 1820 pp = &pc->pc_pool; 1821 if (palloc == NULL && ipl == IPL_NONE) { 1822 if (size > PAGE_SIZE) { 1823 int bigidx = pool_bigidx(size); 1824 1825 palloc = &pool_allocator_big[bigidx]; 1826 } else 1827 palloc = &pool_allocator_nointr; 1828 } 1829 pool_init(pp, size, align, align_offset, flags, wchan, palloc, ipl); 1830 mutex_init(&pc->pc_lock, MUTEX_DEFAULT, ipl); 1831 1832 if (ctor == NULL) { 1833 ctor = (int (*)(void *, void *, int))nullop; 1834 } 1835 if (dtor == NULL) { 1836 dtor = (void (*)(void *, void *))nullop; 1837 } 1838 1839 pc->pc_emptygroups = NULL; 1840 pc->pc_fullgroups = NULL; 1841 pc->pc_partgroups = NULL; 1842 pc->pc_ctor = ctor; 1843 pc->pc_dtor = dtor; 1844 pc->pc_arg = arg; 1845 pc->pc_hits = 0; 1846 pc->pc_misses = 0; 1847 pc->pc_nempty = 0; 1848 pc->pc_npart = 0; 1849 pc->pc_nfull = 0; 1850 pc->pc_contended = 0; 1851 pc->pc_refcnt = 0; 1852 pc->pc_freecheck = NULL; 1853 1854 if ((flags & PR_LARGECACHE) != 0) { 1855 pc->pc_pcgsize = PCG_NOBJECTS_LARGE; 1856 pc->pc_pcgpool = &pcg_large_pool; 1857 } else { 1858 pc->pc_pcgsize = PCG_NOBJECTS_NORMAL; 1859 pc->pc_pcgpool = &pcg_normal_pool; 1860 } 1861 1862 /* Allocate per-CPU caches. */ 1863 memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus)); 1864 pc->pc_ncpu = 0; 1865 if (ncpu < 2) { 1866 /* XXX For sparc: boot CPU is not attached yet. */ 1867 pool_cache_cpu_init1(curcpu(), pc); 1868 } else { 1869 for (CPU_INFO_FOREACH(cii, ci)) { 1870 pool_cache_cpu_init1(ci, pc); 1871 } 1872 } 1873 1874 /* Add to list of all pools. */ 1875 if (__predict_true(!cold)) 1876 mutex_enter(&pool_head_lock); 1877 TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) { 1878 if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0) 1879 break; 1880 } 1881 if (pc1 == NULL) 1882 TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist); 1883 else 1884 TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist); 1885 if (__predict_true(!cold)) 1886 mutex_exit(&pool_head_lock); 1887 1888 membar_sync(); 1889 pp->pr_cache = pc; 1890 } 1891 1892 /* 1893 * pool_cache_destroy: 1894 * 1895 * Destroy a pool cache. 1896 */ 1897 void 1898 pool_cache_destroy(pool_cache_t pc) 1899 { 1900 1901 pool_cache_bootstrap_destroy(pc); 1902 pool_put(&cache_pool, pc); 1903 } 1904 1905 /* 1906 * pool_cache_bootstrap_destroy: 1907 * 1908 * Destroy a pool cache. 1909 */ 1910 void 1911 pool_cache_bootstrap_destroy(pool_cache_t pc) 1912 { 1913 struct pool *pp = &pc->pc_pool; 1914 u_int i; 1915 1916 /* Remove it from the global list. */ 1917 mutex_enter(&pool_head_lock); 1918 while (pc->pc_refcnt != 0) 1919 cv_wait(&pool_busy, &pool_head_lock); 1920 TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist); 1921 mutex_exit(&pool_head_lock); 1922 1923 /* First, invalidate the entire cache. */ 1924 pool_cache_invalidate(pc); 1925 1926 /* Disassociate it from the pool. */ 1927 mutex_enter(&pp->pr_lock); 1928 pp->pr_cache = NULL; 1929 mutex_exit(&pp->pr_lock); 1930 1931 /* Destroy per-CPU data */ 1932 for (i = 0; i < __arraycount(pc->pc_cpus); i++) 1933 pool_cache_invalidate_cpu(pc, i); 1934 1935 /* Finally, destroy it. */ 1936 mutex_destroy(&pc->pc_lock); 1937 pool_destroy(pp); 1938 } 1939 1940 /* 1941 * pool_cache_cpu_init1: 1942 * 1943 * Called for each pool_cache whenever a new CPU is attached. 1944 */ 1945 static void 1946 pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) 1947 { 1948 pool_cache_cpu_t *cc; 1949 int index; 1950 1951 index = ci->ci_index; 1952 1953 KASSERT(index < __arraycount(pc->pc_cpus)); 1954 1955 if ((cc = pc->pc_cpus[index]) != NULL) { 1956 KASSERT(cc->cc_cpuindex == index); 1957 return; 1958 } 1959 1960 /* 1961 * The first CPU is 'free'. This needs to be the case for 1962 * bootstrap - we may not be able to allocate yet. 1963 */ 1964 if (pc->pc_ncpu == 0) { 1965 cc = &pc->pc_cpu0; 1966 pc->pc_ncpu = 1; 1967 } else { 1968 mutex_enter(&pc->pc_lock); 1969 pc->pc_ncpu++; 1970 mutex_exit(&pc->pc_lock); 1971 cc = pool_get(&cache_cpu_pool, PR_WAITOK); 1972 } 1973 1974 cc->cc_ipl = pc->pc_pool.pr_ipl; 1975 cc->cc_iplcookie = makeiplcookie(cc->cc_ipl); 1976 cc->cc_cache = pc; 1977 cc->cc_cpuindex = index; 1978 cc->cc_hits = 0; 1979 cc->cc_misses = 0; 1980 cc->cc_current = __UNCONST(&pcg_dummy); 1981 cc->cc_previous = __UNCONST(&pcg_dummy); 1982 1983 pc->pc_cpus[index] = cc; 1984 } 1985 1986 /* 1987 * pool_cache_cpu_init: 1988 * 1989 * Called whenever a new CPU is attached. 1990 */ 1991 void 1992 pool_cache_cpu_init(struct cpu_info *ci) 1993 { 1994 pool_cache_t pc; 1995 1996 mutex_enter(&pool_head_lock); 1997 TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) { 1998 pc->pc_refcnt++; 1999 mutex_exit(&pool_head_lock); 2000 2001 pool_cache_cpu_init1(ci, pc); 2002 2003 mutex_enter(&pool_head_lock); 2004 pc->pc_refcnt--; 2005 cv_broadcast(&pool_busy); 2006 } 2007 mutex_exit(&pool_head_lock); 2008 } 2009 2010 /* 2011 * pool_cache_reclaim: 2012 * 2013 * Reclaim memory from a pool cache. 2014 */ 2015 bool 2016 pool_cache_reclaim(pool_cache_t pc) 2017 { 2018 2019 return pool_reclaim(&pc->pc_pool); 2020 } 2021 2022 static void 2023 pool_cache_destruct_object1(pool_cache_t pc, void *object) 2024 { 2025 if (pc->pc_pool.pr_redzone) { 2026 /* 2027 * The object is marked as invalid. Temporarily mark it as 2028 * valid for the destructor. pool_put below will re-mark it 2029 * as invalid. 2030 */ 2031 kasan_mark(object, pc->pc_pool.pr_reqsize, 2032 pc->pc_pool.pr_reqsize_with_redzone); 2033 } 2034 2035 (*pc->pc_dtor)(pc->pc_arg, object); 2036 pool_put(&pc->pc_pool, object); 2037 } 2038 2039 /* 2040 * pool_cache_destruct_object: 2041 * 2042 * Force destruction of an object and its release back into 2043 * the pool. 2044 */ 2045 void 2046 pool_cache_destruct_object(pool_cache_t pc, void *object) 2047 { 2048 2049 FREECHECK_IN(&pc->pc_freecheck, object); 2050 2051 pool_cache_destruct_object1(pc, object); 2052 } 2053 2054 /* 2055 * pool_cache_invalidate_groups: 2056 * 2057 * Invalidate a chain of groups and destruct all objects. 2058 */ 2059 static void 2060 pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg) 2061 { 2062 void *object; 2063 pcg_t *next; 2064 int i; 2065 2066 for (; pcg != NULL; pcg = next) { 2067 next = pcg->pcg_next; 2068 2069 for (i = 0; i < pcg->pcg_avail; i++) { 2070 object = pcg->pcg_objects[i].pcgo_va; 2071 pool_cache_destruct_object1(pc, object); 2072 } 2073 2074 if (pcg->pcg_size == PCG_NOBJECTS_LARGE) { 2075 pool_put(&pcg_large_pool, pcg); 2076 } else { 2077 KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL); 2078 pool_put(&pcg_normal_pool, pcg); 2079 } 2080 } 2081 } 2082 2083 /* 2084 * pool_cache_invalidate: 2085 * 2086 * Invalidate a pool cache (destruct and release all of the 2087 * cached objects). Does not reclaim objects from the pool. 2088 * 2089 * Note: For pool caches that provide constructed objects, there 2090 * is an assumption that another level of synchronization is occurring 2091 * between the input to the constructor and the cache invalidation. 2092 * 2093 * Invalidation is a costly process and should not be called from 2094 * interrupt context. 2095 */ 2096 void 2097 pool_cache_invalidate(pool_cache_t pc) 2098 { 2099 uint64_t where; 2100 pcg_t *full, *empty, *part; 2101 2102 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 2103 2104 if (ncpu < 2 || !mp_online) { 2105 /* 2106 * We might be called early enough in the boot process 2107 * for the CPU data structures to not be fully initialized. 2108 * In this case, transfer the content of the local CPU's 2109 * cache back into global cache as only this CPU is currently 2110 * running. 2111 */ 2112 pool_cache_transfer(pc); 2113 } else { 2114 /* 2115 * Signal all CPUs that they must transfer their local 2116 * cache back to the global pool then wait for the xcall to 2117 * complete. 2118 */ 2119 where = xc_broadcast(0, (xcfunc_t)pool_cache_transfer, 2120 pc, NULL); 2121 xc_wait(where); 2122 } 2123 2124 /* Empty pool caches, then invalidate objects */ 2125 mutex_enter(&pc->pc_lock); 2126 full = pc->pc_fullgroups; 2127 empty = pc->pc_emptygroups; 2128 part = pc->pc_partgroups; 2129 pc->pc_fullgroups = NULL; 2130 pc->pc_emptygroups = NULL; 2131 pc->pc_partgroups = NULL; 2132 pc->pc_nfull = 0; 2133 pc->pc_nempty = 0; 2134 pc->pc_npart = 0; 2135 mutex_exit(&pc->pc_lock); 2136 2137 pool_cache_invalidate_groups(pc, full); 2138 pool_cache_invalidate_groups(pc, empty); 2139 pool_cache_invalidate_groups(pc, part); 2140 } 2141 2142 /* 2143 * pool_cache_invalidate_cpu: 2144 * 2145 * Invalidate all CPU-bound cached objects in pool cache, the CPU being 2146 * identified by its associated index. 2147 * It is caller's responsibility to ensure that no operation is 2148 * taking place on this pool cache while doing this invalidation. 2149 * WARNING: as no inter-CPU locking is enforced, trying to invalidate 2150 * pool cached objects from a CPU different from the one currently running 2151 * may result in an undefined behaviour. 2152 */ 2153 static void 2154 pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) 2155 { 2156 pool_cache_cpu_t *cc; 2157 pcg_t *pcg; 2158 2159 if ((cc = pc->pc_cpus[index]) == NULL) 2160 return; 2161 2162 if ((pcg = cc->cc_current) != &pcg_dummy) { 2163 pcg->pcg_next = NULL; 2164 pool_cache_invalidate_groups(pc, pcg); 2165 } 2166 if ((pcg = cc->cc_previous) != &pcg_dummy) { 2167 pcg->pcg_next = NULL; 2168 pool_cache_invalidate_groups(pc, pcg); 2169 } 2170 if (cc != &pc->pc_cpu0) 2171 pool_put(&cache_cpu_pool, cc); 2172 2173 } 2174 2175 void 2176 pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg) 2177 { 2178 2179 pool_set_drain_hook(&pc->pc_pool, fn, arg); 2180 } 2181 2182 void 2183 pool_cache_setlowat(pool_cache_t pc, int n) 2184 { 2185 2186 pool_setlowat(&pc->pc_pool, n); 2187 } 2188 2189 void 2190 pool_cache_sethiwat(pool_cache_t pc, int n) 2191 { 2192 2193 pool_sethiwat(&pc->pc_pool, n); 2194 } 2195 2196 void 2197 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) 2198 { 2199 2200 pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap); 2201 } 2202 2203 static bool __noinline 2204 pool_cache_get_slow(pool_cache_cpu_t *cc, int s, void **objectp, 2205 paddr_t *pap, int flags) 2206 { 2207 pcg_t *pcg, *cur; 2208 uint64_t ncsw; 2209 pool_cache_t pc; 2210 void *object; 2211 2212 KASSERT(cc->cc_current->pcg_avail == 0); 2213 KASSERT(cc->cc_previous->pcg_avail == 0); 2214 2215 pc = cc->cc_cache; 2216 cc->cc_misses++; 2217 2218 /* 2219 * Nothing was available locally. Try and grab a group 2220 * from the cache. 2221 */ 2222 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2223 ncsw = curlwp->l_ncsw; 2224 mutex_enter(&pc->pc_lock); 2225 pc->pc_contended++; 2226 2227 /* 2228 * If we context switched while locking, then 2229 * our view of the per-CPU data is invalid: 2230 * retry. 2231 */ 2232 if (curlwp->l_ncsw != ncsw) { 2233 mutex_exit(&pc->pc_lock); 2234 return true; 2235 } 2236 } 2237 2238 if (__predict_true((pcg = pc->pc_fullgroups) != NULL)) { 2239 /* 2240 * If there's a full group, release our empty 2241 * group back to the cache. Install the full 2242 * group as cc_current and return. 2243 */ 2244 if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { 2245 KASSERT(cur->pcg_avail == 0); 2246 cur->pcg_next = pc->pc_emptygroups; 2247 pc->pc_emptygroups = cur; 2248 pc->pc_nempty++; 2249 } 2250 KASSERT(pcg->pcg_avail == pcg->pcg_size); 2251 cc->cc_current = pcg; 2252 pc->pc_fullgroups = pcg->pcg_next; 2253 pc->pc_hits++; 2254 pc->pc_nfull--; 2255 mutex_exit(&pc->pc_lock); 2256 return true; 2257 } 2258 2259 /* 2260 * Nothing available locally or in cache. Take the slow 2261 * path: fetch a new object from the pool and construct 2262 * it. 2263 */ 2264 pc->pc_misses++; 2265 mutex_exit(&pc->pc_lock); 2266 splx(s); 2267 2268 object = pool_get(&pc->pc_pool, flags); 2269 *objectp = object; 2270 if (__predict_false(object == NULL)) { 2271 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 2272 return false; 2273 } 2274 2275 if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) { 2276 pool_put(&pc->pc_pool, object); 2277 *objectp = NULL; 2278 return false; 2279 } 2280 2281 KASSERT((((vaddr_t)object + pc->pc_pool.pr_itemoffset) & 2282 (pc->pc_pool.pr_align - 1)) == 0); 2283 2284 if (pap != NULL) { 2285 #ifdef POOL_VTOPHYS 2286 *pap = POOL_VTOPHYS(object); 2287 #else 2288 *pap = POOL_PADDR_INVALID; 2289 #endif 2290 } 2291 2292 FREECHECK_OUT(&pc->pc_freecheck, object); 2293 pool_redzone_fill(&pc->pc_pool, object); 2294 pool_cache_kleak_fill(pc, object); 2295 return false; 2296 } 2297 2298 /* 2299 * pool_cache_get{,_paddr}: 2300 * 2301 * Get an object from a pool cache (optionally returning 2302 * the physical address of the object). 2303 */ 2304 void * 2305 pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) 2306 { 2307 pool_cache_cpu_t *cc; 2308 pcg_t *pcg; 2309 void *object; 2310 int s; 2311 2312 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 2313 KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()) || 2314 (pc->pc_pool.pr_ipl != IPL_NONE || cold || panicstr != NULL), 2315 "%s: [%s] is IPL_NONE, but called from interrupt context", 2316 __func__, pc->pc_pool.pr_wchan); 2317 2318 if (flags & PR_WAITOK) { 2319 ASSERT_SLEEPABLE(); 2320 } 2321 2322 /* Lock out interrupts and disable preemption. */ 2323 s = splvm(); 2324 while (/* CONSTCOND */ true) { 2325 /* Try and allocate an object from the current group. */ 2326 cc = pc->pc_cpus[curcpu()->ci_index]; 2327 KASSERT(cc->cc_cache == pc); 2328 pcg = cc->cc_current; 2329 if (__predict_true(pcg->pcg_avail > 0)) { 2330 object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; 2331 if (__predict_false(pap != NULL)) 2332 *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa; 2333 #if defined(DIAGNOSTIC) 2334 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; 2335 KASSERT(pcg->pcg_avail < pcg->pcg_size); 2336 KASSERT(object != NULL); 2337 #endif 2338 cc->cc_hits++; 2339 splx(s); 2340 FREECHECK_OUT(&pc->pc_freecheck, object); 2341 pool_redzone_fill(&pc->pc_pool, object); 2342 pool_cache_kleak_fill(pc, object); 2343 return object; 2344 } 2345 2346 /* 2347 * That failed. If the previous group isn't empty, swap 2348 * it with the current group and allocate from there. 2349 */ 2350 pcg = cc->cc_previous; 2351 if (__predict_true(pcg->pcg_avail > 0)) { 2352 cc->cc_previous = cc->cc_current; 2353 cc->cc_current = pcg; 2354 continue; 2355 } 2356 2357 /* 2358 * Can't allocate from either group: try the slow path. 2359 * If get_slow() allocated an object for us, or if 2360 * no more objects are available, it will return false. 2361 * Otherwise, we need to retry. 2362 */ 2363 if (!pool_cache_get_slow(cc, s, &object, pap, flags)) 2364 break; 2365 } 2366 2367 /* 2368 * We would like to KASSERT(object || (flags & PR_NOWAIT)), but 2369 * pool_cache_get can fail even in the PR_WAITOK case, if the 2370 * constructor fails. 2371 */ 2372 return object; 2373 } 2374 2375 static bool __noinline 2376 pool_cache_put_slow(pool_cache_cpu_t *cc, int s, void *object) 2377 { 2378 struct lwp *l = curlwp; 2379 pcg_t *pcg, *cur; 2380 uint64_t ncsw; 2381 pool_cache_t pc; 2382 2383 KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); 2384 KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); 2385 2386 pc = cc->cc_cache; 2387 pcg = NULL; 2388 cc->cc_misses++; 2389 ncsw = l->l_ncsw; 2390 2391 /* 2392 * If there are no empty groups in the cache then allocate one 2393 * while still unlocked. 2394 */ 2395 if (__predict_false(pc->pc_emptygroups == NULL)) { 2396 if (__predict_true(!pool_cache_disable)) { 2397 pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT); 2398 } 2399 /* 2400 * If pool_get() blocked, then our view of 2401 * the per-CPU data is invalid: retry. 2402 */ 2403 if (__predict_false(l->l_ncsw != ncsw)) { 2404 if (pcg != NULL) { 2405 pool_put(pc->pc_pcgpool, pcg); 2406 } 2407 return true; 2408 } 2409 if (__predict_true(pcg != NULL)) { 2410 pcg->pcg_avail = 0; 2411 pcg->pcg_size = pc->pc_pcgsize; 2412 } 2413 } 2414 2415 /* Lock the cache. */ 2416 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2417 mutex_enter(&pc->pc_lock); 2418 pc->pc_contended++; 2419 2420 /* 2421 * If we context switched while locking, then our view of 2422 * the per-CPU data is invalid: retry. 2423 */ 2424 if (__predict_false(l->l_ncsw != ncsw)) { 2425 mutex_exit(&pc->pc_lock); 2426 if (pcg != NULL) { 2427 pool_put(pc->pc_pcgpool, pcg); 2428 } 2429 return true; 2430 } 2431 } 2432 2433 /* If there are no empty groups in the cache then allocate one. */ 2434 if (pcg == NULL && pc->pc_emptygroups != NULL) { 2435 pcg = pc->pc_emptygroups; 2436 pc->pc_emptygroups = pcg->pcg_next; 2437 pc->pc_nempty--; 2438 } 2439 2440 /* 2441 * If there's a empty group, release our full group back 2442 * to the cache. Install the empty group to the local CPU 2443 * and return. 2444 */ 2445 if (pcg != NULL) { 2446 KASSERT(pcg->pcg_avail == 0); 2447 if (__predict_false(cc->cc_previous == &pcg_dummy)) { 2448 cc->cc_previous = pcg; 2449 } else { 2450 cur = cc->cc_current; 2451 if (__predict_true(cur != &pcg_dummy)) { 2452 KASSERT(cur->pcg_avail == cur->pcg_size); 2453 cur->pcg_next = pc->pc_fullgroups; 2454 pc->pc_fullgroups = cur; 2455 pc->pc_nfull++; 2456 } 2457 cc->cc_current = pcg; 2458 } 2459 pc->pc_hits++; 2460 mutex_exit(&pc->pc_lock); 2461 return true; 2462 } 2463 2464 /* 2465 * Nothing available locally or in cache, and we didn't 2466 * allocate an empty group. Take the slow path and destroy 2467 * the object here and now. 2468 */ 2469 pc->pc_misses++; 2470 mutex_exit(&pc->pc_lock); 2471 splx(s); 2472 pool_cache_destruct_object(pc, object); 2473 2474 return false; 2475 } 2476 2477 /* 2478 * pool_cache_put{,_paddr}: 2479 * 2480 * Put an object back to the pool cache (optionally caching the 2481 * physical address of the object). 2482 */ 2483 void 2484 pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) 2485 { 2486 pool_cache_cpu_t *cc; 2487 pcg_t *pcg; 2488 int s; 2489 2490 KASSERT(object != NULL); 2491 pool_cache_redzone_check(pc, object); 2492 FREECHECK_IN(&pc->pc_freecheck, object); 2493 2494 /* Lock out interrupts and disable preemption. */ 2495 s = splvm(); 2496 while (/* CONSTCOND */ true) { 2497 /* If the current group isn't full, release it there. */ 2498 cc = pc->pc_cpus[curcpu()->ci_index]; 2499 KASSERT(cc->cc_cache == pc); 2500 pcg = cc->cc_current; 2501 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2502 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object; 2503 pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; 2504 pcg->pcg_avail++; 2505 cc->cc_hits++; 2506 splx(s); 2507 return; 2508 } 2509 2510 /* 2511 * That failed. If the previous group isn't full, swap 2512 * it with the current group and try again. 2513 */ 2514 pcg = cc->cc_previous; 2515 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2516 cc->cc_previous = cc->cc_current; 2517 cc->cc_current = pcg; 2518 continue; 2519 } 2520 2521 /* 2522 * Can't free to either group: try the slow path. 2523 * If put_slow() releases the object for us, it 2524 * will return false. Otherwise we need to retry. 2525 */ 2526 if (!pool_cache_put_slow(cc, s, object)) 2527 break; 2528 } 2529 } 2530 2531 /* 2532 * pool_cache_transfer: 2533 * 2534 * Transfer objects from the per-CPU cache to the global cache. 2535 * Run within a cross-call thread. 2536 */ 2537 static void 2538 pool_cache_transfer(pool_cache_t pc) 2539 { 2540 pool_cache_cpu_t *cc; 2541 pcg_t *prev, *cur, **list; 2542 int s; 2543 2544 s = splvm(); 2545 mutex_enter(&pc->pc_lock); 2546 cc = pc->pc_cpus[curcpu()->ci_index]; 2547 cur = cc->cc_current; 2548 cc->cc_current = __UNCONST(&pcg_dummy); 2549 prev = cc->cc_previous; 2550 cc->cc_previous = __UNCONST(&pcg_dummy); 2551 if (cur != &pcg_dummy) { 2552 if (cur->pcg_avail == cur->pcg_size) { 2553 list = &pc->pc_fullgroups; 2554 pc->pc_nfull++; 2555 } else if (cur->pcg_avail == 0) { 2556 list = &pc->pc_emptygroups; 2557 pc->pc_nempty++; 2558 } else { 2559 list = &pc->pc_partgroups; 2560 pc->pc_npart++; 2561 } 2562 cur->pcg_next = *list; 2563 *list = cur; 2564 } 2565 if (prev != &pcg_dummy) { 2566 if (prev->pcg_avail == prev->pcg_size) { 2567 list = &pc->pc_fullgroups; 2568 pc->pc_nfull++; 2569 } else if (prev->pcg_avail == 0) { 2570 list = &pc->pc_emptygroups; 2571 pc->pc_nempty++; 2572 } else { 2573 list = &pc->pc_partgroups; 2574 pc->pc_npart++; 2575 } 2576 prev->pcg_next = *list; 2577 *list = prev; 2578 } 2579 mutex_exit(&pc->pc_lock); 2580 splx(s); 2581 } 2582 2583 /* 2584 * Pool backend allocators. 2585 * 2586 * Each pool has a backend allocator that handles allocation, deallocation, 2587 * and any additional draining that might be needed. 2588 * 2589 * We provide two standard allocators: 2590 * 2591 * pool_allocator_kmem - the default when no allocator is specified 2592 * 2593 * pool_allocator_nointr - used for pools that will not be accessed 2594 * in interrupt context. 2595 */ 2596 void *pool_page_alloc(struct pool *, int); 2597 void pool_page_free(struct pool *, void *); 2598 2599 #ifdef POOL_SUBPAGE 2600 struct pool_allocator pool_allocator_kmem_fullpage = { 2601 .pa_alloc = pool_page_alloc, 2602 .pa_free = pool_page_free, 2603 .pa_pagesz = 0 2604 }; 2605 #else 2606 struct pool_allocator pool_allocator_kmem = { 2607 .pa_alloc = pool_page_alloc, 2608 .pa_free = pool_page_free, 2609 .pa_pagesz = 0 2610 }; 2611 #endif 2612 2613 #ifdef POOL_SUBPAGE 2614 struct pool_allocator pool_allocator_nointr_fullpage = { 2615 .pa_alloc = pool_page_alloc, 2616 .pa_free = pool_page_free, 2617 .pa_pagesz = 0 2618 }; 2619 #else 2620 struct pool_allocator pool_allocator_nointr = { 2621 .pa_alloc = pool_page_alloc, 2622 .pa_free = pool_page_free, 2623 .pa_pagesz = 0 2624 }; 2625 #endif 2626 2627 #ifdef POOL_SUBPAGE 2628 void *pool_subpage_alloc(struct pool *, int); 2629 void pool_subpage_free(struct pool *, void *); 2630 2631 struct pool_allocator pool_allocator_kmem = { 2632 .pa_alloc = pool_subpage_alloc, 2633 .pa_free = pool_subpage_free, 2634 .pa_pagesz = POOL_SUBPAGE 2635 }; 2636 2637 struct pool_allocator pool_allocator_nointr = { 2638 .pa_alloc = pool_subpage_alloc, 2639 .pa_free = pool_subpage_free, 2640 .pa_pagesz = POOL_SUBPAGE 2641 }; 2642 #endif /* POOL_SUBPAGE */ 2643 2644 struct pool_allocator pool_allocator_big[] = { 2645 { 2646 .pa_alloc = pool_page_alloc, 2647 .pa_free = pool_page_free, 2648 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0), 2649 }, 2650 { 2651 .pa_alloc = pool_page_alloc, 2652 .pa_free = pool_page_free, 2653 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1), 2654 }, 2655 { 2656 .pa_alloc = pool_page_alloc, 2657 .pa_free = pool_page_free, 2658 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2), 2659 }, 2660 { 2661 .pa_alloc = pool_page_alloc, 2662 .pa_free = pool_page_free, 2663 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3), 2664 }, 2665 { 2666 .pa_alloc = pool_page_alloc, 2667 .pa_free = pool_page_free, 2668 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4), 2669 }, 2670 { 2671 .pa_alloc = pool_page_alloc, 2672 .pa_free = pool_page_free, 2673 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5), 2674 }, 2675 { 2676 .pa_alloc = pool_page_alloc, 2677 .pa_free = pool_page_free, 2678 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6), 2679 }, 2680 { 2681 .pa_alloc = pool_page_alloc, 2682 .pa_free = pool_page_free, 2683 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7), 2684 } 2685 }; 2686 2687 static int 2688 pool_bigidx(size_t size) 2689 { 2690 int i; 2691 2692 for (i = 0; i < __arraycount(pool_allocator_big); i++) { 2693 if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size) 2694 return i; 2695 } 2696 panic("pool item size %zu too large, use a custom allocator", size); 2697 } 2698 2699 static void * 2700 pool_allocator_alloc(struct pool *pp, int flags) 2701 { 2702 struct pool_allocator *pa = pp->pr_alloc; 2703 void *res; 2704 2705 res = (*pa->pa_alloc)(pp, flags); 2706 if (res == NULL && (flags & PR_WAITOK) == 0) { 2707 /* 2708 * We only run the drain hook here if PR_NOWAIT. 2709 * In other cases, the hook will be run in 2710 * pool_reclaim(). 2711 */ 2712 if (pp->pr_drain_hook != NULL) { 2713 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 2714 res = (*pa->pa_alloc)(pp, flags); 2715 } 2716 } 2717 return res; 2718 } 2719 2720 static void 2721 pool_allocator_free(struct pool *pp, void *v) 2722 { 2723 struct pool_allocator *pa = pp->pr_alloc; 2724 2725 if (pp->pr_redzone) { 2726 kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz); 2727 } 2728 (*pa->pa_free)(pp, v); 2729 } 2730 2731 void * 2732 pool_page_alloc(struct pool *pp, int flags) 2733 { 2734 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2735 vmem_addr_t va; 2736 int ret; 2737 2738 ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz, 2739 vflags | VM_INSTANTFIT, &va); 2740 2741 return ret ? NULL : (void *)va; 2742 } 2743 2744 void 2745 pool_page_free(struct pool *pp, void *v) 2746 { 2747 2748 uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz); 2749 } 2750 2751 static void * 2752 pool_page_alloc_meta(struct pool *pp, int flags) 2753 { 2754 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2755 vmem_addr_t va; 2756 int ret; 2757 2758 ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, 2759 vflags | VM_INSTANTFIT, &va); 2760 2761 return ret ? NULL : (void *)va; 2762 } 2763 2764 static void 2765 pool_page_free_meta(struct pool *pp, void *v) 2766 { 2767 2768 vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); 2769 } 2770 2771 #ifdef KLEAK 2772 static void 2773 pool_kleak_fill(struct pool *pp, void *p) 2774 { 2775 if (__predict_false(pp->pr_roflags & PR_NOTOUCH)) { 2776 return; 2777 } 2778 kleak_fill_area(p, pp->pr_size); 2779 } 2780 2781 static void 2782 pool_cache_kleak_fill(pool_cache_t pc, void *p) 2783 { 2784 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 2785 return; 2786 } 2787 pool_kleak_fill(&pc->pc_pool, p); 2788 } 2789 #endif 2790 2791 #ifdef POOL_REDZONE 2792 #if defined(_LP64) 2793 # define PRIME 0x9e37fffffffc0000UL 2794 #else /* defined(_LP64) */ 2795 # define PRIME 0x9e3779b1 2796 #endif /* defined(_LP64) */ 2797 #define STATIC_BYTE 0xFE 2798 CTASSERT(POOL_REDZONE_SIZE > 1); 2799 2800 #ifndef KASAN 2801 static inline uint8_t 2802 pool_pattern_generate(const void *p) 2803 { 2804 return (uint8_t)(((uintptr_t)p) * PRIME 2805 >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT); 2806 } 2807 #endif 2808 2809 static void 2810 pool_redzone_init(struct pool *pp, size_t requested_size) 2811 { 2812 size_t redzsz; 2813 size_t nsz; 2814 2815 #ifdef KASAN 2816 redzsz = requested_size; 2817 kasan_add_redzone(&redzsz); 2818 redzsz -= requested_size; 2819 #else 2820 redzsz = POOL_REDZONE_SIZE; 2821 #endif 2822 2823 if (pp->pr_roflags & PR_NOTOUCH) { 2824 pp->pr_redzone = false; 2825 return; 2826 } 2827 2828 /* 2829 * We may have extended the requested size earlier; check if 2830 * there's naturally space in the padding for a red zone. 2831 */ 2832 if (pp->pr_size - requested_size >= redzsz) { 2833 pp->pr_reqsize_with_redzone = requested_size + redzsz; 2834 pp->pr_redzone = true; 2835 return; 2836 } 2837 2838 /* 2839 * No space in the natural padding; check if we can extend a 2840 * bit the size of the pool. 2841 */ 2842 nsz = roundup(pp->pr_size + redzsz, pp->pr_align); 2843 if (nsz <= pp->pr_alloc->pa_pagesz) { 2844 /* Ok, we can */ 2845 pp->pr_size = nsz; 2846 pp->pr_reqsize_with_redzone = requested_size + redzsz; 2847 pp->pr_redzone = true; 2848 } else { 2849 /* No space for a red zone... snif :'( */ 2850 pp->pr_redzone = false; 2851 printf("pool redzone disabled for '%s'\n", pp->pr_wchan); 2852 } 2853 } 2854 2855 static void 2856 pool_redzone_fill(struct pool *pp, void *p) 2857 { 2858 if (!pp->pr_redzone) 2859 return; 2860 #ifdef KASAN 2861 kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone); 2862 #else 2863 uint8_t *cp, pat; 2864 const uint8_t *ep; 2865 2866 cp = (uint8_t *)p + pp->pr_reqsize; 2867 ep = cp + POOL_REDZONE_SIZE; 2868 2869 /* 2870 * We really don't want the first byte of the red zone to be '\0'; 2871 * an off-by-one in a string may not be properly detected. 2872 */ 2873 pat = pool_pattern_generate(cp); 2874 *cp = (pat == '\0') ? STATIC_BYTE: pat; 2875 cp++; 2876 2877 while (cp < ep) { 2878 *cp = pool_pattern_generate(cp); 2879 cp++; 2880 } 2881 #endif 2882 } 2883 2884 static void 2885 pool_redzone_check(struct pool *pp, void *p) 2886 { 2887 if (!pp->pr_redzone) 2888 return; 2889 #ifdef KASAN 2890 kasan_mark(p, 0, pp->pr_reqsize_with_redzone); 2891 #else 2892 uint8_t *cp, pat, expected; 2893 const uint8_t *ep; 2894 2895 cp = (uint8_t *)p + pp->pr_reqsize; 2896 ep = cp + POOL_REDZONE_SIZE; 2897 2898 pat = pool_pattern_generate(cp); 2899 expected = (pat == '\0') ? STATIC_BYTE: pat; 2900 if (__predict_false(expected != *cp)) { 2901 printf("%s: %p: 0x%02x != 0x%02x\n", 2902 __func__, cp, *cp, expected); 2903 } 2904 cp++; 2905 2906 while (cp < ep) { 2907 expected = pool_pattern_generate(cp); 2908 if (__predict_false(*cp != expected)) { 2909 printf("%s: %p: 0x%02x != 0x%02x\n", 2910 __func__, cp, *cp, expected); 2911 } 2912 cp++; 2913 } 2914 #endif 2915 } 2916 2917 static void 2918 pool_cache_redzone_check(pool_cache_t pc, void *p) 2919 { 2920 #ifdef KASAN 2921 /* If there is a ctor/dtor, leave the data as valid. */ 2922 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 2923 return; 2924 } 2925 #endif 2926 pool_redzone_check(&pc->pc_pool, p); 2927 } 2928 2929 #endif /* POOL_REDZONE */ 2930 2931 2932 #ifdef POOL_SUBPAGE 2933 /* Sub-page allocator, for machines with large hardware pages. */ 2934 void * 2935 pool_subpage_alloc(struct pool *pp, int flags) 2936 { 2937 return pool_get(&psppool, flags); 2938 } 2939 2940 void 2941 pool_subpage_free(struct pool *pp, void *v) 2942 { 2943 pool_put(&psppool, v); 2944 } 2945 2946 #endif /* POOL_SUBPAGE */ 2947 2948 #if defined(DDB) 2949 static bool 2950 pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 2951 { 2952 2953 return (uintptr_t)ph->ph_page <= addr && 2954 addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz; 2955 } 2956 2957 static bool 2958 pool_in_item(struct pool *pp, void *item, uintptr_t addr) 2959 { 2960 2961 return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size; 2962 } 2963 2964 static bool 2965 pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr) 2966 { 2967 int i; 2968 2969 if (pcg == NULL) { 2970 return false; 2971 } 2972 for (i = 0; i < pcg->pcg_avail; i++) { 2973 if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) { 2974 return true; 2975 } 2976 } 2977 return false; 2978 } 2979 2980 static bool 2981 pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 2982 { 2983 2984 if ((pp->pr_roflags & PR_NOTOUCH) != 0) { 2985 unsigned int idx = pr_item_notouch_index(pp, ph, (void *)addr); 2986 pool_item_bitmap_t *bitmap = 2987 ph->ph_bitmap + (idx / BITMAP_SIZE); 2988 pool_item_bitmap_t mask = 1 << (idx & BITMAP_MASK); 2989 2990 return (*bitmap & mask) == 0; 2991 } else { 2992 struct pool_item *pi; 2993 2994 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 2995 if (pool_in_item(pp, pi, addr)) { 2996 return false; 2997 } 2998 } 2999 return true; 3000 } 3001 } 3002 3003 void 3004 pool_whatis(uintptr_t addr, void (*pr)(const char *, ...)) 3005 { 3006 struct pool *pp; 3007 3008 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3009 struct pool_item_header *ph; 3010 uintptr_t item; 3011 bool allocated = true; 3012 bool incache = false; 3013 bool incpucache = false; 3014 char cpucachestr[32]; 3015 3016 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 3017 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 3018 if (pool_in_page(pp, ph, addr)) { 3019 goto found; 3020 } 3021 } 3022 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 3023 if (pool_in_page(pp, ph, addr)) { 3024 allocated = 3025 pool_allocated(pp, ph, addr); 3026 goto found; 3027 } 3028 } 3029 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 3030 if (pool_in_page(pp, ph, addr)) { 3031 allocated = false; 3032 goto found; 3033 } 3034 } 3035 continue; 3036 } else { 3037 ph = pr_find_pagehead_noalign(pp, (void *)addr); 3038 if (ph == NULL || !pool_in_page(pp, ph, addr)) { 3039 continue; 3040 } 3041 allocated = pool_allocated(pp, ph, addr); 3042 } 3043 found: 3044 if (allocated && pp->pr_cache) { 3045 pool_cache_t pc = pp->pr_cache; 3046 struct pool_cache_group *pcg; 3047 int i; 3048 3049 for (pcg = pc->pc_fullgroups; pcg != NULL; 3050 pcg = pcg->pcg_next) { 3051 if (pool_in_cg(pp, pcg, addr)) { 3052 incache = true; 3053 goto print; 3054 } 3055 } 3056 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 3057 pool_cache_cpu_t *cc; 3058 3059 if ((cc = pc->pc_cpus[i]) == NULL) { 3060 continue; 3061 } 3062 if (pool_in_cg(pp, cc->cc_current, addr) || 3063 pool_in_cg(pp, cc->cc_previous, addr)) { 3064 struct cpu_info *ci = 3065 cpu_lookup(i); 3066 3067 incpucache = true; 3068 snprintf(cpucachestr, 3069 sizeof(cpucachestr), 3070 "cached by CPU %u", 3071 ci->ci_index); 3072 goto print; 3073 } 3074 } 3075 } 3076 print: 3077 item = (uintptr_t)ph->ph_page + ph->ph_off; 3078 item = item + rounddown(addr - item, pp->pr_size); 3079 (*pr)("%p is %p+%zu in POOL '%s' (%s)\n", 3080 (void *)addr, item, (size_t)(addr - item), 3081 pp->pr_wchan, 3082 incpucache ? cpucachestr : 3083 incache ? "cached" : allocated ? "allocated" : "free"); 3084 } 3085 } 3086 #endif /* defined(DDB) */ 3087 3088 static int 3089 pool_sysctl(SYSCTLFN_ARGS) 3090 { 3091 struct pool_sysctl data; 3092 struct pool *pp; 3093 struct pool_cache *pc; 3094 pool_cache_cpu_t *cc; 3095 int error; 3096 size_t i, written; 3097 3098 if (oldp == NULL) { 3099 *oldlenp = 0; 3100 TAILQ_FOREACH(pp, &pool_head, pr_poollist) 3101 *oldlenp += sizeof(data); 3102 return 0; 3103 } 3104 3105 memset(&data, 0, sizeof(data)); 3106 error = 0; 3107 written = 0; 3108 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3109 if (written + sizeof(data) > *oldlenp) 3110 break; 3111 strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan)); 3112 data.pr_pagesize = pp->pr_alloc->pa_pagesz; 3113 data.pr_flags = pp->pr_roflags | pp->pr_flags; 3114 #define COPY(field) data.field = pp->field 3115 COPY(pr_size); 3116 3117 COPY(pr_itemsperpage); 3118 COPY(pr_nitems); 3119 COPY(pr_nout); 3120 COPY(pr_hardlimit); 3121 COPY(pr_npages); 3122 COPY(pr_minpages); 3123 COPY(pr_maxpages); 3124 3125 COPY(pr_nget); 3126 COPY(pr_nfail); 3127 COPY(pr_nput); 3128 COPY(pr_npagealloc); 3129 COPY(pr_npagefree); 3130 COPY(pr_hiwat); 3131 COPY(pr_nidle); 3132 #undef COPY 3133 3134 data.pr_cache_nmiss_pcpu = 0; 3135 data.pr_cache_nhit_pcpu = 0; 3136 if (pp->pr_cache) { 3137 pc = pp->pr_cache; 3138 data.pr_cache_meta_size = pc->pc_pcgsize; 3139 data.pr_cache_nfull = pc->pc_nfull; 3140 data.pr_cache_npartial = pc->pc_npart; 3141 data.pr_cache_nempty = pc->pc_nempty; 3142 data.pr_cache_ncontended = pc->pc_contended; 3143 data.pr_cache_nmiss_global = pc->pc_misses; 3144 data.pr_cache_nhit_global = pc->pc_hits; 3145 for (i = 0; i < pc->pc_ncpu; ++i) { 3146 cc = pc->pc_cpus[i]; 3147 if (cc == NULL) 3148 continue; 3149 data.pr_cache_nmiss_pcpu += cc->cc_misses; 3150 data.pr_cache_nhit_pcpu += cc->cc_hits; 3151 } 3152 } else { 3153 data.pr_cache_meta_size = 0; 3154 data.pr_cache_nfull = 0; 3155 data.pr_cache_npartial = 0; 3156 data.pr_cache_nempty = 0; 3157 data.pr_cache_ncontended = 0; 3158 data.pr_cache_nmiss_global = 0; 3159 data.pr_cache_nhit_global = 0; 3160 } 3161 3162 error = sysctl_copyout(l, &data, oldp, sizeof(data)); 3163 if (error) 3164 break; 3165 written += sizeof(data); 3166 oldp = (char *)oldp + sizeof(data); 3167 } 3168 3169 *oldlenp = written; 3170 return error; 3171 } 3172 3173 SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup") 3174 { 3175 const struct sysctlnode *rnode = NULL; 3176 3177 sysctl_createv(clog, 0, NULL, &rnode, 3178 CTLFLAG_PERMANENT, 3179 CTLTYPE_STRUCT, "pool", 3180 SYSCTL_DESCR("Get pool statistics"), 3181 pool_sysctl, 0, NULL, 0, 3182 CTL_KERN, CTL_CREATE, CTL_EOL); 3183 } 3184