1 /* $NetBSD: subr_pool.c,v 1.252 2019/06/29 11:13:23 maxv Exp $ */ 2 3 /* 4 * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by 11 * Maxime Villard. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.252 2019/06/29 11:13:23 maxv Exp $"); 37 38 #ifdef _KERNEL_OPT 39 #include "opt_ddb.h" 40 #include "opt_lockdebug.h" 41 #include "opt_pool.h" 42 #include "opt_kleak.h" 43 #endif 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysctl.h> 48 #include <sys/bitops.h> 49 #include <sys/proc.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/vmem.h> 53 #include <sys/pool.h> 54 #include <sys/syslog.h> 55 #include <sys/debug.h> 56 #include <sys/lockdebug.h> 57 #include <sys/xcall.h> 58 #include <sys/cpu.h> 59 #include <sys/atomic.h> 60 #include <sys/asan.h> 61 62 #include <uvm/uvm_extern.h> 63 64 /* 65 * Pool resource management utility. 66 * 67 * Memory is allocated in pages which are split into pieces according to 68 * the pool item size. Each page is kept on one of three lists in the 69 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 70 * for empty, full and partially-full pages respectively. The individual 71 * pool items are on a linked list headed by `ph_itemlist' in each page 72 * header. The memory for building the page list is either taken from 73 * the allocated pages themselves (for small pool items) or taken from 74 * an internal pool of page headers (`phpool'). 75 */ 76 77 /* List of all pools. Non static as needed by 'vmstat -m' */ 78 TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 79 80 /* Private pool for page header structures */ 81 #define PHPOOL_MAX 8 82 static struct pool phpool[PHPOOL_MAX]; 83 #define PHPOOL_FREELIST_NELEM(idx) \ 84 (((idx) == 0) ? 0 : BITMAP_SIZE * (1 << (idx))) 85 86 #if defined(KASAN) 87 #define POOL_REDZONE 88 #endif 89 90 #ifdef POOL_REDZONE 91 # ifdef KASAN 92 # define POOL_REDZONE_SIZE 8 93 # else 94 # define POOL_REDZONE_SIZE 2 95 # endif 96 static void pool_redzone_init(struct pool *, size_t); 97 static void pool_redzone_fill(struct pool *, void *); 98 static void pool_redzone_check(struct pool *, void *); 99 static void pool_cache_redzone_check(pool_cache_t, void *); 100 #else 101 # define pool_redzone_init(pp, sz) __nothing 102 # define pool_redzone_fill(pp, ptr) __nothing 103 # define pool_redzone_check(pp, ptr) __nothing 104 # define pool_cache_redzone_check(pc, ptr) __nothing 105 #endif 106 107 #ifdef KLEAK 108 static void pool_kleak_fill(struct pool *, void *); 109 static void pool_cache_kleak_fill(pool_cache_t, void *); 110 #else 111 #define pool_kleak_fill(pp, ptr) __nothing 112 #define pool_cache_kleak_fill(pc, ptr) __nothing 113 #endif 114 115 #ifdef POOL_QUARANTINE 116 static void pool_quarantine_init(struct pool *); 117 static void pool_quarantine_flush(struct pool *); 118 static bool pool_put_quarantine(struct pool *, void *, 119 struct pool_pagelist *); 120 static bool pool_cache_put_quarantine(pool_cache_t, void *, paddr_t); 121 #else 122 #define pool_quarantine_init(a) __nothing 123 #define pool_quarantine_flush(a) __nothing 124 #define pool_put_quarantine(a, b, c) false 125 #define pool_cache_put_quarantine(a, b, c) false 126 #endif 127 128 #define pc_has_ctor(pc) \ 129 (pc->pc_ctor != (int (*)(void *, void *, int))nullop) 130 #define pc_has_dtor(pc) \ 131 (pc->pc_dtor != (void (*)(void *, void *))nullop) 132 133 static void *pool_page_alloc_meta(struct pool *, int); 134 static void pool_page_free_meta(struct pool *, void *); 135 136 /* allocator for pool metadata */ 137 struct pool_allocator pool_allocator_meta = { 138 .pa_alloc = pool_page_alloc_meta, 139 .pa_free = pool_page_free_meta, 140 .pa_pagesz = 0 141 }; 142 143 #define POOL_ALLOCATOR_BIG_BASE 13 144 extern struct pool_allocator pool_allocator_big[]; 145 static int pool_bigidx(size_t); 146 147 /* # of seconds to retain page after last use */ 148 int pool_inactive_time = 10; 149 150 /* Next candidate for drainage (see pool_drain()) */ 151 static struct pool *drainpp; 152 153 /* This lock protects both pool_head and drainpp. */ 154 static kmutex_t pool_head_lock; 155 static kcondvar_t pool_busy; 156 157 /* This lock protects initialization of a potentially shared pool allocator */ 158 static kmutex_t pool_allocator_lock; 159 160 static unsigned int poolid_counter = 0; 161 162 typedef uint32_t pool_item_bitmap_t; 163 #define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t)) 164 #define BITMAP_MASK (BITMAP_SIZE - 1) 165 166 struct pool_item_header { 167 /* Page headers */ 168 LIST_ENTRY(pool_item_header) 169 ph_pagelist; /* pool page list */ 170 union { 171 /* !PR_PHINPAGE */ 172 struct { 173 SPLAY_ENTRY(pool_item_header) 174 phu_node; /* off-page page headers */ 175 } phu_offpage; 176 /* PR_PHINPAGE */ 177 struct { 178 unsigned int phu_poolid; 179 } phu_onpage; 180 } ph_u1; 181 void * ph_page; /* this page's address */ 182 uint32_t ph_time; /* last referenced */ 183 uint16_t ph_nmissing; /* # of chunks in use */ 184 uint16_t ph_off; /* start offset in page */ 185 union { 186 /* !PR_USEBMAP */ 187 struct { 188 LIST_HEAD(, pool_item) 189 phu_itemlist; /* chunk list for this page */ 190 } phu_normal; 191 /* PR_USEBMAP */ 192 struct { 193 pool_item_bitmap_t phu_bitmap[1]; 194 } phu_notouch; 195 } ph_u2; 196 }; 197 #define ph_node ph_u1.phu_offpage.phu_node 198 #define ph_poolid ph_u1.phu_onpage.phu_poolid 199 #define ph_itemlist ph_u2.phu_normal.phu_itemlist 200 #define ph_bitmap ph_u2.phu_notouch.phu_bitmap 201 202 #define PHSIZE ALIGN(sizeof(struct pool_item_header)) 203 204 #if defined(DIAGNOSTIC) && !defined(KASAN) 205 #define POOL_CHECK_MAGIC 206 #endif 207 208 struct pool_item { 209 #ifdef POOL_CHECK_MAGIC 210 u_int pi_magic; 211 #endif 212 #define PI_MAGIC 0xdeaddeadU 213 /* Other entries use only this list entry */ 214 LIST_ENTRY(pool_item) pi_list; 215 }; 216 217 #define POOL_NEEDS_CATCHUP(pp) \ 218 ((pp)->pr_nitems < (pp)->pr_minitems) 219 220 /* 221 * Pool cache management. 222 * 223 * Pool caches provide a way for constructed objects to be cached by the 224 * pool subsystem. This can lead to performance improvements by avoiding 225 * needless object construction/destruction; it is deferred until absolutely 226 * necessary. 227 * 228 * Caches are grouped into cache groups. Each cache group references up 229 * to PCG_NUMOBJECTS constructed objects. When a cache allocates an 230 * object from the pool, it calls the object's constructor and places it 231 * into a cache group. When a cache group frees an object back to the 232 * pool, it first calls the object's destructor. This allows the object 233 * to persist in constructed form while freed to the cache. 234 * 235 * The pool references each cache, so that when a pool is drained by the 236 * pagedaemon, it can drain each individual cache as well. Each time a 237 * cache is drained, the most idle cache group is freed to the pool in 238 * its entirety. 239 * 240 * Pool caches are layed on top of pools. By layering them, we can avoid 241 * the complexity of cache management for pools which would not benefit 242 * from it. 243 */ 244 245 static struct pool pcg_normal_pool; 246 static struct pool pcg_large_pool; 247 static struct pool cache_pool; 248 static struct pool cache_cpu_pool; 249 250 /* List of all caches. */ 251 TAILQ_HEAD(,pool_cache) pool_cache_head = 252 TAILQ_HEAD_INITIALIZER(pool_cache_head); 253 254 int pool_cache_disable; /* global disable for caching */ 255 static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */ 256 257 static bool pool_cache_put_slow(pool_cache_cpu_t *, int, 258 void *); 259 static bool pool_cache_get_slow(pool_cache_cpu_t *, int, 260 void **, paddr_t *, int); 261 static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t); 262 static void pool_cache_invalidate_groups(pool_cache_t, pcg_t *); 263 static void pool_cache_invalidate_cpu(pool_cache_t, u_int); 264 static void pool_cache_transfer(pool_cache_t); 265 266 static int pool_catchup(struct pool *); 267 static void pool_prime_page(struct pool *, void *, 268 struct pool_item_header *); 269 static void pool_update_curpage(struct pool *); 270 271 static int pool_grow(struct pool *, int); 272 static void *pool_allocator_alloc(struct pool *, int); 273 static void pool_allocator_free(struct pool *, void *); 274 275 static void pool_print_pagelist(struct pool *, struct pool_pagelist *, 276 void (*)(const char *, ...) __printflike(1, 2)); 277 static void pool_print1(struct pool *, const char *, 278 void (*)(const char *, ...) __printflike(1, 2)); 279 280 static int pool_chk_page(struct pool *, const char *, 281 struct pool_item_header *); 282 283 /* -------------------------------------------------------------------------- */ 284 285 static inline unsigned int 286 pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph, 287 const void *v) 288 { 289 const char *cp = v; 290 unsigned int idx; 291 292 KASSERT(pp->pr_roflags & PR_USEBMAP); 293 idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size; 294 295 if (__predict_false(idx >= pp->pr_itemsperpage)) { 296 panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx, 297 pp->pr_itemsperpage); 298 } 299 300 return idx; 301 } 302 303 static inline void 304 pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph, 305 void *obj) 306 { 307 unsigned int idx = pr_item_bitmap_index(pp, ph, obj); 308 pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); 309 pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); 310 311 if (__predict_false((*bitmap & mask) != 0)) { 312 panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj); 313 } 314 315 *bitmap |= mask; 316 } 317 318 static inline void * 319 pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph) 320 { 321 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 322 unsigned int idx; 323 int i; 324 325 for (i = 0; ; i++) { 326 int bit; 327 328 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); 329 bit = ffs32(bitmap[i]); 330 if (bit) { 331 pool_item_bitmap_t mask; 332 333 bit--; 334 idx = (i * BITMAP_SIZE) + bit; 335 mask = 1U << bit; 336 KASSERT((bitmap[i] & mask) != 0); 337 bitmap[i] &= ~mask; 338 break; 339 } 340 } 341 KASSERT(idx < pp->pr_itemsperpage); 342 return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size; 343 } 344 345 static inline void 346 pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph) 347 { 348 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 349 const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE); 350 int i; 351 352 for (i = 0; i < n; i++) { 353 bitmap[i] = (pool_item_bitmap_t)-1; 354 } 355 } 356 357 /* -------------------------------------------------------------------------- */ 358 359 static inline void 360 pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph, 361 void *obj) 362 { 363 struct pool_item *pi = obj; 364 365 #ifdef POOL_CHECK_MAGIC 366 pi->pi_magic = PI_MAGIC; 367 #endif 368 369 if (pp->pr_redzone) { 370 /* 371 * Mark the pool_item as valid. The rest is already 372 * invalid. 373 */ 374 kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0); 375 } 376 377 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 378 } 379 380 static inline void * 381 pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph) 382 { 383 struct pool_item *pi; 384 void *v; 385 386 v = pi = LIST_FIRST(&ph->ph_itemlist); 387 if (__predict_false(v == NULL)) { 388 mutex_exit(&pp->pr_lock); 389 panic("%s: [%s] page empty", __func__, pp->pr_wchan); 390 } 391 KASSERTMSG((pp->pr_nitems > 0), 392 "%s: [%s] nitems %u inconsistent on itemlist", 393 __func__, pp->pr_wchan, pp->pr_nitems); 394 #ifdef POOL_CHECK_MAGIC 395 KASSERTMSG((pi->pi_magic == PI_MAGIC), 396 "%s: [%s] free list modified: " 397 "magic=%x; page %p; item addr %p", __func__, 398 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 399 #endif 400 401 /* 402 * Remove from item list. 403 */ 404 LIST_REMOVE(pi, pi_list); 405 406 return v; 407 } 408 409 /* -------------------------------------------------------------------------- */ 410 411 static inline int 412 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 413 { 414 415 /* 416 * We consider pool_item_header with smaller ph_page bigger. This 417 * unnatural ordering is for the benefit of pr_find_pagehead. 418 */ 419 if (a->ph_page < b->ph_page) 420 return 1; 421 else if (a->ph_page > b->ph_page) 422 return -1; 423 else 424 return 0; 425 } 426 427 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 428 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 429 430 static inline struct pool_item_header * 431 pr_find_pagehead_noalign(struct pool *pp, void *v) 432 { 433 struct pool_item_header *ph, tmp; 434 435 tmp.ph_page = (void *)(uintptr_t)v; 436 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 437 if (ph == NULL) { 438 ph = SPLAY_ROOT(&pp->pr_phtree); 439 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { 440 ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph); 441 } 442 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0); 443 } 444 445 return ph; 446 } 447 448 /* 449 * Return the pool page header based on item address. 450 */ 451 static inline struct pool_item_header * 452 pr_find_pagehead(struct pool *pp, void *v) 453 { 454 struct pool_item_header *ph, tmp; 455 456 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 457 ph = pr_find_pagehead_noalign(pp, v); 458 } else { 459 void *page = 460 (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask); 461 462 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 463 ph = (struct pool_item_header *)page; 464 if (__predict_false((void *)ph->ph_page != page)) { 465 panic("%s: [%s] item %p not part of pool", 466 __func__, pp->pr_wchan, v); 467 } 468 if (__predict_false((char *)v < (char *)page + 469 ph->ph_off)) { 470 panic("%s: [%s] item %p below item space", 471 __func__, pp->pr_wchan, v); 472 } 473 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 474 panic("%s: [%s] item %p poolid %u != %u", 475 __func__, pp->pr_wchan, v, ph->ph_poolid, 476 pp->pr_poolid); 477 } 478 } else { 479 tmp.ph_page = page; 480 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 481 } 482 } 483 484 KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) || 485 ((char *)ph->ph_page <= (char *)v && 486 (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz)); 487 return ph; 488 } 489 490 static void 491 pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq) 492 { 493 struct pool_item_header *ph; 494 495 while ((ph = LIST_FIRST(pq)) != NULL) { 496 LIST_REMOVE(ph, ph_pagelist); 497 pool_allocator_free(pp, ph->ph_page); 498 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 499 pool_put(pp->pr_phpool, ph); 500 } 501 } 502 503 /* 504 * Remove a page from the pool. 505 */ 506 static inline void 507 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 508 struct pool_pagelist *pq) 509 { 510 511 KASSERT(mutex_owned(&pp->pr_lock)); 512 513 /* 514 * If the page was idle, decrement the idle page count. 515 */ 516 if (ph->ph_nmissing == 0) { 517 KASSERT(pp->pr_nidle != 0); 518 KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage), 519 "%s: [%s] nitems=%u < itemsperpage=%u", __func__, 520 pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage); 521 pp->pr_nidle--; 522 } 523 524 pp->pr_nitems -= pp->pr_itemsperpage; 525 526 /* 527 * Unlink the page from the pool and queue it for release. 528 */ 529 LIST_REMOVE(ph, ph_pagelist); 530 if (pp->pr_roflags & PR_PHINPAGE) { 531 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 532 panic("%s: [%s] ph %p poolid %u != %u", 533 __func__, pp->pr_wchan, ph, ph->ph_poolid, 534 pp->pr_poolid); 535 } 536 } else { 537 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 538 } 539 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 540 541 pp->pr_npages--; 542 pp->pr_npagefree++; 543 544 pool_update_curpage(pp); 545 } 546 547 /* 548 * Initialize all the pools listed in the "pools" link set. 549 */ 550 void 551 pool_subsystem_init(void) 552 { 553 size_t size; 554 int idx; 555 556 mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE); 557 mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE); 558 cv_init(&pool_busy, "poolbusy"); 559 560 /* 561 * Initialize private page header pool and cache magazine pool if we 562 * haven't done so yet. 563 */ 564 for (idx = 0; idx < PHPOOL_MAX; idx++) { 565 static char phpool_names[PHPOOL_MAX][6+1+6+1]; 566 int nelem; 567 size_t sz; 568 569 nelem = PHPOOL_FREELIST_NELEM(idx); 570 snprintf(phpool_names[idx], sizeof(phpool_names[idx]), 571 "phpool-%d", nelem); 572 sz = sizeof(struct pool_item_header); 573 if (nelem) { 574 sz = offsetof(struct pool_item_header, 575 ph_bitmap[howmany(nelem, BITMAP_SIZE)]); 576 } 577 pool_init(&phpool[idx], sz, 0, 0, 0, 578 phpool_names[idx], &pool_allocator_meta, IPL_VM); 579 } 580 581 size = sizeof(pcg_t) + 582 (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t); 583 pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0, 584 "pcgnormal", &pool_allocator_meta, IPL_VM); 585 586 size = sizeof(pcg_t) + 587 (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t); 588 pool_init(&pcg_large_pool, size, coherency_unit, 0, 0, 589 "pcglarge", &pool_allocator_meta, IPL_VM); 590 591 pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit, 592 0, 0, "pcache", &pool_allocator_meta, IPL_NONE); 593 594 pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit, 595 0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE); 596 } 597 598 static inline bool 599 pool_init_is_phinpage(const struct pool *pp) 600 { 601 size_t pagesize; 602 603 if (pp->pr_roflags & PR_PHINPAGE) { 604 return true; 605 } 606 if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) { 607 return false; 608 } 609 610 pagesize = pp->pr_alloc->pa_pagesz; 611 612 /* 613 * Threshold: the item size is below 1/16 of a page size, and below 614 * 8 times the page header size. The latter ensures we go off-page 615 * if the page header would make us waste a rather big item. 616 */ 617 if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) { 618 return true; 619 } 620 621 /* Put the header into the page if it doesn't waste any items. */ 622 if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) { 623 return true; 624 } 625 626 return false; 627 } 628 629 static inline bool 630 pool_init_is_usebmap(const struct pool *pp) 631 { 632 size_t bmapsize; 633 634 if (pp->pr_roflags & PR_NOTOUCH) { 635 return true; 636 } 637 638 /* 639 * If we're on-page, and the page header can already contain a bitmap 640 * big enough to cover all the items of the page, go with a bitmap. 641 */ 642 if (!(pp->pr_roflags & PR_PHINPAGE)) { 643 return false; 644 } 645 bmapsize = roundup(PHSIZE, pp->pr_align) - 646 offsetof(struct pool_item_header, ph_bitmap[0]); 647 KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0); 648 if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) { 649 return true; 650 } 651 652 return false; 653 } 654 655 /* 656 * Initialize the given pool resource structure. 657 * 658 * We export this routine to allow other kernel parts to declare 659 * static pools that must be initialized before kmem(9) is available. 660 */ 661 void 662 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 663 const char *wchan, struct pool_allocator *palloc, int ipl) 664 { 665 struct pool *pp1; 666 size_t prsize; 667 int itemspace, slack; 668 669 /* XXX ioff will be removed. */ 670 KASSERT(ioff == 0); 671 672 #ifdef DEBUG 673 if (__predict_true(!cold)) 674 mutex_enter(&pool_head_lock); 675 /* 676 * Check that the pool hasn't already been initialised and 677 * added to the list of all pools. 678 */ 679 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 680 if (pp == pp1) 681 panic("%s: [%s] already initialised", __func__, 682 wchan); 683 } 684 if (__predict_true(!cold)) 685 mutex_exit(&pool_head_lock); 686 #endif 687 688 if (palloc == NULL) 689 palloc = &pool_allocator_kmem; 690 691 if (!cold) 692 mutex_enter(&pool_allocator_lock); 693 if (palloc->pa_refcnt++ == 0) { 694 if (palloc->pa_pagesz == 0) 695 palloc->pa_pagesz = PAGE_SIZE; 696 697 TAILQ_INIT(&palloc->pa_list); 698 699 mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM); 700 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 701 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 702 } 703 if (!cold) 704 mutex_exit(&pool_allocator_lock); 705 706 if (align == 0) 707 align = ALIGN(1); 708 709 prsize = size; 710 if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item)) 711 prsize = sizeof(struct pool_item); 712 713 prsize = roundup(prsize, align); 714 KASSERTMSG((prsize <= palloc->pa_pagesz), 715 "%s: [%s] pool item size (%zu) larger than page size (%u)", 716 __func__, wchan, prsize, palloc->pa_pagesz); 717 718 /* 719 * Initialize the pool structure. 720 */ 721 LIST_INIT(&pp->pr_emptypages); 722 LIST_INIT(&pp->pr_fullpages); 723 LIST_INIT(&pp->pr_partpages); 724 pp->pr_cache = NULL; 725 pp->pr_curpage = NULL; 726 pp->pr_npages = 0; 727 pp->pr_minitems = 0; 728 pp->pr_minpages = 0; 729 pp->pr_maxpages = UINT_MAX; 730 pp->pr_roflags = flags; 731 pp->pr_flags = 0; 732 pp->pr_size = prsize; 733 pp->pr_reqsize = size; 734 pp->pr_align = align; 735 pp->pr_wchan = wchan; 736 pp->pr_alloc = palloc; 737 pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter); 738 pp->pr_nitems = 0; 739 pp->pr_nout = 0; 740 pp->pr_hardlimit = UINT_MAX; 741 pp->pr_hardlimit_warning = NULL; 742 pp->pr_hardlimit_ratecap.tv_sec = 0; 743 pp->pr_hardlimit_ratecap.tv_usec = 0; 744 pp->pr_hardlimit_warning_last.tv_sec = 0; 745 pp->pr_hardlimit_warning_last.tv_usec = 0; 746 pp->pr_drain_hook = NULL; 747 pp->pr_drain_hook_arg = NULL; 748 pp->pr_freecheck = NULL; 749 pool_redzone_init(pp, size); 750 pool_quarantine_init(pp); 751 752 /* 753 * Decide whether to put the page header off-page to avoid wasting too 754 * large a part of the page or too big an item. Off-page page headers 755 * go on a hash table, so we can match a returned item with its header 756 * based on the page address. 757 */ 758 if (pool_init_is_phinpage(pp)) { 759 /* Use the beginning of the page for the page header */ 760 itemspace = palloc->pa_pagesz - roundup(PHSIZE, align); 761 pp->pr_itemoffset = roundup(PHSIZE, align); 762 pp->pr_roflags |= PR_PHINPAGE; 763 } else { 764 /* The page header will be taken from our page header pool */ 765 itemspace = palloc->pa_pagesz; 766 pp->pr_itemoffset = 0; 767 SPLAY_INIT(&pp->pr_phtree); 768 } 769 770 pp->pr_itemsperpage = itemspace / pp->pr_size; 771 KASSERT(pp->pr_itemsperpage != 0); 772 773 /* 774 * Decide whether to use a bitmap or a linked list to manage freed 775 * items. 776 */ 777 if (pool_init_is_usebmap(pp)) { 778 pp->pr_roflags |= PR_USEBMAP; 779 } 780 781 /* 782 * If we're off-page and use a bitmap, choose the appropriate pool to 783 * allocate page headers, whose size varies depending on the bitmap. If 784 * we're just off-page, take the first pool, no extra size. If we're 785 * on-page, nothing to do. 786 */ 787 if (!(pp->pr_roflags & PR_PHINPAGE) && (pp->pr_roflags & PR_USEBMAP)) { 788 int idx; 789 790 for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx); 791 idx++) { 792 /* nothing */ 793 } 794 if (idx >= PHPOOL_MAX) { 795 /* 796 * if you see this panic, consider to tweak 797 * PHPOOL_MAX and PHPOOL_FREELIST_NELEM. 798 */ 799 panic("%s: [%s] too large itemsperpage(%d) for " 800 "PR_USEBMAP", __func__, 801 pp->pr_wchan, pp->pr_itemsperpage); 802 } 803 pp->pr_phpool = &phpool[idx]; 804 } else if (!(pp->pr_roflags & PR_PHINPAGE)) { 805 pp->pr_phpool = &phpool[0]; 806 } else { 807 pp->pr_phpool = NULL; 808 } 809 810 /* 811 * Use the slack between the chunks and the page header 812 * for "cache coloring". 813 */ 814 slack = itemspace - pp->pr_itemsperpage * pp->pr_size; 815 pp->pr_maxcolor = rounddown(slack, align); 816 pp->pr_curcolor = 0; 817 818 pp->pr_nget = 0; 819 pp->pr_nfail = 0; 820 pp->pr_nput = 0; 821 pp->pr_npagealloc = 0; 822 pp->pr_npagefree = 0; 823 pp->pr_hiwat = 0; 824 pp->pr_nidle = 0; 825 pp->pr_refcnt = 0; 826 827 mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl); 828 cv_init(&pp->pr_cv, wchan); 829 pp->pr_ipl = ipl; 830 831 /* Insert into the list of all pools. */ 832 if (!cold) 833 mutex_enter(&pool_head_lock); 834 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 835 if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0) 836 break; 837 } 838 if (pp1 == NULL) 839 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 840 else 841 TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist); 842 if (!cold) 843 mutex_exit(&pool_head_lock); 844 845 /* Insert this into the list of pools using this allocator. */ 846 if (!cold) 847 mutex_enter(&palloc->pa_lock); 848 TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list); 849 if (!cold) 850 mutex_exit(&palloc->pa_lock); 851 } 852 853 /* 854 * De-commision a pool resource. 855 */ 856 void 857 pool_destroy(struct pool *pp) 858 { 859 struct pool_pagelist pq; 860 struct pool_item_header *ph; 861 862 pool_quarantine_flush(pp); 863 864 /* Remove from global pool list */ 865 mutex_enter(&pool_head_lock); 866 while (pp->pr_refcnt != 0) 867 cv_wait(&pool_busy, &pool_head_lock); 868 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 869 if (drainpp == pp) 870 drainpp = NULL; 871 mutex_exit(&pool_head_lock); 872 873 /* Remove this pool from its allocator's list of pools. */ 874 mutex_enter(&pp->pr_alloc->pa_lock); 875 TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list); 876 mutex_exit(&pp->pr_alloc->pa_lock); 877 878 mutex_enter(&pool_allocator_lock); 879 if (--pp->pr_alloc->pa_refcnt == 0) 880 mutex_destroy(&pp->pr_alloc->pa_lock); 881 mutex_exit(&pool_allocator_lock); 882 883 mutex_enter(&pp->pr_lock); 884 885 KASSERT(pp->pr_cache == NULL); 886 KASSERTMSG((pp->pr_nout == 0), 887 "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan, 888 pp->pr_nout); 889 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 890 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 891 892 /* Remove all pages */ 893 LIST_INIT(&pq); 894 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 895 pr_rmpage(pp, ph, &pq); 896 897 mutex_exit(&pp->pr_lock); 898 899 pr_pagelist_free(pp, &pq); 900 cv_destroy(&pp->pr_cv); 901 mutex_destroy(&pp->pr_lock); 902 } 903 904 void 905 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg) 906 { 907 908 /* XXX no locking -- must be used just after pool_init() */ 909 KASSERTMSG((pp->pr_drain_hook == NULL), 910 "%s: [%s] already set", __func__, pp->pr_wchan); 911 pp->pr_drain_hook = fn; 912 pp->pr_drain_hook_arg = arg; 913 } 914 915 static struct pool_item_header * 916 pool_alloc_item_header(struct pool *pp, void *storage, int flags) 917 { 918 struct pool_item_header *ph; 919 920 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 921 ph = storage; 922 else 923 ph = pool_get(pp->pr_phpool, flags); 924 925 return ph; 926 } 927 928 /* 929 * Grab an item from the pool. 930 */ 931 void * 932 pool_get(struct pool *pp, int flags) 933 { 934 struct pool_item_header *ph; 935 void *v; 936 937 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 938 KASSERTMSG((pp->pr_itemsperpage != 0), 939 "%s: [%s] pr_itemsperpage is zero, " 940 "pool not initialized?", __func__, pp->pr_wchan); 941 KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p()) 942 || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL), 943 "%s: [%s] is IPL_NONE, but called from interrupt context", 944 __func__, pp->pr_wchan); 945 if (flags & PR_WAITOK) { 946 ASSERT_SLEEPABLE(); 947 } 948 949 mutex_enter(&pp->pr_lock); 950 startover: 951 /* 952 * Check to see if we've reached the hard limit. If we have, 953 * and we can wait, then wait until an item has been returned to 954 * the pool. 955 */ 956 KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit), 957 "%s: %s: crossed hard limit", __func__, pp->pr_wchan); 958 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 959 if (pp->pr_drain_hook != NULL) { 960 /* 961 * Since the drain hook is going to free things 962 * back to the pool, unlock, call the hook, re-lock, 963 * and check the hardlimit condition again. 964 */ 965 mutex_exit(&pp->pr_lock); 966 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 967 mutex_enter(&pp->pr_lock); 968 if (pp->pr_nout < pp->pr_hardlimit) 969 goto startover; 970 } 971 972 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 973 /* 974 * XXX: A warning isn't logged in this case. Should 975 * it be? 976 */ 977 pp->pr_flags |= PR_WANTED; 978 do { 979 cv_wait(&pp->pr_cv, &pp->pr_lock); 980 } while (pp->pr_flags & PR_WANTED); 981 goto startover; 982 } 983 984 /* 985 * Log a message that the hard limit has been hit. 986 */ 987 if (pp->pr_hardlimit_warning != NULL && 988 ratecheck(&pp->pr_hardlimit_warning_last, 989 &pp->pr_hardlimit_ratecap)) 990 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 991 992 pp->pr_nfail++; 993 994 mutex_exit(&pp->pr_lock); 995 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 996 return NULL; 997 } 998 999 /* 1000 * The convention we use is that if `curpage' is not NULL, then 1001 * it points at a non-empty bucket. In particular, `curpage' 1002 * never points at a page header which has PR_PHINPAGE set and 1003 * has no items in its bucket. 1004 */ 1005 if ((ph = pp->pr_curpage) == NULL) { 1006 int error; 1007 1008 KASSERTMSG((pp->pr_nitems == 0), 1009 "%s: [%s] curpage NULL, inconsistent nitems %u", 1010 __func__, pp->pr_wchan, pp->pr_nitems); 1011 1012 /* 1013 * Call the back-end page allocator for more memory. 1014 * Release the pool lock, as the back-end page allocator 1015 * may block. 1016 */ 1017 error = pool_grow(pp, flags); 1018 if (error != 0) { 1019 /* 1020 * pool_grow aborts when another thread 1021 * is allocating a new page. Retry if it 1022 * waited for it. 1023 */ 1024 if (error == ERESTART) 1025 goto startover; 1026 1027 /* 1028 * We were unable to allocate a page or item 1029 * header, but we released the lock during 1030 * allocation, so perhaps items were freed 1031 * back to the pool. Check for this case. 1032 */ 1033 if (pp->pr_curpage != NULL) 1034 goto startover; 1035 1036 pp->pr_nfail++; 1037 mutex_exit(&pp->pr_lock); 1038 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 1039 return NULL; 1040 } 1041 1042 /* Start the allocation process over. */ 1043 goto startover; 1044 } 1045 if (pp->pr_roflags & PR_USEBMAP) { 1046 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage), 1047 "%s: [%s] pool page empty", __func__, pp->pr_wchan); 1048 v = pr_item_bitmap_get(pp, ph); 1049 } else { 1050 v = pr_item_linkedlist_get(pp, ph); 1051 } 1052 pp->pr_nitems--; 1053 pp->pr_nout++; 1054 if (ph->ph_nmissing == 0) { 1055 KASSERT(pp->pr_nidle > 0); 1056 pp->pr_nidle--; 1057 1058 /* 1059 * This page was previously empty. Move it to the list of 1060 * partially-full pages. This page is already curpage. 1061 */ 1062 LIST_REMOVE(ph, ph_pagelist); 1063 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1064 } 1065 ph->ph_nmissing++; 1066 if (ph->ph_nmissing == pp->pr_itemsperpage) { 1067 KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) || 1068 LIST_EMPTY(&ph->ph_itemlist)), 1069 "%s: [%s] nmissing (%u) inconsistent", __func__, 1070 pp->pr_wchan, ph->ph_nmissing); 1071 /* 1072 * This page is now full. Move it to the full list 1073 * and select a new current page. 1074 */ 1075 LIST_REMOVE(ph, ph_pagelist); 1076 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 1077 pool_update_curpage(pp); 1078 } 1079 1080 pp->pr_nget++; 1081 1082 /* 1083 * If we have a low water mark and we are now below that low 1084 * water mark, add more items to the pool. 1085 */ 1086 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1087 /* 1088 * XXX: Should we log a warning? Should we set up a timeout 1089 * to try again in a second or so? The latter could break 1090 * a caller's assumptions about interrupt protection, etc. 1091 */ 1092 } 1093 1094 mutex_exit(&pp->pr_lock); 1095 KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0); 1096 FREECHECK_OUT(&pp->pr_freecheck, v); 1097 pool_redzone_fill(pp, v); 1098 if (flags & PR_ZERO) 1099 memset(v, 0, pp->pr_reqsize); 1100 else 1101 pool_kleak_fill(pp, v); 1102 return v; 1103 } 1104 1105 /* 1106 * Internal version of pool_put(). Pool is already locked/entered. 1107 */ 1108 static void 1109 pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq) 1110 { 1111 struct pool_item_header *ph; 1112 1113 KASSERT(mutex_owned(&pp->pr_lock)); 1114 pool_redzone_check(pp, v); 1115 FREECHECK_IN(&pp->pr_freecheck, v); 1116 LOCKDEBUG_MEM_CHECK(v, pp->pr_size); 1117 1118 KASSERTMSG((pp->pr_nout > 0), 1119 "%s: [%s] putting with none out", __func__, pp->pr_wchan); 1120 1121 if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) { 1122 panic("%s: [%s] page header missing", __func__, pp->pr_wchan); 1123 } 1124 1125 /* 1126 * Return to item list. 1127 */ 1128 if (pp->pr_roflags & PR_USEBMAP) { 1129 pr_item_bitmap_put(pp, ph, v); 1130 } else { 1131 pr_item_linkedlist_put(pp, ph, v); 1132 } 1133 KDASSERT(ph->ph_nmissing != 0); 1134 ph->ph_nmissing--; 1135 pp->pr_nput++; 1136 pp->pr_nitems++; 1137 pp->pr_nout--; 1138 1139 /* Cancel "pool empty" condition if it exists */ 1140 if (pp->pr_curpage == NULL) 1141 pp->pr_curpage = ph; 1142 1143 if (pp->pr_flags & PR_WANTED) { 1144 pp->pr_flags &= ~PR_WANTED; 1145 cv_broadcast(&pp->pr_cv); 1146 } 1147 1148 /* 1149 * If this page is now empty, do one of two things: 1150 * 1151 * (1) If we have more pages than the page high water mark, 1152 * free the page back to the system. ONLY CONSIDER 1153 * FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE 1154 * CLAIM. 1155 * 1156 * (2) Otherwise, move the page to the empty page list. 1157 * 1158 * Either way, select a new current page (so we use a partially-full 1159 * page if one is available). 1160 */ 1161 if (ph->ph_nmissing == 0) { 1162 pp->pr_nidle++; 1163 if (pp->pr_npages > pp->pr_minpages && 1164 pp->pr_npages > pp->pr_maxpages) { 1165 pr_rmpage(pp, ph, pq); 1166 } else { 1167 LIST_REMOVE(ph, ph_pagelist); 1168 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1169 1170 /* 1171 * Update the timestamp on the page. A page must 1172 * be idle for some period of time before it can 1173 * be reclaimed by the pagedaemon. This minimizes 1174 * ping-pong'ing for memory. 1175 * 1176 * note for 64-bit time_t: truncating to 32-bit is not 1177 * a problem for our usage. 1178 */ 1179 ph->ph_time = time_uptime; 1180 } 1181 pool_update_curpage(pp); 1182 } 1183 1184 /* 1185 * If the page was previously completely full, move it to the 1186 * partially-full list and make it the current page. The next 1187 * allocation will get the item from this page, instead of 1188 * further fragmenting the pool. 1189 */ 1190 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 1191 LIST_REMOVE(ph, ph_pagelist); 1192 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1193 pp->pr_curpage = ph; 1194 } 1195 } 1196 1197 void 1198 pool_put(struct pool *pp, void *v) 1199 { 1200 struct pool_pagelist pq; 1201 1202 LIST_INIT(&pq); 1203 1204 mutex_enter(&pp->pr_lock); 1205 if (!pool_put_quarantine(pp, v, &pq)) { 1206 pool_do_put(pp, v, &pq); 1207 } 1208 mutex_exit(&pp->pr_lock); 1209 1210 pr_pagelist_free(pp, &pq); 1211 } 1212 1213 /* 1214 * pool_grow: grow a pool by a page. 1215 * 1216 * => called with pool locked. 1217 * => unlock and relock the pool. 1218 * => return with pool locked. 1219 */ 1220 1221 static int 1222 pool_grow(struct pool *pp, int flags) 1223 { 1224 struct pool_item_header *ph; 1225 char *storage; 1226 1227 /* 1228 * If there's a pool_grow in progress, wait for it to complete 1229 * and try again from the top. 1230 */ 1231 if (pp->pr_flags & PR_GROWING) { 1232 if (flags & PR_WAITOK) { 1233 do { 1234 cv_wait(&pp->pr_cv, &pp->pr_lock); 1235 } while (pp->pr_flags & PR_GROWING); 1236 return ERESTART; 1237 } else { 1238 if (pp->pr_flags & PR_GROWINGNOWAIT) { 1239 /* 1240 * This needs an unlock/relock dance so 1241 * that the other caller has a chance to 1242 * run and actually do the thing. Note 1243 * that this is effectively a busy-wait. 1244 */ 1245 mutex_exit(&pp->pr_lock); 1246 mutex_enter(&pp->pr_lock); 1247 return ERESTART; 1248 } 1249 return EWOULDBLOCK; 1250 } 1251 } 1252 pp->pr_flags |= PR_GROWING; 1253 if (flags & PR_WAITOK) 1254 mutex_exit(&pp->pr_lock); 1255 else 1256 pp->pr_flags |= PR_GROWINGNOWAIT; 1257 1258 storage = pool_allocator_alloc(pp, flags); 1259 if (__predict_false(storage == NULL)) 1260 goto out; 1261 1262 ph = pool_alloc_item_header(pp, storage, flags); 1263 if (__predict_false(ph == NULL)) { 1264 pool_allocator_free(pp, storage); 1265 goto out; 1266 } 1267 1268 if (flags & PR_WAITOK) 1269 mutex_enter(&pp->pr_lock); 1270 pool_prime_page(pp, storage, ph); 1271 pp->pr_npagealloc++; 1272 KASSERT(pp->pr_flags & PR_GROWING); 1273 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1274 /* 1275 * If anyone was waiting for pool_grow, notify them that we 1276 * may have just done it. 1277 */ 1278 cv_broadcast(&pp->pr_cv); 1279 return 0; 1280 out: 1281 if (flags & PR_WAITOK) 1282 mutex_enter(&pp->pr_lock); 1283 KASSERT(pp->pr_flags & PR_GROWING); 1284 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1285 return ENOMEM; 1286 } 1287 1288 /* 1289 * Add N items to the pool. 1290 */ 1291 int 1292 pool_prime(struct pool *pp, int n) 1293 { 1294 int newpages; 1295 int error = 0; 1296 1297 mutex_enter(&pp->pr_lock); 1298 1299 newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1300 1301 while (newpages > 0) { 1302 error = pool_grow(pp, PR_NOWAIT); 1303 if (error) { 1304 if (error == ERESTART) 1305 continue; 1306 break; 1307 } 1308 pp->pr_minpages++; 1309 newpages--; 1310 } 1311 1312 if (pp->pr_minpages >= pp->pr_maxpages) 1313 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 1314 1315 mutex_exit(&pp->pr_lock); 1316 return error; 1317 } 1318 1319 /* 1320 * Add a page worth of items to the pool. 1321 * 1322 * Note, we must be called with the pool descriptor LOCKED. 1323 */ 1324 static void 1325 pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph) 1326 { 1327 const unsigned int align = pp->pr_align; 1328 struct pool_item *pi; 1329 void *cp = storage; 1330 int n; 1331 1332 KASSERT(mutex_owned(&pp->pr_lock)); 1333 KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) || 1334 (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)), 1335 "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp); 1336 1337 /* 1338 * Insert page header. 1339 */ 1340 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1341 LIST_INIT(&ph->ph_itemlist); 1342 ph->ph_page = storage; 1343 ph->ph_nmissing = 0; 1344 ph->ph_time = time_uptime; 1345 if (pp->pr_roflags & PR_PHINPAGE) 1346 ph->ph_poolid = pp->pr_poolid; 1347 else 1348 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 1349 1350 pp->pr_nidle++; 1351 1352 /* 1353 * The item space starts after the on-page header, if any. 1354 */ 1355 ph->ph_off = pp->pr_itemoffset; 1356 1357 /* 1358 * Color this page. 1359 */ 1360 ph->ph_off += pp->pr_curcolor; 1361 cp = (char *)cp + ph->ph_off; 1362 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 1363 pp->pr_curcolor = 0; 1364 1365 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1366 1367 /* 1368 * Insert remaining chunks on the bucket list. 1369 */ 1370 n = pp->pr_itemsperpage; 1371 pp->pr_nitems += n; 1372 1373 if (pp->pr_roflags & PR_USEBMAP) { 1374 pr_item_bitmap_init(pp, ph); 1375 } else { 1376 while (n--) { 1377 pi = (struct pool_item *)cp; 1378 1379 KASSERT((((vaddr_t)pi) & (align - 1)) == 0); 1380 1381 /* Insert on page list */ 1382 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 1383 #ifdef POOL_CHECK_MAGIC 1384 pi->pi_magic = PI_MAGIC; 1385 #endif 1386 cp = (char *)cp + pp->pr_size; 1387 1388 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1389 } 1390 } 1391 1392 /* 1393 * If the pool was depleted, point at the new page. 1394 */ 1395 if (pp->pr_curpage == NULL) 1396 pp->pr_curpage = ph; 1397 1398 if (++pp->pr_npages > pp->pr_hiwat) 1399 pp->pr_hiwat = pp->pr_npages; 1400 } 1401 1402 /* 1403 * Used by pool_get() when nitems drops below the low water mark. This 1404 * is used to catch up pr_nitems with the low water mark. 1405 * 1406 * Note 1, we never wait for memory here, we let the caller decide what to do. 1407 * 1408 * Note 2, we must be called with the pool already locked, and we return 1409 * with it locked. 1410 */ 1411 static int 1412 pool_catchup(struct pool *pp) 1413 { 1414 int error = 0; 1415 1416 while (POOL_NEEDS_CATCHUP(pp)) { 1417 error = pool_grow(pp, PR_NOWAIT); 1418 if (error) { 1419 if (error == ERESTART) 1420 continue; 1421 break; 1422 } 1423 } 1424 return error; 1425 } 1426 1427 static void 1428 pool_update_curpage(struct pool *pp) 1429 { 1430 1431 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 1432 if (pp->pr_curpage == NULL) { 1433 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 1434 } 1435 KASSERT((pp->pr_curpage == NULL && pp->pr_nitems == 0) || 1436 (pp->pr_curpage != NULL && pp->pr_nitems > 0)); 1437 } 1438 1439 void 1440 pool_setlowat(struct pool *pp, int n) 1441 { 1442 1443 mutex_enter(&pp->pr_lock); 1444 1445 pp->pr_minitems = n; 1446 pp->pr_minpages = (n == 0) 1447 ? 0 1448 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1449 1450 /* Make sure we're caught up with the newly-set low water mark. */ 1451 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1452 /* 1453 * XXX: Should we log a warning? Should we set up a timeout 1454 * to try again in a second or so? The latter could break 1455 * a caller's assumptions about interrupt protection, etc. 1456 */ 1457 } 1458 1459 mutex_exit(&pp->pr_lock); 1460 } 1461 1462 void 1463 pool_sethiwat(struct pool *pp, int n) 1464 { 1465 1466 mutex_enter(&pp->pr_lock); 1467 1468 pp->pr_maxpages = (n == 0) 1469 ? 0 1470 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1471 1472 mutex_exit(&pp->pr_lock); 1473 } 1474 1475 void 1476 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap) 1477 { 1478 1479 mutex_enter(&pp->pr_lock); 1480 1481 pp->pr_hardlimit = n; 1482 pp->pr_hardlimit_warning = warnmess; 1483 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 1484 pp->pr_hardlimit_warning_last.tv_sec = 0; 1485 pp->pr_hardlimit_warning_last.tv_usec = 0; 1486 1487 /* 1488 * In-line version of pool_sethiwat(), because we don't want to 1489 * release the lock. 1490 */ 1491 pp->pr_maxpages = (n == 0) 1492 ? 0 1493 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1494 1495 mutex_exit(&pp->pr_lock); 1496 } 1497 1498 /* 1499 * Release all complete pages that have not been used recently. 1500 * 1501 * Must not be called from interrupt context. 1502 */ 1503 int 1504 pool_reclaim(struct pool *pp) 1505 { 1506 struct pool_item_header *ph, *phnext; 1507 struct pool_pagelist pq; 1508 uint32_t curtime; 1509 bool klock; 1510 int rv; 1511 1512 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 1513 1514 if (pp->pr_drain_hook != NULL) { 1515 /* 1516 * The drain hook must be called with the pool unlocked. 1517 */ 1518 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT); 1519 } 1520 1521 /* 1522 * XXXSMP Because we do not want to cause non-MPSAFE code 1523 * to block. 1524 */ 1525 if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK || 1526 pp->pr_ipl == IPL_SOFTSERIAL) { 1527 KERNEL_LOCK(1, NULL); 1528 klock = true; 1529 } else 1530 klock = false; 1531 1532 /* Reclaim items from the pool's cache (if any). */ 1533 if (pp->pr_cache != NULL) 1534 pool_cache_invalidate(pp->pr_cache); 1535 1536 if (mutex_tryenter(&pp->pr_lock) == 0) { 1537 if (klock) { 1538 KERNEL_UNLOCK_ONE(NULL); 1539 } 1540 return 0; 1541 } 1542 1543 LIST_INIT(&pq); 1544 1545 curtime = time_uptime; 1546 1547 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 1548 phnext = LIST_NEXT(ph, ph_pagelist); 1549 1550 /* Check our minimum page claim */ 1551 if (pp->pr_npages <= pp->pr_minpages) 1552 break; 1553 1554 KASSERT(ph->ph_nmissing == 0); 1555 if (curtime - ph->ph_time < pool_inactive_time) 1556 continue; 1557 1558 /* 1559 * If freeing this page would put us below 1560 * the low water mark, stop now. 1561 */ 1562 if ((pp->pr_nitems - pp->pr_itemsperpage) < 1563 pp->pr_minitems) 1564 break; 1565 1566 pr_rmpage(pp, ph, &pq); 1567 } 1568 1569 mutex_exit(&pp->pr_lock); 1570 1571 if (LIST_EMPTY(&pq)) 1572 rv = 0; 1573 else { 1574 pr_pagelist_free(pp, &pq); 1575 rv = 1; 1576 } 1577 1578 if (klock) { 1579 KERNEL_UNLOCK_ONE(NULL); 1580 } 1581 1582 return rv; 1583 } 1584 1585 /* 1586 * Drain pools, one at a time. The drained pool is returned within ppp. 1587 * 1588 * Note, must never be called from interrupt context. 1589 */ 1590 bool 1591 pool_drain(struct pool **ppp) 1592 { 1593 bool reclaimed; 1594 struct pool *pp; 1595 1596 KASSERT(!TAILQ_EMPTY(&pool_head)); 1597 1598 pp = NULL; 1599 1600 /* Find next pool to drain, and add a reference. */ 1601 mutex_enter(&pool_head_lock); 1602 do { 1603 if (drainpp == NULL) { 1604 drainpp = TAILQ_FIRST(&pool_head); 1605 } 1606 if (drainpp != NULL) { 1607 pp = drainpp; 1608 drainpp = TAILQ_NEXT(pp, pr_poollist); 1609 } 1610 /* 1611 * Skip completely idle pools. We depend on at least 1612 * one pool in the system being active. 1613 */ 1614 } while (pp == NULL || pp->pr_npages == 0); 1615 pp->pr_refcnt++; 1616 mutex_exit(&pool_head_lock); 1617 1618 /* Drain the cache (if any) and pool.. */ 1619 reclaimed = pool_reclaim(pp); 1620 1621 /* Finally, unlock the pool. */ 1622 mutex_enter(&pool_head_lock); 1623 pp->pr_refcnt--; 1624 cv_broadcast(&pool_busy); 1625 mutex_exit(&pool_head_lock); 1626 1627 if (ppp != NULL) 1628 *ppp = pp; 1629 1630 return reclaimed; 1631 } 1632 1633 /* 1634 * Calculate the total number of pages consumed by pools. 1635 */ 1636 int 1637 pool_totalpages(void) 1638 { 1639 1640 mutex_enter(&pool_head_lock); 1641 int pages = pool_totalpages_locked(); 1642 mutex_exit(&pool_head_lock); 1643 1644 return pages; 1645 } 1646 1647 int 1648 pool_totalpages_locked(void) 1649 { 1650 struct pool *pp; 1651 uint64_t total = 0; 1652 1653 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1654 uint64_t bytes = pp->pr_npages * pp->pr_alloc->pa_pagesz; 1655 1656 if ((pp->pr_roflags & PR_RECURSIVE) != 0) 1657 bytes -= (pp->pr_nout * pp->pr_size); 1658 total += bytes; 1659 } 1660 1661 return atop(total); 1662 } 1663 1664 /* 1665 * Diagnostic helpers. 1666 */ 1667 1668 void 1669 pool_printall(const char *modif, void (*pr)(const char *, ...)) 1670 { 1671 struct pool *pp; 1672 1673 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1674 pool_printit(pp, modif, pr); 1675 } 1676 } 1677 1678 void 1679 pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1680 { 1681 1682 if (pp == NULL) { 1683 (*pr)("Must specify a pool to print.\n"); 1684 return; 1685 } 1686 1687 pool_print1(pp, modif, pr); 1688 } 1689 1690 static void 1691 pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl, 1692 void (*pr)(const char *, ...)) 1693 { 1694 struct pool_item_header *ph; 1695 1696 LIST_FOREACH(ph, pl, ph_pagelist) { 1697 (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n", 1698 ph->ph_page, ph->ph_nmissing, ph->ph_time); 1699 #ifdef POOL_CHECK_MAGIC 1700 struct pool_item *pi; 1701 if (!(pp->pr_roflags & PR_USEBMAP)) { 1702 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 1703 if (pi->pi_magic != PI_MAGIC) { 1704 (*pr)("\t\t\titem %p, magic 0x%x\n", 1705 pi, pi->pi_magic); 1706 } 1707 } 1708 } 1709 #endif 1710 } 1711 } 1712 1713 static void 1714 pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1715 { 1716 struct pool_item_header *ph; 1717 pool_cache_t pc; 1718 pcg_t *pcg; 1719 pool_cache_cpu_t *cc; 1720 uint64_t cpuhit, cpumiss; 1721 int i, print_log = 0, print_pagelist = 0, print_cache = 0; 1722 char c; 1723 1724 while ((c = *modif++) != '\0') { 1725 if (c == 'l') 1726 print_log = 1; 1727 if (c == 'p') 1728 print_pagelist = 1; 1729 if (c == 'c') 1730 print_cache = 1; 1731 } 1732 1733 if ((pc = pp->pr_cache) != NULL) { 1734 (*pr)("POOL CACHE"); 1735 } else { 1736 (*pr)("POOL"); 1737 } 1738 1739 (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n", 1740 pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, 1741 pp->pr_roflags); 1742 (*pr)("\talloc %p\n", pp->pr_alloc); 1743 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1744 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1745 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1746 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1747 1748 (*pr)("\tnget %lu, nfail %lu, nput %lu\n", 1749 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1750 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1751 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1752 1753 if (print_pagelist == 0) 1754 goto skip_pagelist; 1755 1756 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1757 (*pr)("\n\tempty page list:\n"); 1758 pool_print_pagelist(pp, &pp->pr_emptypages, pr); 1759 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1760 (*pr)("\n\tfull page list:\n"); 1761 pool_print_pagelist(pp, &pp->pr_fullpages, pr); 1762 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1763 (*pr)("\n\tpartial-page list:\n"); 1764 pool_print_pagelist(pp, &pp->pr_partpages, pr); 1765 1766 if (pp->pr_curpage == NULL) 1767 (*pr)("\tno current page\n"); 1768 else 1769 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1770 1771 skip_pagelist: 1772 if (print_log == 0) 1773 goto skip_log; 1774 1775 (*pr)("\n"); 1776 1777 skip_log: 1778 1779 #define PR_GROUPLIST(pcg) \ 1780 (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \ 1781 for (i = 0; i < pcg->pcg_size; i++) { \ 1782 if (pcg->pcg_objects[i].pcgo_pa != \ 1783 POOL_PADDR_INVALID) { \ 1784 (*pr)("\t\t\t%p, 0x%llx\n", \ 1785 pcg->pcg_objects[i].pcgo_va, \ 1786 (unsigned long long) \ 1787 pcg->pcg_objects[i].pcgo_pa); \ 1788 } else { \ 1789 (*pr)("\t\t\t%p\n", \ 1790 pcg->pcg_objects[i].pcgo_va); \ 1791 } \ 1792 } 1793 1794 if (pc != NULL) { 1795 cpuhit = 0; 1796 cpumiss = 0; 1797 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 1798 if ((cc = pc->pc_cpus[i]) == NULL) 1799 continue; 1800 cpuhit += cc->cc_hits; 1801 cpumiss += cc->cc_misses; 1802 } 1803 (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss); 1804 (*pr)("\tcache layer hits %llu misses %llu\n", 1805 pc->pc_hits, pc->pc_misses); 1806 (*pr)("\tcache layer entry uncontended %llu contended %llu\n", 1807 pc->pc_hits + pc->pc_misses - pc->pc_contended, 1808 pc->pc_contended); 1809 (*pr)("\tcache layer empty groups %u full groups %u\n", 1810 pc->pc_nempty, pc->pc_nfull); 1811 if (print_cache) { 1812 (*pr)("\tfull cache groups:\n"); 1813 for (pcg = pc->pc_fullgroups; pcg != NULL; 1814 pcg = pcg->pcg_next) { 1815 PR_GROUPLIST(pcg); 1816 } 1817 (*pr)("\tempty cache groups:\n"); 1818 for (pcg = pc->pc_emptygroups; pcg != NULL; 1819 pcg = pcg->pcg_next) { 1820 PR_GROUPLIST(pcg); 1821 } 1822 } 1823 } 1824 #undef PR_GROUPLIST 1825 } 1826 1827 static int 1828 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 1829 { 1830 struct pool_item *pi; 1831 void *page; 1832 int n; 1833 1834 if ((pp->pr_roflags & PR_NOALIGN) == 0) { 1835 page = (void *)((uintptr_t)ph & pp->pr_alloc->pa_pagemask); 1836 if (page != ph->ph_page && 1837 (pp->pr_roflags & PR_PHINPAGE) != 0) { 1838 if (label != NULL) 1839 printf("%s: ", label); 1840 printf("pool(%p:%s): page inconsistency: page %p;" 1841 " at page head addr %p (p %p)\n", pp, 1842 pp->pr_wchan, ph->ph_page, 1843 ph, page); 1844 return 1; 1845 } 1846 } 1847 1848 if ((pp->pr_roflags & PR_USEBMAP) != 0) 1849 return 0; 1850 1851 for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0; 1852 pi != NULL; 1853 pi = LIST_NEXT(pi,pi_list), n++) { 1854 1855 #ifdef POOL_CHECK_MAGIC 1856 if (pi->pi_magic != PI_MAGIC) { 1857 if (label != NULL) 1858 printf("%s: ", label); 1859 printf("pool(%s): free list modified: magic=%x;" 1860 " page %p; item ordinal %d; addr %p\n", 1861 pp->pr_wchan, pi->pi_magic, ph->ph_page, 1862 n, pi); 1863 panic("pool"); 1864 } 1865 #endif 1866 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 1867 continue; 1868 } 1869 page = (void *)((uintptr_t)pi & pp->pr_alloc->pa_pagemask); 1870 if (page == ph->ph_page) 1871 continue; 1872 1873 if (label != NULL) 1874 printf("%s: ", label); 1875 printf("pool(%p:%s): page inconsistency: page %p;" 1876 " item ordinal %d; addr %p (p %p)\n", pp, 1877 pp->pr_wchan, ph->ph_page, 1878 n, pi, page); 1879 return 1; 1880 } 1881 return 0; 1882 } 1883 1884 1885 int 1886 pool_chk(struct pool *pp, const char *label) 1887 { 1888 struct pool_item_header *ph; 1889 int r = 0; 1890 1891 mutex_enter(&pp->pr_lock); 1892 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 1893 r = pool_chk_page(pp, label, ph); 1894 if (r) { 1895 goto out; 1896 } 1897 } 1898 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 1899 r = pool_chk_page(pp, label, ph); 1900 if (r) { 1901 goto out; 1902 } 1903 } 1904 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 1905 r = pool_chk_page(pp, label, ph); 1906 if (r) { 1907 goto out; 1908 } 1909 } 1910 1911 out: 1912 mutex_exit(&pp->pr_lock); 1913 return r; 1914 } 1915 1916 /* 1917 * pool_cache_init: 1918 * 1919 * Initialize a pool cache. 1920 */ 1921 pool_cache_t 1922 pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags, 1923 const char *wchan, struct pool_allocator *palloc, int ipl, 1924 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) 1925 { 1926 pool_cache_t pc; 1927 1928 pc = pool_get(&cache_pool, PR_WAITOK); 1929 if (pc == NULL) 1930 return NULL; 1931 1932 pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan, 1933 palloc, ipl, ctor, dtor, arg); 1934 1935 return pc; 1936 } 1937 1938 /* 1939 * pool_cache_bootstrap: 1940 * 1941 * Kernel-private version of pool_cache_init(). The caller 1942 * provides initial storage. 1943 */ 1944 void 1945 pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, 1946 u_int align_offset, u_int flags, const char *wchan, 1947 struct pool_allocator *palloc, int ipl, 1948 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), 1949 void *arg) 1950 { 1951 CPU_INFO_ITERATOR cii; 1952 pool_cache_t pc1; 1953 struct cpu_info *ci; 1954 struct pool *pp; 1955 1956 pp = &pc->pc_pool; 1957 if (palloc == NULL && ipl == IPL_NONE) { 1958 if (size > PAGE_SIZE) { 1959 int bigidx = pool_bigidx(size); 1960 1961 palloc = &pool_allocator_big[bigidx]; 1962 flags |= PR_NOALIGN; 1963 } else 1964 palloc = &pool_allocator_nointr; 1965 } 1966 pool_init(pp, size, align, align_offset, flags, wchan, palloc, ipl); 1967 mutex_init(&pc->pc_lock, MUTEX_DEFAULT, ipl); 1968 1969 if (ctor == NULL) { 1970 ctor = (int (*)(void *, void *, int))nullop; 1971 } 1972 if (dtor == NULL) { 1973 dtor = (void (*)(void *, void *))nullop; 1974 } 1975 1976 pc->pc_emptygroups = NULL; 1977 pc->pc_fullgroups = NULL; 1978 pc->pc_partgroups = NULL; 1979 pc->pc_ctor = ctor; 1980 pc->pc_dtor = dtor; 1981 pc->pc_arg = arg; 1982 pc->pc_hits = 0; 1983 pc->pc_misses = 0; 1984 pc->pc_nempty = 0; 1985 pc->pc_npart = 0; 1986 pc->pc_nfull = 0; 1987 pc->pc_contended = 0; 1988 pc->pc_refcnt = 0; 1989 pc->pc_freecheck = NULL; 1990 1991 if ((flags & PR_LARGECACHE) != 0) { 1992 pc->pc_pcgsize = PCG_NOBJECTS_LARGE; 1993 pc->pc_pcgpool = &pcg_large_pool; 1994 } else { 1995 pc->pc_pcgsize = PCG_NOBJECTS_NORMAL; 1996 pc->pc_pcgpool = &pcg_normal_pool; 1997 } 1998 1999 /* Allocate per-CPU caches. */ 2000 memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus)); 2001 pc->pc_ncpu = 0; 2002 if (ncpu < 2) { 2003 /* XXX For sparc: boot CPU is not attached yet. */ 2004 pool_cache_cpu_init1(curcpu(), pc); 2005 } else { 2006 for (CPU_INFO_FOREACH(cii, ci)) { 2007 pool_cache_cpu_init1(ci, pc); 2008 } 2009 } 2010 2011 /* Add to list of all pools. */ 2012 if (__predict_true(!cold)) 2013 mutex_enter(&pool_head_lock); 2014 TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) { 2015 if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0) 2016 break; 2017 } 2018 if (pc1 == NULL) 2019 TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist); 2020 else 2021 TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist); 2022 if (__predict_true(!cold)) 2023 mutex_exit(&pool_head_lock); 2024 2025 membar_sync(); 2026 pp->pr_cache = pc; 2027 } 2028 2029 /* 2030 * pool_cache_destroy: 2031 * 2032 * Destroy a pool cache. 2033 */ 2034 void 2035 pool_cache_destroy(pool_cache_t pc) 2036 { 2037 2038 pool_cache_bootstrap_destroy(pc); 2039 pool_put(&cache_pool, pc); 2040 } 2041 2042 /* 2043 * pool_cache_bootstrap_destroy: 2044 * 2045 * Destroy a pool cache. 2046 */ 2047 void 2048 pool_cache_bootstrap_destroy(pool_cache_t pc) 2049 { 2050 struct pool *pp = &pc->pc_pool; 2051 u_int i; 2052 2053 /* Remove it from the global list. */ 2054 mutex_enter(&pool_head_lock); 2055 while (pc->pc_refcnt != 0) 2056 cv_wait(&pool_busy, &pool_head_lock); 2057 TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist); 2058 mutex_exit(&pool_head_lock); 2059 2060 /* First, invalidate the entire cache. */ 2061 pool_cache_invalidate(pc); 2062 2063 /* Disassociate it from the pool. */ 2064 mutex_enter(&pp->pr_lock); 2065 pp->pr_cache = NULL; 2066 mutex_exit(&pp->pr_lock); 2067 2068 /* Destroy per-CPU data */ 2069 for (i = 0; i < __arraycount(pc->pc_cpus); i++) 2070 pool_cache_invalidate_cpu(pc, i); 2071 2072 /* Finally, destroy it. */ 2073 mutex_destroy(&pc->pc_lock); 2074 pool_destroy(pp); 2075 } 2076 2077 /* 2078 * pool_cache_cpu_init1: 2079 * 2080 * Called for each pool_cache whenever a new CPU is attached. 2081 */ 2082 static void 2083 pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) 2084 { 2085 pool_cache_cpu_t *cc; 2086 int index; 2087 2088 index = ci->ci_index; 2089 2090 KASSERT(index < __arraycount(pc->pc_cpus)); 2091 2092 if ((cc = pc->pc_cpus[index]) != NULL) { 2093 KASSERT(cc->cc_cpuindex == index); 2094 return; 2095 } 2096 2097 /* 2098 * The first CPU is 'free'. This needs to be the case for 2099 * bootstrap - we may not be able to allocate yet. 2100 */ 2101 if (pc->pc_ncpu == 0) { 2102 cc = &pc->pc_cpu0; 2103 pc->pc_ncpu = 1; 2104 } else { 2105 mutex_enter(&pc->pc_lock); 2106 pc->pc_ncpu++; 2107 mutex_exit(&pc->pc_lock); 2108 cc = pool_get(&cache_cpu_pool, PR_WAITOK); 2109 } 2110 2111 cc->cc_ipl = pc->pc_pool.pr_ipl; 2112 cc->cc_iplcookie = makeiplcookie(cc->cc_ipl); 2113 cc->cc_cache = pc; 2114 cc->cc_cpuindex = index; 2115 cc->cc_hits = 0; 2116 cc->cc_misses = 0; 2117 cc->cc_current = __UNCONST(&pcg_dummy); 2118 cc->cc_previous = __UNCONST(&pcg_dummy); 2119 2120 pc->pc_cpus[index] = cc; 2121 } 2122 2123 /* 2124 * pool_cache_cpu_init: 2125 * 2126 * Called whenever a new CPU is attached. 2127 */ 2128 void 2129 pool_cache_cpu_init(struct cpu_info *ci) 2130 { 2131 pool_cache_t pc; 2132 2133 mutex_enter(&pool_head_lock); 2134 TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) { 2135 pc->pc_refcnt++; 2136 mutex_exit(&pool_head_lock); 2137 2138 pool_cache_cpu_init1(ci, pc); 2139 2140 mutex_enter(&pool_head_lock); 2141 pc->pc_refcnt--; 2142 cv_broadcast(&pool_busy); 2143 } 2144 mutex_exit(&pool_head_lock); 2145 } 2146 2147 /* 2148 * pool_cache_reclaim: 2149 * 2150 * Reclaim memory from a pool cache. 2151 */ 2152 bool 2153 pool_cache_reclaim(pool_cache_t pc) 2154 { 2155 2156 return pool_reclaim(&pc->pc_pool); 2157 } 2158 2159 static void 2160 pool_cache_destruct_object1(pool_cache_t pc, void *object) 2161 { 2162 (*pc->pc_dtor)(pc->pc_arg, object); 2163 pool_put(&pc->pc_pool, object); 2164 } 2165 2166 /* 2167 * pool_cache_destruct_object: 2168 * 2169 * Force destruction of an object and its release back into 2170 * the pool. 2171 */ 2172 void 2173 pool_cache_destruct_object(pool_cache_t pc, void *object) 2174 { 2175 2176 FREECHECK_IN(&pc->pc_freecheck, object); 2177 2178 pool_cache_destruct_object1(pc, object); 2179 } 2180 2181 /* 2182 * pool_cache_invalidate_groups: 2183 * 2184 * Invalidate a chain of groups and destruct all objects. 2185 */ 2186 static void 2187 pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg) 2188 { 2189 void *object; 2190 pcg_t *next; 2191 int i; 2192 2193 for (; pcg != NULL; pcg = next) { 2194 next = pcg->pcg_next; 2195 2196 for (i = 0; i < pcg->pcg_avail; i++) { 2197 object = pcg->pcg_objects[i].pcgo_va; 2198 pool_cache_destruct_object1(pc, object); 2199 } 2200 2201 if (pcg->pcg_size == PCG_NOBJECTS_LARGE) { 2202 pool_put(&pcg_large_pool, pcg); 2203 } else { 2204 KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL); 2205 pool_put(&pcg_normal_pool, pcg); 2206 } 2207 } 2208 } 2209 2210 /* 2211 * pool_cache_invalidate: 2212 * 2213 * Invalidate a pool cache (destruct and release all of the 2214 * cached objects). Does not reclaim objects from the pool. 2215 * 2216 * Note: For pool caches that provide constructed objects, there 2217 * is an assumption that another level of synchronization is occurring 2218 * between the input to the constructor and the cache invalidation. 2219 * 2220 * Invalidation is a costly process and should not be called from 2221 * interrupt context. 2222 */ 2223 void 2224 pool_cache_invalidate(pool_cache_t pc) 2225 { 2226 uint64_t where; 2227 pcg_t *full, *empty, *part; 2228 2229 KASSERT(!cpu_intr_p() && !cpu_softintr_p()); 2230 2231 if (ncpu < 2 || !mp_online) { 2232 /* 2233 * We might be called early enough in the boot process 2234 * for the CPU data structures to not be fully initialized. 2235 * In this case, transfer the content of the local CPU's 2236 * cache back into global cache as only this CPU is currently 2237 * running. 2238 */ 2239 pool_cache_transfer(pc); 2240 } else { 2241 /* 2242 * Signal all CPUs that they must transfer their local 2243 * cache back to the global pool then wait for the xcall to 2244 * complete. 2245 */ 2246 where = xc_broadcast(0, (xcfunc_t)pool_cache_transfer, 2247 pc, NULL); 2248 xc_wait(where); 2249 } 2250 2251 /* Empty pool caches, then invalidate objects */ 2252 mutex_enter(&pc->pc_lock); 2253 full = pc->pc_fullgroups; 2254 empty = pc->pc_emptygroups; 2255 part = pc->pc_partgroups; 2256 pc->pc_fullgroups = NULL; 2257 pc->pc_emptygroups = NULL; 2258 pc->pc_partgroups = NULL; 2259 pc->pc_nfull = 0; 2260 pc->pc_nempty = 0; 2261 pc->pc_npart = 0; 2262 mutex_exit(&pc->pc_lock); 2263 2264 pool_cache_invalidate_groups(pc, full); 2265 pool_cache_invalidate_groups(pc, empty); 2266 pool_cache_invalidate_groups(pc, part); 2267 } 2268 2269 /* 2270 * pool_cache_invalidate_cpu: 2271 * 2272 * Invalidate all CPU-bound cached objects in pool cache, the CPU being 2273 * identified by its associated index. 2274 * It is caller's responsibility to ensure that no operation is 2275 * taking place on this pool cache while doing this invalidation. 2276 * WARNING: as no inter-CPU locking is enforced, trying to invalidate 2277 * pool cached objects from a CPU different from the one currently running 2278 * may result in an undefined behaviour. 2279 */ 2280 static void 2281 pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) 2282 { 2283 pool_cache_cpu_t *cc; 2284 pcg_t *pcg; 2285 2286 if ((cc = pc->pc_cpus[index]) == NULL) 2287 return; 2288 2289 if ((pcg = cc->cc_current) != &pcg_dummy) { 2290 pcg->pcg_next = NULL; 2291 pool_cache_invalidate_groups(pc, pcg); 2292 } 2293 if ((pcg = cc->cc_previous) != &pcg_dummy) { 2294 pcg->pcg_next = NULL; 2295 pool_cache_invalidate_groups(pc, pcg); 2296 } 2297 if (cc != &pc->pc_cpu0) 2298 pool_put(&cache_cpu_pool, cc); 2299 2300 } 2301 2302 void 2303 pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg) 2304 { 2305 2306 pool_set_drain_hook(&pc->pc_pool, fn, arg); 2307 } 2308 2309 void 2310 pool_cache_setlowat(pool_cache_t pc, int n) 2311 { 2312 2313 pool_setlowat(&pc->pc_pool, n); 2314 } 2315 2316 void 2317 pool_cache_sethiwat(pool_cache_t pc, int n) 2318 { 2319 2320 pool_sethiwat(&pc->pc_pool, n); 2321 } 2322 2323 void 2324 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) 2325 { 2326 2327 pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap); 2328 } 2329 2330 static bool __noinline 2331 pool_cache_get_slow(pool_cache_cpu_t *cc, int s, void **objectp, 2332 paddr_t *pap, int flags) 2333 { 2334 pcg_t *pcg, *cur; 2335 uint64_t ncsw; 2336 pool_cache_t pc; 2337 void *object; 2338 2339 KASSERT(cc->cc_current->pcg_avail == 0); 2340 KASSERT(cc->cc_previous->pcg_avail == 0); 2341 2342 pc = cc->cc_cache; 2343 cc->cc_misses++; 2344 2345 /* 2346 * Nothing was available locally. Try and grab a group 2347 * from the cache. 2348 */ 2349 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2350 ncsw = curlwp->l_ncsw; 2351 mutex_enter(&pc->pc_lock); 2352 pc->pc_contended++; 2353 2354 /* 2355 * If we context switched while locking, then 2356 * our view of the per-CPU data is invalid: 2357 * retry. 2358 */ 2359 if (curlwp->l_ncsw != ncsw) { 2360 mutex_exit(&pc->pc_lock); 2361 return true; 2362 } 2363 } 2364 2365 if (__predict_true((pcg = pc->pc_fullgroups) != NULL)) { 2366 /* 2367 * If there's a full group, release our empty 2368 * group back to the cache. Install the full 2369 * group as cc_current and return. 2370 */ 2371 if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { 2372 KASSERT(cur->pcg_avail == 0); 2373 cur->pcg_next = pc->pc_emptygroups; 2374 pc->pc_emptygroups = cur; 2375 pc->pc_nempty++; 2376 } 2377 KASSERT(pcg->pcg_avail == pcg->pcg_size); 2378 cc->cc_current = pcg; 2379 pc->pc_fullgroups = pcg->pcg_next; 2380 pc->pc_hits++; 2381 pc->pc_nfull--; 2382 mutex_exit(&pc->pc_lock); 2383 return true; 2384 } 2385 2386 /* 2387 * Nothing available locally or in cache. Take the slow 2388 * path: fetch a new object from the pool and construct 2389 * it. 2390 */ 2391 pc->pc_misses++; 2392 mutex_exit(&pc->pc_lock); 2393 splx(s); 2394 2395 object = pool_get(&pc->pc_pool, flags); 2396 *objectp = object; 2397 if (__predict_false(object == NULL)) { 2398 KASSERT((flags & (PR_WAITOK|PR_NOWAIT)) == PR_NOWAIT); 2399 return false; 2400 } 2401 2402 if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) { 2403 pool_put(&pc->pc_pool, object); 2404 *objectp = NULL; 2405 return false; 2406 } 2407 2408 KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); 2409 2410 if (pap != NULL) { 2411 #ifdef POOL_VTOPHYS 2412 *pap = POOL_VTOPHYS(object); 2413 #else 2414 *pap = POOL_PADDR_INVALID; 2415 #endif 2416 } 2417 2418 FREECHECK_OUT(&pc->pc_freecheck, object); 2419 pool_cache_kleak_fill(pc, object); 2420 return false; 2421 } 2422 2423 /* 2424 * pool_cache_get{,_paddr}: 2425 * 2426 * Get an object from a pool cache (optionally returning 2427 * the physical address of the object). 2428 */ 2429 void * 2430 pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) 2431 { 2432 pool_cache_cpu_t *cc; 2433 pcg_t *pcg; 2434 void *object; 2435 int s; 2436 2437 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 2438 KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()) || 2439 (pc->pc_pool.pr_ipl != IPL_NONE || cold || panicstr != NULL), 2440 "%s: [%s] is IPL_NONE, but called from interrupt context", 2441 __func__, pc->pc_pool.pr_wchan); 2442 2443 if (flags & PR_WAITOK) { 2444 ASSERT_SLEEPABLE(); 2445 } 2446 2447 /* Lock out interrupts and disable preemption. */ 2448 s = splvm(); 2449 while (/* CONSTCOND */ true) { 2450 /* Try and allocate an object from the current group. */ 2451 cc = pc->pc_cpus[curcpu()->ci_index]; 2452 KASSERT(cc->cc_cache == pc); 2453 pcg = cc->cc_current; 2454 if (__predict_true(pcg->pcg_avail > 0)) { 2455 object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; 2456 if (__predict_false(pap != NULL)) 2457 *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa; 2458 #if defined(DIAGNOSTIC) 2459 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; 2460 KASSERT(pcg->pcg_avail < pcg->pcg_size); 2461 KASSERT(object != NULL); 2462 #endif 2463 cc->cc_hits++; 2464 splx(s); 2465 FREECHECK_OUT(&pc->pc_freecheck, object); 2466 pool_redzone_fill(&pc->pc_pool, object); 2467 pool_cache_kleak_fill(pc, object); 2468 return object; 2469 } 2470 2471 /* 2472 * That failed. If the previous group isn't empty, swap 2473 * it with the current group and allocate from there. 2474 */ 2475 pcg = cc->cc_previous; 2476 if (__predict_true(pcg->pcg_avail > 0)) { 2477 cc->cc_previous = cc->cc_current; 2478 cc->cc_current = pcg; 2479 continue; 2480 } 2481 2482 /* 2483 * Can't allocate from either group: try the slow path. 2484 * If get_slow() allocated an object for us, or if 2485 * no more objects are available, it will return false. 2486 * Otherwise, we need to retry. 2487 */ 2488 if (!pool_cache_get_slow(cc, s, &object, pap, flags)) 2489 break; 2490 } 2491 2492 /* 2493 * We would like to KASSERT(object || (flags & PR_NOWAIT)), but 2494 * pool_cache_get can fail even in the PR_WAITOK case, if the 2495 * constructor fails. 2496 */ 2497 return object; 2498 } 2499 2500 static bool __noinline 2501 pool_cache_put_slow(pool_cache_cpu_t *cc, int s, void *object) 2502 { 2503 struct lwp *l = curlwp; 2504 pcg_t *pcg, *cur; 2505 uint64_t ncsw; 2506 pool_cache_t pc; 2507 2508 KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); 2509 KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); 2510 2511 pc = cc->cc_cache; 2512 pcg = NULL; 2513 cc->cc_misses++; 2514 ncsw = l->l_ncsw; 2515 2516 /* 2517 * If there are no empty groups in the cache then allocate one 2518 * while still unlocked. 2519 */ 2520 if (__predict_false(pc->pc_emptygroups == NULL)) { 2521 if (__predict_true(!pool_cache_disable)) { 2522 pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT); 2523 } 2524 /* 2525 * If pool_get() blocked, then our view of 2526 * the per-CPU data is invalid: retry. 2527 */ 2528 if (__predict_false(l->l_ncsw != ncsw)) { 2529 if (pcg != NULL) { 2530 pool_put(pc->pc_pcgpool, pcg); 2531 } 2532 return true; 2533 } 2534 if (__predict_true(pcg != NULL)) { 2535 pcg->pcg_avail = 0; 2536 pcg->pcg_size = pc->pc_pcgsize; 2537 } 2538 } 2539 2540 /* Lock the cache. */ 2541 if (__predict_false(!mutex_tryenter(&pc->pc_lock))) { 2542 mutex_enter(&pc->pc_lock); 2543 pc->pc_contended++; 2544 2545 /* 2546 * If we context switched while locking, then our view of 2547 * the per-CPU data is invalid: retry. 2548 */ 2549 if (__predict_false(l->l_ncsw != ncsw)) { 2550 mutex_exit(&pc->pc_lock); 2551 if (pcg != NULL) { 2552 pool_put(pc->pc_pcgpool, pcg); 2553 } 2554 return true; 2555 } 2556 } 2557 2558 /* If there are no empty groups in the cache then allocate one. */ 2559 if (pcg == NULL && pc->pc_emptygroups != NULL) { 2560 pcg = pc->pc_emptygroups; 2561 pc->pc_emptygroups = pcg->pcg_next; 2562 pc->pc_nempty--; 2563 } 2564 2565 /* 2566 * If there's a empty group, release our full group back 2567 * to the cache. Install the empty group to the local CPU 2568 * and return. 2569 */ 2570 if (pcg != NULL) { 2571 KASSERT(pcg->pcg_avail == 0); 2572 if (__predict_false(cc->cc_previous == &pcg_dummy)) { 2573 cc->cc_previous = pcg; 2574 } else { 2575 cur = cc->cc_current; 2576 if (__predict_true(cur != &pcg_dummy)) { 2577 KASSERT(cur->pcg_avail == cur->pcg_size); 2578 cur->pcg_next = pc->pc_fullgroups; 2579 pc->pc_fullgroups = cur; 2580 pc->pc_nfull++; 2581 } 2582 cc->cc_current = pcg; 2583 } 2584 pc->pc_hits++; 2585 mutex_exit(&pc->pc_lock); 2586 return true; 2587 } 2588 2589 /* 2590 * Nothing available locally or in cache, and we didn't 2591 * allocate an empty group. Take the slow path and destroy 2592 * the object here and now. 2593 */ 2594 pc->pc_misses++; 2595 mutex_exit(&pc->pc_lock); 2596 splx(s); 2597 pool_cache_destruct_object(pc, object); 2598 2599 return false; 2600 } 2601 2602 /* 2603 * pool_cache_put{,_paddr}: 2604 * 2605 * Put an object back to the pool cache (optionally caching the 2606 * physical address of the object). 2607 */ 2608 void 2609 pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) 2610 { 2611 pool_cache_cpu_t *cc; 2612 pcg_t *pcg; 2613 int s; 2614 2615 KASSERT(object != NULL); 2616 pool_cache_redzone_check(pc, object); 2617 FREECHECK_IN(&pc->pc_freecheck, object); 2618 2619 if (pool_cache_put_quarantine(pc, object, pa)) { 2620 return; 2621 } 2622 2623 /* Lock out interrupts and disable preemption. */ 2624 s = splvm(); 2625 while (/* CONSTCOND */ true) { 2626 /* If the current group isn't full, release it there. */ 2627 cc = pc->pc_cpus[curcpu()->ci_index]; 2628 KASSERT(cc->cc_cache == pc); 2629 pcg = cc->cc_current; 2630 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2631 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object; 2632 pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; 2633 pcg->pcg_avail++; 2634 cc->cc_hits++; 2635 splx(s); 2636 return; 2637 } 2638 2639 /* 2640 * That failed. If the previous group isn't full, swap 2641 * it with the current group and try again. 2642 */ 2643 pcg = cc->cc_previous; 2644 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2645 cc->cc_previous = cc->cc_current; 2646 cc->cc_current = pcg; 2647 continue; 2648 } 2649 2650 /* 2651 * Can't free to either group: try the slow path. 2652 * If put_slow() releases the object for us, it 2653 * will return false. Otherwise we need to retry. 2654 */ 2655 if (!pool_cache_put_slow(cc, s, object)) 2656 break; 2657 } 2658 } 2659 2660 /* 2661 * pool_cache_transfer: 2662 * 2663 * Transfer objects from the per-CPU cache to the global cache. 2664 * Run within a cross-call thread. 2665 */ 2666 static void 2667 pool_cache_transfer(pool_cache_t pc) 2668 { 2669 pool_cache_cpu_t *cc; 2670 pcg_t *prev, *cur, **list; 2671 int s; 2672 2673 s = splvm(); 2674 mutex_enter(&pc->pc_lock); 2675 cc = pc->pc_cpus[curcpu()->ci_index]; 2676 cur = cc->cc_current; 2677 cc->cc_current = __UNCONST(&pcg_dummy); 2678 prev = cc->cc_previous; 2679 cc->cc_previous = __UNCONST(&pcg_dummy); 2680 if (cur != &pcg_dummy) { 2681 if (cur->pcg_avail == cur->pcg_size) { 2682 list = &pc->pc_fullgroups; 2683 pc->pc_nfull++; 2684 } else if (cur->pcg_avail == 0) { 2685 list = &pc->pc_emptygroups; 2686 pc->pc_nempty++; 2687 } else { 2688 list = &pc->pc_partgroups; 2689 pc->pc_npart++; 2690 } 2691 cur->pcg_next = *list; 2692 *list = cur; 2693 } 2694 if (prev != &pcg_dummy) { 2695 if (prev->pcg_avail == prev->pcg_size) { 2696 list = &pc->pc_fullgroups; 2697 pc->pc_nfull++; 2698 } else if (prev->pcg_avail == 0) { 2699 list = &pc->pc_emptygroups; 2700 pc->pc_nempty++; 2701 } else { 2702 list = &pc->pc_partgroups; 2703 pc->pc_npart++; 2704 } 2705 prev->pcg_next = *list; 2706 *list = prev; 2707 } 2708 mutex_exit(&pc->pc_lock); 2709 splx(s); 2710 } 2711 2712 /* 2713 * Pool backend allocators. 2714 * 2715 * Each pool has a backend allocator that handles allocation, deallocation, 2716 * and any additional draining that might be needed. 2717 * 2718 * We provide two standard allocators: 2719 * 2720 * pool_allocator_kmem - the default when no allocator is specified 2721 * 2722 * pool_allocator_nointr - used for pools that will not be accessed 2723 * in interrupt context. 2724 */ 2725 void *pool_page_alloc(struct pool *, int); 2726 void pool_page_free(struct pool *, void *); 2727 2728 struct pool_allocator pool_allocator_kmem = { 2729 .pa_alloc = pool_page_alloc, 2730 .pa_free = pool_page_free, 2731 .pa_pagesz = 0 2732 }; 2733 2734 struct pool_allocator pool_allocator_nointr = { 2735 .pa_alloc = pool_page_alloc, 2736 .pa_free = pool_page_free, 2737 .pa_pagesz = 0 2738 }; 2739 2740 struct pool_allocator pool_allocator_big[] = { 2741 { 2742 .pa_alloc = pool_page_alloc, 2743 .pa_free = pool_page_free, 2744 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0), 2745 }, 2746 { 2747 .pa_alloc = pool_page_alloc, 2748 .pa_free = pool_page_free, 2749 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1), 2750 }, 2751 { 2752 .pa_alloc = pool_page_alloc, 2753 .pa_free = pool_page_free, 2754 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2), 2755 }, 2756 { 2757 .pa_alloc = pool_page_alloc, 2758 .pa_free = pool_page_free, 2759 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3), 2760 }, 2761 { 2762 .pa_alloc = pool_page_alloc, 2763 .pa_free = pool_page_free, 2764 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4), 2765 }, 2766 { 2767 .pa_alloc = pool_page_alloc, 2768 .pa_free = pool_page_free, 2769 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5), 2770 }, 2771 { 2772 .pa_alloc = pool_page_alloc, 2773 .pa_free = pool_page_free, 2774 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6), 2775 }, 2776 { 2777 .pa_alloc = pool_page_alloc, 2778 .pa_free = pool_page_free, 2779 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7), 2780 } 2781 }; 2782 2783 static int 2784 pool_bigidx(size_t size) 2785 { 2786 int i; 2787 2788 for (i = 0; i < __arraycount(pool_allocator_big); i++) { 2789 if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size) 2790 return i; 2791 } 2792 panic("pool item size %zu too large, use a custom allocator", size); 2793 } 2794 2795 static void * 2796 pool_allocator_alloc(struct pool *pp, int flags) 2797 { 2798 struct pool_allocator *pa = pp->pr_alloc; 2799 void *res; 2800 2801 res = (*pa->pa_alloc)(pp, flags); 2802 if (res == NULL && (flags & PR_WAITOK) == 0) { 2803 /* 2804 * We only run the drain hook here if PR_NOWAIT. 2805 * In other cases, the hook will be run in 2806 * pool_reclaim(). 2807 */ 2808 if (pp->pr_drain_hook != NULL) { 2809 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 2810 res = (*pa->pa_alloc)(pp, flags); 2811 } 2812 } 2813 return res; 2814 } 2815 2816 static void 2817 pool_allocator_free(struct pool *pp, void *v) 2818 { 2819 struct pool_allocator *pa = pp->pr_alloc; 2820 2821 if (pp->pr_redzone) { 2822 kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0); 2823 } 2824 (*pa->pa_free)(pp, v); 2825 } 2826 2827 void * 2828 pool_page_alloc(struct pool *pp, int flags) 2829 { 2830 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2831 vmem_addr_t va; 2832 int ret; 2833 2834 ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz, 2835 vflags | VM_INSTANTFIT, &va); 2836 2837 return ret ? NULL : (void *)va; 2838 } 2839 2840 void 2841 pool_page_free(struct pool *pp, void *v) 2842 { 2843 2844 uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz); 2845 } 2846 2847 static void * 2848 pool_page_alloc_meta(struct pool *pp, int flags) 2849 { 2850 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 2851 vmem_addr_t va; 2852 int ret; 2853 2854 ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, 2855 vflags | VM_INSTANTFIT, &va); 2856 2857 return ret ? NULL : (void *)va; 2858 } 2859 2860 static void 2861 pool_page_free_meta(struct pool *pp, void *v) 2862 { 2863 2864 vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); 2865 } 2866 2867 #ifdef KLEAK 2868 static void 2869 pool_kleak_fill(struct pool *pp, void *p) 2870 { 2871 if (__predict_false(pp->pr_roflags & PR_NOTOUCH)) { 2872 return; 2873 } 2874 kleak_fill_area(p, pp->pr_size); 2875 } 2876 2877 static void 2878 pool_cache_kleak_fill(pool_cache_t pc, void *p) 2879 { 2880 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 2881 return; 2882 } 2883 pool_kleak_fill(&pc->pc_pool, p); 2884 } 2885 #endif 2886 2887 #ifdef POOL_QUARANTINE 2888 static void 2889 pool_quarantine_init(struct pool *pp) 2890 { 2891 pp->pr_quar.rotor = 0; 2892 memset(&pp->pr_quar, 0, sizeof(pp->pr_quar)); 2893 } 2894 2895 static void 2896 pool_quarantine_flush(struct pool *pp) 2897 { 2898 pool_quar_t *quar = &pp->pr_quar; 2899 struct pool_pagelist pq; 2900 size_t i; 2901 2902 LIST_INIT(&pq); 2903 2904 mutex_enter(&pp->pr_lock); 2905 for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) { 2906 if (quar->list[i] == 0) 2907 continue; 2908 pool_do_put(pp, (void *)quar->list[i], &pq); 2909 } 2910 mutex_exit(&pp->pr_lock); 2911 2912 pr_pagelist_free(pp, &pq); 2913 } 2914 2915 static bool 2916 pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq) 2917 { 2918 pool_quar_t *quar = &pp->pr_quar; 2919 uintptr_t old; 2920 2921 if (pp->pr_roflags & PR_NOTOUCH) { 2922 return false; 2923 } 2924 2925 pool_redzone_check(pp, v); 2926 2927 old = quar->list[quar->rotor]; 2928 quar->list[quar->rotor] = (uintptr_t)v; 2929 quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH; 2930 if (old != 0) { 2931 pool_do_put(pp, (void *)old, pq); 2932 } 2933 2934 return true; 2935 } 2936 2937 static bool 2938 pool_cache_put_quarantine(pool_cache_t pc, void *p, paddr_t pa) 2939 { 2940 pool_cache_destruct_object(pc, p); 2941 return true; 2942 } 2943 #endif 2944 2945 #ifdef POOL_REDZONE 2946 #if defined(_LP64) 2947 # define PRIME 0x9e37fffffffc0000UL 2948 #else /* defined(_LP64) */ 2949 # define PRIME 0x9e3779b1 2950 #endif /* defined(_LP64) */ 2951 #define STATIC_BYTE 0xFE 2952 CTASSERT(POOL_REDZONE_SIZE > 1); 2953 2954 #ifndef KASAN 2955 static inline uint8_t 2956 pool_pattern_generate(const void *p) 2957 { 2958 return (uint8_t)(((uintptr_t)p) * PRIME 2959 >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT); 2960 } 2961 #endif 2962 2963 static void 2964 pool_redzone_init(struct pool *pp, size_t requested_size) 2965 { 2966 size_t redzsz; 2967 size_t nsz; 2968 2969 #ifdef KASAN 2970 redzsz = requested_size; 2971 kasan_add_redzone(&redzsz); 2972 redzsz -= requested_size; 2973 #else 2974 redzsz = POOL_REDZONE_SIZE; 2975 #endif 2976 2977 if (pp->pr_roflags & PR_NOTOUCH) { 2978 pp->pr_redzone = false; 2979 return; 2980 } 2981 2982 /* 2983 * We may have extended the requested size earlier; check if 2984 * there's naturally space in the padding for a red zone. 2985 */ 2986 if (pp->pr_size - requested_size >= redzsz) { 2987 pp->pr_reqsize_with_redzone = requested_size + redzsz; 2988 pp->pr_redzone = true; 2989 return; 2990 } 2991 2992 /* 2993 * No space in the natural padding; check if we can extend a 2994 * bit the size of the pool. 2995 */ 2996 nsz = roundup(pp->pr_size + redzsz, pp->pr_align); 2997 if (nsz <= pp->pr_alloc->pa_pagesz) { 2998 /* Ok, we can */ 2999 pp->pr_size = nsz; 3000 pp->pr_reqsize_with_redzone = requested_size + redzsz; 3001 pp->pr_redzone = true; 3002 } else { 3003 /* No space for a red zone... snif :'( */ 3004 pp->pr_redzone = false; 3005 printf("pool redzone disabled for '%s'\n", pp->pr_wchan); 3006 } 3007 } 3008 3009 static void 3010 pool_redzone_fill(struct pool *pp, void *p) 3011 { 3012 if (!pp->pr_redzone) 3013 return; 3014 #ifdef KASAN 3015 kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone, 3016 KASAN_POOL_REDZONE); 3017 #else 3018 uint8_t *cp, pat; 3019 const uint8_t *ep; 3020 3021 cp = (uint8_t *)p + pp->pr_reqsize; 3022 ep = cp + POOL_REDZONE_SIZE; 3023 3024 /* 3025 * We really don't want the first byte of the red zone to be '\0'; 3026 * an off-by-one in a string may not be properly detected. 3027 */ 3028 pat = pool_pattern_generate(cp); 3029 *cp = (pat == '\0') ? STATIC_BYTE: pat; 3030 cp++; 3031 3032 while (cp < ep) { 3033 *cp = pool_pattern_generate(cp); 3034 cp++; 3035 } 3036 #endif 3037 } 3038 3039 static void 3040 pool_redzone_check(struct pool *pp, void *p) 3041 { 3042 if (!pp->pr_redzone) 3043 return; 3044 #ifdef KASAN 3045 kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED); 3046 #else 3047 uint8_t *cp, pat, expected; 3048 const uint8_t *ep; 3049 3050 cp = (uint8_t *)p + pp->pr_reqsize; 3051 ep = cp + POOL_REDZONE_SIZE; 3052 3053 pat = pool_pattern_generate(cp); 3054 expected = (pat == '\0') ? STATIC_BYTE: pat; 3055 if (__predict_false(expected != *cp)) { 3056 printf("%s: %p: 0x%02x != 0x%02x\n", 3057 __func__, cp, *cp, expected); 3058 } 3059 cp++; 3060 3061 while (cp < ep) { 3062 expected = pool_pattern_generate(cp); 3063 if (__predict_false(*cp != expected)) { 3064 printf("%s: %p: 0x%02x != 0x%02x\n", 3065 __func__, cp, *cp, expected); 3066 } 3067 cp++; 3068 } 3069 #endif 3070 } 3071 3072 static void 3073 pool_cache_redzone_check(pool_cache_t pc, void *p) 3074 { 3075 #ifdef KASAN 3076 /* If there is a ctor/dtor, leave the data as valid. */ 3077 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc))) { 3078 return; 3079 } 3080 #endif 3081 pool_redzone_check(&pc->pc_pool, p); 3082 } 3083 3084 #endif /* POOL_REDZONE */ 3085 3086 #if defined(DDB) 3087 static bool 3088 pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3089 { 3090 3091 return (uintptr_t)ph->ph_page <= addr && 3092 addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz; 3093 } 3094 3095 static bool 3096 pool_in_item(struct pool *pp, void *item, uintptr_t addr) 3097 { 3098 3099 return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size; 3100 } 3101 3102 static bool 3103 pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr) 3104 { 3105 int i; 3106 3107 if (pcg == NULL) { 3108 return false; 3109 } 3110 for (i = 0; i < pcg->pcg_avail; i++) { 3111 if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) { 3112 return true; 3113 } 3114 } 3115 return false; 3116 } 3117 3118 static bool 3119 pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3120 { 3121 3122 if ((pp->pr_roflags & PR_USEBMAP) != 0) { 3123 unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr); 3124 pool_item_bitmap_t *bitmap = 3125 ph->ph_bitmap + (idx / BITMAP_SIZE); 3126 pool_item_bitmap_t mask = 1 << (idx & BITMAP_MASK); 3127 3128 return (*bitmap & mask) == 0; 3129 } else { 3130 struct pool_item *pi; 3131 3132 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 3133 if (pool_in_item(pp, pi, addr)) { 3134 return false; 3135 } 3136 } 3137 return true; 3138 } 3139 } 3140 3141 void 3142 pool_whatis(uintptr_t addr, void (*pr)(const char *, ...)) 3143 { 3144 struct pool *pp; 3145 3146 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3147 struct pool_item_header *ph; 3148 uintptr_t item; 3149 bool allocated = true; 3150 bool incache = false; 3151 bool incpucache = false; 3152 char cpucachestr[32]; 3153 3154 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 3155 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 3156 if (pool_in_page(pp, ph, addr)) { 3157 goto found; 3158 } 3159 } 3160 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 3161 if (pool_in_page(pp, ph, addr)) { 3162 allocated = 3163 pool_allocated(pp, ph, addr); 3164 goto found; 3165 } 3166 } 3167 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 3168 if (pool_in_page(pp, ph, addr)) { 3169 allocated = false; 3170 goto found; 3171 } 3172 } 3173 continue; 3174 } else { 3175 ph = pr_find_pagehead_noalign(pp, (void *)addr); 3176 if (ph == NULL || !pool_in_page(pp, ph, addr)) { 3177 continue; 3178 } 3179 allocated = pool_allocated(pp, ph, addr); 3180 } 3181 found: 3182 if (allocated && pp->pr_cache) { 3183 pool_cache_t pc = pp->pr_cache; 3184 struct pool_cache_group *pcg; 3185 int i; 3186 3187 for (pcg = pc->pc_fullgroups; pcg != NULL; 3188 pcg = pcg->pcg_next) { 3189 if (pool_in_cg(pp, pcg, addr)) { 3190 incache = true; 3191 goto print; 3192 } 3193 } 3194 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 3195 pool_cache_cpu_t *cc; 3196 3197 if ((cc = pc->pc_cpus[i]) == NULL) { 3198 continue; 3199 } 3200 if (pool_in_cg(pp, cc->cc_current, addr) || 3201 pool_in_cg(pp, cc->cc_previous, addr)) { 3202 struct cpu_info *ci = 3203 cpu_lookup(i); 3204 3205 incpucache = true; 3206 snprintf(cpucachestr, 3207 sizeof(cpucachestr), 3208 "cached by CPU %u", 3209 ci->ci_index); 3210 goto print; 3211 } 3212 } 3213 } 3214 print: 3215 item = (uintptr_t)ph->ph_page + ph->ph_off; 3216 item = item + rounddown(addr - item, pp->pr_size); 3217 (*pr)("%p is %p+%zu in POOL '%s' (%s)\n", 3218 (void *)addr, item, (size_t)(addr - item), 3219 pp->pr_wchan, 3220 incpucache ? cpucachestr : 3221 incache ? "cached" : allocated ? "allocated" : "free"); 3222 } 3223 } 3224 #endif /* defined(DDB) */ 3225 3226 static int 3227 pool_sysctl(SYSCTLFN_ARGS) 3228 { 3229 struct pool_sysctl data; 3230 struct pool *pp; 3231 struct pool_cache *pc; 3232 pool_cache_cpu_t *cc; 3233 int error; 3234 size_t i, written; 3235 3236 if (oldp == NULL) { 3237 *oldlenp = 0; 3238 TAILQ_FOREACH(pp, &pool_head, pr_poollist) 3239 *oldlenp += sizeof(data); 3240 return 0; 3241 } 3242 3243 memset(&data, 0, sizeof(data)); 3244 error = 0; 3245 written = 0; 3246 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3247 if (written + sizeof(data) > *oldlenp) 3248 break; 3249 strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan)); 3250 data.pr_pagesize = pp->pr_alloc->pa_pagesz; 3251 data.pr_flags = pp->pr_roflags | pp->pr_flags; 3252 #define COPY(field) data.field = pp->field 3253 COPY(pr_size); 3254 3255 COPY(pr_itemsperpage); 3256 COPY(pr_nitems); 3257 COPY(pr_nout); 3258 COPY(pr_hardlimit); 3259 COPY(pr_npages); 3260 COPY(pr_minpages); 3261 COPY(pr_maxpages); 3262 3263 COPY(pr_nget); 3264 COPY(pr_nfail); 3265 COPY(pr_nput); 3266 COPY(pr_npagealloc); 3267 COPY(pr_npagefree); 3268 COPY(pr_hiwat); 3269 COPY(pr_nidle); 3270 #undef COPY 3271 3272 data.pr_cache_nmiss_pcpu = 0; 3273 data.pr_cache_nhit_pcpu = 0; 3274 if (pp->pr_cache) { 3275 pc = pp->pr_cache; 3276 data.pr_cache_meta_size = pc->pc_pcgsize; 3277 data.pr_cache_nfull = pc->pc_nfull; 3278 data.pr_cache_npartial = pc->pc_npart; 3279 data.pr_cache_nempty = pc->pc_nempty; 3280 data.pr_cache_ncontended = pc->pc_contended; 3281 data.pr_cache_nmiss_global = pc->pc_misses; 3282 data.pr_cache_nhit_global = pc->pc_hits; 3283 for (i = 0; i < pc->pc_ncpu; ++i) { 3284 cc = pc->pc_cpus[i]; 3285 if (cc == NULL) 3286 continue; 3287 data.pr_cache_nmiss_pcpu += cc->cc_misses; 3288 data.pr_cache_nhit_pcpu += cc->cc_hits; 3289 } 3290 } else { 3291 data.pr_cache_meta_size = 0; 3292 data.pr_cache_nfull = 0; 3293 data.pr_cache_npartial = 0; 3294 data.pr_cache_nempty = 0; 3295 data.pr_cache_ncontended = 0; 3296 data.pr_cache_nmiss_global = 0; 3297 data.pr_cache_nhit_global = 0; 3298 } 3299 3300 error = sysctl_copyout(l, &data, oldp, sizeof(data)); 3301 if (error) 3302 break; 3303 written += sizeof(data); 3304 oldp = (char *)oldp + sizeof(data); 3305 } 3306 3307 *oldlenp = written; 3308 return error; 3309 } 3310 3311 SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup") 3312 { 3313 const struct sysctlnode *rnode = NULL; 3314 3315 sysctl_createv(clog, 0, NULL, &rnode, 3316 CTLFLAG_PERMANENT, 3317 CTLTYPE_STRUCT, "pool", 3318 SYSCTL_DESCR("Get pool statistics"), 3319 pool_sysctl, 0, NULL, 0, 3320 CTL_KERN, CTL_CREATE, CTL_EOL); 3321 } 3322