1 /* $OpenBSD: subr_pool.c,v 1.62 2008/06/26 05:42:20 ray Exp $ */ 2 /* $NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $ */ 3 4 /*- 5 * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 #include <sys/errno.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/pool.h> 41 #include <sys/syslog.h> 42 #include <sys/sysctl.h> 43 44 #include <uvm/uvm.h> 45 46 47 /* 48 * Pool resource management utility. 49 * 50 * Memory is allocated in pages which are split into pieces according to 51 * the pool item size. Each page is kept on one of three lists in the 52 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 53 * for empty, full and partially-full pages respectively. The individual 54 * pool items are on a linked list headed by `ph_itemlist' in each page 55 * header. The memory for building the page list is either taken from 56 * the allocated pages themselves (for small pool items) or taken from 57 * an internal pool of page headers (`phpool'). 58 */ 59 60 /* List of all pools */ 61 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 62 63 /* Private pool for page header structures */ 64 struct pool phpool; 65 66 struct pool_item_header { 67 /* Page headers */ 68 LIST_ENTRY(pool_item_header) 69 ph_pagelist; /* pool page list */ 70 TAILQ_HEAD(,pool_item) ph_itemlist; /* chunk list for this page */ 71 SPLAY_ENTRY(pool_item_header) 72 ph_node; /* Off-page page headers */ 73 int ph_nmissing; /* # of chunks in use */ 74 caddr_t ph_page; /* this page's address */ 75 }; 76 77 struct pool_item { 78 #ifdef DIAGNOSTIC 79 int pi_magic; 80 #endif 81 #ifdef DEADBEEF1 82 #define PI_MAGIC DEADBEEF1 83 #else 84 #define PI_MAGIC 0xdeafbeef 85 #endif 86 /* Other entries use only this list entry */ 87 TAILQ_ENTRY(pool_item) pi_list; 88 }; 89 90 #define POOL_NEEDS_CATCHUP(pp) \ 91 ((pp)->pr_nitems < (pp)->pr_minitems) 92 93 /* 94 * Every pool gets a unique serial number assigned to it. If this counter 95 * wraps, we're screwed, but we shouldn't create so many pools anyway. 96 */ 97 unsigned int pool_serial; 98 99 int pool_catchup(struct pool *); 100 void pool_prime_page(struct pool *, caddr_t, struct pool_item_header *); 101 void pool_update_curpage(struct pool *); 102 void *pool_do_get(struct pool *, int); 103 void pool_do_put(struct pool *, void *); 104 void pr_rmpage(struct pool *, struct pool_item_header *, 105 struct pool_pagelist *); 106 int pool_chk_page(struct pool *, const char *, struct pool_item_header *); 107 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int); 108 109 void *pool_allocator_alloc(struct pool *, int); 110 void pool_allocator_free(struct pool *, void *); 111 112 #ifdef DDB 113 void pool_print_pagelist(struct pool_pagelist *, 114 int (*)(const char *, ...)); 115 void pool_print1(struct pool *, const char *, int (*)(const char *, ...)); 116 #endif 117 118 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0) 119 120 static __inline int 121 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 122 { 123 if (a->ph_page < b->ph_page) 124 return (-1); 125 else if (a->ph_page > b->ph_page) 126 return (1); 127 else 128 return (0); 129 } 130 131 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 132 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 133 134 /* 135 * Return the pool page header based on page address. 136 */ 137 static __inline struct pool_item_header * 138 pr_find_pagehead(struct pool *pp, caddr_t page) 139 { 140 struct pool_item_header *ph, tmp; 141 142 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 143 return ((struct pool_item_header *)(page + pp->pr_phoffset)); 144 145 tmp.ph_page = page; 146 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 147 return ph; 148 } 149 150 /* 151 * Remove a page from the pool. 152 */ 153 void 154 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 155 struct pool_pagelist *pq) 156 { 157 158 /* 159 * If the page was idle, decrement the idle page count. 160 */ 161 if (ph->ph_nmissing == 0) { 162 #ifdef DIAGNOSTIC 163 if (pp->pr_nidle == 0) 164 panic("pr_rmpage: nidle inconsistent"); 165 if (pp->pr_nitems < pp->pr_itemsperpage) 166 panic("pr_rmpage: nitems inconsistent"); 167 #endif 168 pp->pr_nidle--; 169 } 170 171 pp->pr_nitems -= pp->pr_itemsperpage; 172 173 /* 174 * Unlink a page from the pool and release it (or queue it for release). 175 */ 176 LIST_REMOVE(ph, ph_pagelist); 177 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 178 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 179 if (pq) { 180 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 181 } else { 182 pool_allocator_free(pp, ph->ph_page); 183 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 184 pool_put(&phpool, ph); 185 } 186 pp->pr_npages--; 187 pp->pr_npagefree++; 188 189 pool_update_curpage(pp); 190 } 191 192 /* 193 * Initialize the given pool resource structure. 194 * 195 * We export this routine to allow other kernel parts to declare 196 * static pools that must be initialized before malloc() is available. 197 */ 198 void 199 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 200 const char *wchan, struct pool_allocator *palloc) 201 { 202 int off, slack; 203 204 #ifdef MALLOC_DEBUG 205 if ((flags & PR_DEBUG) && (ioff != 0 || align != 0)) 206 flags &= ~PR_DEBUG; 207 #endif 208 /* 209 * Check arguments and construct default values. 210 */ 211 if (palloc == NULL) 212 palloc = &pool_allocator_nointr; 213 if (palloc->pa_pagesz == 0) { 214 palloc->pa_pagesz = PAGE_SIZE; 215 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 216 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 217 } 218 219 if (align == 0) 220 align = ALIGN(1); 221 222 if (size < sizeof(struct pool_item)) 223 size = sizeof(struct pool_item); 224 225 size = roundup(size, align); 226 #ifdef DIAGNOSTIC 227 if (size > palloc->pa_pagesz) 228 panic("pool_init: pool item size (%lu) too large", 229 (u_long)size); 230 #endif 231 232 /* 233 * Initialize the pool structure. 234 */ 235 LIST_INIT(&pp->pr_emptypages); 236 LIST_INIT(&pp->pr_fullpages); 237 LIST_INIT(&pp->pr_partpages); 238 pp->pr_curpage = NULL; 239 pp->pr_npages = 0; 240 pp->pr_minitems = 0; 241 pp->pr_minpages = 0; 242 pp->pr_maxpages = 8; 243 pp->pr_roflags = flags; 244 pp->pr_flags = 0; 245 pp->pr_size = size; 246 pp->pr_align = align; 247 pp->pr_wchan = wchan; 248 pp->pr_alloc = palloc; 249 pp->pr_nitems = 0; 250 pp->pr_nout = 0; 251 pp->pr_hardlimit = UINT_MAX; 252 pp->pr_hardlimit_warning = NULL; 253 pp->pr_hardlimit_ratecap.tv_sec = 0; 254 pp->pr_hardlimit_ratecap.tv_usec = 0; 255 pp->pr_hardlimit_warning_last.tv_sec = 0; 256 pp->pr_hardlimit_warning_last.tv_usec = 0; 257 pp->pr_serial = ++pool_serial; 258 if (pool_serial == 0) 259 panic("pool_init: too much uptime"); 260 261 /* 262 * Decide whether to put the page header off page to avoid 263 * wasting too large a part of the page. Off-page page headers 264 * go on a hash table, so we can match a returned item 265 * with its header based on the page address. 266 * We use 1/16 of the page size as the threshold (XXX: tune) 267 */ 268 if (pp->pr_size < palloc->pa_pagesz/16) { 269 /* Use the end of the page for the page header */ 270 pp->pr_roflags |= PR_PHINPAGE; 271 pp->pr_phoffset = off = palloc->pa_pagesz - 272 ALIGN(sizeof(struct pool_item_header)); 273 } else { 274 /* The page header will be taken from our page header pool */ 275 pp->pr_phoffset = 0; 276 off = palloc->pa_pagesz; 277 SPLAY_INIT(&pp->pr_phtree); 278 } 279 280 /* 281 * Alignment is to take place at `ioff' within the item. This means 282 * we must reserve up to `align - 1' bytes on the page to allow 283 * appropriate positioning of each item. 284 * 285 * Silently enforce `0 <= ioff < align'. 286 */ 287 pp->pr_itemoffset = ioff = ioff % align; 288 pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size; 289 KASSERT(pp->pr_itemsperpage != 0); 290 291 /* 292 * Use the slack between the chunks and the page header 293 * for "cache coloring". 294 */ 295 slack = off - pp->pr_itemsperpage * pp->pr_size; 296 pp->pr_maxcolor = (slack / align) * align; 297 pp->pr_curcolor = 0; 298 299 pp->pr_nget = 0; 300 pp->pr_nfail = 0; 301 pp->pr_nput = 0; 302 pp->pr_npagealloc = 0; 303 pp->pr_npagefree = 0; 304 pp->pr_hiwat = 0; 305 pp->pr_nidle = 0; 306 307 pp->pr_ipl = -1; 308 mtx_init(&pp->pr_mtx, IPL_NONE); 309 310 if (phpool.pr_size == 0) { 311 pool_init(&phpool, sizeof(struct pool_item_header), 0, 0, 312 0, "phpool", NULL); 313 pool_setipl(&phpool, IPL_HIGH); 314 } 315 316 /* Insert this into the list of all pools. */ 317 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 318 } 319 320 void 321 pool_setipl(struct pool *pp, int ipl) 322 { 323 pp->pr_ipl = ipl; 324 mtx_init(&pp->pr_mtx, ipl); 325 } 326 327 /* 328 * Decommission a pool resource. 329 */ 330 void 331 pool_destroy(struct pool *pp) 332 { 333 struct pool_item_header *ph; 334 335 #ifdef DIAGNOSTIC 336 if (pp->pr_nout != 0) 337 panic("pool_destroy: pool busy: still out: %u", pp->pr_nout); 338 #endif 339 340 /* Remove all pages */ 341 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 342 pr_rmpage(pp, ph, NULL); 343 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 344 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 345 346 /* Remove from global pool list */ 347 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 348 } 349 350 struct pool_item_header * 351 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags) 352 { 353 struct pool_item_header *ph; 354 355 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 356 ph = (struct pool_item_header *)(storage + pp->pr_phoffset); 357 else { 358 ph = pool_get(&phpool, flags); 359 } 360 361 return (ph); 362 } 363 364 /* 365 * Grab an item from the pool; must be called at appropriate spl level 366 */ 367 void * 368 pool_get(struct pool *pp, int flags) 369 { 370 void *v; 371 372 mtx_enter(&pp->pr_mtx); 373 v = pool_do_get(pp, flags); 374 mtx_leave(&pp->pr_mtx); 375 if (v && pp->pr_ctor && pp->pr_ctor(pp->pr_arg, v, flags)) { 376 mtx_enter(&pp->pr_mtx); 377 pool_do_put(pp, v); 378 mtx_leave(&pp->pr_mtx); 379 v = NULL; 380 } 381 if (v) { 382 pp->pr_nget++; 383 if (flags & PR_ZERO) 384 memset(v, 0, pp->pr_size); 385 } 386 return (v); 387 } 388 389 void * 390 pool_do_get(struct pool *pp, int flags) 391 { 392 struct pool_item *pi; 393 struct pool_item_header *ph; 394 void *v; 395 396 #ifdef DIAGNOSTIC 397 if ((flags & PR_WAITOK) != 0) 398 splassert(IPL_NONE); 399 if (pp->pr_ipl != -1) 400 splassert(pp->pr_ipl); 401 #endif /* DIAGNOSTIC */ 402 403 #ifdef MALLOC_DEBUG 404 if (pp->pr_roflags & PR_DEBUG) { 405 void *addr; 406 407 addr = NULL; 408 debug_malloc(pp->pr_size, M_DEBUG, 409 (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr); 410 return (addr); 411 } 412 #endif 413 414 startover: 415 /* 416 * Check to see if we've reached the hard limit. If we have, 417 * and we can wait, then wait until an item has been returned to 418 * the pool. 419 */ 420 #ifdef DIAGNOSTIC 421 if (__predict_false(pp->pr_nout > pp->pr_hardlimit)) 422 panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan); 423 #endif 424 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 425 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 426 /* 427 * XXX: A warning isn't logged in this case. Should 428 * it be? 429 */ 430 pp->pr_flags |= PR_WANTED; 431 pool_sleep(pp); 432 goto startover; 433 } 434 435 /* 436 * Log a message that the hard limit has been hit. 437 */ 438 if (pp->pr_hardlimit_warning != NULL && 439 ratecheck(&pp->pr_hardlimit_warning_last, 440 &pp->pr_hardlimit_ratecap)) 441 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 442 443 pp->pr_nfail++; 444 return (NULL); 445 } 446 447 /* 448 * The convention we use is that if `curpage' is not NULL, then 449 * it points at a non-empty bucket. In particular, `curpage' 450 * never points at a page header which has PR_PHINPAGE set and 451 * has no items in its bucket. 452 */ 453 if ((ph = pp->pr_curpage) == NULL) { 454 #ifdef DIAGNOSTIC 455 if (pp->pr_nitems != 0) { 456 printf("pool_do_get: %s: curpage NULL, nitems %u\n", 457 pp->pr_wchan, pp->pr_nitems); 458 panic("pool_do_get: nitems inconsistent"); 459 } 460 #endif 461 462 /* 463 * Call the back-end page allocator for more memory. 464 */ 465 v = pool_allocator_alloc(pp, flags); 466 if (__predict_true(v != NULL)) 467 ph = pool_alloc_item_header(pp, v, flags); 468 469 if (__predict_false(v == NULL || ph == NULL)) { 470 if (v != NULL) 471 pool_allocator_free(pp, v); 472 473 if ((flags & PR_WAITOK) == 0) { 474 pp->pr_nfail++; 475 return (NULL); 476 } 477 478 /* 479 * Wait for items to be returned to this pool. 480 * 481 * XXX: maybe we should wake up once a second and 482 * try again? 483 */ 484 pp->pr_flags |= PR_WANTED; 485 pool_sleep(pp); 486 goto startover; 487 } 488 489 /* We have more memory; add it to the pool */ 490 pool_prime_page(pp, v, ph); 491 pp->pr_npagealloc++; 492 493 /* Start the allocation process over. */ 494 goto startover; 495 } 496 if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) { 497 panic("pool_do_get: %s: page empty", pp->pr_wchan); 498 } 499 #ifdef DIAGNOSTIC 500 if (__predict_false(pp->pr_nitems == 0)) { 501 printf("pool_do_get: %s: items on itemlist, nitems %u\n", 502 pp->pr_wchan, pp->pr_nitems); 503 panic("pool_do_get: nitems inconsistent"); 504 } 505 #endif 506 507 #ifdef DIAGNOSTIC 508 if (__predict_false(pi->pi_magic != PI_MAGIC)) { 509 panic("pool_do_get(%s): free list modified: magic=%x; page %p;" 510 " item addr %p", 511 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 512 } 513 #endif 514 515 /* 516 * Remove from item list. 517 */ 518 TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list); 519 pp->pr_nitems--; 520 pp->pr_nout++; 521 if (ph->ph_nmissing == 0) { 522 #ifdef DIAGNOSTIC 523 if (__predict_false(pp->pr_nidle == 0)) 524 panic("pool_do_get: nidle inconsistent"); 525 #endif 526 pp->pr_nidle--; 527 528 /* 529 * This page was previously empty. Move it to the list of 530 * partially-full pages. This page is already curpage. 531 */ 532 LIST_REMOVE(ph, ph_pagelist); 533 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 534 } 535 ph->ph_nmissing++; 536 if (TAILQ_EMPTY(&ph->ph_itemlist)) { 537 #ifdef DIAGNOSTIC 538 if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) { 539 panic("pool_do_get: %s: nmissing inconsistent", 540 pp->pr_wchan); 541 } 542 #endif 543 /* 544 * This page is now full. Move it to the full list 545 * and select a new current page. 546 */ 547 LIST_REMOVE(ph, ph_pagelist); 548 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 549 pool_update_curpage(pp); 550 } 551 552 /* 553 * If we have a low water mark and we are now below that low 554 * water mark, add more items to the pool. 555 */ 556 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 557 /* 558 * XXX: Should we log a warning? Should we set up a timeout 559 * to try again in a second or so? The latter could break 560 * a caller's assumptions about interrupt protection, etc. 561 */ 562 } 563 return (v); 564 } 565 566 /* 567 * Return resource to the pool; must be called at appropriate spl level 568 */ 569 void 570 pool_put(struct pool *pp, void *v) 571 { 572 if (pp->pr_dtor) 573 pp->pr_dtor(pp->pr_arg, v); 574 mtx_enter(&pp->pr_mtx); 575 pool_do_put(pp, v); 576 mtx_leave(&pp->pr_mtx); 577 pp->pr_nput++; 578 } 579 580 /* 581 * Internal version of pool_put(). 582 */ 583 void 584 pool_do_put(struct pool *pp, void *v) 585 { 586 struct pool_item *pi = v; 587 struct pool_item_header *ph; 588 caddr_t page; 589 590 #ifdef MALLOC_DEBUG 591 if (pp->pr_roflags & PR_DEBUG) { 592 debug_free(v, M_DEBUG); 593 return; 594 } 595 #endif 596 597 page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask); 598 599 #ifdef DIAGNOSTIC 600 if (pp->pr_ipl != -1) 601 splassert(pp->pr_ipl); 602 603 if (__predict_false(pp->pr_nout == 0)) { 604 printf("pool %s: putting with none out\n", 605 pp->pr_wchan); 606 panic("pool_do_put"); 607 } 608 #endif 609 610 if (__predict_false((ph = pr_find_pagehead(pp, page)) == NULL)) { 611 panic("pool_do_put: %s: page header missing", pp->pr_wchan); 612 } 613 614 /* 615 * Return to item list. 616 */ 617 #ifdef DIAGNOSTIC 618 pi->pi_magic = PI_MAGIC; 619 #endif 620 #ifdef DEBUG 621 { 622 int i, *ip = v; 623 624 for (i = 0; i < pp->pr_size / sizeof(int); i++) { 625 *ip++ = PI_MAGIC; 626 } 627 } 628 #endif 629 630 TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 631 ph->ph_nmissing--; 632 pp->pr_nitems++; 633 pp->pr_nout--; 634 635 /* Cancel "pool empty" condition if it exists */ 636 if (pp->pr_curpage == NULL) 637 pp->pr_curpage = ph; 638 639 if (pp->pr_flags & PR_WANTED) { 640 pp->pr_flags &= ~PR_WANTED; 641 if (ph->ph_nmissing == 0) 642 pp->pr_nidle++; 643 wakeup(pp); 644 return; 645 } 646 647 /* 648 * If this page is now empty, do one of two things: 649 * 650 * (1) If we have more pages than the page high water mark, 651 * free the page back to the system. 652 * 653 * (2) Otherwise, move the page to the empty page list. 654 * 655 * Either way, select a new current page (so we use a partially-full 656 * page if one is available). 657 */ 658 if (ph->ph_nmissing == 0) { 659 pp->pr_nidle++; 660 if (pp->pr_nidle > pp->pr_maxpages) { 661 pr_rmpage(pp, ph, NULL); 662 } else { 663 LIST_REMOVE(ph, ph_pagelist); 664 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 665 } 666 pool_update_curpage(pp); 667 } 668 669 /* 670 * If the page was previously completely full, move it to the 671 * partially-full list and make it the current page. The next 672 * allocation will get the item from this page, instead of 673 * further fragmenting the pool. 674 */ 675 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 676 LIST_REMOVE(ph, ph_pagelist); 677 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 678 pp->pr_curpage = ph; 679 } 680 } 681 682 /* 683 * Add N items to the pool. 684 */ 685 int 686 pool_prime(struct pool *pp, int n) 687 { 688 struct pool_item_header *ph; 689 caddr_t cp; 690 int newpages; 691 692 mtx_enter(&pp->pr_mtx); 693 newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 694 695 while (newpages-- > 0) { 696 cp = pool_allocator_alloc(pp, PR_NOWAIT); 697 if (__predict_true(cp != NULL)) 698 ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); 699 if (__predict_false(cp == NULL || ph == NULL)) { 700 if (cp != NULL) 701 pool_allocator_free(pp, cp); 702 break; 703 } 704 705 pool_prime_page(pp, cp, ph); 706 pp->pr_npagealloc++; 707 pp->pr_minpages++; 708 } 709 710 if (pp->pr_minpages >= pp->pr_maxpages) 711 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 712 713 mtx_leave(&pp->pr_mtx); 714 return (0); 715 } 716 717 /* 718 * Add a page worth of items to the pool. 719 * 720 * Note, we must be called with the pool descriptor LOCKED. 721 */ 722 void 723 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph) 724 { 725 struct pool_item *pi; 726 caddr_t cp = storage; 727 unsigned int align = pp->pr_align; 728 unsigned int ioff = pp->pr_itemoffset; 729 int n; 730 731 #ifdef DIAGNOSTIC 732 if (((u_long)cp & (pp->pr_alloc->pa_pagesz - 1)) != 0) 733 panic("pool_prime_page: %s: unaligned page", pp->pr_wchan); 734 #endif 735 736 /* 737 * Insert page header. 738 */ 739 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 740 TAILQ_INIT(&ph->ph_itemlist); 741 ph->ph_page = storage; 742 ph->ph_nmissing = 0; 743 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 744 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 745 746 pp->pr_nidle++; 747 748 /* 749 * Color this page. 750 */ 751 cp = (caddr_t)(cp + pp->pr_curcolor); 752 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 753 pp->pr_curcolor = 0; 754 755 /* 756 * Adjust storage to apply aligment to `pr_itemoffset' in each item. 757 */ 758 if (ioff != 0) 759 cp = (caddr_t)(cp + (align - ioff)); 760 761 /* 762 * Insert remaining chunks on the bucket list. 763 */ 764 n = pp->pr_itemsperpage; 765 pp->pr_nitems += n; 766 767 while (n--) { 768 pi = (struct pool_item *)cp; 769 770 KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0); 771 772 /* Insert on page list */ 773 TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list); 774 #ifdef DIAGNOSTIC 775 pi->pi_magic = PI_MAGIC; 776 #endif 777 cp = (caddr_t)(cp + pp->pr_size); 778 } 779 780 /* 781 * If the pool was depleted, point at the new page. 782 */ 783 if (pp->pr_curpage == NULL) 784 pp->pr_curpage = ph; 785 786 if (++pp->pr_npages > pp->pr_hiwat) 787 pp->pr_hiwat = pp->pr_npages; 788 } 789 790 /* 791 * Used by pool_get() when nitems drops below the low water mark. This 792 * is used to catch up pr_nitems with the low water mark. 793 * 794 * Note we never wait for memory here, we let the caller decide what to do. 795 */ 796 int 797 pool_catchup(struct pool *pp) 798 { 799 struct pool_item_header *ph; 800 caddr_t cp; 801 int error = 0; 802 803 while (POOL_NEEDS_CATCHUP(pp)) { 804 /* 805 * Call the page back-end allocator for more memory. 806 */ 807 cp = pool_allocator_alloc(pp, PR_NOWAIT); 808 if (__predict_true(cp != NULL)) 809 ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); 810 if (__predict_false(cp == NULL || ph == NULL)) { 811 if (cp != NULL) 812 pool_allocator_free(pp, cp); 813 error = ENOMEM; 814 break; 815 } 816 pool_prime_page(pp, cp, ph); 817 pp->pr_npagealloc++; 818 } 819 820 return (error); 821 } 822 823 void 824 pool_update_curpage(struct pool *pp) 825 { 826 827 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 828 if (pp->pr_curpage == NULL) { 829 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 830 } 831 } 832 833 void 834 pool_setlowat(struct pool *pp, int n) 835 { 836 837 pp->pr_minitems = n; 838 pp->pr_minpages = (n == 0) 839 ? 0 840 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 841 842 mtx_enter(&pp->pr_mtx); 843 /* Make sure we're caught up with the newly-set low water mark. */ 844 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 845 /* 846 * XXX: Should we log a warning? Should we set up a timeout 847 * to try again in a second or so? The latter could break 848 * a caller's assumptions about interrupt protection, etc. 849 */ 850 } 851 mtx_leave(&pp->pr_mtx); 852 } 853 854 void 855 pool_sethiwat(struct pool *pp, int n) 856 { 857 858 pp->pr_maxpages = (n == 0) 859 ? 0 860 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 861 } 862 863 int 864 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap) 865 { 866 int error = 0; 867 868 if (n < pp->pr_nout) { 869 error = EINVAL; 870 goto done; 871 } 872 873 pp->pr_hardlimit = n; 874 pp->pr_hardlimit_warning = warnmsg; 875 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 876 pp->pr_hardlimit_warning_last.tv_sec = 0; 877 pp->pr_hardlimit_warning_last.tv_usec = 0; 878 879 /* 880 * In-line version of pool_sethiwat(). 881 */ 882 pp->pr_maxpages = (n == 0 || n == UINT_MAX) 883 ? n 884 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 885 886 done: 887 return (error); 888 } 889 890 void 891 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int), 892 void (*dtor)(void *, void *), void *arg) 893 { 894 pp->pr_ctor = ctor; 895 pp->pr_dtor = dtor; 896 pp->pr_arg = arg; 897 } 898 /* 899 * Release all complete pages that have not been used recently. 900 * 901 * Returns non-zero if any pages have been reclaimed. 902 */ 903 int 904 pool_reclaim(struct pool *pp) 905 { 906 struct pool_item_header *ph, *phnext; 907 struct pool_pagelist pq; 908 909 LIST_INIT(&pq); 910 911 mtx_enter(&pp->pr_mtx); 912 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 913 phnext = LIST_NEXT(ph, ph_pagelist); 914 915 /* Check our minimum page claim */ 916 if (pp->pr_npages <= pp->pr_minpages) 917 break; 918 919 KASSERT(ph->ph_nmissing == 0); 920 921 /* 922 * If freeing this page would put us below 923 * the low water mark, stop now. 924 */ 925 if ((pp->pr_nitems - pp->pr_itemsperpage) < 926 pp->pr_minitems) 927 break; 928 929 pr_rmpage(pp, ph, &pq); 930 } 931 mtx_leave(&pp->pr_mtx); 932 933 if (LIST_EMPTY(&pq)) 934 return (0); 935 while ((ph = LIST_FIRST(&pq)) != NULL) { 936 LIST_REMOVE(ph, ph_pagelist); 937 pool_allocator_free(pp, ph->ph_page); 938 if (pp->pr_roflags & PR_PHINPAGE) 939 continue; 940 pool_put(&phpool, ph); 941 } 942 943 return (1); 944 } 945 946 #ifdef DDB 947 #include <machine/db_machdep.h> 948 #include <ddb/db_interface.h> 949 #include <ddb/db_output.h> 950 951 /* 952 * Diagnostic helpers. 953 */ 954 void 955 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...)) 956 { 957 pool_print1(pp, modif, pr); 958 } 959 960 void 961 pool_print_pagelist(struct pool_pagelist *pl, int (*pr)(const char *, ...)) 962 { 963 struct pool_item_header *ph; 964 #ifdef DIAGNOSTIC 965 struct pool_item *pi; 966 #endif 967 968 LIST_FOREACH(ph, pl, ph_pagelist) { 969 (*pr)("\t\tpage %p, nmissing %d\n", 970 ph->ph_page, ph->ph_nmissing); 971 #ifdef DIAGNOSTIC 972 TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) { 973 if (pi->pi_magic != PI_MAGIC) { 974 (*pr)("\t\t\titem %p, magic 0x%x\n", 975 pi, pi->pi_magic); 976 } 977 } 978 #endif 979 } 980 } 981 982 void 983 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...)) 984 { 985 struct pool_item_header *ph; 986 int print_pagelist = 0; 987 char c; 988 989 while ((c = *modif++) != '\0') { 990 if (c == 'p') 991 print_pagelist = 1; 992 modif++; 993 } 994 995 (*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n", 996 pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, 997 pp->pr_roflags); 998 (*pr)("\talloc %p\n", pp->pr_alloc); 999 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1000 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1001 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1002 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1003 1004 (*pr)("\n\tnget %lu, nfail %lu, nput %lu\n", 1005 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1006 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1007 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1008 1009 if (print_pagelist == 0) 1010 return; 1011 1012 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1013 (*pr)("\n\tempty page list:\n"); 1014 pool_print_pagelist(&pp->pr_emptypages, pr); 1015 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1016 (*pr)("\n\tfull page list:\n"); 1017 pool_print_pagelist(&pp->pr_fullpages, pr); 1018 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1019 (*pr)("\n\tpartial-page list:\n"); 1020 pool_print_pagelist(&pp->pr_partpages, pr); 1021 1022 if (pp->pr_curpage == NULL) 1023 (*pr)("\tno current page\n"); 1024 else 1025 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1026 } 1027 1028 void 1029 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif) 1030 { 1031 struct pool *pp; 1032 char maxp[16]; 1033 int ovflw; 1034 char mode; 1035 1036 mode = modif[0]; 1037 if (mode != '\0' && mode != 'a') { 1038 db_printf("usage: show all pools [/a]\n"); 1039 return; 1040 } 1041 1042 if (mode == '\0') 1043 db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n", 1044 "Name", 1045 "Size", 1046 "Requests", 1047 "Fail", 1048 "Releases", 1049 "Pgreq", 1050 "Pgrel", 1051 "Npage", 1052 "Hiwat", 1053 "Minpg", 1054 "Maxpg", 1055 "Idle"); 1056 else 1057 db_printf("%-10s %18s %18s\n", 1058 "Name", "Address", "Allocator"); 1059 1060 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1061 if (mode == 'a') { 1062 db_printf("%-10s %18p %18p\n", pp->pr_wchan, pp, 1063 pp->pr_alloc); 1064 continue; 1065 } 1066 1067 if (!pp->pr_nget) 1068 continue; 1069 1070 if (pp->pr_maxpages == UINT_MAX) 1071 snprintf(maxp, sizeof maxp, "inf"); 1072 else 1073 snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages); 1074 1075 #define PRWORD(ovflw, fmt, width, fixed, val) do { \ 1076 (ovflw) += db_printf((fmt), \ 1077 (width) - (fixed) - (ovflw) > 0 ? \ 1078 (width) - (fixed) - (ovflw) : 0, \ 1079 (val)) - (width); \ 1080 if ((ovflw) < 0) \ 1081 (ovflw) = 0; \ 1082 } while (/* CONSTCOND */0) 1083 1084 ovflw = 0; 1085 PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan); 1086 PRWORD(ovflw, " %*u", 4, 1, pp->pr_size); 1087 PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget); 1088 PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail); 1089 PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput); 1090 PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc); 1091 PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree); 1092 PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages); 1093 PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat); 1094 PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages); 1095 PRWORD(ovflw, " %*s", 6, 1, maxp); 1096 PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle); 1097 } 1098 } 1099 1100 int 1101 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 1102 { 1103 struct pool_item *pi; 1104 caddr_t page; 1105 int n; 1106 1107 page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask); 1108 if (page != ph->ph_page && 1109 (pp->pr_roflags & PR_PHINPAGE) != 0) { 1110 if (label != NULL) 1111 printf("%s: ", label); 1112 printf("pool(%p:%s): page inconsistency: page %p;" 1113 " at page head addr %p (p %p)\n", pp, 1114 pp->pr_wchan, ph->ph_page, 1115 ph, page); 1116 return 1; 1117 } 1118 1119 for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0; 1120 pi != NULL; 1121 pi = TAILQ_NEXT(pi,pi_list), n++) { 1122 1123 #ifdef DIAGNOSTIC 1124 if (pi->pi_magic != PI_MAGIC) { 1125 if (label != NULL) 1126 printf("%s: ", label); 1127 printf("pool(%s): free list modified: magic=%x;" 1128 " page %p; item ordinal %d;" 1129 " addr %p (p %p)\n", 1130 pp->pr_wchan, pi->pi_magic, ph->ph_page, 1131 n, pi, page); 1132 panic("pool"); 1133 } 1134 #endif 1135 page = 1136 (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask); 1137 if (page == ph->ph_page) 1138 continue; 1139 1140 if (label != NULL) 1141 printf("%s: ", label); 1142 printf("pool(%p:%s): page inconsistency: page %p;" 1143 " item ordinal %d; addr %p (p %p)\n", pp, 1144 pp->pr_wchan, ph->ph_page, 1145 n, pi, page); 1146 return 1; 1147 } 1148 return 0; 1149 } 1150 1151 int 1152 pool_chk(struct pool *pp, const char *label) 1153 { 1154 struct pool_item_header *ph; 1155 int r = 0; 1156 1157 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 1158 r = pool_chk_page(pp, label, ph); 1159 if (r) { 1160 goto out; 1161 } 1162 } 1163 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 1164 r = pool_chk_page(pp, label, ph); 1165 if (r) { 1166 goto out; 1167 } 1168 } 1169 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 1170 r = pool_chk_page(pp, label, ph); 1171 if (r) { 1172 goto out; 1173 } 1174 } 1175 1176 out: 1177 return (r); 1178 } 1179 #endif 1180 1181 /* 1182 * We have three different sysctls. 1183 * kern.pool.npools - the number of pools. 1184 * kern.pool.pool.<pool#> - the pool struct for the pool#. 1185 * kern.pool.name.<pool#> - the name for pool#. 1186 */ 1187 int 1188 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep) 1189 { 1190 struct pool *pp, *foundpool = NULL; 1191 size_t buflen = where != NULL ? *sizep : 0; 1192 int npools = 0, s; 1193 unsigned int lookfor; 1194 size_t len; 1195 1196 switch (*name) { 1197 case KERN_POOL_NPOOLS: 1198 if (namelen != 1 || buflen != sizeof(int)) 1199 return (EINVAL); 1200 lookfor = 0; 1201 break; 1202 case KERN_POOL_NAME: 1203 if (namelen != 2 || buflen < 1) 1204 return (EINVAL); 1205 lookfor = name[1]; 1206 break; 1207 case KERN_POOL_POOL: 1208 if (namelen != 2 || buflen != sizeof(struct pool)) 1209 return (EINVAL); 1210 lookfor = name[1]; 1211 break; 1212 default: 1213 return (EINVAL); 1214 } 1215 1216 s = splvm(); 1217 1218 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1219 npools++; 1220 if (lookfor == pp->pr_serial) { 1221 foundpool = pp; 1222 break; 1223 } 1224 } 1225 1226 splx(s); 1227 1228 if (*name != KERN_POOL_NPOOLS && foundpool == NULL) 1229 return (ENOENT); 1230 1231 switch (*name) { 1232 case KERN_POOL_NPOOLS: 1233 return copyout(&npools, where, buflen); 1234 case KERN_POOL_NAME: 1235 len = strlen(foundpool->pr_wchan) + 1; 1236 if (*sizep < len) 1237 return (ENOMEM); 1238 *sizep = len; 1239 return copyout(foundpool->pr_wchan, where, len); 1240 case KERN_POOL_POOL: 1241 return copyout(foundpool, where, buflen); 1242 } 1243 /* NOTREACHED */ 1244 return (0); /* XXX - Stupid gcc */ 1245 } 1246 1247 /* 1248 * Pool backend allocators. 1249 * 1250 * Each pool has a backend allocator that handles allocation, deallocation 1251 */ 1252 void *pool_page_alloc_oldnointr(struct pool *, int); 1253 void pool_page_free_oldnointr(struct pool *, void *); 1254 void *pool_page_alloc(struct pool *, int); 1255 void pool_page_free(struct pool *, void *); 1256 1257 /* 1258 * safe for interrupts, name preserved for compat this is the default 1259 * allocator 1260 */ 1261 struct pool_allocator pool_allocator_nointr = { 1262 pool_page_alloc, pool_page_free, 0, 1263 }; 1264 1265 /* 1266 * XXX - we have at least three different resources for the same allocation 1267 * and each resource can be depleted. First we have the ready elements in 1268 * the pool. Then we have the resource (typically a vm_map) for this 1269 * allocator, then we have physical memory. Waiting for any of these can 1270 * be unnecessary when any other is freed, but the kernel doesn't support 1271 * sleeping on multiple addresses, so we have to fake. The caller sleeps on 1272 * the pool (so that we can be awakened when an item is returned to the pool), 1273 * but we set PA_WANT on the allocator. When a page is returned to 1274 * the allocator and PA_WANT is set pool_allocator_free will wakeup all 1275 * sleeping pools belonging to this allocator. (XXX - thundering herd). 1276 * We also wake up the allocator in case someone without a pool (malloc) 1277 * is sleeping waiting for this allocator. 1278 */ 1279 1280 void * 1281 pool_allocator_alloc(struct pool *pp, int flags) 1282 { 1283 boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE; 1284 void *v; 1285 1286 if (waitok) 1287 mtx_leave(&pp->pr_mtx); 1288 v = pp->pr_alloc->pa_alloc(pp, flags); 1289 if (waitok) 1290 mtx_enter(&pp->pr_mtx); 1291 1292 return (v); 1293 } 1294 1295 void 1296 pool_allocator_free(struct pool *pp, void *v) 1297 { 1298 struct pool_allocator *pa = pp->pr_alloc; 1299 1300 (*pa->pa_free)(pp, v); 1301 } 1302 1303 void * 1304 pool_page_alloc(struct pool *pp, int flags) 1305 { 1306 boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE; 1307 1308 return (uvm_km_getpage(waitok)); 1309 } 1310 1311 void 1312 pool_page_free(struct pool *pp, void *v) 1313 { 1314 1315 uvm_km_putpage(v); 1316 } 1317