1 /* $OpenBSD: subr_pool.c,v 1.58 2007/12/11 15:04:58 tedu Exp $ */ 2 /* $NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $ */ 3 4 /*- 5 * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the NetBSD 23 * Foundation, Inc. and its contributors. 24 * 4. Neither the name of The NetBSD Foundation nor the names of its 25 * contributors may be used to endorse or promote products derived 26 * from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 31 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 32 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 * POSSIBILITY OF SUCH DAMAGE. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/proc.h> 44 #include <sys/errno.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 #include <sys/pool.h> 48 #include <sys/syslog.h> 49 #include <sys/sysctl.h> 50 51 #include <uvm/uvm.h> 52 53 54 /* 55 * Pool resource management utility. 56 * 57 * Memory is allocated in pages which are split into pieces according to 58 * the pool item size. Each page is kept on one of three lists in the 59 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 60 * for empty, full and partially-full pages respectively. The individual 61 * pool items are on a linked list headed by `ph_itemlist' in each page 62 * header. The memory for building the page list is either taken from 63 * the allocated pages themselves (for small pool items) or taken from 64 * an internal pool of page headers (`phpool'). 65 */ 66 67 /* List of all pools */ 68 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 69 70 /* Private pool for page header structures */ 71 struct pool phpool; 72 73 struct pool_item_header { 74 /* Page headers */ 75 LIST_ENTRY(pool_item_header) 76 ph_pagelist; /* pool page list */ 77 TAILQ_HEAD(,pool_item) ph_itemlist; /* chunk list for this page */ 78 SPLAY_ENTRY(pool_item_header) 79 ph_node; /* Off-page page headers */ 80 int ph_nmissing; /* # of chunks in use */ 81 caddr_t ph_page; /* this page's address */ 82 }; 83 84 struct pool_item { 85 #ifdef DIAGNOSTIC 86 int pi_magic; 87 #endif 88 #ifdef DEADBEEF1 89 #define PI_MAGIC DEADBEEF1 90 #else 91 #define PI_MAGIC 0xdeafbeef 92 #endif 93 /* Other entries use only this list entry */ 94 TAILQ_ENTRY(pool_item) pi_list; 95 }; 96 97 #define POOL_NEEDS_CATCHUP(pp) \ 98 ((pp)->pr_nitems < (pp)->pr_minitems) 99 100 /* 101 * Every pool gets a unique serial number assigned to it. If this counter 102 * wraps, we're screwed, but we shouldn't create so many pools anyway. 103 */ 104 unsigned int pool_serial; 105 106 int pool_catchup(struct pool *); 107 void pool_prime_page(struct pool *, caddr_t, struct pool_item_header *); 108 void pool_update_curpage(struct pool *); 109 void *pool_do_get(struct pool *, int); 110 void pool_do_put(struct pool *, void *); 111 void pr_rmpage(struct pool *, struct pool_item_header *, 112 struct pool_pagelist *); 113 int pool_chk_page(struct pool *, const char *, struct pool_item_header *); 114 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int); 115 116 void *pool_allocator_alloc(struct pool *, int); 117 void pool_allocator_free(struct pool *, void *); 118 119 #ifdef DDB 120 void pool_print_pagelist(struct pool_pagelist *, 121 int (*)(const char *, ...)); 122 void pool_print1(struct pool *, const char *, int (*)(const char *, ...)); 123 #endif 124 125 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0) 126 127 static __inline int 128 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 129 { 130 if (a->ph_page < b->ph_page) 131 return (-1); 132 else if (a->ph_page > b->ph_page) 133 return (1); 134 else 135 return (0); 136 } 137 138 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 139 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 140 141 /* 142 * Return the pool page header based on page address. 143 */ 144 static __inline struct pool_item_header * 145 pr_find_pagehead(struct pool *pp, caddr_t page) 146 { 147 struct pool_item_header *ph, tmp; 148 149 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 150 return ((struct pool_item_header *)(page + pp->pr_phoffset)); 151 152 tmp.ph_page = page; 153 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 154 return ph; 155 } 156 157 /* 158 * Remove a page from the pool. 159 */ 160 void 161 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 162 struct pool_pagelist *pq) 163 { 164 165 /* 166 * If the page was idle, decrement the idle page count. 167 */ 168 if (ph->ph_nmissing == 0) { 169 #ifdef DIAGNOSTIC 170 if (pp->pr_nidle == 0) 171 panic("pr_rmpage: nidle inconsistent"); 172 if (pp->pr_nitems < pp->pr_itemsperpage) 173 panic("pr_rmpage: nitems inconsistent"); 174 #endif 175 pp->pr_nidle--; 176 } 177 178 pp->pr_nitems -= pp->pr_itemsperpage; 179 180 /* 181 * Unlink a page from the pool and release it (or queue it for release). 182 */ 183 LIST_REMOVE(ph, ph_pagelist); 184 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 185 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 186 if (pq) { 187 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 188 } else { 189 pool_allocator_free(pp, ph->ph_page); 190 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 191 pool_put(&phpool, ph); 192 } 193 pp->pr_npages--; 194 pp->pr_npagefree++; 195 196 pool_update_curpage(pp); 197 } 198 199 /* 200 * Initialize the given pool resource structure. 201 * 202 * We export this routine to allow other kernel parts to declare 203 * static pools that must be initialized before malloc() is available. 204 */ 205 void 206 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 207 const char *wchan, struct pool_allocator *palloc) 208 { 209 int off, slack; 210 211 #ifdef MALLOC_DEBUG 212 if ((flags & PR_DEBUG) && (ioff != 0 || align != 0)) 213 flags &= ~PR_DEBUG; 214 #endif 215 /* 216 * Check arguments and construct default values. 217 */ 218 if (palloc == NULL) 219 palloc = &pool_allocator_nointr; 220 if (palloc->pa_pagesz == 0) { 221 palloc->pa_pagesz = PAGE_SIZE; 222 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 223 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 224 } 225 226 if (align == 0) 227 align = ALIGN(1); 228 229 if (size < sizeof(struct pool_item)) 230 size = sizeof(struct pool_item); 231 232 size = roundup(size, align); 233 #ifdef DIAGNOSTIC 234 if (size > palloc->pa_pagesz) 235 panic("pool_init: pool item size (%lu) too large", 236 (u_long)size); 237 #endif 238 239 /* 240 * Initialize the pool structure. 241 */ 242 LIST_INIT(&pp->pr_emptypages); 243 LIST_INIT(&pp->pr_fullpages); 244 LIST_INIT(&pp->pr_partpages); 245 pp->pr_curpage = NULL; 246 pp->pr_npages = 0; 247 pp->pr_minitems = 0; 248 pp->pr_minpages = 0; 249 pp->pr_maxpages = 8; 250 pp->pr_roflags = flags; 251 pp->pr_flags = 0; 252 pp->pr_size = size; 253 pp->pr_align = align; 254 pp->pr_wchan = wchan; 255 pp->pr_alloc = palloc; 256 pp->pr_nitems = 0; 257 pp->pr_nout = 0; 258 pp->pr_hardlimit = UINT_MAX; 259 pp->pr_hardlimit_warning = NULL; 260 pp->pr_hardlimit_ratecap.tv_sec = 0; 261 pp->pr_hardlimit_ratecap.tv_usec = 0; 262 pp->pr_hardlimit_warning_last.tv_sec = 0; 263 pp->pr_hardlimit_warning_last.tv_usec = 0; 264 pp->pr_serial = ++pool_serial; 265 if (pool_serial == 0) 266 panic("pool_init: too much uptime"); 267 268 /* 269 * Decide whether to put the page header off page to avoid 270 * wasting too large a part of the page. Off-page page headers 271 * go on a hash table, so we can match a returned item 272 * with its header based on the page address. 273 * We use 1/16 of the page size as the threshold (XXX: tune) 274 */ 275 if (pp->pr_size < palloc->pa_pagesz/16) { 276 /* Use the end of the page for the page header */ 277 pp->pr_roflags |= PR_PHINPAGE; 278 pp->pr_phoffset = off = palloc->pa_pagesz - 279 ALIGN(sizeof(struct pool_item_header)); 280 } else { 281 /* The page header will be taken from our page header pool */ 282 pp->pr_phoffset = 0; 283 off = palloc->pa_pagesz; 284 SPLAY_INIT(&pp->pr_phtree); 285 } 286 287 /* 288 * Alignment is to take place at `ioff' within the item. This means 289 * we must reserve up to `align - 1' bytes on the page to allow 290 * appropriate positioning of each item. 291 * 292 * Silently enforce `0 <= ioff < align'. 293 */ 294 pp->pr_itemoffset = ioff = ioff % align; 295 pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size; 296 KASSERT(pp->pr_itemsperpage != 0); 297 298 /* 299 * Use the slack between the chunks and the page header 300 * for "cache coloring". 301 */ 302 slack = off - pp->pr_itemsperpage * pp->pr_size; 303 pp->pr_maxcolor = (slack / align) * align; 304 pp->pr_curcolor = 0; 305 306 pp->pr_nget = 0; 307 pp->pr_nfail = 0; 308 pp->pr_nput = 0; 309 pp->pr_npagealloc = 0; 310 pp->pr_npagefree = 0; 311 pp->pr_hiwat = 0; 312 pp->pr_nidle = 0; 313 314 pp->pr_ipl = -1; 315 mtx_init(&pp->pr_mtx, IPL_NONE); 316 317 if (phpool.pr_size == 0) { 318 pool_init(&phpool, sizeof(struct pool_item_header), 0, 0, 319 0, "phpool", NULL); 320 pool_setipl(&phpool, IPL_HIGH); 321 } 322 323 /* Insert this into the list of all pools. */ 324 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 325 } 326 327 void 328 pool_setipl(struct pool *pp, int ipl) 329 { 330 pp->pr_ipl = ipl; 331 mtx_init(&pp->pr_mtx, ipl); 332 } 333 334 /* 335 * Decommission a pool resource. 336 */ 337 void 338 pool_destroy(struct pool *pp) 339 { 340 struct pool_item_header *ph; 341 342 #ifdef DIAGNOSTIC 343 if (pp->pr_nout != 0) 344 panic("pool_destroy: pool busy: still out: %u", pp->pr_nout); 345 #endif 346 347 /* Remove all pages */ 348 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 349 pr_rmpage(pp, ph, NULL); 350 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 351 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 352 353 /* Remove from global pool list */ 354 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 355 } 356 357 struct pool_item_header * 358 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags) 359 { 360 struct pool_item_header *ph; 361 362 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 363 ph = (struct pool_item_header *)(storage + pp->pr_phoffset); 364 else { 365 ph = pool_get(&phpool, flags); 366 } 367 368 return (ph); 369 } 370 371 /* 372 * Grab an item from the pool; must be called at appropriate spl level 373 */ 374 void * 375 pool_get(struct pool *pp, int flags) 376 { 377 void *v; 378 379 mtx_enter(&pp->pr_mtx); 380 v = pool_do_get(pp, flags); 381 mtx_leave(&pp->pr_mtx); 382 if (v && pp->pr_ctor && pp->pr_ctor(pp->pr_arg, v, flags)) { 383 mtx_enter(&pp->pr_mtx); 384 pool_do_put(pp, v); 385 mtx_leave(&pp->pr_mtx); 386 v = NULL; 387 } 388 if (v) 389 pp->pr_nget++; 390 return (v); 391 } 392 393 void * 394 pool_do_get(struct pool *pp, int flags) 395 { 396 struct pool_item *pi; 397 struct pool_item_header *ph; 398 void *v; 399 400 #ifdef DIAGNOSTIC 401 if ((flags & PR_WAITOK) != 0) 402 splassert(IPL_NONE); 403 if (pp->pr_ipl != -1) 404 splassert(pp->pr_ipl); 405 #endif /* DIAGNOSTIC */ 406 407 #ifdef MALLOC_DEBUG 408 if (pp->pr_roflags & PR_DEBUG) { 409 void *addr; 410 411 addr = NULL; 412 debug_malloc(pp->pr_size, M_DEBUG, 413 (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr); 414 return (addr); 415 } 416 #endif 417 418 startover: 419 /* 420 * Check to see if we've reached the hard limit. If we have, 421 * and we can wait, then wait until an item has been returned to 422 * the pool. 423 */ 424 #ifdef DIAGNOSTIC 425 if (__predict_false(pp->pr_nout > pp->pr_hardlimit)) 426 panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan); 427 #endif 428 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 429 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 430 /* 431 * XXX: A warning isn't logged in this case. Should 432 * it be? 433 */ 434 pp->pr_flags |= PR_WANTED; 435 pool_sleep(pp); 436 goto startover; 437 } 438 439 /* 440 * Log a message that the hard limit has been hit. 441 */ 442 if (pp->pr_hardlimit_warning != NULL && 443 ratecheck(&pp->pr_hardlimit_warning_last, 444 &pp->pr_hardlimit_ratecap)) 445 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 446 447 pp->pr_nfail++; 448 return (NULL); 449 } 450 451 /* 452 * The convention we use is that if `curpage' is not NULL, then 453 * it points at a non-empty bucket. In particular, `curpage' 454 * never points at a page header which has PR_PHINPAGE set and 455 * has no items in its bucket. 456 */ 457 if ((ph = pp->pr_curpage) == NULL) { 458 #ifdef DIAGNOSTIC 459 if (pp->pr_nitems != 0) { 460 printf("pool_do_get: %s: curpage NULL, nitems %u\n", 461 pp->pr_wchan, pp->pr_nitems); 462 panic("pool_do_get: nitems inconsistent"); 463 } 464 #endif 465 466 /* 467 * Call the back-end page allocator for more memory. 468 */ 469 v = pool_allocator_alloc(pp, flags); 470 if (__predict_true(v != NULL)) 471 ph = pool_alloc_item_header(pp, v, flags); 472 473 if (__predict_false(v == NULL || ph == NULL)) { 474 if (v != NULL) 475 pool_allocator_free(pp, v); 476 477 if ((flags & PR_WAITOK) == 0) { 478 pp->pr_nfail++; 479 return (NULL); 480 } 481 482 /* 483 * Wait for items to be returned to this pool. 484 * 485 * XXX: maybe we should wake up once a second and 486 * try again? 487 */ 488 pp->pr_flags |= PR_WANTED; 489 pool_sleep(pp); 490 goto startover; 491 } 492 493 /* We have more memory; add it to the pool */ 494 pool_prime_page(pp, v, ph); 495 pp->pr_npagealloc++; 496 497 /* Start the allocation process over. */ 498 goto startover; 499 } 500 if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) { 501 panic("pool_do_get: %s: page empty", pp->pr_wchan); 502 } 503 #ifdef DIAGNOSTIC 504 if (__predict_false(pp->pr_nitems == 0)) { 505 printf("pool_do_get: %s: items on itemlist, nitems %u\n", 506 pp->pr_wchan, pp->pr_nitems); 507 panic("pool_do_get: nitems inconsistent"); 508 } 509 #endif 510 511 #ifdef DIAGNOSTIC 512 if (__predict_false(pi->pi_magic != PI_MAGIC)) { 513 panic("pool_do_get(%s): free list modified: magic=%x; page %p;" 514 " item addr %p", 515 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 516 } 517 #endif 518 519 /* 520 * Remove from item list. 521 */ 522 TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list); 523 pp->pr_nitems--; 524 pp->pr_nout++; 525 if (ph->ph_nmissing == 0) { 526 #ifdef DIAGNOSTIC 527 if (__predict_false(pp->pr_nidle == 0)) 528 panic("pool_do_get: nidle inconsistent"); 529 #endif 530 pp->pr_nidle--; 531 532 /* 533 * This page was previously empty. Move it to the list of 534 * partially-full pages. This page is already curpage. 535 */ 536 LIST_REMOVE(ph, ph_pagelist); 537 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 538 } 539 ph->ph_nmissing++; 540 if (TAILQ_EMPTY(&ph->ph_itemlist)) { 541 #ifdef DIAGNOSTIC 542 if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) { 543 panic("pool_do_get: %s: nmissing inconsistent", 544 pp->pr_wchan); 545 } 546 #endif 547 /* 548 * This page is now full. Move it to the full list 549 * and select a new current page. 550 */ 551 LIST_REMOVE(ph, ph_pagelist); 552 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 553 pool_update_curpage(pp); 554 } 555 556 /* 557 * If we have a low water mark and we are now below that low 558 * water mark, add more items to the pool. 559 */ 560 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 561 /* 562 * XXX: Should we log a warning? Should we set up a timeout 563 * to try again in a second or so? The latter could break 564 * a caller's assumptions about interrupt protection, etc. 565 */ 566 } 567 return (v); 568 } 569 570 /* 571 * Return resource to the pool; must be called at appropriate spl level 572 */ 573 void 574 pool_put(struct pool *pp, void *v) 575 { 576 if (pp->pr_dtor) 577 pp->pr_dtor(pp->pr_arg, v); 578 mtx_enter(&pp->pr_mtx); 579 pool_do_put(pp, v); 580 mtx_leave(&pp->pr_mtx); 581 pp->pr_nput++; 582 } 583 584 /* 585 * Internal version of pool_put(). 586 */ 587 void 588 pool_do_put(struct pool *pp, void *v) 589 { 590 struct pool_item *pi = v; 591 struct pool_item_header *ph; 592 caddr_t page; 593 594 #ifdef MALLOC_DEBUG 595 if (pp->pr_roflags & PR_DEBUG) { 596 debug_free(v, M_DEBUG); 597 return; 598 } 599 #endif 600 601 page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask); 602 603 #ifdef DIAGNOSTIC 604 if (pp->pr_ipl != -1) 605 splassert(pp->pr_ipl); 606 607 if (__predict_false(pp->pr_nout == 0)) { 608 printf("pool %s: putting with none out\n", 609 pp->pr_wchan); 610 panic("pool_do_put"); 611 } 612 #endif 613 614 if (__predict_false((ph = pr_find_pagehead(pp, page)) == NULL)) { 615 panic("pool_do_put: %s: page header missing", pp->pr_wchan); 616 } 617 618 /* 619 * Return to item list. 620 */ 621 #ifdef DIAGNOSTIC 622 pi->pi_magic = PI_MAGIC; 623 #endif 624 #ifdef DEBUG 625 { 626 int i, *ip = v; 627 628 for (i = 0; i < pp->pr_size / sizeof(int); i++) { 629 *ip++ = PI_MAGIC; 630 } 631 } 632 #endif 633 634 TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 635 ph->ph_nmissing--; 636 pp->pr_nitems++; 637 pp->pr_nout--; 638 639 /* Cancel "pool empty" condition if it exists */ 640 if (pp->pr_curpage == NULL) 641 pp->pr_curpage = ph; 642 643 if (pp->pr_flags & PR_WANTED) { 644 pp->pr_flags &= ~PR_WANTED; 645 if (ph->ph_nmissing == 0) 646 pp->pr_nidle++; 647 wakeup(pp); 648 return; 649 } 650 651 /* 652 * If this page is now empty, do one of two things: 653 * 654 * (1) If we have more pages than the page high water mark, 655 * free the page back to the system. 656 * 657 * (2) Otherwise, move the page to the empty page list. 658 * 659 * Either way, select a new current page (so we use a partially-full 660 * page if one is available). 661 */ 662 if (ph->ph_nmissing == 0) { 663 pp->pr_nidle++; 664 if (pp->pr_nidle > pp->pr_maxpages) { 665 pr_rmpage(pp, ph, NULL); 666 } else { 667 LIST_REMOVE(ph, ph_pagelist); 668 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 669 } 670 pool_update_curpage(pp); 671 } 672 673 /* 674 * If the page was previously completely full, move it to the 675 * partially-full list and make it the current page. The next 676 * allocation will get the item from this page, instead of 677 * further fragmenting the pool. 678 */ 679 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 680 LIST_REMOVE(ph, ph_pagelist); 681 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 682 pp->pr_curpage = ph; 683 } 684 } 685 686 /* 687 * Add N items to the pool. 688 */ 689 int 690 pool_prime(struct pool *pp, int n) 691 { 692 struct pool_item_header *ph; 693 caddr_t cp; 694 int newpages; 695 696 mtx_enter(&pp->pr_mtx); 697 newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 698 699 while (newpages-- > 0) { 700 cp = pool_allocator_alloc(pp, PR_NOWAIT); 701 if (__predict_true(cp != NULL)) 702 ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); 703 if (__predict_false(cp == NULL || ph == NULL)) { 704 if (cp != NULL) 705 pool_allocator_free(pp, cp); 706 break; 707 } 708 709 pool_prime_page(pp, cp, ph); 710 pp->pr_npagealloc++; 711 pp->pr_minpages++; 712 } 713 714 if (pp->pr_minpages >= pp->pr_maxpages) 715 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 716 717 mtx_leave(&pp->pr_mtx); 718 return (0); 719 } 720 721 /* 722 * Add a page worth of items to the pool. 723 * 724 * Note, we must be called with the pool descriptor LOCKED. 725 */ 726 void 727 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph) 728 { 729 struct pool_item *pi; 730 caddr_t cp = storage; 731 unsigned int align = pp->pr_align; 732 unsigned int ioff = pp->pr_itemoffset; 733 int n; 734 735 #ifdef DIAGNOSTIC 736 if (((u_long)cp & (pp->pr_alloc->pa_pagesz - 1)) != 0) 737 panic("pool_prime_page: %s: unaligned page", pp->pr_wchan); 738 #endif 739 740 /* 741 * Insert page header. 742 */ 743 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 744 TAILQ_INIT(&ph->ph_itemlist); 745 ph->ph_page = storage; 746 ph->ph_nmissing = 0; 747 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 748 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 749 750 pp->pr_nidle++; 751 752 /* 753 * Color this page. 754 */ 755 cp = (caddr_t)(cp + pp->pr_curcolor); 756 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 757 pp->pr_curcolor = 0; 758 759 /* 760 * Adjust storage to apply aligment to `pr_itemoffset' in each item. 761 */ 762 if (ioff != 0) 763 cp = (caddr_t)(cp + (align - ioff)); 764 765 /* 766 * Insert remaining chunks on the bucket list. 767 */ 768 n = pp->pr_itemsperpage; 769 pp->pr_nitems += n; 770 771 while (n--) { 772 pi = (struct pool_item *)cp; 773 774 KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0); 775 776 /* Insert on page list */ 777 TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list); 778 #ifdef DIAGNOSTIC 779 pi->pi_magic = PI_MAGIC; 780 #endif 781 cp = (caddr_t)(cp + pp->pr_size); 782 } 783 784 /* 785 * If the pool was depleted, point at the new page. 786 */ 787 if (pp->pr_curpage == NULL) 788 pp->pr_curpage = ph; 789 790 if (++pp->pr_npages > pp->pr_hiwat) 791 pp->pr_hiwat = pp->pr_npages; 792 } 793 794 /* 795 * Used by pool_get() when nitems drops below the low water mark. This 796 * is used to catch up pr_nitems with the low water mark. 797 * 798 * Note we never wait for memory here, we let the caller decide what to do. 799 */ 800 int 801 pool_catchup(struct pool *pp) 802 { 803 struct pool_item_header *ph; 804 caddr_t cp; 805 int error = 0; 806 807 while (POOL_NEEDS_CATCHUP(pp)) { 808 /* 809 * Call the page back-end allocator for more memory. 810 */ 811 cp = pool_allocator_alloc(pp, PR_NOWAIT); 812 if (__predict_true(cp != NULL)) 813 ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); 814 if (__predict_false(cp == NULL || ph == NULL)) { 815 if (cp != NULL) 816 pool_allocator_free(pp, cp); 817 error = ENOMEM; 818 break; 819 } 820 pool_prime_page(pp, cp, ph); 821 pp->pr_npagealloc++; 822 } 823 824 return (error); 825 } 826 827 void 828 pool_update_curpage(struct pool *pp) 829 { 830 831 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 832 if (pp->pr_curpage == NULL) { 833 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 834 } 835 } 836 837 void 838 pool_setlowat(struct pool *pp, int n) 839 { 840 841 pp->pr_minitems = n; 842 pp->pr_minpages = (n == 0) 843 ? 0 844 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 845 846 mtx_enter(&pp->pr_mtx); 847 /* Make sure we're caught up with the newly-set low water mark. */ 848 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 849 /* 850 * XXX: Should we log a warning? Should we set up a timeout 851 * to try again in a second or so? The latter could break 852 * a caller's assumptions about interrupt protection, etc. 853 */ 854 } 855 mtx_leave(&pp->pr_mtx); 856 } 857 858 void 859 pool_sethiwat(struct pool *pp, int n) 860 { 861 862 pp->pr_maxpages = (n == 0) 863 ? 0 864 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 865 } 866 867 int 868 pool_sethardlimit(struct pool *pp, unsigned n, const char *warnmess, int ratecap) 869 { 870 int error = 0; 871 872 if (n < pp->pr_nout) { 873 error = EINVAL; 874 goto done; 875 } 876 877 pp->pr_hardlimit = n; 878 pp->pr_hardlimit_warning = warnmess; 879 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 880 pp->pr_hardlimit_warning_last.tv_sec = 0; 881 pp->pr_hardlimit_warning_last.tv_usec = 0; 882 883 /* 884 * In-line version of pool_sethiwat(). 885 */ 886 pp->pr_maxpages = (n == 0 || n == UINT_MAX) 887 ? n 888 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 889 890 done: 891 return (error); 892 } 893 894 void 895 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int), 896 void (*dtor)(void *, void *), void *arg) 897 { 898 pp->pr_ctor = ctor; 899 pp->pr_dtor = dtor; 900 pp->pr_arg = arg; 901 } 902 /* 903 * Release all complete pages that have not been used recently. 904 * 905 * Returns non-zero if any pages have been reclaimed. 906 */ 907 int 908 pool_reclaim(struct pool *pp) 909 { 910 struct pool_item_header *ph, *phnext; 911 struct pool_pagelist pq; 912 913 LIST_INIT(&pq); 914 915 mtx_enter(&pp->pr_mtx); 916 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 917 phnext = LIST_NEXT(ph, ph_pagelist); 918 919 /* Check our minimum page claim */ 920 if (pp->pr_npages <= pp->pr_minpages) 921 break; 922 923 KASSERT(ph->ph_nmissing == 0); 924 925 /* 926 * If freeing this page would put us below 927 * the low water mark, stop now. 928 */ 929 if ((pp->pr_nitems - pp->pr_itemsperpage) < 930 pp->pr_minitems) 931 break; 932 933 pr_rmpage(pp, ph, &pq); 934 } 935 mtx_leave(&pp->pr_mtx); 936 937 if (LIST_EMPTY(&pq)) 938 return (0); 939 while ((ph = LIST_FIRST(&pq)) != NULL) { 940 LIST_REMOVE(ph, ph_pagelist); 941 pool_allocator_free(pp, ph->ph_page); 942 if (pp->pr_roflags & PR_PHINPAGE) 943 continue; 944 pool_put(&phpool, ph); 945 } 946 947 return (1); 948 } 949 950 #ifdef DDB 951 #include <machine/db_machdep.h> 952 #include <ddb/db_interface.h> 953 #include <ddb/db_output.h> 954 955 /* 956 * Diagnostic helpers. 957 */ 958 void 959 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...)) 960 { 961 pool_print1(pp, modif, pr); 962 } 963 964 void 965 pool_print_pagelist(struct pool_pagelist *pl, int (*pr)(const char *, ...)) 966 { 967 struct pool_item_header *ph; 968 #ifdef DIAGNOSTIC 969 struct pool_item *pi; 970 #endif 971 972 LIST_FOREACH(ph, pl, ph_pagelist) { 973 (*pr)("\t\tpage %p, nmissing %d\n", 974 ph->ph_page, ph->ph_nmissing); 975 #ifdef DIAGNOSTIC 976 TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) { 977 if (pi->pi_magic != PI_MAGIC) { 978 (*pr)("\t\t\titem %p, magic 0x%x\n", 979 pi, pi->pi_magic); 980 } 981 } 982 #endif 983 } 984 } 985 986 void 987 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...)) 988 { 989 struct pool_item_header *ph; 990 int print_pagelist = 0; 991 char c; 992 993 while ((c = *modif++) != '\0') { 994 if (c == 'p') 995 print_pagelist = 1; 996 modif++; 997 } 998 999 (*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n", 1000 pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, 1001 pp->pr_roflags); 1002 (*pr)("\talloc %p\n", pp->pr_alloc); 1003 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1004 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1005 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1006 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1007 1008 (*pr)("\n\tnget %lu, nfail %lu, nput %lu\n", 1009 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1010 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1011 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1012 1013 if (print_pagelist == 0) 1014 return; 1015 1016 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1017 (*pr)("\n\tempty page list:\n"); 1018 pool_print_pagelist(&pp->pr_emptypages, pr); 1019 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1020 (*pr)("\n\tfull page list:\n"); 1021 pool_print_pagelist(&pp->pr_fullpages, pr); 1022 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1023 (*pr)("\n\tpartial-page list:\n"); 1024 pool_print_pagelist(&pp->pr_partpages, pr); 1025 1026 if (pp->pr_curpage == NULL) 1027 (*pr)("\tno current page\n"); 1028 else 1029 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1030 } 1031 1032 void 1033 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif) 1034 { 1035 struct pool *pp; 1036 char maxp[16]; 1037 int ovflw; 1038 char mode; 1039 1040 mode = modif[0]; 1041 if (mode != '\0' && mode != 'a') { 1042 db_printf("usage: show all pools [/a]\n"); 1043 return; 1044 } 1045 1046 if (mode == '\0') 1047 db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n", 1048 "Name", 1049 "Size", 1050 "Requests", 1051 "Fail", 1052 "Releases", 1053 "Pgreq", 1054 "Pgrel", 1055 "Npage", 1056 "Hiwat", 1057 "Minpg", 1058 "Maxpg", 1059 "Idle"); 1060 else 1061 db_printf("%-10s %18s %18s\n", 1062 "Name", "Address", "Allocator"); 1063 1064 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1065 if (mode == 'a') { 1066 db_printf("%-10s %18p %18p\n", pp->pr_wchan, pp, 1067 pp->pr_alloc); 1068 continue; 1069 } 1070 1071 if (!pp->pr_nget) 1072 continue; 1073 1074 if (pp->pr_maxpages == UINT_MAX) 1075 snprintf(maxp, sizeof maxp, "inf"); 1076 else 1077 snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages); 1078 1079 #define PRWORD(ovflw, fmt, width, fixed, val) do { \ 1080 (ovflw) += db_printf((fmt), \ 1081 (width) - (fixed) - (ovflw) > 0 ? \ 1082 (width) - (fixed) - (ovflw) : 0, \ 1083 (val)) - (width); \ 1084 if ((ovflw) < 0) \ 1085 (ovflw) = 0; \ 1086 } while (/* CONSTCOND */0) 1087 1088 ovflw = 0; 1089 PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan); 1090 PRWORD(ovflw, " %*u", 4, 1, pp->pr_size); 1091 PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget); 1092 PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail); 1093 PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput); 1094 PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc); 1095 PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree); 1096 PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages); 1097 PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat); 1098 PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages); 1099 PRWORD(ovflw, " %*s", 6, 1, maxp); 1100 PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle); 1101 } 1102 } 1103 1104 int 1105 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 1106 { 1107 struct pool_item *pi; 1108 caddr_t page; 1109 int n; 1110 1111 page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask); 1112 if (page != ph->ph_page && 1113 (pp->pr_roflags & PR_PHINPAGE) != 0) { 1114 if (label != NULL) 1115 printf("%s: ", label); 1116 printf("pool(%p:%s): page inconsistency: page %p;" 1117 " at page head addr %p (p %p)\n", pp, 1118 pp->pr_wchan, ph->ph_page, 1119 ph, page); 1120 return 1; 1121 } 1122 1123 for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0; 1124 pi != NULL; 1125 pi = TAILQ_NEXT(pi,pi_list), n++) { 1126 1127 #ifdef DIAGNOSTIC 1128 if (pi->pi_magic != PI_MAGIC) { 1129 if (label != NULL) 1130 printf("%s: ", label); 1131 printf("pool(%s): free list modified: magic=%x;" 1132 " page %p; item ordinal %d;" 1133 " addr %p (p %p)\n", 1134 pp->pr_wchan, pi->pi_magic, ph->ph_page, 1135 n, pi, page); 1136 panic("pool"); 1137 } 1138 #endif 1139 page = 1140 (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask); 1141 if (page == ph->ph_page) 1142 continue; 1143 1144 if (label != NULL) 1145 printf("%s: ", label); 1146 printf("pool(%p:%s): page inconsistency: page %p;" 1147 " item ordinal %d; addr %p (p %p)\n", pp, 1148 pp->pr_wchan, ph->ph_page, 1149 n, pi, page); 1150 return 1; 1151 } 1152 return 0; 1153 } 1154 1155 int 1156 pool_chk(struct pool *pp, const char *label) 1157 { 1158 struct pool_item_header *ph; 1159 int r = 0; 1160 1161 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 1162 r = pool_chk_page(pp, label, ph); 1163 if (r) { 1164 goto out; 1165 } 1166 } 1167 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 1168 r = pool_chk_page(pp, label, ph); 1169 if (r) { 1170 goto out; 1171 } 1172 } 1173 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 1174 r = pool_chk_page(pp, label, ph); 1175 if (r) { 1176 goto out; 1177 } 1178 } 1179 1180 out: 1181 return (r); 1182 } 1183 #endif 1184 1185 /* 1186 * We have three different sysctls. 1187 * kern.pool.npools - the number of pools. 1188 * kern.pool.pool.<pool#> - the pool struct for the pool#. 1189 * kern.pool.name.<pool#> - the name for pool#. 1190 */ 1191 int 1192 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep) 1193 { 1194 struct pool *pp, *foundpool = NULL; 1195 size_t buflen = where != NULL ? *sizep : 0; 1196 int npools = 0, s; 1197 unsigned int lookfor; 1198 size_t len; 1199 1200 switch (*name) { 1201 case KERN_POOL_NPOOLS: 1202 if (namelen != 1 || buflen != sizeof(int)) 1203 return (EINVAL); 1204 lookfor = 0; 1205 break; 1206 case KERN_POOL_NAME: 1207 if (namelen != 2 || buflen < 1) 1208 return (EINVAL); 1209 lookfor = name[1]; 1210 break; 1211 case KERN_POOL_POOL: 1212 if (namelen != 2 || buflen != sizeof(struct pool)) 1213 return (EINVAL); 1214 lookfor = name[1]; 1215 break; 1216 default: 1217 return (EINVAL); 1218 } 1219 1220 s = splvm(); 1221 1222 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1223 npools++; 1224 if (lookfor == pp->pr_serial) { 1225 foundpool = pp; 1226 break; 1227 } 1228 } 1229 1230 splx(s); 1231 1232 if (*name != KERN_POOL_NPOOLS && foundpool == NULL) 1233 return (ENOENT); 1234 1235 switch (*name) { 1236 case KERN_POOL_NPOOLS: 1237 return copyout(&npools, where, buflen); 1238 case KERN_POOL_NAME: 1239 len = strlen(foundpool->pr_wchan) + 1; 1240 if (*sizep < len) 1241 return (ENOMEM); 1242 *sizep = len; 1243 return copyout(foundpool->pr_wchan, where, len); 1244 case KERN_POOL_POOL: 1245 return copyout(foundpool, where, buflen); 1246 } 1247 /* NOTREACHED */ 1248 return (0); /* XXX - Stupid gcc */ 1249 } 1250 1251 /* 1252 * Pool backend allocators. 1253 * 1254 * Each pool has a backend allocator that handles allocation, deallocation 1255 */ 1256 void *pool_page_alloc_oldnointr(struct pool *, int); 1257 void pool_page_free_oldnointr(struct pool *, void *); 1258 void *pool_page_alloc(struct pool *, int); 1259 void pool_page_free(struct pool *, void *); 1260 1261 /* previous nointr. handles large allocations safely */ 1262 struct pool_allocator pool_allocator_oldnointr = { 1263 pool_page_alloc_oldnointr, pool_page_free_oldnointr, 0, 1264 }; 1265 /* safe for interrupts, name preserved for compat 1266 * this is the default allocator */ 1267 struct pool_allocator pool_allocator_nointr = { 1268 pool_page_alloc, pool_page_free, 0, 1269 }; 1270 1271 /* 1272 * XXX - we have at least three different resources for the same allocation 1273 * and each resource can be depleted. First we have the ready elements in 1274 * the pool. Then we have the resource (typically a vm_map) for this 1275 * allocator, then we have physical memory. Waiting for any of these can 1276 * be unnecessary when any other is freed, but the kernel doesn't support 1277 * sleeping on multiple addresses, so we have to fake. The caller sleeps on 1278 * the pool (so that we can be awakened when an item is returned to the pool), 1279 * but we set PA_WANT on the allocator. When a page is returned to 1280 * the allocator and PA_WANT is set pool_allocator_free will wakeup all 1281 * sleeping pools belonging to this allocator. (XXX - thundering herd). 1282 * We also wake up the allocator in case someone without a pool (malloc) 1283 * is sleeping waiting for this allocator. 1284 */ 1285 1286 void * 1287 pool_allocator_alloc(struct pool *pp, int flags) 1288 { 1289 boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE; 1290 void *v; 1291 1292 if (waitok) 1293 mtx_leave(&pp->pr_mtx); 1294 v = pp->pr_alloc->pa_alloc(pp, flags); 1295 if (waitok) 1296 mtx_enter(&pp->pr_mtx); 1297 1298 return (v); 1299 } 1300 1301 void 1302 pool_allocator_free(struct pool *pp, void *v) 1303 { 1304 struct pool_allocator *pa = pp->pr_alloc; 1305 1306 (*pa->pa_free)(pp, v); 1307 } 1308 1309 void * 1310 pool_page_alloc(struct pool *pp, int flags) 1311 { 1312 boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE; 1313 1314 return (uvm_km_getpage(waitok)); 1315 } 1316 1317 void 1318 pool_page_free(struct pool *pp, void *v) 1319 { 1320 1321 uvm_km_putpage(v); 1322 } 1323 1324 void * 1325 pool_page_alloc_oldnointr(struct pool *pp, int flags) 1326 { 1327 boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE; 1328 1329 splassert(IPL_NONE); 1330 1331 return ((void *)uvm_km_alloc_poolpage1(kernel_map, uvm.kernel_object, 1332 waitok)); 1333 } 1334 1335 void 1336 pool_page_free_oldnointr(struct pool *pp, void *v) 1337 { 1338 splassert(IPL_NONE); 1339 1340 uvm_km_free_poolpage1(kernel_map, (vaddr_t)v); 1341 } 1342