xref: /openbsd-src/sys/kern/subr_pool.c (revision aa5e9e10509ffd51558f081f01cd78bfa3c4f2a5)
1 /*	$OpenBSD: subr_pool.c,v 1.121 2013/05/31 20:44:10 tedu Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/errno.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/pool.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 
44 #include <uvm/uvm.h>
45 #include <dev/rndvar.h>
46 
47 /*
48  * Pool resource management utility.
49  *
50  * Memory is allocated in pages which are split into pieces according to
51  * the pool item size. Each page is kept on one of three lists in the
52  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
53  * for empty, full and partially-full pages respectively. The individual
54  * pool items are on a linked list headed by `ph_itemlist' in each page
55  * header. The memory for building the page list is either taken from
56  * the allocated pages themselves (for small pool items) or taken from
57  * an internal pool of page headers (`phpool').
58  */
59 
60 /* List of all pools */
61 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
62 
63 /* Private pool for page header structures */
64 struct pool phpool;
65 
66 struct pool_item_header {
67 	/* Page headers */
68 	LIST_ENTRY(pool_item_header)
69 				ph_pagelist;	/* pool page list */
70 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
71 	RB_ENTRY(pool_item_header)
72 				ph_node;	/* Off-page page headers */
73 	int			ph_nmissing;	/* # of chunks in use */
74 	caddr_t			ph_page;	/* this page's address */
75 	caddr_t			ph_colored;	/* page's colored address */
76 	int			ph_pagesize;
77 	int			ph_magic;
78 };
79 
80 struct pool_item {
81 	u_int32_t pi_magic;
82 	/* Other entries use only this list entry */
83 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
84 };
85 
86 #ifdef POOL_DEBUG
87 int	pool_debug = 1;
88 #else
89 int	pool_debug = 0;
90 #endif
91 
92 #define	POOL_NEEDS_CATCHUP(pp)						\
93 	((pp)->pr_nitems < (pp)->pr_minitems)
94 
95 /*
96  * Every pool gets a unique serial number assigned to it. If this counter
97  * wraps, we're screwed, but we shouldn't create so many pools anyway.
98  */
99 unsigned int pool_serial;
100 
101 int	 pool_catchup(struct pool *);
102 void	 pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
103 void	 pool_update_curpage(struct pool *);
104 void	*pool_do_get(struct pool *, int);
105 void	 pool_do_put(struct pool *, void *);
106 void	 pr_rmpage(struct pool *, struct pool_item_header *,
107 	    struct pool_pagelist *);
108 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
109 int	 pool_chk(struct pool *);
110 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int);
111 
112 void	*pool_allocator_alloc(struct pool *, int, int *);
113 void	 pool_allocator_free(struct pool *, void *);
114 
115 /*
116  * XXX - quick hack. For pools with large items we want to use a special
117  *       allocator. For now, instead of having the allocator figure out
118  *       the allocation size from the pool (which can be done trivially
119  *       with round_page(pr_itemsperpage * pr_size)) which would require
120  *	 lots of changes everywhere, we just create allocators for each
121  *	 size. We limit those to 128 pages.
122  */
123 #define POOL_LARGE_MAXPAGES 128
124 struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES];
125 struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES];
126 void	*pool_large_alloc(struct pool *, int, int *);
127 void	pool_large_free(struct pool *, void *);
128 void	*pool_large_alloc_ni(struct pool *, int, int *);
129 void	pool_large_free_ni(struct pool *, void *);
130 
131 
132 #ifdef DDB
133 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
134 	    /* __attribute__((__format__(__kprintf__,1,2))) */);
135 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
136 	    /* __attribute__((__format__(__kprintf__,1,2))) */);
137 #endif
138 
139 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
140 
141 static __inline int
142 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
143 {
144 	long diff = (vaddr_t)a->ph_page - (vaddr_t)b->ph_page;
145 	if (diff < 0)
146 		return -(-diff >= a->ph_pagesize);
147 	else if (diff > 0)
148 		return (diff >= b->ph_pagesize);
149 	else
150 		return (0);
151 }
152 
153 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
154 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
155 
156 /*
157  * Return the pool page header based on page address.
158  */
159 static __inline struct pool_item_header *
160 pr_find_pagehead(struct pool *pp, void *v)
161 {
162 	struct pool_item_header *ph, tmp;
163 
164 	if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
165 		caddr_t page;
166 
167 		page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
168 
169 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
170 	}
171 
172 	/*
173 	 * The trick we're using in the tree compare function is to compare
174 	 * two elements equal when they overlap. We want to return the
175 	 * page header that belongs to the element just before this address.
176 	 * We don't want this element to compare equal to the next element,
177 	 * so the compare function takes the pagesize from the lower element.
178 	 * If this header is the lower, its pagesize is zero, so it can't
179 	 * overlap with the next header. But if the header we're looking for
180 	 * is lower, we'll use its pagesize and it will overlap and return
181 	 * equal.
182 	 */
183 	tmp.ph_page = v;
184 	tmp.ph_pagesize = 0;
185 	ph = RB_FIND(phtree, &pp->pr_phtree, &tmp);
186 
187 	if (ph) {
188 		KASSERT(ph->ph_page <= (caddr_t)v);
189 		KASSERT(ph->ph_page + ph->ph_pagesize > (caddr_t)v);
190 	}
191 	return ph;
192 }
193 
194 /*
195  * Remove a page from the pool.
196  */
197 void
198 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
199     struct pool_pagelist *pq)
200 {
201 
202 	/*
203 	 * If the page was idle, decrement the idle page count.
204 	 */
205 	if (ph->ph_nmissing == 0) {
206 #ifdef DIAGNOSTIC
207 		if (pp->pr_nidle == 0)
208 			panic("pr_rmpage: nidle inconsistent");
209 		if (pp->pr_nitems < pp->pr_itemsperpage)
210 			panic("pr_rmpage: nitems inconsistent");
211 #endif
212 		pp->pr_nidle--;
213 	}
214 
215 	pp->pr_nitems -= pp->pr_itemsperpage;
216 
217 	/*
218 	 * Unlink a page from the pool and release it (or queue it for release).
219 	 */
220 	LIST_REMOVE(ph, ph_pagelist);
221 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
222 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
223 	pp->pr_npages--;
224 	pp->pr_npagefree++;
225 	pool_update_curpage(pp);
226 
227 	if (pq) {
228 		LIST_INSERT_HEAD(pq, ph, ph_pagelist);
229 	} else {
230 		pool_allocator_free(pp, ph->ph_page);
231 		if ((pp->pr_roflags & PR_PHINPAGE) == 0)
232 			pool_put(&phpool, ph);
233 	}
234 }
235 
236 /*
237  * Initialize the given pool resource structure.
238  *
239  * We export this routine to allow other kernel parts to declare
240  * static pools that must be initialized before malloc() is available.
241  */
242 void
243 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
244     const char *wchan, struct pool_allocator *palloc)
245 {
246 	int off, slack;
247 #ifdef DIAGNOSTIC
248 	struct pool *iter;
249 
250 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
251 		if (iter == pp)
252 			panic("init pool already on list");
253 	}
254 #endif
255 
256 #ifdef MALLOC_DEBUG
257 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
258 		flags &= ~PR_DEBUG;
259 #endif
260 	/*
261 	 * Check arguments and construct default values.
262 	 */
263 	if (palloc == NULL) {
264 		if (size > PAGE_SIZE) {
265 			int psize;
266 
267 			/*
268 			 * XXX - should take align into account as well.
269 			 */
270 			if (size == round_page(size))
271 				psize = size / PAGE_SIZE;
272 			else
273 				psize = PAGE_SIZE / roundup(size % PAGE_SIZE,
274 				    1024);
275 			if (psize > POOL_LARGE_MAXPAGES)
276 				psize = POOL_LARGE_MAXPAGES;
277 			if (flags & PR_WAITOK)
278 				palloc = &pool_allocator_large_ni[psize-1];
279 			else
280 				palloc = &pool_allocator_large[psize-1];
281 			if (palloc->pa_pagesz == 0) {
282 				palloc->pa_pagesz = psize * PAGE_SIZE;
283 				if (flags & PR_WAITOK) {
284 					palloc->pa_alloc = pool_large_alloc_ni;
285 					palloc->pa_free = pool_large_free_ni;
286 				} else {
287 					palloc->pa_alloc = pool_large_alloc;
288 					palloc->pa_free = pool_large_free;
289 				}
290 			}
291 		} else {
292 			palloc = &pool_allocator_nointr;
293 		}
294 	}
295 	if (palloc->pa_pagesz == 0) {
296 		palloc->pa_pagesz = PAGE_SIZE;
297 	}
298 	if (palloc->pa_pagemask == 0) {
299 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
300 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
301 	}
302 
303 	if (align == 0)
304 		align = ALIGN(1);
305 
306 	if (size < sizeof(struct pool_item))
307 		size = sizeof(struct pool_item);
308 
309 	size = roundup(size, align);
310 #ifdef DIAGNOSTIC
311 	if (size > palloc->pa_pagesz)
312 		panic("pool_init: pool item size (%lu) too large",
313 		    (u_long)size);
314 #endif
315 
316 	/*
317 	 * Initialize the pool structure.
318 	 */
319 	LIST_INIT(&pp->pr_emptypages);
320 	LIST_INIT(&pp->pr_fullpages);
321 	LIST_INIT(&pp->pr_partpages);
322 	pp->pr_curpage = NULL;
323 	pp->pr_npages = 0;
324 	pp->pr_minitems = 0;
325 	pp->pr_minpages = 0;
326 	pp->pr_maxpages = 8;
327 	pp->pr_roflags = flags;
328 	pp->pr_flags = 0;
329 	pp->pr_size = size;
330 	pp->pr_align = align;
331 	pp->pr_wchan = wchan;
332 	pp->pr_alloc = palloc;
333 	pp->pr_nitems = 0;
334 	pp->pr_nout = 0;
335 	pp->pr_hardlimit = UINT_MAX;
336 	pp->pr_hardlimit_warning = NULL;
337 	pp->pr_hardlimit_ratecap.tv_sec = 0;
338 	pp->pr_hardlimit_ratecap.tv_usec = 0;
339 	pp->pr_hardlimit_warning_last.tv_sec = 0;
340 	pp->pr_hardlimit_warning_last.tv_usec = 0;
341 	pp->pr_serial = ++pool_serial;
342 	if (pool_serial == 0)
343 		panic("pool_init: too much uptime");
344 
345         /* constructor, destructor, and arg */
346 	pp->pr_ctor = NULL;
347 	pp->pr_dtor = NULL;
348 	pp->pr_arg = NULL;
349 
350 	/*
351 	 * Decide whether to put the page header off page to avoid
352 	 * wasting too large a part of the page. Off-page page headers
353 	 * go into an RB tree, so we can match a returned item with
354 	 * its header based on the page address.
355 	 * We use 1/16 of the page size as the threshold (XXX: tune)
356 	 */
357 	if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) {
358 		/* Use the end of the page for the page header */
359 		pp->pr_roflags |= PR_PHINPAGE;
360 		pp->pr_phoffset = off = palloc->pa_pagesz -
361 		    ALIGN(sizeof(struct pool_item_header));
362 	} else {
363 		/* The page header will be taken from our page header pool */
364 		pp->pr_phoffset = 0;
365 		off = palloc->pa_pagesz;
366 		RB_INIT(&pp->pr_phtree);
367 	}
368 
369 	/*
370 	 * Alignment is to take place at `ioff' within the item. This means
371 	 * we must reserve up to `align - 1' bytes on the page to allow
372 	 * appropriate positioning of each item.
373 	 *
374 	 * Silently enforce `0 <= ioff < align'.
375 	 */
376 	pp->pr_itemoffset = ioff = ioff % align;
377 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
378 	KASSERT(pp->pr_itemsperpage != 0);
379 
380 	/*
381 	 * Use the slack between the chunks and the page header
382 	 * for "cache coloring".
383 	 */
384 	slack = off - pp->pr_itemsperpage * pp->pr_size;
385 	pp->pr_maxcolor = (slack / align) * align;
386 	pp->pr_curcolor = 0;
387 
388 	pp->pr_nget = 0;
389 	pp->pr_nfail = 0;
390 	pp->pr_nput = 0;
391 	pp->pr_npagealloc = 0;
392 	pp->pr_npagefree = 0;
393 	pp->pr_hiwat = 0;
394 	pp->pr_nidle = 0;
395 
396 	pp->pr_ipl = -1;
397 	mtx_init(&pp->pr_mtx, IPL_NONE);
398 
399 	if (phpool.pr_size == 0) {
400 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
401 		    0, "phpool", NULL);
402 		pool_setipl(&phpool, IPL_HIGH);
403 	}
404 
405 	/* pglistalloc/constraint parameters */
406 	pp->pr_crange = &kp_dirty;
407 
408 	/* Insert this into the list of all pools. */
409 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
410 }
411 
412 void
413 pool_setipl(struct pool *pp, int ipl)
414 {
415 	pp->pr_ipl = ipl;
416 	mtx_init(&pp->pr_mtx, ipl);
417 }
418 
419 /*
420  * Decommission a pool resource.
421  */
422 void
423 pool_destroy(struct pool *pp)
424 {
425 	struct pool_item_header *ph;
426 	struct pool *prev, *iter;
427 
428 	/* Remove from global pool list */
429 	if (pp == SIMPLEQ_FIRST(&pool_head))
430 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
431 	else {
432 		prev = SIMPLEQ_FIRST(&pool_head);
433 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
434 			if (iter == pp) {
435 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
436 				    pr_poollist);
437 				goto removed;
438 			}
439 			prev = iter;
440 		}
441 #ifdef DIAGNOSTIC
442 		panic("destroyed pool not on list");
443 #endif
444 	}
445 removed:
446 #ifdef DIAGNOSTIC
447 	if (pp->pr_nout != 0)
448 		panic("pool_destroy: pool busy: still out: %u", pp->pr_nout);
449 #endif
450 
451 	/* Remove all pages */
452 	while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
453 		pr_rmpage(pp, ph, NULL);
454 	KASSERT(LIST_EMPTY(&pp->pr_fullpages));
455 	KASSERT(LIST_EMPTY(&pp->pr_partpages));
456 
457 }
458 
459 struct pool_item_header *
460 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
461 {
462 	struct pool_item_header *ph;
463 
464 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
465 		ph = (struct pool_item_header *)(storage + pp->pr_phoffset);
466 	else
467 		ph = pool_get(&phpool, (flags & ~(PR_WAITOK | PR_ZERO)) |
468 		    PR_NOWAIT);
469 #ifdef DIAGNOSTIC
470 	if (pool_debug && ph != NULL)
471 		ph->ph_magic = poison_value(ph);
472 #endif
473 	return (ph);
474 }
475 
476 /*
477  * Grab an item from the pool; must be called at appropriate spl level
478  */
479 void *
480 pool_get(struct pool *pp, int flags)
481 {
482 	void *v;
483 
484 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
485 
486 #ifdef DIAGNOSTIC
487 	if ((flags & PR_WAITOK) != 0) {
488 		assertwaitok();
489 		if (pool_debug == 2)
490 			yield();
491 	}
492 #endif /* DIAGNOSTIC */
493 
494 	mtx_enter(&pp->pr_mtx);
495 #ifdef POOL_DEBUG
496 	if (pp->pr_roflags & PR_DEBUGCHK) {
497 		if (pool_chk(pp))
498 			panic("before pool_get");
499 	}
500 #endif
501 	v = pool_do_get(pp, flags);
502 #ifdef POOL_DEBUG
503 	if (pp->pr_roflags & PR_DEBUGCHK) {
504 		if (pool_chk(pp))
505 			panic("after pool_get");
506 	}
507 #endif
508 	if (v != NULL)
509 		pp->pr_nget++;
510 	mtx_leave(&pp->pr_mtx);
511 	if (v == NULL)
512 		return (v);
513 
514 	if (pp->pr_ctor) {
515 		if (flags & PR_ZERO)
516 			panic("pool_get: PR_ZERO when ctor set");
517 		if (pp->pr_ctor(pp->pr_arg, v, flags)) {
518 			mtx_enter(&pp->pr_mtx);
519 			pp->pr_nget--;
520 			pool_do_put(pp, v);
521 			mtx_leave(&pp->pr_mtx);
522 			v = NULL;
523 		}
524 	} else {
525 		if (flags & PR_ZERO)
526 			memset(v, 0, pp->pr_size);
527 	}
528 	return (v);
529 }
530 
531 void *
532 pool_do_get(struct pool *pp, int flags)
533 {
534 	struct pool_item *pi;
535 	struct pool_item_header *ph;
536 	void *v;
537 	int slowdown = 0;
538 
539 #ifdef MALLOC_DEBUG
540 	if (pp->pr_roflags & PR_DEBUG) {
541 		void *addr;
542 
543 		addr = NULL;
544 		debug_malloc(pp->pr_size, M_DEBUG,
545 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
546 		return (addr);
547 	}
548 #endif
549 
550 startover:
551 	/*
552 	 * Check to see if we've reached the hard limit.  If we have,
553 	 * and we can wait, then wait until an item has been returned to
554 	 * the pool.
555 	 */
556 #ifdef DIAGNOSTIC
557 	if (pp->pr_nout > pp->pr_hardlimit)
558 		panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan);
559 #endif
560 	if (pp->pr_nout == pp->pr_hardlimit) {
561 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
562 			/*
563 			 * XXX: A warning isn't logged in this case.  Should
564 			 * it be?
565 			 */
566 			pp->pr_flags |= PR_WANTED;
567 			pool_sleep(pp);
568 			goto startover;
569 		}
570 
571 		/*
572 		 * Log a message that the hard limit has been hit.
573 		 */
574 		if (pp->pr_hardlimit_warning != NULL &&
575 		    ratecheck(&pp->pr_hardlimit_warning_last,
576 		    &pp->pr_hardlimit_ratecap))
577 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
578 
579 		pp->pr_nfail++;
580 		return (NULL);
581 	}
582 
583 	/*
584 	 * The convention we use is that if `curpage' is not NULL, then
585 	 * it points at a non-empty bucket. In particular, `curpage'
586 	 * never points at a page header which has PR_PHINPAGE set and
587 	 * has no items in its bucket.
588 	 */
589 	if ((ph = pp->pr_curpage) == NULL) {
590 #ifdef DIAGNOSTIC
591 		if (pp->pr_nitems != 0) {
592 			printf("pool_do_get: %s: curpage NULL, nitems %u\n",
593 			    pp->pr_wchan, pp->pr_nitems);
594 			panic("pool_do_get: nitems inconsistent");
595 		}
596 #endif
597 
598 		/*
599 		 * Call the back-end page allocator for more memory.
600 		 */
601 		v = pool_allocator_alloc(pp, flags, &slowdown);
602 		if (v != NULL)
603 			ph = pool_alloc_item_header(pp, v, flags);
604 
605 		if (v == NULL || ph == NULL) {
606 			if (v != NULL)
607 				pool_allocator_free(pp, v);
608 
609 			if ((flags & PR_WAITOK) == 0) {
610 				pp->pr_nfail++;
611 				return (NULL);
612 			}
613 
614 			/*
615 			 * Wait for items to be returned to this pool.
616 			 *
617 			 * XXX: maybe we should wake up once a second and
618 			 * try again?
619 			 */
620 			pp->pr_flags |= PR_WANTED;
621 			pool_sleep(pp);
622 			goto startover;
623 		}
624 
625 		/* We have more memory; add it to the pool */
626 		pool_prime_page(pp, v, ph);
627 		pp->pr_npagealloc++;
628 
629 		if (slowdown && (flags & PR_WAITOK)) {
630 			mtx_leave(&pp->pr_mtx);
631 			yield();
632 			mtx_enter(&pp->pr_mtx);
633 		}
634 
635 		/* Start the allocation process over. */
636 		goto startover;
637 	}
638 	if ((v = pi = XSIMPLEQ_FIRST(&ph->ph_itemlist)) == NULL) {
639 		panic("pool_do_get: %s: page empty", pp->pr_wchan);
640 	}
641 #ifdef DIAGNOSTIC
642 	if (pp->pr_nitems == 0) {
643 		printf("pool_do_get: %s: items on itemlist, nitems %u\n",
644 		    pp->pr_wchan, pp->pr_nitems);
645 		panic("pool_do_get: nitems inconsistent");
646 	}
647 #endif
648 
649 #ifdef DIAGNOSTIC
650 	if (pi->pi_magic != poison_value(pi))
651 		panic("pool_do_get(%s): free list modified: "
652 		    "page %p; item addr %p; offset 0x%x=0x%x",
653 		    pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic);
654 	if (pool_debug && ph->ph_magic) {
655 		size_t pidx;
656 		int pval;
657 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
658 		    &pidx, &pval)) {
659 			int *ip = (int *)(pi + 1);
660 			panic("pool_do_get(%s): free list modified: "
661 			    "page %p; item addr %p; offset 0x%zx=0x%x",
662 			    pp->pr_wchan, ph->ph_page, pi,
663 			    pidx * sizeof(int), ip[pidx]);
664 		}
665 	}
666 #endif /* DIAGNOSTIC */
667 
668 	/*
669 	 * Remove from item list.
670 	 */
671 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
672 	pp->pr_nitems--;
673 	pp->pr_nout++;
674 	if (ph->ph_nmissing == 0) {
675 #ifdef DIAGNOSTIC
676 		if (pp->pr_nidle == 0)
677 			panic("pool_do_get: nidle inconsistent");
678 #endif
679 		pp->pr_nidle--;
680 
681 		/*
682 		 * This page was previously empty.  Move it to the list of
683 		 * partially-full pages.  This page is already curpage.
684 		 */
685 		LIST_REMOVE(ph, ph_pagelist);
686 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
687 	}
688 	ph->ph_nmissing++;
689 	if (XSIMPLEQ_EMPTY(&ph->ph_itemlist)) {
690 #ifdef DIAGNOSTIC
691 		if (ph->ph_nmissing != pp->pr_itemsperpage) {
692 			panic("pool_do_get: %s: nmissing inconsistent",
693 			    pp->pr_wchan);
694 		}
695 #endif
696 		/*
697 		 * This page is now full.  Move it to the full list
698 		 * and select a new current page.
699 		 */
700 		LIST_REMOVE(ph, ph_pagelist);
701 		LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
702 		pool_update_curpage(pp);
703 	}
704 
705 	/*
706 	 * If we have a low water mark and we are now below that low
707 	 * water mark, add more items to the pool.
708 	 */
709 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
710 		/*
711 		 * XXX: Should we log a warning?  Should we set up a timeout
712 		 * to try again in a second or so?  The latter could break
713 		 * a caller's assumptions about interrupt protection, etc.
714 		 */
715 	}
716 	return (v);
717 }
718 
719 /*
720  * Return resource to the pool; must be called at appropriate spl level
721  */
722 void
723 pool_put(struct pool *pp, void *v)
724 {
725 	if (pp->pr_dtor)
726 		pp->pr_dtor(pp->pr_arg, v);
727 	mtx_enter(&pp->pr_mtx);
728 #ifdef POOL_DEBUG
729 	if (pp->pr_roflags & PR_DEBUGCHK) {
730 		if (pool_chk(pp))
731 			panic("before pool_put");
732 	}
733 #endif
734 	pool_do_put(pp, v);
735 #ifdef POOL_DEBUG
736 	if (pp->pr_roflags & PR_DEBUGCHK) {
737 		if (pool_chk(pp))
738 			panic("after pool_put");
739 	}
740 #endif
741 	pp->pr_nput++;
742 	mtx_leave(&pp->pr_mtx);
743 }
744 
745 /*
746  * Internal version of pool_put().
747  */
748 void
749 pool_do_put(struct pool *pp, void *v)
750 {
751 	struct pool_item *pi = v;
752 	struct pool_item_header *ph;
753 
754 	if (v == NULL)
755 		panic("pool_put of NULL");
756 
757 #ifdef MALLOC_DEBUG
758 	if (pp->pr_roflags & PR_DEBUG) {
759 		debug_free(v, M_DEBUG);
760 		return;
761 	}
762 #endif
763 
764 #ifdef DIAGNOSTIC
765 	if (pp->pr_ipl != -1)
766 		splassert(pp->pr_ipl);
767 
768 	if (pp->pr_nout == 0) {
769 		printf("pool %s: putting with none out\n",
770 		    pp->pr_wchan);
771 		panic("pool_do_put");
772 	}
773 #endif
774 
775 	if ((ph = pr_find_pagehead(pp, v)) == NULL) {
776 		panic("pool_do_put: %s: page header missing", pp->pr_wchan);
777 	}
778 
779 	/*
780 	 * Return to item list.
781 	 */
782 #ifdef DIAGNOSTIC
783 	pi->pi_magic = poison_value(pi);
784 	if (ph->ph_magic) {
785 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
786 	}
787 #endif /* DIAGNOSTIC */
788 
789 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
790 	ph->ph_nmissing--;
791 	pp->pr_nitems++;
792 	pp->pr_nout--;
793 
794 	/* Cancel "pool empty" condition if it exists */
795 	if (pp->pr_curpage == NULL)
796 		pp->pr_curpage = ph;
797 
798 	if (pp->pr_flags & PR_WANTED) {
799 		pp->pr_flags &= ~PR_WANTED;
800 		wakeup(pp);
801 	}
802 
803 	/*
804 	 * If this page is now empty, do one of two things:
805 	 *
806 	 *	(1) If we have more pages than the page high water mark,
807 	 *	    free the page back to the system.
808 	 *
809 	 *	(2) Otherwise, move the page to the empty page list.
810 	 *
811 	 * Either way, select a new current page (so we use a partially-full
812 	 * page if one is available).
813 	 */
814 	if (ph->ph_nmissing == 0) {
815 		pp->pr_nidle++;
816 		if (pp->pr_nidle > pp->pr_maxpages) {
817 			pr_rmpage(pp, ph, NULL);
818 		} else {
819 			LIST_REMOVE(ph, ph_pagelist);
820 			LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
821 			pool_update_curpage(pp);
822 		}
823 	}
824 
825 	/*
826 	 * If the page was previously completely full, move it to the
827 	 * partially-full list and make it the current page.  The next
828 	 * allocation will get the item from this page, instead of
829 	 * further fragmenting the pool.
830 	 */
831 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
832 		LIST_REMOVE(ph, ph_pagelist);
833 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
834 		pp->pr_curpage = ph;
835 	}
836 }
837 
838 /*
839  * Add N items to the pool.
840  */
841 int
842 pool_prime(struct pool *pp, int n)
843 {
844 	struct pool_item_header *ph;
845 	caddr_t cp;
846 	int newpages;
847 	int slowdown;
848 
849 	mtx_enter(&pp->pr_mtx);
850 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
851 
852 	while (newpages-- > 0) {
853 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
854 		if (cp != NULL)
855 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
856 		if (cp == NULL || ph == NULL) {
857 			if (cp != NULL)
858 				pool_allocator_free(pp, cp);
859 			break;
860 		}
861 
862 		pool_prime_page(pp, cp, ph);
863 		pp->pr_npagealloc++;
864 		pp->pr_minpages++;
865 	}
866 
867 	if (pp->pr_minpages >= pp->pr_maxpages)
868 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
869 
870 	mtx_leave(&pp->pr_mtx);
871 	return (0);
872 }
873 
874 /*
875  * Add a page worth of items to the pool.
876  *
877  * Note, we must be called with the pool descriptor LOCKED.
878  */
879 void
880 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
881 {
882 	struct pool_item *pi;
883 	caddr_t cp = storage;
884 	unsigned int align = pp->pr_align;
885 	unsigned int ioff = pp->pr_itemoffset;
886 	int n;
887 
888 	/*
889 	 * Insert page header.
890 	 */
891 	LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
892 	XSIMPLEQ_INIT(&ph->ph_itemlist);
893 	ph->ph_page = storage;
894 	ph->ph_pagesize = pp->pr_alloc->pa_pagesz;
895 	ph->ph_nmissing = 0;
896 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
897 		RB_INSERT(phtree, &pp->pr_phtree, ph);
898 
899 	pp->pr_nidle++;
900 
901 	/*
902 	 * Color this page.
903 	 */
904 	cp = (caddr_t)(cp + pp->pr_curcolor);
905 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
906 		pp->pr_curcolor = 0;
907 
908 	/*
909 	 * Adjust storage to apply alignment to `pr_itemoffset' in each item.
910 	 */
911 	if (ioff != 0)
912 		cp = (caddr_t)(cp + (align - ioff));
913 	ph->ph_colored = cp;
914 
915 	/*
916 	 * Insert remaining chunks on the bucket list.
917 	 */
918 	n = pp->pr_itemsperpage;
919 	pp->pr_nitems += n;
920 
921 	while (n--) {
922 		pi = (struct pool_item *)cp;
923 
924 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
925 
926 		/* Insert on page list */
927 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
928 
929 #ifdef DIAGNOSTIC
930 		pi->pi_magic = poison_value(pi);
931 		if (ph->ph_magic) {
932 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
933 		}
934 #endif /* DIAGNOSTIC */
935 		cp = (caddr_t)(cp + pp->pr_size);
936 	}
937 
938 	/*
939 	 * If the pool was depleted, point at the new page.
940 	 */
941 	if (pp->pr_curpage == NULL)
942 		pp->pr_curpage = ph;
943 
944 	if (++pp->pr_npages > pp->pr_hiwat)
945 		pp->pr_hiwat = pp->pr_npages;
946 }
947 
948 /*
949  * Used by pool_get() when nitems drops below the low water mark.  This
950  * is used to catch up pr_nitems with the low water mark.
951  *
952  * Note we never wait for memory here, we let the caller decide what to do.
953  */
954 int
955 pool_catchup(struct pool *pp)
956 {
957 	struct pool_item_header *ph;
958 	caddr_t cp;
959 	int error = 0;
960 	int slowdown;
961 
962 	while (POOL_NEEDS_CATCHUP(pp)) {
963 		/*
964 		 * Call the page back-end allocator for more memory.
965 		 */
966 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
967 		if (cp != NULL)
968 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
969 		if (cp == NULL || ph == NULL) {
970 			if (cp != NULL)
971 				pool_allocator_free(pp, cp);
972 			error = ENOMEM;
973 			break;
974 		}
975 		pool_prime_page(pp, cp, ph);
976 		pp->pr_npagealloc++;
977 	}
978 
979 	return (error);
980 }
981 
982 void
983 pool_update_curpage(struct pool *pp)
984 {
985 
986 	pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
987 	if (pp->pr_curpage == NULL) {
988 		pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
989 	}
990 }
991 
992 void
993 pool_setlowat(struct pool *pp, int n)
994 {
995 
996 	pp->pr_minitems = n;
997 	pp->pr_minpages = (n == 0)
998 		? 0
999 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1000 
1001 	mtx_enter(&pp->pr_mtx);
1002 	/* Make sure we're caught up with the newly-set low water mark. */
1003 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
1004 		/*
1005 		 * XXX: Should we log a warning?  Should we set up a timeout
1006 		 * to try again in a second or so?  The latter could break
1007 		 * a caller's assumptions about interrupt protection, etc.
1008 		 */
1009 	}
1010 	mtx_leave(&pp->pr_mtx);
1011 }
1012 
1013 void
1014 pool_sethiwat(struct pool *pp, int n)
1015 {
1016 
1017 	pp->pr_maxpages = (n == 0)
1018 		? 0
1019 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1020 }
1021 
1022 int
1023 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
1024 {
1025 	int error = 0;
1026 
1027 	if (n < pp->pr_nout) {
1028 		error = EINVAL;
1029 		goto done;
1030 	}
1031 
1032 	pp->pr_hardlimit = n;
1033 	pp->pr_hardlimit_warning = warnmsg;
1034 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1035 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1036 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1037 
1038 done:
1039 	return (error);
1040 }
1041 
1042 void
1043 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
1044 {
1045 	pp->pr_crange = mode;
1046 }
1047 
1048 void
1049 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int),
1050     void (*dtor)(void *, void *), void *arg)
1051 {
1052 	pp->pr_ctor = ctor;
1053 	pp->pr_dtor = dtor;
1054 	pp->pr_arg = arg;
1055 }
1056 /*
1057  * Release all complete pages that have not been used recently.
1058  *
1059  * Returns non-zero if any pages have been reclaimed.
1060  */
1061 int
1062 pool_reclaim(struct pool *pp)
1063 {
1064 	struct pool_item_header *ph, *phnext;
1065 	struct pool_pagelist pq;
1066 
1067 	LIST_INIT(&pq);
1068 
1069 	mtx_enter(&pp->pr_mtx);
1070 	for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1071 		phnext = LIST_NEXT(ph, ph_pagelist);
1072 
1073 		/* Check our minimum page claim */
1074 		if (pp->pr_npages <= pp->pr_minpages)
1075 			break;
1076 
1077 		KASSERT(ph->ph_nmissing == 0);
1078 
1079 		/*
1080 		 * If freeing this page would put us below
1081 		 * the low water mark, stop now.
1082 		 */
1083 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1084 		    pp->pr_minitems)
1085 			break;
1086 
1087 		pr_rmpage(pp, ph, &pq);
1088 	}
1089 	mtx_leave(&pp->pr_mtx);
1090 
1091 	if (LIST_EMPTY(&pq))
1092 		return (0);
1093 	while ((ph = LIST_FIRST(&pq)) != NULL) {
1094 		LIST_REMOVE(ph, ph_pagelist);
1095 		pool_allocator_free(pp, ph->ph_page);
1096 		if (pp->pr_roflags & PR_PHINPAGE)
1097 			continue;
1098 		pool_put(&phpool, ph);
1099 	}
1100 
1101 	return (1);
1102 }
1103 
1104 /*
1105  * Release all complete pages that have not been used recently
1106  * from all pools.
1107  */
1108 void
1109 pool_reclaim_all(void)
1110 {
1111 	struct pool	*pp;
1112 	int		s;
1113 
1114 	s = splhigh();
1115 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1116 		pool_reclaim(pp);
1117 	splx(s);
1118 }
1119 
1120 #ifdef DDB
1121 #include <machine/db_machdep.h>
1122 #include <ddb/db_interface.h>
1123 #include <ddb/db_output.h>
1124 
1125 /*
1126  * Diagnostic helpers.
1127  */
1128 void
1129 pool_printit(struct pool *pp, const char *modif,
1130     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1131 {
1132 	pool_print1(pp, modif, pr);
1133 }
1134 
1135 void
1136 pool_print_pagelist(struct pool_pagelist *pl,
1137     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1138 {
1139 	struct pool_item_header *ph;
1140 #ifdef DIAGNOSTIC
1141 	struct pool_item *pi;
1142 #endif
1143 
1144 	LIST_FOREACH(ph, pl, ph_pagelist) {
1145 		(*pr)("\t\tpage %p, nmissing %d\n",
1146 		    ph->ph_page, ph->ph_nmissing);
1147 #ifdef DIAGNOSTIC
1148 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1149 			if (pi->pi_magic != poison_value(pi)) {
1150 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1151 				    pi, pi->pi_magic);
1152 			}
1153 		}
1154 #endif
1155 	}
1156 }
1157 
1158 void
1159 pool_print1(struct pool *pp, const char *modif,
1160     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1161 {
1162 	struct pool_item_header *ph;
1163 	int print_pagelist = 0;
1164 	char c;
1165 
1166 	while ((c = *modif++) != '\0') {
1167 		if (c == 'p')
1168 			print_pagelist = 1;
1169 		modif++;
1170 	}
1171 
1172 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1173 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1174 	    pp->pr_roflags);
1175 	(*pr)("\talloc %p\n", pp->pr_alloc);
1176 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1177 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1178 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1179 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1180 
1181 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1182 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1183 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1184 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1185 
1186 	if (print_pagelist == 0)
1187 		return;
1188 
1189 	if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
1190 		(*pr)("\n\tempty page list:\n");
1191 	pool_print_pagelist(&pp->pr_emptypages, pr);
1192 	if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
1193 		(*pr)("\n\tfull page list:\n");
1194 	pool_print_pagelist(&pp->pr_fullpages, pr);
1195 	if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
1196 		(*pr)("\n\tpartial-page list:\n");
1197 	pool_print_pagelist(&pp->pr_partpages, pr);
1198 
1199 	if (pp->pr_curpage == NULL)
1200 		(*pr)("\tno current page\n");
1201 	else
1202 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1203 }
1204 
1205 void
1206 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1207 {
1208 	struct pool *pp;
1209 	char maxp[16];
1210 	int ovflw;
1211 	char mode;
1212 
1213 	mode = modif[0];
1214 	if (mode != '\0' && mode != 'a') {
1215 		db_printf("usage: show all pools [/a]\n");
1216 		return;
1217 	}
1218 
1219 	if (mode == '\0')
1220 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1221 		    "Name",
1222 		    "Size",
1223 		    "Requests",
1224 		    "Fail",
1225 		    "Releases",
1226 		    "Pgreq",
1227 		    "Pgrel",
1228 		    "Npage",
1229 		    "Hiwat",
1230 		    "Minpg",
1231 		    "Maxpg",
1232 		    "Idle");
1233 	else
1234 		db_printf("%-12s %18s %18s\n",
1235 		    "Name", "Address", "Allocator");
1236 
1237 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1238 		if (mode == 'a') {
1239 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1240 			    pp->pr_alloc);
1241 			continue;
1242 		}
1243 
1244 		if (!pp->pr_nget)
1245 			continue;
1246 
1247 		if (pp->pr_maxpages == UINT_MAX)
1248 			snprintf(maxp, sizeof maxp, "inf");
1249 		else
1250 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1251 
1252 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1253 	(ovflw) += db_printf((fmt),			\
1254 	    (width) - (fixed) - (ovflw) > 0 ?		\
1255 	    (width) - (fixed) - (ovflw) : 0,		\
1256 	    (val)) - (width);				\
1257 	if ((ovflw) < 0)				\
1258 		(ovflw) = 0;				\
1259 } while (/* CONSTCOND */0)
1260 
1261 		ovflw = 0;
1262 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1263 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1264 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1265 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1266 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1267 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1268 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1269 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1270 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1271 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1272 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1273 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1274 
1275 		pool_chk(pp);
1276 	}
1277 }
1278 #endif /* DDB */
1279 
1280 #if defined(POOL_DEBUG) || defined(DDB)
1281 int
1282 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1283 {
1284 	struct pool_item *pi;
1285 	caddr_t page;
1286 	int n;
1287 	const char *label = pp->pr_wchan;
1288 
1289 	page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask);
1290 	if (page != ph->ph_page &&
1291 	    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1292 		printf("%s: ", label);
1293 		printf("pool(%p:%s): page inconsistency: page %p; "
1294 		    "at page head addr %p (p %p)\n",
1295 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1296 		return 1;
1297 	}
1298 
1299 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1300 	     pi != NULL;
1301 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1302 
1303 #ifdef DIAGNOSTIC
1304 		if (pi->pi_magic != poison_value(pi)) {
1305 			printf("%s: ", label);
1306 			printf("pool(%s): free list modified: "
1307 			    "page %p; item ordinal %d; addr %p "
1308 			    "(p %p); offset 0x%x=0x%x\n",
1309 			    pp->pr_wchan, ph->ph_page, n, pi, page,
1310 			    0, pi->pi_magic);
1311 		}
1312 		if (pool_debug && ph->ph_magic) {
1313 			size_t pidx;
1314 			int pval;
1315 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1316 			    &pidx, &pval)) {
1317 				int *ip = (int *)(pi + 1);
1318 				printf("pool(%s): free list modified: "
1319 				    "page %p; item ordinal %d; addr %p "
1320 				    "(p %p); offset 0x%zx=0x%x\n",
1321 				    pp->pr_wchan, ph->ph_page, n, pi,
1322 				    page, pidx * sizeof(int), ip[pidx]);
1323 			}
1324 		}
1325 #endif /* DIAGNOSTIC */
1326 		page =
1327 		    (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask);
1328 		if (page == ph->ph_page)
1329 			continue;
1330 
1331 		printf("%s: ", label);
1332 		printf("pool(%p:%s): page inconsistency: page %p;"
1333 		    " item ordinal %d; addr %p (p %p)\n", pp,
1334 		    pp->pr_wchan, ph->ph_page, n, pi, page);
1335 		return 1;
1336 	}
1337 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1338 		printf("pool(%p:%s): page inconsistency: page %p;"
1339 		    " %d on list, %d missing, %d items per page\n", pp,
1340 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1341 		    pp->pr_itemsperpage);
1342 		return 1;
1343 	}
1344 	if (expected >= 0 && n != expected) {
1345 		printf("pool(%p:%s): page inconsistency: page %p;"
1346 		    " %d on list, %d missing, %d expected\n", pp,
1347 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1348 		    expected);
1349 		return 1;
1350 	}
1351 	return 0;
1352 }
1353 
1354 int
1355 pool_chk(struct pool *pp)
1356 {
1357 	struct pool_item_header *ph;
1358 	int r = 0;
1359 
1360 	LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1361 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1362 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1363 		r += pool_chk_page(pp, ph, 0);
1364 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1365 		r += pool_chk_page(pp, ph, -1);
1366 
1367 	return (r);
1368 }
1369 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1370 
1371 #ifdef DDB
1372 void
1373 pool_walk(struct pool *pp, int full,
1374     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */,
1375     void (*func)(void *, int, int (*)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */))
1376 {
1377 	struct pool_item_header *ph;
1378 	struct pool_item *pi;
1379 	caddr_t cp;
1380 	int n;
1381 
1382 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1383 		cp = ph->ph_colored;
1384 		n = ph->ph_nmissing;
1385 
1386 		while (n--) {
1387 			func(cp, full, pr);
1388 			cp += pp->pr_size;
1389 		}
1390 	}
1391 
1392 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1393 		cp = ph->ph_colored;
1394 		n = ph->ph_nmissing;
1395 
1396 		do {
1397 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1398 				if (cp == (caddr_t)pi)
1399 					break;
1400 			}
1401 			if (cp != (caddr_t)pi) {
1402 				func(cp, full, pr);
1403 				n--;
1404 			}
1405 
1406 			cp += pp->pr_size;
1407 		} while (n > 0);
1408 	}
1409 }
1410 #endif
1411 
1412 /*
1413  * We have three different sysctls.
1414  * kern.pool.npools - the number of pools.
1415  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1416  * kern.pool.name.<pool#> - the name for pool#.
1417  */
1418 int
1419 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1420 {
1421 	struct pool *pp, *foundpool = NULL;
1422 	size_t buflen = where != NULL ? *sizep : 0;
1423 	int npools = 0, s;
1424 	unsigned int lookfor;
1425 	size_t len;
1426 
1427 	switch (*name) {
1428 	case KERN_POOL_NPOOLS:
1429 		if (namelen != 1 || buflen != sizeof(int))
1430 			return (EINVAL);
1431 		lookfor = 0;
1432 		break;
1433 	case KERN_POOL_NAME:
1434 		if (namelen != 2 || buflen < 1)
1435 			return (EINVAL);
1436 		lookfor = name[1];
1437 		break;
1438 	case KERN_POOL_POOL:
1439 		if (namelen != 2 || buflen != sizeof(struct pool))
1440 			return (EINVAL);
1441 		lookfor = name[1];
1442 		break;
1443 	default:
1444 		return (EINVAL);
1445 	}
1446 
1447 	s = splvm();
1448 
1449 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1450 		npools++;
1451 		if (lookfor == pp->pr_serial) {
1452 			foundpool = pp;
1453 			break;
1454 		}
1455 	}
1456 
1457 	splx(s);
1458 
1459 	if (*name != KERN_POOL_NPOOLS && foundpool == NULL)
1460 		return (ENOENT);
1461 
1462 	switch (*name) {
1463 	case KERN_POOL_NPOOLS:
1464 		return copyout(&npools, where, buflen);
1465 	case KERN_POOL_NAME:
1466 		len = strlen(foundpool->pr_wchan) + 1;
1467 		if (*sizep < len)
1468 			return (ENOMEM);
1469 		*sizep = len;
1470 		return copyout(foundpool->pr_wchan, where, len);
1471 	case KERN_POOL_POOL:
1472 		return copyout(foundpool, where, buflen);
1473 	}
1474 	/* NOTREACHED */
1475 	return (0); /* XXX - Stupid gcc */
1476 }
1477 
1478 /*
1479  * Pool backend allocators.
1480  *
1481  * Each pool has a backend allocator that handles allocation, deallocation
1482  */
1483 void	*pool_page_alloc(struct pool *, int, int *);
1484 void	pool_page_free(struct pool *, void *);
1485 
1486 /*
1487  * safe for interrupts, name preserved for compat this is the default
1488  * allocator
1489  */
1490 struct pool_allocator pool_allocator_nointr = {
1491 	pool_page_alloc, pool_page_free, 0,
1492 };
1493 
1494 /*
1495  * XXX - we have at least three different resources for the same allocation
1496  *  and each resource can be depleted. First we have the ready elements in
1497  *  the pool. Then we have the resource (typically a vm_map) for this
1498  *  allocator, then we have physical memory. Waiting for any of these can
1499  *  be unnecessary when any other is freed, but the kernel doesn't support
1500  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1501  *  the pool (so that we can be awakened when an item is returned to the pool),
1502  *  but we set PA_WANT on the allocator. When a page is returned to
1503  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1504  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1505  *  We also wake up the allocator in case someone without a pool (malloc)
1506  *  is sleeping waiting for this allocator.
1507  */
1508 
1509 void *
1510 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1511 {
1512 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1513 	void *v;
1514 
1515 	if (waitok)
1516 		mtx_leave(&pp->pr_mtx);
1517 	v = pp->pr_alloc->pa_alloc(pp, flags, slowdown);
1518 	if (waitok)
1519 		mtx_enter(&pp->pr_mtx);
1520 
1521 	return (v);
1522 }
1523 
1524 void
1525 pool_allocator_free(struct pool *pp, void *v)
1526 {
1527 	struct pool_allocator *pa = pp->pr_alloc;
1528 
1529 	(*pa->pa_free)(pp, v);
1530 }
1531 
1532 void *
1533 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1534 {
1535 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1536 
1537 	kd.kd_waitok = (flags & PR_WAITOK);
1538 	kd.kd_slowdown = slowdown;
1539 
1540 	return (km_alloc(PAGE_SIZE, &kv_page, pp->pr_crange, &kd));
1541 }
1542 
1543 void
1544 pool_page_free(struct pool *pp, void *v)
1545 {
1546 	km_free(v, PAGE_SIZE, &kv_page, pp->pr_crange);
1547 }
1548 
1549 void *
1550 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1551 {
1552 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1553 	void *v;
1554 	int s;
1555 
1556 	kd.kd_waitok = (flags & PR_WAITOK);
1557 	kd.kd_slowdown = slowdown;
1558 
1559 	s = splvm();
1560 	v = km_alloc(pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange,
1561 	    &kd);
1562 	splx(s);
1563 
1564 	return (v);
1565 }
1566 
1567 void
1568 pool_large_free(struct pool *pp, void *v)
1569 {
1570 	int s;
1571 
1572 	s = splvm();
1573 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange);
1574 	splx(s);
1575 }
1576 
1577 void *
1578 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1579 {
1580 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1581 
1582 	kd.kd_waitok = (flags & PR_WAITOK);
1583 	kd.kd_slowdown = slowdown;
1584 
1585 	return (km_alloc(pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange, &kd));
1586 }
1587 
1588 void
1589 pool_large_free_ni(struct pool *pp, void *v)
1590 {
1591 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange);
1592 }
1593