xref: /openbsd-src/sys/kern/subr_pool.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: subr_pool.c,v 1.198 2016/09/15 02:00:16 dlg Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/syslog.h>
41 #include <sys/rwlock.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/timeout.h>
45 
46 #include <uvm/uvm_extern.h>
47 
48 /*
49  * Pool resource management utility.
50  *
51  * Memory is allocated in pages which are split into pieces according to
52  * the pool item size. Each page is kept on one of three lists in the
53  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
54  * for empty, full and partially-full pages respectively. The individual
55  * pool items are on a linked list headed by `ph_itemlist' in each page
56  * header. The memory for building the page list is either taken from
57  * the allocated pages themselves (for small pool items) or taken from
58  * an internal pool of page headers (`phpool').
59  */
60 
61 /* List of all pools */
62 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
63 
64 /*
65  * Every pool gets a unique serial number assigned to it. If this counter
66  * wraps, we're screwed, but we shouldn't create so many pools anyway.
67  */
68 unsigned int pool_serial;
69 unsigned int pool_count;
70 
71 /* Lock the previous variables making up the global pool state */
72 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
73 
74 /* Private pool for page header structures */
75 struct pool phpool;
76 
77 struct pool_item_header {
78 	/* Page headers */
79 	TAILQ_ENTRY(pool_item_header)
80 				ph_pagelist;	/* pool page list */
81 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
82 	RBT_ENTRY(pool_item_header)
83 				ph_node;	/* Off-page page headers */
84 	int			ph_nmissing;	/* # of chunks in use */
85 	caddr_t			ph_page;	/* this page's address */
86 	caddr_t			ph_colored;	/* page's colored address */
87 	u_long			ph_magic;
88 	int			ph_tick;
89 };
90 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
91 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
92 
93 struct pool_item {
94 	u_long				pi_magic;
95 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
96 };
97 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
98 
99 #ifdef POOL_DEBUG
100 int	pool_debug = 1;
101 #else
102 int	pool_debug = 0;
103 #endif
104 
105 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
106 
107 struct pool_item_header *
108 	 pool_p_alloc(struct pool *, int, int *);
109 void	 pool_p_insert(struct pool *, struct pool_item_header *);
110 void	 pool_p_remove(struct pool *, struct pool_item_header *);
111 void	 pool_p_free(struct pool *, struct pool_item_header *);
112 
113 void	 pool_update_curpage(struct pool *);
114 void	*pool_do_get(struct pool *, int, int *);
115 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
116 int	 pool_chk(struct pool *);
117 void	 pool_get_done(void *, void *);
118 void	 pool_runqueue(struct pool *, int);
119 
120 void	*pool_allocator_alloc(struct pool *, int, int *);
121 void	 pool_allocator_free(struct pool *, void *);
122 
123 /*
124  * The default pool allocator.
125  */
126 void	*pool_page_alloc(struct pool *, int, int *);
127 void	pool_page_free(struct pool *, void *);
128 
129 /*
130  * safe for interrupts; this is the default allocator
131  */
132 struct pool_allocator pool_allocator_single = {
133 	pool_page_alloc,
134 	pool_page_free
135 };
136 
137 void	*pool_multi_alloc(struct pool *, int, int *);
138 void	pool_multi_free(struct pool *, void *);
139 
140 struct pool_allocator pool_allocator_multi = {
141 	pool_multi_alloc,
142 	pool_multi_free
143 };
144 
145 void	*pool_multi_alloc_ni(struct pool *, int, int *);
146 void	pool_multi_free_ni(struct pool *, void *);
147 
148 struct pool_allocator pool_allocator_multi_ni = {
149 	pool_multi_alloc_ni,
150 	pool_multi_free_ni
151 };
152 
153 #ifdef DDB
154 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
155 	     __attribute__((__format__(__kprintf__,1,2))));
156 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
157 	     __attribute__((__format__(__kprintf__,1,2))));
158 #endif
159 
160 /* stale page garbage collectors */
161 void	pool_gc_sched(void *);
162 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
163 void	pool_gc_pages(void *);
164 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
165 int pool_wait_free = 1;
166 int pool_wait_gc = 8;
167 
168 RBT_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
169 
170 static inline int
171 phtree_compare(const struct pool_item_header *a,
172     const struct pool_item_header *b)
173 {
174 	vaddr_t va = (vaddr_t)a->ph_page;
175 	vaddr_t vb = (vaddr_t)b->ph_page;
176 
177 	/* the compares in this order are important for the NFIND to work */
178 	if (vb < va)
179 		return (-1);
180 	if (vb > va)
181 		return (1);
182 
183 	return (0);
184 }
185 
186 RBT_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
187 
188 /*
189  * Return the pool page header based on page address.
190  */
191 static inline struct pool_item_header *
192 pr_find_pagehead(struct pool *pp, void *v)
193 {
194 	struct pool_item_header *ph, key;
195 
196 	if (POOL_INPGHDR(pp)) {
197 		caddr_t page;
198 
199 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
200 
201 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
202 	}
203 
204 	key.ph_page = v;
205 	ph = RBT_NFIND(phtree, &pp->pr_phtree, &key);
206 	if (ph == NULL)
207 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
208 
209 	KASSERT(ph->ph_page <= (caddr_t)v);
210 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
211 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
212 
213 	return (ph);
214 }
215 
216 /*
217  * Initialize the given pool resource structure.
218  *
219  * We export this routine to allow other kernel parts to declare
220  * static pools that must be initialized before malloc() is available.
221  */
222 void
223 pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags,
224     const char *wchan, struct pool_allocator *palloc)
225 {
226 	int off = 0, space;
227 	unsigned int pgsize = PAGE_SIZE, items;
228 #ifdef DIAGNOSTIC
229 	struct pool *iter;
230 #endif
231 
232 	if (align == 0)
233 		align = ALIGN(1);
234 
235 	if (size < sizeof(struct pool_item))
236 		size = sizeof(struct pool_item);
237 
238 	size = roundup(size, align);
239 
240 	if (palloc == NULL) {
241 		while (size * 8 > pgsize)
242 			pgsize <<= 1;
243 
244 		if (pgsize > PAGE_SIZE) {
245 			palloc = ISSET(flags, PR_WAITOK) ?
246 			    &pool_allocator_multi_ni : &pool_allocator_multi;
247 		} else
248 			palloc = &pool_allocator_single;
249 	} else
250 		pgsize = palloc->pa_pagesz ? palloc->pa_pagesz : PAGE_SIZE;
251 
252 	items = pgsize / size;
253 
254 	/*
255 	 * Decide whether to put the page header off page to avoid
256 	 * wasting too large a part of the page. Off-page page headers
257 	 * go into an RB tree, so we can match a returned item with
258 	 * its header based on the page address.
259 	 */
260 	if (pgsize - (size * items) > sizeof(struct pool_item_header)) {
261 		off = pgsize - sizeof(struct pool_item_header);
262 	} else if (sizeof(struct pool_item_header) * 2 >= size) {
263 		off = pgsize - sizeof(struct pool_item_header);
264 		items = off / size;
265 	}
266 
267 	KASSERT(items > 0);
268 
269 	/*
270 	 * Initialize the pool structure.
271 	 */
272 	memset(pp, 0, sizeof(*pp));
273 	TAILQ_INIT(&pp->pr_emptypages);
274 	TAILQ_INIT(&pp->pr_fullpages);
275 	TAILQ_INIT(&pp->pr_partpages);
276 	pp->pr_curpage = NULL;
277 	pp->pr_npages = 0;
278 	pp->pr_minitems = 0;
279 	pp->pr_minpages = 0;
280 	pp->pr_maxpages = 8;
281 	pp->pr_size = size;
282 	pp->pr_pgsize = pgsize;
283 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
284 	pp->pr_phoffset = off;
285 	pp->pr_itemsperpage = items;
286 	pp->pr_wchan = wchan;
287 	pp->pr_alloc = palloc;
288 	pp->pr_nitems = 0;
289 	pp->pr_nout = 0;
290 	pp->pr_hardlimit = UINT_MAX;
291 	pp->pr_hardlimit_warning = NULL;
292 	pp->pr_hardlimit_ratecap.tv_sec = 0;
293 	pp->pr_hardlimit_ratecap.tv_usec = 0;
294 	pp->pr_hardlimit_warning_last.tv_sec = 0;
295 	pp->pr_hardlimit_warning_last.tv_usec = 0;
296 	RBT_INIT(phtree, &pp->pr_phtree);
297 
298 	/*
299 	 * Use the space between the chunks and the page header
300 	 * for cache coloring.
301 	 */
302 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
303 	space -= pp->pr_itemsperpage * pp->pr_size;
304 	pp->pr_align = align;
305 	pp->pr_maxcolors = (space / align) + 1;
306 
307 	pp->pr_nget = 0;
308 	pp->pr_nfail = 0;
309 	pp->pr_nput = 0;
310 	pp->pr_npagealloc = 0;
311 	pp->pr_npagefree = 0;
312 	pp->pr_hiwat = 0;
313 	pp->pr_nidle = 0;
314 
315 	pp->pr_ipl = ipl;
316 	mtx_init(&pp->pr_mtx, pp->pr_ipl);
317 	mtx_init(&pp->pr_requests_mtx, pp->pr_ipl);
318 	TAILQ_INIT(&pp->pr_requests);
319 
320 	if (phpool.pr_size == 0) {
321 		pool_init(&phpool, sizeof(struct pool_item_header), 0,
322 		    IPL_HIGH, 0, "phpool", NULL);
323 
324 		/* make sure phpool wont "recurse" */
325 		KASSERT(POOL_INPGHDR(&phpool));
326 	}
327 
328 	/* pglistalloc/constraint parameters */
329 	pp->pr_crange = &kp_dirty;
330 
331 	/* Insert this into the list of all pools. */
332 	rw_enter_write(&pool_lock);
333 #ifdef DIAGNOSTIC
334 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
335 		if (iter == pp)
336 			panic("%s: pool %s already on list", __func__, wchan);
337 	}
338 #endif
339 
340 	pp->pr_serial = ++pool_serial;
341 	if (pool_serial == 0)
342 		panic("%s: too much uptime", __func__);
343 
344 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
345 	pool_count++;
346 	rw_exit_write(&pool_lock);
347 }
348 
349 /*
350  * Decommission a pool resource.
351  */
352 void
353 pool_destroy(struct pool *pp)
354 {
355 	struct pool_item_header *ph;
356 	struct pool *prev, *iter;
357 
358 #ifdef DIAGNOSTIC
359 	if (pp->pr_nout != 0)
360 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
361 #endif
362 
363 	/* Remove from global pool list */
364 	rw_enter_write(&pool_lock);
365 	pool_count--;
366 	if (pp == SIMPLEQ_FIRST(&pool_head))
367 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
368 	else {
369 		prev = SIMPLEQ_FIRST(&pool_head);
370 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
371 			if (iter == pp) {
372 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
373 				    pr_poollist);
374 				break;
375 			}
376 			prev = iter;
377 		}
378 	}
379 	rw_exit_write(&pool_lock);
380 
381 	/* Remove all pages */
382 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
383 		mtx_enter(&pp->pr_mtx);
384 		pool_p_remove(pp, ph);
385 		mtx_leave(&pp->pr_mtx);
386 		pool_p_free(pp, ph);
387 	}
388 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
389 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
390 }
391 
392 void
393 pool_request_init(struct pool_request *pr,
394     void (*handler)(void *, void *), void *cookie)
395 {
396 	pr->pr_handler = handler;
397 	pr->pr_cookie = cookie;
398 	pr->pr_item = NULL;
399 }
400 
401 void
402 pool_request(struct pool *pp, struct pool_request *pr)
403 {
404 	mtx_enter(&pp->pr_requests_mtx);
405 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
406 	pool_runqueue(pp, PR_NOWAIT);
407 	mtx_leave(&pp->pr_requests_mtx);
408 }
409 
410 struct pool_get_memory {
411 	struct mutex mtx;
412 	void * volatile v;
413 };
414 
415 /*
416  * Grab an item from the pool.
417  */
418 void *
419 pool_get(struct pool *pp, int flags)
420 {
421 	void *v = NULL;
422 	int slowdown = 0;
423 
424 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
425 
426 	mtx_enter(&pp->pr_mtx);
427 	if (pp->pr_nout >= pp->pr_hardlimit) {
428 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
429 			goto fail;
430 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
431 		if (ISSET(flags, PR_NOWAIT))
432 			goto fail;
433 	}
434 	mtx_leave(&pp->pr_mtx);
435 
436 	if (slowdown && ISSET(flags, PR_WAITOK))
437 		yield();
438 
439 	if (v == NULL) {
440 		struct pool_get_memory mem = {
441 		    MUTEX_INITIALIZER(pp->pr_ipl),
442 		    NULL };
443 		struct pool_request pr;
444 
445 		pool_request_init(&pr, pool_get_done, &mem);
446 		pool_request(pp, &pr);
447 
448 		mtx_enter(&mem.mtx);
449 		while (mem.v == NULL)
450 			msleep(&mem, &mem.mtx, PSWP, pp->pr_wchan, 0);
451 		mtx_leave(&mem.mtx);
452 
453 		v = mem.v;
454 	}
455 
456 	if (ISSET(flags, PR_ZERO))
457 		memset(v, 0, pp->pr_size);
458 
459 	return (v);
460 
461 fail:
462 	pp->pr_nfail++;
463 	mtx_leave(&pp->pr_mtx);
464 	return (NULL);
465 }
466 
467 void
468 pool_get_done(void *xmem, void *v)
469 {
470 	struct pool_get_memory *mem = xmem;
471 
472 	mtx_enter(&mem->mtx);
473 	mem->v = v;
474 	mtx_leave(&mem->mtx);
475 
476 	wakeup_one(mem);
477 }
478 
479 void
480 pool_runqueue(struct pool *pp, int flags)
481 {
482 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
483 	struct pool_request *pr;
484 
485 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
486 	MUTEX_ASSERT_LOCKED(&pp->pr_requests_mtx);
487 
488 	if (pp->pr_requesting++)
489 		return;
490 
491 	do {
492 		pp->pr_requesting = 1;
493 
494 		/* no TAILQ_JOIN? :( */
495 		while ((pr = TAILQ_FIRST(&pp->pr_requests)) != NULL) {
496 			TAILQ_REMOVE(&pp->pr_requests, pr, pr_entry);
497 			TAILQ_INSERT_TAIL(&prl, pr, pr_entry);
498 		}
499 		if (TAILQ_EMPTY(&prl))
500 			continue;
501 
502 		mtx_leave(&pp->pr_requests_mtx);
503 
504 		mtx_enter(&pp->pr_mtx);
505 		pr = TAILQ_FIRST(&prl);
506 		while (pr != NULL) {
507 			int slowdown = 0;
508 
509 			if (pp->pr_nout >= pp->pr_hardlimit)
510 				break;
511 
512 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
513 			if (pr->pr_item == NULL) /* || slowdown ? */
514 				break;
515 
516 			pr = TAILQ_NEXT(pr, pr_entry);
517 		}
518 		mtx_leave(&pp->pr_mtx);
519 
520 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
521 		    pr->pr_item != NULL) {
522 			TAILQ_REMOVE(&prl, pr, pr_entry);
523 			(*pr->pr_handler)(pr->pr_cookie, pr->pr_item);
524 		}
525 
526 		mtx_enter(&pp->pr_requests_mtx);
527 	} while (--pp->pr_requesting);
528 
529 	/* no TAILQ_JOIN :( */
530 	while ((pr = TAILQ_FIRST(&prl)) != NULL) {
531 		TAILQ_REMOVE(&prl, pr, pr_entry);
532 		TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
533 	}
534 }
535 
536 void *
537 pool_do_get(struct pool *pp, int flags, int *slowdown)
538 {
539 	struct pool_item *pi;
540 	struct pool_item_header *ph;
541 
542 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
543 
544 	splassert(pp->pr_ipl);
545 
546 	/*
547 	 * Account for this item now to avoid races if we need to give up
548 	 * pr_mtx to allocate a page.
549 	 */
550 	pp->pr_nout++;
551 
552 	if (pp->pr_curpage == NULL) {
553 		mtx_leave(&pp->pr_mtx);
554 		ph = pool_p_alloc(pp, flags, slowdown);
555 		mtx_enter(&pp->pr_mtx);
556 
557 		if (ph == NULL) {
558 			pp->pr_nout--;
559 			return (NULL);
560 		}
561 
562 		pool_p_insert(pp, ph);
563 	}
564 
565 	ph = pp->pr_curpage;
566 	pi = XSIMPLEQ_FIRST(&ph->ph_itemlist);
567 	if (__predict_false(pi == NULL))
568 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
569 
570 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
571 		panic("%s: %s free list modified: "
572 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
573 		    __func__, pp->pr_wchan, ph->ph_page, pi,
574 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
575 	}
576 
577 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
578 
579 #ifdef DIAGNOSTIC
580 	if (pool_debug && POOL_PHPOISON(ph)) {
581 		size_t pidx;
582 		uint32_t pval;
583 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
584 		    &pidx, &pval)) {
585 			int *ip = (int *)(pi + 1);
586 			panic("%s: %s free list modified: "
587 			    "page %p; item addr %p; offset 0x%zx=0x%x",
588 			    __func__, pp->pr_wchan, ph->ph_page, pi,
589 			    pidx * sizeof(int), ip[pidx]);
590 		}
591 	}
592 #endif /* DIAGNOSTIC */
593 
594 	if (ph->ph_nmissing++ == 0) {
595 		/*
596 		 * This page was previously empty.  Move it to the list of
597 		 * partially-full pages.  This page is already curpage.
598 		 */
599 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
600 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
601 
602 		pp->pr_nidle--;
603 	}
604 
605 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
606 		/*
607 		 * This page is now full.  Move it to the full list
608 		 * and select a new current page.
609 		 */
610 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
611 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_pagelist);
612 		pool_update_curpage(pp);
613 	}
614 
615 	pp->pr_nget++;
616 
617 	return (pi);
618 }
619 
620 /*
621  * Return resource to the pool.
622  */
623 void
624 pool_put(struct pool *pp, void *v)
625 {
626 	struct pool_item *pi = v;
627 	struct pool_item_header *ph, *freeph = NULL;
628 
629 #ifdef DIAGNOSTIC
630 	if (v == NULL)
631 		panic("%s: NULL item", __func__);
632 #endif
633 
634 	mtx_enter(&pp->pr_mtx);
635 
636 	splassert(pp->pr_ipl);
637 
638 	ph = pr_find_pagehead(pp, v);
639 
640 #ifdef DIAGNOSTIC
641 	if (pool_debug) {
642 		struct pool_item *qi;
643 		XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list) {
644 			if (pi == qi) {
645 				panic("%s: %s: double pool_put: %p", __func__,
646 				    pp->pr_wchan, pi);
647 			}
648 		}
649 	}
650 #endif /* DIAGNOSTIC */
651 
652 	pi->pi_magic = POOL_IMAGIC(ph, pi);
653 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
654 #ifdef DIAGNOSTIC
655 	if (POOL_PHPOISON(ph))
656 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
657 #endif /* DIAGNOSTIC */
658 
659 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
660 		/*
661 		 * The page was previously completely full, move it to the
662 		 * partially-full list.
663 		 */
664 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_pagelist);
665 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
666 	}
667 
668 	if (ph->ph_nmissing == 0) {
669 		/*
670 		 * The page is now empty, so move it to the empty page list.
671 	 	 */
672 		pp->pr_nidle++;
673 
674 		ph->ph_tick = ticks;
675 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
676 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
677 		pool_update_curpage(pp);
678 	}
679 
680 	pp->pr_nout--;
681 	pp->pr_nput++;
682 
683 	/* is it time to free a page? */
684 	if (pp->pr_nidle > pp->pr_maxpages &&
685 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
686 	    (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
687 		freeph = ph;
688 		pool_p_remove(pp, freeph);
689 	}
690 	mtx_leave(&pp->pr_mtx);
691 
692 	if (freeph != NULL)
693 		pool_p_free(pp, freeph);
694 
695 	if (!TAILQ_EMPTY(&pp->pr_requests)) {
696 		mtx_enter(&pp->pr_requests_mtx);
697 		pool_runqueue(pp, PR_NOWAIT);
698 		mtx_leave(&pp->pr_requests_mtx);
699 	}
700 }
701 
702 /*
703  * Add N items to the pool.
704  */
705 int
706 pool_prime(struct pool *pp, int n)
707 {
708 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
709 	struct pool_item_header *ph;
710 	int newpages;
711 
712 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
713 
714 	while (newpages-- > 0) {
715 		int slowdown = 0;
716 
717 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
718 		if (ph == NULL) /* or slowdown? */
719 			break;
720 
721 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
722 	}
723 
724 	mtx_enter(&pp->pr_mtx);
725 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
726 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
727 		pool_p_insert(pp, ph);
728 	}
729 	mtx_leave(&pp->pr_mtx);
730 
731 	return (0);
732 }
733 
734 struct pool_item_header *
735 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
736 {
737 	struct pool_item_header *ph;
738 	struct pool_item *pi;
739 	caddr_t addr;
740 	int n;
741 
742 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
743 	KASSERT(pp->pr_size >= sizeof(*pi));
744 
745 	addr = pool_allocator_alloc(pp, flags, slowdown);
746 	if (addr == NULL)
747 		return (NULL);
748 
749 	if (POOL_INPGHDR(pp))
750 		ph = (struct pool_item_header *)(addr + pp->pr_phoffset);
751 	else {
752 		ph = pool_get(&phpool, flags);
753 		if (ph == NULL) {
754 			pool_allocator_free(pp, addr);
755 			return (NULL);
756 		}
757 	}
758 
759 	XSIMPLEQ_INIT(&ph->ph_itemlist);
760 	ph->ph_page = addr;
761 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
762 	ph->ph_colored = addr;
763 	ph->ph_nmissing = 0;
764 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
765 #ifdef DIAGNOSTIC
766 	/* use a bit in ph_magic to record if we poison page items */
767 	if (pool_debug)
768 		SET(ph->ph_magic, POOL_MAGICBIT);
769 	else
770 		CLR(ph->ph_magic, POOL_MAGICBIT);
771 #endif /* DIAGNOSTIC */
772 
773 	n = pp->pr_itemsperpage;
774 	while (n--) {
775 		pi = (struct pool_item *)addr;
776 		pi->pi_magic = POOL_IMAGIC(ph, pi);
777 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
778 
779 #ifdef DIAGNOSTIC
780 		if (POOL_PHPOISON(ph))
781 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
782 #endif /* DIAGNOSTIC */
783 
784 		addr += pp->pr_size;
785 	}
786 
787 	return (ph);
788 }
789 
790 void
791 pool_p_free(struct pool *pp, struct pool_item_header *ph)
792 {
793 	struct pool_item *pi;
794 
795 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
796 	KASSERT(ph->ph_nmissing == 0);
797 
798 	XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
799 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
800 			panic("%s: %s free list modified: "
801 			    "page %p; item addr %p; offset 0x%x=0x%lx",
802 			    __func__, pp->pr_wchan, ph->ph_page, pi,
803 			    0, pi->pi_magic);
804 		}
805 
806 #ifdef DIAGNOSTIC
807 		if (POOL_PHPOISON(ph)) {
808 			size_t pidx;
809 			uint32_t pval;
810 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
811 			    &pidx, &pval)) {
812 				int *ip = (int *)(pi + 1);
813 				panic("%s: %s free list modified: "
814 				    "page %p; item addr %p; offset 0x%zx=0x%x",
815 				    __func__, pp->pr_wchan, ph->ph_page, pi,
816 				    pidx * sizeof(int), ip[pidx]);
817 			}
818 		}
819 #endif
820 	}
821 
822 	pool_allocator_free(pp, ph->ph_page);
823 
824 	if (!POOL_INPGHDR(pp))
825 		pool_put(&phpool, ph);
826 }
827 
828 void
829 pool_p_insert(struct pool *pp, struct pool_item_header *ph)
830 {
831 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
832 
833 	/* If the pool was depleted, point at the new page */
834 	if (pp->pr_curpage == NULL)
835 		pp->pr_curpage = ph;
836 
837 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
838 	if (!POOL_INPGHDR(pp))
839 		RBT_INSERT(phtree, &pp->pr_phtree, ph);
840 
841 	pp->pr_nitems += pp->pr_itemsperpage;
842 	pp->pr_nidle++;
843 
844 	pp->pr_npagealloc++;
845 	if (++pp->pr_npages > pp->pr_hiwat)
846 		pp->pr_hiwat = pp->pr_npages;
847 }
848 
849 void
850 pool_p_remove(struct pool *pp, struct pool_item_header *ph)
851 {
852 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
853 
854 	pp->pr_npagefree++;
855 	pp->pr_npages--;
856 	pp->pr_nidle--;
857 	pp->pr_nitems -= pp->pr_itemsperpage;
858 
859 	if (!POOL_INPGHDR(pp))
860 		RBT_REMOVE(phtree, &pp->pr_phtree, ph);
861 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
862 
863 	pool_update_curpage(pp);
864 }
865 
866 void
867 pool_update_curpage(struct pool *pp)
868 {
869 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
870 	if (pp->pr_curpage == NULL) {
871 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
872 	}
873 }
874 
875 void
876 pool_setlowat(struct pool *pp, int n)
877 {
878 	int prime = 0;
879 
880 	mtx_enter(&pp->pr_mtx);
881 	pp->pr_minitems = n;
882 	pp->pr_minpages = (n == 0)
883 		? 0
884 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
885 
886 	if (pp->pr_nitems < n)
887 		prime = n - pp->pr_nitems;
888 	mtx_leave(&pp->pr_mtx);
889 
890 	if (prime > 0)
891 		pool_prime(pp, prime);
892 }
893 
894 void
895 pool_sethiwat(struct pool *pp, int n)
896 {
897 	pp->pr_maxpages = (n == 0)
898 		? 0
899 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
900 }
901 
902 int
903 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
904 {
905 	int error = 0;
906 
907 	if (n < pp->pr_nout) {
908 		error = EINVAL;
909 		goto done;
910 	}
911 
912 	pp->pr_hardlimit = n;
913 	pp->pr_hardlimit_warning = warnmsg;
914 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
915 	pp->pr_hardlimit_warning_last.tv_sec = 0;
916 	pp->pr_hardlimit_warning_last.tv_usec = 0;
917 
918 done:
919 	return (error);
920 }
921 
922 void
923 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
924 {
925 	pp->pr_crange = mode;
926 }
927 
928 /*
929  * Release all complete pages that have not been used recently.
930  *
931  * Returns non-zero if any pages have been reclaimed.
932  */
933 int
934 pool_reclaim(struct pool *pp)
935 {
936 	struct pool_item_header *ph, *phnext;
937 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
938 
939 	mtx_enter(&pp->pr_mtx);
940 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
941 		phnext = TAILQ_NEXT(ph, ph_pagelist);
942 
943 		/* Check our minimum page claim */
944 		if (pp->pr_npages <= pp->pr_minpages)
945 			break;
946 
947 		/*
948 		 * If freeing this page would put us below
949 		 * the low water mark, stop now.
950 		 */
951 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
952 		    pp->pr_minitems)
953 			break;
954 
955 		pool_p_remove(pp, ph);
956 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
957 	}
958 	mtx_leave(&pp->pr_mtx);
959 
960 	if (TAILQ_EMPTY(&pl))
961 		return (0);
962 
963 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
964 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
965 		pool_p_free(pp, ph);
966 	}
967 
968 	return (1);
969 }
970 
971 /*
972  * Release all complete pages that have not been used recently
973  * from all pools.
974  */
975 void
976 pool_reclaim_all(void)
977 {
978 	struct pool	*pp;
979 
980 	rw_enter_read(&pool_lock);
981 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
982 		pool_reclaim(pp);
983 	rw_exit_read(&pool_lock);
984 }
985 
986 #ifdef DDB
987 #include <machine/db_machdep.h>
988 #include <ddb/db_output.h>
989 
990 /*
991  * Diagnostic helpers.
992  */
993 void
994 pool_printit(struct pool *pp, const char *modif,
995     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
996 {
997 	pool_print1(pp, modif, pr);
998 }
999 
1000 void
1001 pool_print_pagelist(struct pool_pagelist *pl,
1002     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1003 {
1004 	struct pool_item_header *ph;
1005 	struct pool_item *pi;
1006 
1007 	TAILQ_FOREACH(ph, pl, ph_pagelist) {
1008 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1009 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1010 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1011 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1012 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1013 				    pi, pi->pi_magic);
1014 			}
1015 		}
1016 	}
1017 }
1018 
1019 void
1020 pool_print1(struct pool *pp, const char *modif,
1021     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1022 {
1023 	struct pool_item_header *ph;
1024 	int print_pagelist = 0;
1025 	char c;
1026 
1027 	while ((c = *modif++) != '\0') {
1028 		if (c == 'p')
1029 			print_pagelist = 1;
1030 		modif++;
1031 	}
1032 
1033 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1034 	    pp->pr_maxcolors);
1035 	(*pr)("\talloc %p\n", pp->pr_alloc);
1036 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1037 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1038 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1039 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1040 
1041 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1042 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1043 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1044 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1045 
1046 	if (print_pagelist == 0)
1047 		return;
1048 
1049 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1050 		(*pr)("\n\tempty page list:\n");
1051 	pool_print_pagelist(&pp->pr_emptypages, pr);
1052 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1053 		(*pr)("\n\tfull page list:\n");
1054 	pool_print_pagelist(&pp->pr_fullpages, pr);
1055 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1056 		(*pr)("\n\tpartial-page list:\n");
1057 	pool_print_pagelist(&pp->pr_partpages, pr);
1058 
1059 	if (pp->pr_curpage == NULL)
1060 		(*pr)("\tno current page\n");
1061 	else
1062 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1063 }
1064 
1065 void
1066 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1067 {
1068 	struct pool *pp;
1069 	char maxp[16];
1070 	int ovflw;
1071 	char mode;
1072 
1073 	mode = modif[0];
1074 	if (mode != '\0' && mode != 'a') {
1075 		db_printf("usage: show all pools [/a]\n");
1076 		return;
1077 	}
1078 
1079 	if (mode == '\0')
1080 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1081 		    "Name",
1082 		    "Size",
1083 		    "Requests",
1084 		    "Fail",
1085 		    "Releases",
1086 		    "Pgreq",
1087 		    "Pgrel",
1088 		    "Npage",
1089 		    "Hiwat",
1090 		    "Minpg",
1091 		    "Maxpg",
1092 		    "Idle");
1093 	else
1094 		db_printf("%-12s %18s %18s\n",
1095 		    "Name", "Address", "Allocator");
1096 
1097 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1098 		if (mode == 'a') {
1099 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1100 			    pp->pr_alloc);
1101 			continue;
1102 		}
1103 
1104 		if (!pp->pr_nget)
1105 			continue;
1106 
1107 		if (pp->pr_maxpages == UINT_MAX)
1108 			snprintf(maxp, sizeof maxp, "inf");
1109 		else
1110 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1111 
1112 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1113 	(ovflw) += db_printf((fmt),			\
1114 	    (width) - (fixed) - (ovflw) > 0 ?		\
1115 	    (width) - (fixed) - (ovflw) : 0,		\
1116 	    (val)) - (width);				\
1117 	if ((ovflw) < 0)				\
1118 		(ovflw) = 0;				\
1119 } while (/* CONSTCOND */0)
1120 
1121 		ovflw = 0;
1122 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1123 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1124 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1125 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1126 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1127 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1128 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1129 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1130 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1131 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1132 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1133 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1134 
1135 		pool_chk(pp);
1136 	}
1137 }
1138 #endif /* DDB */
1139 
1140 #if defined(POOL_DEBUG) || defined(DDB)
1141 int
1142 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1143 {
1144 	struct pool_item *pi;
1145 	caddr_t page;
1146 	int n;
1147 	const char *label = pp->pr_wchan;
1148 
1149 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1150 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1151 		printf("%s: ", label);
1152 		printf("pool(%p:%s): page inconsistency: page %p; "
1153 		    "at page head addr %p (p %p)\n",
1154 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1155 		return 1;
1156 	}
1157 
1158 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1159 	     pi != NULL;
1160 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1161 		if ((caddr_t)pi < ph->ph_page ||
1162 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1163 			printf("%s: ", label);
1164 			printf("pool(%p:%s): page inconsistency: page %p;"
1165 			    " item ordinal %d; addr %p\n", pp,
1166 			    pp->pr_wchan, ph->ph_page, n, pi);
1167 			return (1);
1168 		}
1169 
1170 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1171 			printf("%s: ", label);
1172 			printf("pool(%p:%s): free list modified: "
1173 			    "page %p; item ordinal %d; addr %p "
1174 			    "(p %p); offset 0x%x=0x%lx\n",
1175 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1176 			    0, pi->pi_magic);
1177 		}
1178 
1179 #ifdef DIAGNOSTIC
1180 		if (POOL_PHPOISON(ph)) {
1181 			size_t pidx;
1182 			uint32_t pval;
1183 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1184 			    &pidx, &pval)) {
1185 				int *ip = (int *)(pi + 1);
1186 				printf("pool(%s): free list modified: "
1187 				    "page %p; item ordinal %d; addr %p "
1188 				    "(p %p); offset 0x%zx=0x%x\n",
1189 				    pp->pr_wchan, ph->ph_page, n, pi,
1190 				    page, pidx * sizeof(int), ip[pidx]);
1191 			}
1192 		}
1193 #endif /* DIAGNOSTIC */
1194 	}
1195 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1196 		printf("pool(%p:%s): page inconsistency: page %p;"
1197 		    " %d on list, %d missing, %d items per page\n", pp,
1198 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1199 		    pp->pr_itemsperpage);
1200 		return 1;
1201 	}
1202 	if (expected >= 0 && n != expected) {
1203 		printf("pool(%p:%s): page inconsistency: page %p;"
1204 		    " %d on list, %d missing, %d expected\n", pp,
1205 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1206 		    expected);
1207 		return 1;
1208 	}
1209 	return 0;
1210 }
1211 
1212 int
1213 pool_chk(struct pool *pp)
1214 {
1215 	struct pool_item_header *ph;
1216 	int r = 0;
1217 
1218 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1219 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1220 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1221 		r += pool_chk_page(pp, ph, 0);
1222 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1223 		r += pool_chk_page(pp, ph, -1);
1224 
1225 	return (r);
1226 }
1227 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1228 
1229 #ifdef DDB
1230 void
1231 pool_walk(struct pool *pp, int full,
1232     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1233     void (*func)(void *, int, int (*)(const char *, ...)
1234 	    __attribute__((__format__(__kprintf__,1,2)))))
1235 {
1236 	struct pool_item_header *ph;
1237 	struct pool_item *pi;
1238 	caddr_t cp;
1239 	int n;
1240 
1241 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1242 		cp = ph->ph_colored;
1243 		n = ph->ph_nmissing;
1244 
1245 		while (n--) {
1246 			func(cp, full, pr);
1247 			cp += pp->pr_size;
1248 		}
1249 	}
1250 
1251 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1252 		cp = ph->ph_colored;
1253 		n = ph->ph_nmissing;
1254 
1255 		do {
1256 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1257 				if (cp == (caddr_t)pi)
1258 					break;
1259 			}
1260 			if (cp != (caddr_t)pi) {
1261 				func(cp, full, pr);
1262 				n--;
1263 			}
1264 
1265 			cp += pp->pr_size;
1266 		} while (n > 0);
1267 	}
1268 }
1269 #endif
1270 
1271 /*
1272  * We have three different sysctls.
1273  * kern.pool.npools - the number of pools.
1274  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1275  * kern.pool.name.<pool#> - the name for pool#.
1276  */
1277 int
1278 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1279 {
1280 	struct kinfo_pool pi;
1281 	struct pool *pp;
1282 	int rv = ENOENT;
1283 
1284 	switch (name[0]) {
1285 	case KERN_POOL_NPOOLS:
1286 		if (namelen != 1)
1287 			return (ENOTDIR);
1288 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1289 
1290 	case KERN_POOL_NAME:
1291 	case KERN_POOL_POOL:
1292 		break;
1293 	default:
1294 		return (EOPNOTSUPP);
1295 	}
1296 
1297 	if (namelen != 2)
1298 		return (ENOTDIR);
1299 
1300 	rw_enter_read(&pool_lock);
1301 
1302 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1303 		if (name[1] == pp->pr_serial)
1304 			break;
1305 	}
1306 
1307 	if (pp == NULL)
1308 		goto done;
1309 
1310 	switch (name[0]) {
1311 	case KERN_POOL_NAME:
1312 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1313 		break;
1314 	case KERN_POOL_POOL:
1315 		memset(&pi, 0, sizeof(pi));
1316 
1317 		mtx_enter(&pp->pr_mtx);
1318 		pi.pr_size = pp->pr_size;
1319 		pi.pr_pgsize = pp->pr_pgsize;
1320 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1321 		pi.pr_npages = pp->pr_npages;
1322 		pi.pr_minpages = pp->pr_minpages;
1323 		pi.pr_maxpages = pp->pr_maxpages;
1324 		pi.pr_hardlimit = pp->pr_hardlimit;
1325 		pi.pr_nout = pp->pr_nout;
1326 		pi.pr_nitems = pp->pr_nitems;
1327 		pi.pr_nget = pp->pr_nget;
1328 		pi.pr_nput = pp->pr_nput;
1329 		pi.pr_nfail = pp->pr_nfail;
1330 		pi.pr_npagealloc = pp->pr_npagealloc;
1331 		pi.pr_npagefree = pp->pr_npagefree;
1332 		pi.pr_hiwat = pp->pr_hiwat;
1333 		pi.pr_nidle = pp->pr_nidle;
1334 		mtx_leave(&pp->pr_mtx);
1335 
1336 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1337 		break;
1338 	}
1339 
1340 done:
1341 	rw_exit_read(&pool_lock);
1342 
1343 	return (rv);
1344 }
1345 
1346 void
1347 pool_gc_sched(void *null)
1348 {
1349 	task_add(systqmp, &pool_gc_task);
1350 }
1351 
1352 void
1353 pool_gc_pages(void *null)
1354 {
1355 	struct pool *pp;
1356 	struct pool_item_header *ph, *freeph;
1357 	int s;
1358 
1359 	rw_enter_read(&pool_lock);
1360 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1361 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1362 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1363 		    !mtx_enter_try(&pp->pr_mtx)) /* try */
1364 			continue;
1365 
1366 		/* is it time to free a page? */
1367 		if (pp->pr_nidle > pp->pr_minpages &&
1368 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1369 		    (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
1370 			freeph = ph;
1371 			pool_p_remove(pp, freeph);
1372 		} else
1373 			freeph = NULL;
1374 
1375 		mtx_leave(&pp->pr_mtx);
1376 
1377 		if (freeph != NULL)
1378 			pool_p_free(pp, freeph);
1379 	}
1380 	splx(s);
1381 	rw_exit_read(&pool_lock);
1382 
1383 	timeout_add_sec(&pool_gc_tick, 1);
1384 }
1385 
1386 /*
1387  * Pool backend allocators.
1388  */
1389 
1390 void *
1391 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1392 {
1393 	void *v;
1394 
1395 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1396 
1397 #ifdef DIAGNOSTIC
1398 	if (v != NULL && POOL_INPGHDR(pp)) {
1399 		vaddr_t addr = (vaddr_t)v;
1400 		if ((addr & pp->pr_pgmask) != addr) {
1401 			panic("%s: %s page address %p isnt aligned to %u",
1402 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1403 		}
1404 	}
1405 #endif
1406 
1407 	return (v);
1408 }
1409 
1410 void
1411 pool_allocator_free(struct pool *pp, void *v)
1412 {
1413 	struct pool_allocator *pa = pp->pr_alloc;
1414 
1415 	(*pa->pa_free)(pp, v);
1416 }
1417 
1418 void *
1419 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1420 {
1421 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1422 
1423 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1424 	kd.kd_slowdown = slowdown;
1425 
1426 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1427 }
1428 
1429 void
1430 pool_page_free(struct pool *pp, void *v)
1431 {
1432 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1433 }
1434 
1435 void *
1436 pool_multi_alloc(struct pool *pp, int flags, int *slowdown)
1437 {
1438 	struct kmem_va_mode kv = kv_intrsafe;
1439 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1440 	void *v;
1441 	int s;
1442 
1443 	if (POOL_INPGHDR(pp))
1444 		kv.kv_align = pp->pr_pgsize;
1445 
1446 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1447 	kd.kd_slowdown = slowdown;
1448 
1449 	s = splvm();
1450 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1451 	splx(s);
1452 
1453 	return (v);
1454 }
1455 
1456 void
1457 pool_multi_free(struct pool *pp, void *v)
1458 {
1459 	struct kmem_va_mode kv = kv_intrsafe;
1460 	int s;
1461 
1462 	if (POOL_INPGHDR(pp))
1463 		kv.kv_align = pp->pr_pgsize;
1464 
1465 	s = splvm();
1466 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1467 	splx(s);
1468 }
1469 
1470 void *
1471 pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown)
1472 {
1473 	struct kmem_va_mode kv = kv_any;
1474 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1475 	void *v;
1476 
1477 	if (POOL_INPGHDR(pp))
1478 		kv.kv_align = pp->pr_pgsize;
1479 
1480 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1481 	kd.kd_slowdown = slowdown;
1482 
1483 	KERNEL_LOCK();
1484 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1485 	KERNEL_UNLOCK();
1486 
1487 	return (v);
1488 }
1489 
1490 void
1491 pool_multi_free_ni(struct pool *pp, void *v)
1492 {
1493 	struct kmem_va_mode kv = kv_any;
1494 
1495 	if (POOL_INPGHDR(pp))
1496 		kv.kv_align = pp->pr_pgsize;
1497 
1498 	KERNEL_LOCK();
1499 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1500 	KERNEL_UNLOCK();
1501 }
1502