xref: /openbsd-src/sys/kern/subr_pool.c (revision 13499c96e2694e274321ebf82bf157afe0e4d8db)
1 /*	$OpenBSD: subr_pool.c,v 1.191 2015/09/08 13:37:21 kettenis Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/syslog.h>
41 #include <sys/rwlock.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/timeout.h>
45 
46 #include <uvm/uvm_extern.h>
47 
48 /*
49  * Pool resource management utility.
50  *
51  * Memory is allocated in pages which are split into pieces according to
52  * the pool item size. Each page is kept on one of three lists in the
53  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
54  * for empty, full and partially-full pages respectively. The individual
55  * pool items are on a linked list headed by `ph_itemlist' in each page
56  * header. The memory for building the page list is either taken from
57  * the allocated pages themselves (for small pool items) or taken from
58  * an internal pool of page headers (`phpool').
59  */
60 
61 /* List of all pools */
62 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
63 
64 /*
65  * Every pool gets a unique serial number assigned to it. If this counter
66  * wraps, we're screwed, but we shouldn't create so many pools anyway.
67  */
68 unsigned int pool_serial;
69 unsigned int pool_count;
70 
71 /* Lock the previous variables making up the global pool state */
72 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
73 
74 /* Private pool for page header structures */
75 struct pool phpool;
76 
77 struct pool_item_header {
78 	/* Page headers */
79 	TAILQ_ENTRY(pool_item_header)
80 				ph_pagelist;	/* pool page list */
81 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
82 	RB_ENTRY(pool_item_header)
83 				ph_node;	/* Off-page page headers */
84 	int			ph_nmissing;	/* # of chunks in use */
85 	caddr_t			ph_page;	/* this page's address */
86 	caddr_t			ph_colored;	/* page's colored address */
87 	u_long			ph_magic;
88 	int			ph_tick;
89 };
90 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
91 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
92 
93 struct pool_item {
94 	u_long				pi_magic;
95 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
96 };
97 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
98 
99 #ifdef POOL_DEBUG
100 int	pool_debug = 1;
101 #else
102 int	pool_debug = 0;
103 #endif
104 
105 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
106 
107 struct pool_item_header *
108 	 pool_p_alloc(struct pool *, int, int *);
109 void	 pool_p_insert(struct pool *, struct pool_item_header *);
110 void	 pool_p_remove(struct pool *, struct pool_item_header *);
111 void	 pool_p_free(struct pool *, struct pool_item_header *);
112 
113 void	 pool_update_curpage(struct pool *);
114 void	*pool_do_get(struct pool *, int, int *);
115 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
116 int	 pool_chk(struct pool *);
117 void	 pool_get_done(void *, void *);
118 void	 pool_runqueue(struct pool *, int);
119 
120 void	*pool_allocator_alloc(struct pool *, int, int *);
121 void	 pool_allocator_free(struct pool *, void *);
122 
123 /*
124  * The default pool allocator.
125  */
126 void	*pool_page_alloc(struct pool *, int, int *);
127 void	pool_page_free(struct pool *, void *);
128 
129 /*
130  * safe for interrupts, name preserved for compat this is the default
131  * allocator
132  */
133 struct pool_allocator pool_allocator_nointr = {
134 	pool_page_alloc,
135 	pool_page_free
136 };
137 
138 void	*pool_large_alloc(struct pool *, int, int *);
139 void	pool_large_free(struct pool *, void *);
140 
141 struct pool_allocator pool_allocator_large = {
142 	pool_large_alloc,
143 	pool_large_free
144 };
145 
146 void	*pool_large_alloc_ni(struct pool *, int, int *);
147 void	pool_large_free_ni(struct pool *, void *);
148 
149 struct pool_allocator pool_allocator_large_ni = {
150 	pool_large_alloc_ni,
151 	pool_large_free_ni
152 };
153 
154 #ifdef DDB
155 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
156 	     __attribute__((__format__(__kprintf__,1,2))));
157 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
158 	     __attribute__((__format__(__kprintf__,1,2))));
159 #endif
160 
161 /* stale page garbage collectors */
162 void	pool_gc_sched(void *);
163 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
164 void	pool_gc_pages(void *);
165 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
166 int pool_wait_free = 1;
167 int pool_wait_gc = 8;
168 
169 static inline int
170 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
171 {
172 	vaddr_t va = (vaddr_t)a->ph_page;
173 	vaddr_t vb = (vaddr_t)b->ph_page;
174 
175 	/* the compares in this order are important for the NFIND to work */
176 	if (vb < va)
177 		return (-1);
178 	if (vb > va)
179 		return (1);
180 
181 	return (0);
182 }
183 
184 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
185 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
186 
187 /*
188  * Return the pool page header based on page address.
189  */
190 static inline struct pool_item_header *
191 pr_find_pagehead(struct pool *pp, void *v)
192 {
193 	struct pool_item_header *ph, key;
194 
195 	if (POOL_INPGHDR(pp)) {
196 		caddr_t page;
197 
198 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
199 
200 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
201 	}
202 
203 	key.ph_page = v;
204 	ph = RB_NFIND(phtree, &pp->pr_phtree, &key);
205 	if (ph == NULL)
206 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
207 
208 	KASSERT(ph->ph_page <= (caddr_t)v);
209 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
210 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
211 
212 	return (ph);
213 }
214 
215 /*
216  * Initialize the given pool resource structure.
217  *
218  * We export this routine to allow other kernel parts to declare
219  * static pools that must be initialized before malloc() is available.
220  */
221 void
222 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
223     const char *wchan, struct pool_allocator *palloc)
224 {
225 	int off = 0, space;
226 	unsigned int pgsize = PAGE_SIZE, items;
227 #ifdef DIAGNOSTIC
228 	struct pool *iter;
229 	KASSERT(ioff == 0);
230 #endif
231 
232 	if (align == 0)
233 		align = ALIGN(1);
234 
235 	if (size < sizeof(struct pool_item))
236 		size = sizeof(struct pool_item);
237 
238 	size = roundup(size, align);
239 
240 	if (palloc == NULL) {
241 		while (size * 8 > pgsize)
242 			pgsize <<= 1;
243 
244 		if (pgsize > PAGE_SIZE) {
245 			palloc = ISSET(flags, PR_WAITOK) ?
246 			    &pool_allocator_large_ni : &pool_allocator_large;
247 		} else
248 			palloc = &pool_allocator_nointr;
249 	} else
250 		pgsize = palloc->pa_pagesz ? palloc->pa_pagesz : PAGE_SIZE;
251 
252 	items = pgsize / size;
253 
254 	/*
255 	 * Decide whether to put the page header off page to avoid
256 	 * wasting too large a part of the page. Off-page page headers
257 	 * go into an RB tree, so we can match a returned item with
258 	 * its header based on the page address.
259 	 */
260 	if (pgsize - (size * items) > sizeof(struct pool_item_header)) {
261 		off = pgsize - sizeof(struct pool_item_header);
262 	} else if (sizeof(struct pool_item_header) * 2 >= size) {
263 		off = pgsize - sizeof(struct pool_item_header);
264 		items = off / size;
265 	}
266 
267 	KASSERT(items > 0);
268 
269 	/*
270 	 * Initialize the pool structure.
271 	 */
272 	memset(pp, 0, sizeof(*pp));
273 	TAILQ_INIT(&pp->pr_emptypages);
274 	TAILQ_INIT(&pp->pr_fullpages);
275 	TAILQ_INIT(&pp->pr_partpages);
276 	pp->pr_curpage = NULL;
277 	pp->pr_npages = 0;
278 	pp->pr_minitems = 0;
279 	pp->pr_minpages = 0;
280 	pp->pr_maxpages = 8;
281 	pp->pr_size = size;
282 	pp->pr_pgsize = pgsize;
283 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
284 	pp->pr_phoffset = off;
285 	pp->pr_itemsperpage = items;
286 	pp->pr_wchan = wchan;
287 	pp->pr_alloc = palloc;
288 	pp->pr_nitems = 0;
289 	pp->pr_nout = 0;
290 	pp->pr_hardlimit = UINT_MAX;
291 	pp->pr_hardlimit_warning = NULL;
292 	pp->pr_hardlimit_ratecap.tv_sec = 0;
293 	pp->pr_hardlimit_ratecap.tv_usec = 0;
294 	pp->pr_hardlimit_warning_last.tv_sec = 0;
295 	pp->pr_hardlimit_warning_last.tv_usec = 0;
296 	RB_INIT(&pp->pr_phtree);
297 
298 	/*
299 	 * Use the space between the chunks and the page header
300 	 * for cache coloring.
301 	 */
302 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
303 	space -= pp->pr_itemsperpage * pp->pr_size;
304 	pp->pr_align = align;
305 	pp->pr_maxcolors = (space / align) + 1;
306 
307 	pp->pr_nget = 0;
308 	pp->pr_nfail = 0;
309 	pp->pr_nput = 0;
310 	pp->pr_npagealloc = 0;
311 	pp->pr_npagefree = 0;
312 	pp->pr_hiwat = 0;
313 	pp->pr_nidle = 0;
314 
315 	pp->pr_ipl = -1;
316 	mtx_init(&pp->pr_mtx, IPL_NONE);
317 	mtx_init(&pp->pr_requests_mtx, IPL_NONE);
318 	TAILQ_INIT(&pp->pr_requests);
319 
320 	if (phpool.pr_size == 0) {
321 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
322 		    0, "phpool", NULL);
323 		pool_setipl(&phpool, IPL_HIGH);
324 
325 		/* make sure phpool wont "recurse" */
326 		KASSERT(POOL_INPGHDR(&phpool));
327 	}
328 
329 	/* pglistalloc/constraint parameters */
330 	pp->pr_crange = &kp_dirty;
331 
332 	/* Insert this into the list of all pools. */
333 	rw_enter_write(&pool_lock);
334 #ifdef DIAGNOSTIC
335 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
336 		if (iter == pp)
337 			panic("%s: pool %s already on list", __func__, wchan);
338 	}
339 #endif
340 
341 	pp->pr_serial = ++pool_serial;
342 	if (pool_serial == 0)
343 		panic("%s: too much uptime", __func__);
344 
345 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
346 	pool_count++;
347 	rw_exit_write(&pool_lock);
348 }
349 
350 void
351 pool_setipl(struct pool *pp, int ipl)
352 {
353 	pp->pr_ipl = ipl;
354 	mtx_init(&pp->pr_mtx, ipl);
355 	mtx_init(&pp->pr_requests_mtx, ipl);
356 }
357 
358 /*
359  * Decommission a pool resource.
360  */
361 void
362 pool_destroy(struct pool *pp)
363 {
364 	struct pool_item_header *ph;
365 	struct pool *prev, *iter;
366 
367 #ifdef DIAGNOSTIC
368 	if (pp->pr_nout != 0)
369 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
370 #endif
371 
372 	/* Remove from global pool list */
373 	rw_enter_write(&pool_lock);
374 	pool_count--;
375 	if (pp == SIMPLEQ_FIRST(&pool_head))
376 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
377 	else {
378 		prev = SIMPLEQ_FIRST(&pool_head);
379 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
380 			if (iter == pp) {
381 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
382 				    pr_poollist);
383 				break;
384 			}
385 			prev = iter;
386 		}
387 	}
388 	rw_exit_write(&pool_lock);
389 
390 	/* Remove all pages */
391 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
392 		mtx_enter(&pp->pr_mtx);
393 		pool_p_remove(pp, ph);
394 		mtx_leave(&pp->pr_mtx);
395 		pool_p_free(pp, ph);
396 	}
397 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
398 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
399 }
400 
401 void
402 pool_request_init(struct pool_request *pr,
403     void (*handler)(void *, void *), void *cookie)
404 {
405 	pr->pr_handler = handler;
406 	pr->pr_cookie = cookie;
407 	pr->pr_item = NULL;
408 }
409 
410 void
411 pool_request(struct pool *pp, struct pool_request *pr)
412 {
413 	mtx_enter(&pp->pr_requests_mtx);
414 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
415 	pool_runqueue(pp, PR_NOWAIT);
416 	mtx_leave(&pp->pr_requests_mtx);
417 }
418 
419 struct pool_get_memory {
420 	struct mutex mtx;
421 	void * volatile v;
422 };
423 
424 /*
425  * Grab an item from the pool.
426  */
427 void *
428 pool_get(struct pool *pp, int flags)
429 {
430 	void *v = NULL;
431 	int slowdown = 0;
432 
433 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
434 
435 
436 	mtx_enter(&pp->pr_mtx);
437 	if (pp->pr_nout >= pp->pr_hardlimit) {
438 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
439 			goto fail;
440 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
441 		if (ISSET(flags, PR_NOWAIT))
442 			goto fail;
443 	}
444 	mtx_leave(&pp->pr_mtx);
445 
446 	if (slowdown && ISSET(flags, PR_WAITOK))
447 		yield();
448 
449 	if (v == NULL) {
450 		struct pool_get_memory mem = {
451 		    MUTEX_INITIALIZER((pp->pr_ipl == -1) ?
452 		    IPL_NONE : pp->pr_ipl), NULL };
453 		struct pool_request pr;
454 
455 		pool_request_init(&pr, pool_get_done, &mem);
456 		pool_request(pp, &pr);
457 
458 		mtx_enter(&mem.mtx);
459 		while (mem.v == NULL)
460 			msleep(&mem, &mem.mtx, PSWP, pp->pr_wchan, 0);
461 		mtx_leave(&mem.mtx);
462 
463 		v = mem.v;
464 	}
465 
466 	if (ISSET(flags, PR_ZERO))
467 		memset(v, 0, pp->pr_size);
468 
469 	return (v);
470 
471 fail:
472 	pp->pr_nfail++;
473 	mtx_leave(&pp->pr_mtx);
474 	return (NULL);
475 }
476 
477 void
478 pool_get_done(void *xmem, void *v)
479 {
480 	struct pool_get_memory *mem = xmem;
481 
482 	mtx_enter(&mem->mtx);
483 	mem->v = v;
484 	mtx_leave(&mem->mtx);
485 
486 	wakeup_one(mem);
487 }
488 
489 void
490 pool_runqueue(struct pool *pp, int flags)
491 {
492 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
493 	struct pool_request *pr;
494 
495 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
496 	MUTEX_ASSERT_LOCKED(&pp->pr_requests_mtx);
497 
498 	if (pp->pr_requesting++)
499 		return;
500 
501 	do {
502 		pp->pr_requesting = 1;
503 
504 		/* no TAILQ_JOIN? :( */
505 		while ((pr = TAILQ_FIRST(&pp->pr_requests)) != NULL) {
506 			TAILQ_REMOVE(&pp->pr_requests, pr, pr_entry);
507 			TAILQ_INSERT_TAIL(&prl, pr, pr_entry);
508 		}
509 		if (TAILQ_EMPTY(&prl))
510 			continue;
511 
512 		mtx_leave(&pp->pr_requests_mtx);
513 
514 		mtx_enter(&pp->pr_mtx);
515 		pr = TAILQ_FIRST(&prl);
516 		while (pr != NULL) {
517 			int slowdown = 0;
518 
519 			if (pp->pr_nout >= pp->pr_hardlimit)
520 				break;
521 
522 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
523 			if (pr->pr_item == NULL) /* || slowdown ? */
524 				break;
525 
526 			pr = TAILQ_NEXT(pr, pr_entry);
527 		}
528 		mtx_leave(&pp->pr_mtx);
529 
530 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
531 		    pr->pr_item != NULL) {
532 			TAILQ_REMOVE(&prl, pr, pr_entry);
533 			(*pr->pr_handler)(pr->pr_cookie, pr->pr_item);
534 		}
535 
536 		mtx_enter(&pp->pr_requests_mtx);
537 	} while (--pp->pr_requesting);
538 
539 	/* no TAILQ_JOIN :( */
540 	while ((pr = TAILQ_FIRST(&prl)) != NULL) {
541 		TAILQ_REMOVE(&prl, pr, pr_entry);
542 		TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
543 	}
544 }
545 
546 void *
547 pool_do_get(struct pool *pp, int flags, int *slowdown)
548 {
549 	struct pool_item *pi;
550 	struct pool_item_header *ph;
551 
552 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
553 
554 	if (pp->pr_ipl != -1)
555 		splassert(pp->pr_ipl);
556 
557 	/*
558 	 * Account for this item now to avoid races if we need to give up
559 	 * pr_mtx to allocate a page.
560 	 */
561 	pp->pr_nout++;
562 
563 	if (pp->pr_curpage == NULL) {
564 		mtx_leave(&pp->pr_mtx);
565 		ph = pool_p_alloc(pp, flags, slowdown);
566 		mtx_enter(&pp->pr_mtx);
567 
568 		if (ph == NULL) {
569 			pp->pr_nout--;
570 			return (NULL);
571 		}
572 
573 		pool_p_insert(pp, ph);
574 	}
575 
576 	ph = pp->pr_curpage;
577 	pi = XSIMPLEQ_FIRST(&ph->ph_itemlist);
578 	if (__predict_false(pi == NULL))
579 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
580 
581 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
582 		panic("%s: %s free list modified: "
583 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
584 		    __func__, pp->pr_wchan, ph->ph_page, pi,
585 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
586 	}
587 
588 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
589 
590 #ifdef DIAGNOSTIC
591 	if (pool_debug && POOL_PHPOISON(ph)) {
592 		size_t pidx;
593 		uint32_t pval;
594 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
595 		    &pidx, &pval)) {
596 			int *ip = (int *)(pi + 1);
597 			panic("%s: %s free list modified: "
598 			    "page %p; item addr %p; offset 0x%zx=0x%x",
599 			    __func__, pp->pr_wchan, ph->ph_page, pi,
600 			    pidx * sizeof(int), ip[pidx]);
601 		}
602 	}
603 #endif /* DIAGNOSTIC */
604 
605 	if (ph->ph_nmissing++ == 0) {
606 		/*
607 		 * This page was previously empty.  Move it to the list of
608 		 * partially-full pages.  This page is already curpage.
609 		 */
610 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
611 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
612 
613 		pp->pr_nidle--;
614 	}
615 
616 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
617 		/*
618 		 * This page is now full.  Move it to the full list
619 		 * and select a new current page.
620 		 */
621 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
622 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_pagelist);
623 		pool_update_curpage(pp);
624 	}
625 
626 	pp->pr_nget++;
627 
628 	return (pi);
629 }
630 
631 /*
632  * Return resource to the pool.
633  */
634 void
635 pool_put(struct pool *pp, void *v)
636 {
637 	struct pool_item *pi = v;
638 	struct pool_item_header *ph, *freeph = NULL;
639 
640 #ifdef DIAGNOSTIC
641 	if (v == NULL)
642 		panic("%s: NULL item", __func__);
643 #endif
644 
645 	mtx_enter(&pp->pr_mtx);
646 
647 	if (pp->pr_ipl != -1)
648 		splassert(pp->pr_ipl);
649 
650 	ph = pr_find_pagehead(pp, v);
651 
652 #ifdef DIAGNOSTIC
653 	if (pool_debug) {
654 		struct pool_item *qi;
655 		XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list) {
656 			if (pi == qi) {
657 				panic("%s: %s: double pool_put: %p", __func__,
658 				    pp->pr_wchan, pi);
659 			}
660 		}
661 	}
662 #endif /* DIAGNOSTIC */
663 
664 	pi->pi_magic = POOL_IMAGIC(ph, pi);
665 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
666 #ifdef DIAGNOSTIC
667 	if (POOL_PHPOISON(ph))
668 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
669 #endif /* DIAGNOSTIC */
670 
671 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
672 		/*
673 		 * The page was previously completely full, move it to the
674 		 * partially-full list.
675 		 */
676 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_pagelist);
677 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
678 	}
679 
680 	if (ph->ph_nmissing == 0) {
681 		/*
682 		 * The page is now empty, so move it to the empty page list.
683 	 	 */
684 		pp->pr_nidle++;
685 
686 		ph->ph_tick = ticks;
687 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
688 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
689 		pool_update_curpage(pp);
690 	}
691 
692 	pp->pr_nout--;
693 	pp->pr_nput++;
694 
695 	/* is it time to free a page? */
696 	if (pp->pr_nidle > pp->pr_maxpages &&
697 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
698 	    (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
699 		freeph = ph;
700 		pool_p_remove(pp, freeph);
701 	}
702 	mtx_leave(&pp->pr_mtx);
703 
704 	if (freeph != NULL)
705 		pool_p_free(pp, freeph);
706 
707 	mtx_enter(&pp->pr_requests_mtx);
708 	pool_runqueue(pp, PR_NOWAIT);
709 	mtx_leave(&pp->pr_requests_mtx);
710 }
711 
712 /*
713  * Add N items to the pool.
714  */
715 int
716 pool_prime(struct pool *pp, int n)
717 {
718 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
719 	struct pool_item_header *ph;
720 	int newpages;
721 
722 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
723 
724 	while (newpages-- > 0) {
725 		int slowdown = 0;
726 
727 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
728 		if (ph == NULL) /* or slowdown? */
729 			break;
730 
731 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
732 	}
733 
734 	mtx_enter(&pp->pr_mtx);
735 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
736 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
737 		pool_p_insert(pp, ph);
738 	}
739 	mtx_leave(&pp->pr_mtx);
740 
741 	return (0);
742 }
743 
744 struct pool_item_header *
745 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
746 {
747 	struct pool_item_header *ph;
748 	struct pool_item *pi;
749 	caddr_t addr;
750 	int n;
751 
752 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
753 	KASSERT(pp->pr_size >= sizeof(*pi));
754 
755 	addr = pool_allocator_alloc(pp, flags, slowdown);
756 	if (addr == NULL)
757 		return (NULL);
758 
759 	if (POOL_INPGHDR(pp))
760 		ph = (struct pool_item_header *)(addr + pp->pr_phoffset);
761 	else {
762 		ph = pool_get(&phpool, flags);
763 		if (ph == NULL) {
764 			pool_allocator_free(pp, addr);
765 			return (NULL);
766 		}
767 	}
768 
769 	XSIMPLEQ_INIT(&ph->ph_itemlist);
770 	ph->ph_page = addr;
771 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
772 	ph->ph_colored = addr;
773 	ph->ph_nmissing = 0;
774 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
775 #ifdef DIAGNOSTIC
776 	/* use a bit in ph_magic to record if we poison page items */
777 	if (pool_debug)
778 		SET(ph->ph_magic, POOL_MAGICBIT);
779 	else
780 		CLR(ph->ph_magic, POOL_MAGICBIT);
781 #endif /* DIAGNOSTIC */
782 
783 	n = pp->pr_itemsperpage;
784 	while (n--) {
785 		pi = (struct pool_item *)addr;
786 		pi->pi_magic = POOL_IMAGIC(ph, pi);
787 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
788 
789 #ifdef DIAGNOSTIC
790 		if (POOL_PHPOISON(ph))
791 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
792 #endif /* DIAGNOSTIC */
793 
794 		addr += pp->pr_size;
795 	}
796 
797 	return (ph);
798 }
799 
800 void
801 pool_p_free(struct pool *pp, struct pool_item_header *ph)
802 {
803 	struct pool_item *pi;
804 
805 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
806 	KASSERT(ph->ph_nmissing == 0);
807 
808 	XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
809 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
810 			panic("%s: %s free list modified: "
811 			    "page %p; item addr %p; offset 0x%x=0x%lx",
812 			    __func__, pp->pr_wchan, ph->ph_page, pi,
813 			    0, pi->pi_magic);
814 		}
815 
816 #ifdef DIAGNOSTIC
817 		if (POOL_PHPOISON(ph)) {
818 			size_t pidx;
819 			uint32_t pval;
820 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
821 			    &pidx, &pval)) {
822 				int *ip = (int *)(pi + 1);
823 				panic("%s: %s free list modified: "
824 				    "page %p; item addr %p; offset 0x%zx=0x%x",
825 				    __func__, pp->pr_wchan, ph->ph_page, pi,
826 				    pidx * sizeof(int), ip[pidx]);
827 			}
828 		}
829 #endif
830 	}
831 
832 	pool_allocator_free(pp, ph->ph_page);
833 
834 	if (!POOL_INPGHDR(pp))
835 		pool_put(&phpool, ph);
836 }
837 
838 void
839 pool_p_insert(struct pool *pp, struct pool_item_header *ph)
840 {
841 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
842 
843 	/* If the pool was depleted, point at the new page */
844 	if (pp->pr_curpage == NULL)
845 		pp->pr_curpage = ph;
846 
847 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
848 	if (!POOL_INPGHDR(pp))
849 		RB_INSERT(phtree, &pp->pr_phtree, ph);
850 
851 	pp->pr_nitems += pp->pr_itemsperpage;
852 	pp->pr_nidle++;
853 
854 	pp->pr_npagealloc++;
855 	if (++pp->pr_npages > pp->pr_hiwat)
856 		pp->pr_hiwat = pp->pr_npages;
857 }
858 
859 void
860 pool_p_remove(struct pool *pp, struct pool_item_header *ph)
861 {
862 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
863 
864 	pp->pr_npagefree++;
865 	pp->pr_npages--;
866 	pp->pr_nidle--;
867 	pp->pr_nitems -= pp->pr_itemsperpage;
868 
869 	if (!POOL_INPGHDR(pp))
870 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
871 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
872 
873 	pool_update_curpage(pp);
874 }
875 
876 void
877 pool_update_curpage(struct pool *pp)
878 {
879 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
880 	if (pp->pr_curpage == NULL) {
881 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
882 	}
883 }
884 
885 void
886 pool_setlowat(struct pool *pp, int n)
887 {
888 	int prime = 0;
889 
890 	mtx_enter(&pp->pr_mtx);
891 	pp->pr_minitems = n;
892 	pp->pr_minpages = (n == 0)
893 		? 0
894 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
895 
896 	if (pp->pr_nitems < n)
897 		prime = n - pp->pr_nitems;
898 	mtx_leave(&pp->pr_mtx);
899 
900 	if (prime > 0)
901 		pool_prime(pp, prime);
902 }
903 
904 void
905 pool_sethiwat(struct pool *pp, int n)
906 {
907 	pp->pr_maxpages = (n == 0)
908 		? 0
909 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
910 }
911 
912 int
913 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
914 {
915 	int error = 0;
916 
917 	if (n < pp->pr_nout) {
918 		error = EINVAL;
919 		goto done;
920 	}
921 
922 	pp->pr_hardlimit = n;
923 	pp->pr_hardlimit_warning = warnmsg;
924 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
925 	pp->pr_hardlimit_warning_last.tv_sec = 0;
926 	pp->pr_hardlimit_warning_last.tv_usec = 0;
927 
928 done:
929 	return (error);
930 }
931 
932 void
933 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
934 {
935 	pp->pr_crange = mode;
936 }
937 
938 /*
939  * Release all complete pages that have not been used recently.
940  *
941  * Returns non-zero if any pages have been reclaimed.
942  */
943 int
944 pool_reclaim(struct pool *pp)
945 {
946 	struct pool_item_header *ph, *phnext;
947 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
948 
949 	mtx_enter(&pp->pr_mtx);
950 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
951 		phnext = TAILQ_NEXT(ph, ph_pagelist);
952 
953 		/* Check our minimum page claim */
954 		if (pp->pr_npages <= pp->pr_minpages)
955 			break;
956 
957 		/*
958 		 * If freeing this page would put us below
959 		 * the low water mark, stop now.
960 		 */
961 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
962 		    pp->pr_minitems)
963 			break;
964 
965 		pool_p_remove(pp, ph);
966 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
967 	}
968 	mtx_leave(&pp->pr_mtx);
969 
970 	if (TAILQ_EMPTY(&pl))
971 		return (0);
972 
973 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
974 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
975 		pool_p_free(pp, ph);
976 	}
977 
978 	return (1);
979 }
980 
981 /*
982  * Release all complete pages that have not been used recently
983  * from all pools.
984  */
985 void
986 pool_reclaim_all(void)
987 {
988 	struct pool	*pp;
989 
990 	rw_enter_read(&pool_lock);
991 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
992 		pool_reclaim(pp);
993 	rw_exit_read(&pool_lock);
994 }
995 
996 #ifdef DDB
997 #include <machine/db_machdep.h>
998 #include <ddb/db_output.h>
999 
1000 /*
1001  * Diagnostic helpers.
1002  */
1003 void
1004 pool_printit(struct pool *pp, const char *modif,
1005     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1006 {
1007 	pool_print1(pp, modif, pr);
1008 }
1009 
1010 void
1011 pool_print_pagelist(struct pool_pagelist *pl,
1012     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1013 {
1014 	struct pool_item_header *ph;
1015 	struct pool_item *pi;
1016 
1017 	TAILQ_FOREACH(ph, pl, ph_pagelist) {
1018 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1019 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1020 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1021 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1022 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1023 				    pi, pi->pi_magic);
1024 			}
1025 		}
1026 	}
1027 }
1028 
1029 void
1030 pool_print1(struct pool *pp, const char *modif,
1031     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1032 {
1033 	struct pool_item_header *ph;
1034 	int print_pagelist = 0;
1035 	char c;
1036 
1037 	while ((c = *modif++) != '\0') {
1038 		if (c == 'p')
1039 			print_pagelist = 1;
1040 		modif++;
1041 	}
1042 
1043 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1044 	    pp->pr_maxcolors);
1045 	(*pr)("\talloc %p\n", pp->pr_alloc);
1046 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1047 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1048 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1049 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1050 
1051 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1052 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1053 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1054 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1055 
1056 	if (print_pagelist == 0)
1057 		return;
1058 
1059 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1060 		(*pr)("\n\tempty page list:\n");
1061 	pool_print_pagelist(&pp->pr_emptypages, pr);
1062 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1063 		(*pr)("\n\tfull page list:\n");
1064 	pool_print_pagelist(&pp->pr_fullpages, pr);
1065 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1066 		(*pr)("\n\tpartial-page list:\n");
1067 	pool_print_pagelist(&pp->pr_partpages, pr);
1068 
1069 	if (pp->pr_curpage == NULL)
1070 		(*pr)("\tno current page\n");
1071 	else
1072 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1073 }
1074 
1075 void
1076 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1077 {
1078 	struct pool *pp;
1079 	char maxp[16];
1080 	int ovflw;
1081 	char mode;
1082 
1083 	mode = modif[0];
1084 	if (mode != '\0' && mode != 'a') {
1085 		db_printf("usage: show all pools [/a]\n");
1086 		return;
1087 	}
1088 
1089 	if (mode == '\0')
1090 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1091 		    "Name",
1092 		    "Size",
1093 		    "Requests",
1094 		    "Fail",
1095 		    "Releases",
1096 		    "Pgreq",
1097 		    "Pgrel",
1098 		    "Npage",
1099 		    "Hiwat",
1100 		    "Minpg",
1101 		    "Maxpg",
1102 		    "Idle");
1103 	else
1104 		db_printf("%-12s %18s %18s\n",
1105 		    "Name", "Address", "Allocator");
1106 
1107 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1108 		if (mode == 'a') {
1109 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1110 			    pp->pr_alloc);
1111 			continue;
1112 		}
1113 
1114 		if (!pp->pr_nget)
1115 			continue;
1116 
1117 		if (pp->pr_maxpages == UINT_MAX)
1118 			snprintf(maxp, sizeof maxp, "inf");
1119 		else
1120 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1121 
1122 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1123 	(ovflw) += db_printf((fmt),			\
1124 	    (width) - (fixed) - (ovflw) > 0 ?		\
1125 	    (width) - (fixed) - (ovflw) : 0,		\
1126 	    (val)) - (width);				\
1127 	if ((ovflw) < 0)				\
1128 		(ovflw) = 0;				\
1129 } while (/* CONSTCOND */0)
1130 
1131 		ovflw = 0;
1132 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1133 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1134 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1135 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1136 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1137 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1138 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1139 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1140 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1141 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1142 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1143 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1144 
1145 		pool_chk(pp);
1146 	}
1147 }
1148 #endif /* DDB */
1149 
1150 #if defined(POOL_DEBUG) || defined(DDB)
1151 int
1152 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1153 {
1154 	struct pool_item *pi;
1155 	caddr_t page;
1156 	int n;
1157 	const char *label = pp->pr_wchan;
1158 
1159 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1160 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1161 		printf("%s: ", label);
1162 		printf("pool(%p:%s): page inconsistency: page %p; "
1163 		    "at page head addr %p (p %p)\n",
1164 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1165 		return 1;
1166 	}
1167 
1168 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1169 	     pi != NULL;
1170 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1171 		if ((caddr_t)pi < ph->ph_page ||
1172 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1173 			printf("%s: ", label);
1174 			printf("pool(%p:%s): page inconsistency: page %p;"
1175 			    " item ordinal %d; addr %p\n", pp,
1176 			    pp->pr_wchan, ph->ph_page, n, pi);
1177 			return (1);
1178 		}
1179 
1180 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1181 			printf("%s: ", label);
1182 			printf("pool(%p:%s): free list modified: "
1183 			    "page %p; item ordinal %d; addr %p "
1184 			    "(p %p); offset 0x%x=0x%lx\n",
1185 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1186 			    0, pi->pi_magic);
1187 		}
1188 
1189 #ifdef DIAGNOSTIC
1190 		if (POOL_PHPOISON(ph)) {
1191 			size_t pidx;
1192 			uint32_t pval;
1193 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1194 			    &pidx, &pval)) {
1195 				int *ip = (int *)(pi + 1);
1196 				printf("pool(%s): free list modified: "
1197 				    "page %p; item ordinal %d; addr %p "
1198 				    "(p %p); offset 0x%zx=0x%x\n",
1199 				    pp->pr_wchan, ph->ph_page, n, pi,
1200 				    page, pidx * sizeof(int), ip[pidx]);
1201 			}
1202 		}
1203 #endif /* DIAGNOSTIC */
1204 	}
1205 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1206 		printf("pool(%p:%s): page inconsistency: page %p;"
1207 		    " %d on list, %d missing, %d items per page\n", pp,
1208 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1209 		    pp->pr_itemsperpage);
1210 		return 1;
1211 	}
1212 	if (expected >= 0 && n != expected) {
1213 		printf("pool(%p:%s): page inconsistency: page %p;"
1214 		    " %d on list, %d missing, %d expected\n", pp,
1215 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1216 		    expected);
1217 		return 1;
1218 	}
1219 	return 0;
1220 }
1221 
1222 int
1223 pool_chk(struct pool *pp)
1224 {
1225 	struct pool_item_header *ph;
1226 	int r = 0;
1227 
1228 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1229 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1230 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1231 		r += pool_chk_page(pp, ph, 0);
1232 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1233 		r += pool_chk_page(pp, ph, -1);
1234 
1235 	return (r);
1236 }
1237 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1238 
1239 #ifdef DDB
1240 void
1241 pool_walk(struct pool *pp, int full,
1242     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1243     void (*func)(void *, int, int (*)(const char *, ...)
1244 	    __attribute__((__format__(__kprintf__,1,2)))))
1245 {
1246 	struct pool_item_header *ph;
1247 	struct pool_item *pi;
1248 	caddr_t cp;
1249 	int n;
1250 
1251 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1252 		cp = ph->ph_colored;
1253 		n = ph->ph_nmissing;
1254 
1255 		while (n--) {
1256 			func(cp, full, pr);
1257 			cp += pp->pr_size;
1258 		}
1259 	}
1260 
1261 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1262 		cp = ph->ph_colored;
1263 		n = ph->ph_nmissing;
1264 
1265 		do {
1266 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1267 				if (cp == (caddr_t)pi)
1268 					break;
1269 			}
1270 			if (cp != (caddr_t)pi) {
1271 				func(cp, full, pr);
1272 				n--;
1273 			}
1274 
1275 			cp += pp->pr_size;
1276 		} while (n > 0);
1277 	}
1278 }
1279 #endif
1280 
1281 /*
1282  * We have three different sysctls.
1283  * kern.pool.npools - the number of pools.
1284  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1285  * kern.pool.name.<pool#> - the name for pool#.
1286  */
1287 int
1288 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1289 {
1290 	struct kinfo_pool pi;
1291 	struct pool *pp;
1292 	int rv = ENOENT;
1293 
1294 	switch (name[0]) {
1295 	case KERN_POOL_NPOOLS:
1296 		if (namelen != 1)
1297 			return (ENOTDIR);
1298 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1299 
1300 	case KERN_POOL_NAME:
1301 	case KERN_POOL_POOL:
1302 		break;
1303 	default:
1304 		return (EOPNOTSUPP);
1305 	}
1306 
1307 	if (namelen != 2)
1308 		return (ENOTDIR);
1309 
1310 	rw_enter_read(&pool_lock);
1311 
1312 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1313 		if (name[1] == pp->pr_serial)
1314 			break;
1315 	}
1316 
1317 	if (pp == NULL)
1318 		goto done;
1319 
1320 	switch (name[0]) {
1321 	case KERN_POOL_NAME:
1322 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1323 		break;
1324 	case KERN_POOL_POOL:
1325 		memset(&pi, 0, sizeof(pi));
1326 
1327 		if (pp->pr_ipl != -1)
1328 			mtx_enter(&pp->pr_mtx);
1329 		pi.pr_size = pp->pr_size;
1330 		pi.pr_pgsize = pp->pr_pgsize;
1331 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1332 		pi.pr_npages = pp->pr_npages;
1333 		pi.pr_minpages = pp->pr_minpages;
1334 		pi.pr_maxpages = pp->pr_maxpages;
1335 		pi.pr_hardlimit = pp->pr_hardlimit;
1336 		pi.pr_nout = pp->pr_nout;
1337 		pi.pr_nitems = pp->pr_nitems;
1338 		pi.pr_nget = pp->pr_nget;
1339 		pi.pr_nput = pp->pr_nput;
1340 		pi.pr_nfail = pp->pr_nfail;
1341 		pi.pr_npagealloc = pp->pr_npagealloc;
1342 		pi.pr_npagefree = pp->pr_npagefree;
1343 		pi.pr_hiwat = pp->pr_hiwat;
1344 		pi.pr_nidle = pp->pr_nidle;
1345 		if (pp->pr_ipl != -1)
1346 			mtx_leave(&pp->pr_mtx);
1347 
1348 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1349 		break;
1350 	}
1351 
1352 done:
1353 	rw_exit_read(&pool_lock);
1354 
1355 	return (rv);
1356 }
1357 
1358 void
1359 pool_gc_sched(void *null)
1360 {
1361 	task_add(systqmp, &pool_gc_task);
1362 }
1363 
1364 void
1365 pool_gc_pages(void *null)
1366 {
1367 	struct pool *pp;
1368 	struct pool_item_header *ph, *freeph;
1369 	int s;
1370 
1371 	rw_enter_read(&pool_lock);
1372 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1373 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1374 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1375 		    !mtx_enter_try(&pp->pr_mtx)) /* try */
1376 			continue;
1377 
1378 		/* is it time to free a page? */
1379 		if (pp->pr_nidle > pp->pr_minpages &&
1380 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1381 		    (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
1382 			freeph = ph;
1383 			pool_p_remove(pp, freeph);
1384 		} else
1385 			freeph = NULL;
1386 
1387 		mtx_leave(&pp->pr_mtx);
1388 
1389 		if (freeph != NULL)
1390 			pool_p_free(pp, freeph);
1391 	}
1392 	splx(s);
1393 	rw_exit_read(&pool_lock);
1394 
1395 	timeout_add_sec(&pool_gc_tick, 1);
1396 }
1397 
1398 /*
1399  * Pool backend allocators.
1400  */
1401 
1402 void *
1403 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1404 {
1405 	void *v;
1406 
1407 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1408 
1409 #ifdef DIAGNOSTIC
1410 	if (v != NULL && POOL_INPGHDR(pp)) {
1411 		vaddr_t addr = (vaddr_t)v;
1412 		if ((addr & pp->pr_pgmask) != addr) {
1413 			panic("%s: %s page address %p isnt aligned to %u",
1414 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1415 		}
1416 	}
1417 #endif
1418 
1419 	return (v);
1420 }
1421 
1422 void
1423 pool_allocator_free(struct pool *pp, void *v)
1424 {
1425 	struct pool_allocator *pa = pp->pr_alloc;
1426 
1427 	(*pa->pa_free)(pp, v);
1428 }
1429 
1430 void *
1431 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1432 {
1433 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1434 
1435 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1436 	kd.kd_slowdown = slowdown;
1437 
1438 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1439 }
1440 
1441 void
1442 pool_page_free(struct pool *pp, void *v)
1443 {
1444 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1445 }
1446 
1447 void *
1448 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1449 {
1450 	struct kmem_va_mode kv = kv_intrsafe;
1451 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1452 	void *v;
1453 	int s;
1454 
1455 	if (POOL_INPGHDR(pp))
1456 		kv.kv_align = pp->pr_pgsize;
1457 
1458 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1459 	kd.kd_slowdown = slowdown;
1460 
1461 	s = splvm();
1462 	KERNEL_LOCK();
1463 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1464 	KERNEL_UNLOCK();
1465 	splx(s);
1466 
1467 	return (v);
1468 }
1469 
1470 void
1471 pool_large_free(struct pool *pp, void *v)
1472 {
1473 	struct kmem_va_mode kv = kv_intrsafe;
1474 	int s;
1475 
1476 	if (POOL_INPGHDR(pp))
1477 		kv.kv_align = pp->pr_pgsize;
1478 
1479 	s = splvm();
1480 	KERNEL_LOCK();
1481 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1482 	KERNEL_UNLOCK();
1483 	splx(s);
1484 }
1485 
1486 void *
1487 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1488 {
1489 	struct kmem_va_mode kv = kv_any;
1490 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1491 	void *v;
1492 
1493 	if (POOL_INPGHDR(pp))
1494 		kv.kv_align = pp->pr_pgsize;
1495 
1496 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1497 	kd.kd_slowdown = slowdown;
1498 
1499 	KERNEL_LOCK();
1500 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1501 	KERNEL_UNLOCK();
1502 
1503 	return (v);
1504 }
1505 
1506 void
1507 pool_large_free_ni(struct pool *pp, void *v)
1508 {
1509 	struct kmem_va_mode kv = kv_any;
1510 
1511 	if (POOL_INPGHDR(pp))
1512 		kv.kv_align = pp->pr_pgsize;
1513 
1514 	KERNEL_LOCK();
1515 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1516 	KERNEL_UNLOCK();
1517 }
1518