xref: /openbsd-src/sys/kern/subr_pool.c (revision f3662f08a4faa72710e33f4dff639440c5c3b40e)
1 /*	$OpenBSD: subr_pool.c,v 1.201 2016/11/02 03:29:48 dlg Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/syslog.h>
41 #include <sys/rwlock.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/timeout.h>
45 #include <sys/percpu.h>
46 
47 #include <uvm/uvm_extern.h>
48 
49 /*
50  * Pool resource management utility.
51  *
52  * Memory is allocated in pages which are split into pieces according to
53  * the pool item size. Each page is kept on one of three lists in the
54  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
55  * for empty, full and partially-full pages respectively. The individual
56  * pool items are on a linked list headed by `ph_itemlist' in each page
57  * header. The memory for building the page list is either taken from
58  * the allocated pages themselves (for small pool items) or taken from
59  * an internal pool of page headers (`phpool').
60  */
61 
62 /* List of all pools */
63 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
64 
65 /*
66  * Every pool gets a unique serial number assigned to it. If this counter
67  * wraps, we're screwed, but we shouldn't create so many pools anyway.
68  */
69 unsigned int pool_serial;
70 unsigned int pool_count;
71 
72 /* Lock the previous variables making up the global pool state */
73 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
74 
75 /* Private pool for page header structures */
76 struct pool phpool;
77 
78 struct pool_item_header {
79 	/* Page headers */
80 	TAILQ_ENTRY(pool_item_header)
81 				ph_pagelist;	/* pool page list */
82 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
83 	RBT_ENTRY(pool_item_header)
84 				ph_node;	/* Off-page page headers */
85 	int			ph_nmissing;	/* # of chunks in use */
86 	caddr_t			ph_page;	/* this page's address */
87 	caddr_t			ph_colored;	/* page's colored address */
88 	u_long			ph_magic;
89 	int			ph_tick;
90 };
91 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
92 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
93 
94 struct pool_item {
95 	u_long				pi_magic;
96 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
97 };
98 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
99 
100 #ifdef MULTIPROCESSOR
101 struct pool_list {
102 	struct pool_list	*pl_next;	/* next in list */
103 	unsigned long		 pl_nitems;	/* items in list */
104 	TAILQ_ENTRY(pool_list)	 pl_nextl;	/* list of lists */
105 };
106 
107 #define POOL_LIST_NITEMS_MASK		0x7ffffffUL
108 #define POOL_LIST_NITEMS_POISON		0x8000000UL
109 
110 #define POOL_LIST_POISONED(_pl)						\
111     ISSET((_pl)->pl_nitems, POOL_LIST_NITEMS_POISON)
112 
113 #define POOL_LIST_NITEMS(_pl)						\
114     ((_pl)->pl_nitems & POOL_LIST_NITEMS_MASK)
115 
116 struct pool_cache {
117 	struct pool_list	*pc_actv;
118 	unsigned long		 pc_nactv;	/* cache pc_actv nitems */
119 	struct pool_list	*pc_prev;
120 
121 	uint64_t		 pc_gen;	/* generation number */
122 	uint64_t		 pc_gets;
123 	uint64_t		 pc_puts;
124 	uint64_t		 pc_fails;
125 
126 	int			 pc_nout;
127 };
128 
129 void	*pool_cache_get(struct pool *);
130 void	 pool_cache_put(struct pool *, void *);
131 void	 pool_cache_destroy(struct pool *);
132 #endif
133 void	 pool_cache_info(struct pool *, struct kinfo_pool *);
134 
135 #ifdef POOL_DEBUG
136 int	pool_debug = 1;
137 #else
138 int	pool_debug = 0;
139 #endif
140 
141 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
142 
143 struct pool_item_header *
144 	 pool_p_alloc(struct pool *, int, int *);
145 void	 pool_p_insert(struct pool *, struct pool_item_header *);
146 void	 pool_p_remove(struct pool *, struct pool_item_header *);
147 void	 pool_p_free(struct pool *, struct pool_item_header *);
148 
149 void	 pool_update_curpage(struct pool *);
150 void	*pool_do_get(struct pool *, int, int *);
151 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
152 int	 pool_chk(struct pool *);
153 void	 pool_get_done(void *, void *);
154 void	 pool_runqueue(struct pool *, int);
155 
156 void	*pool_allocator_alloc(struct pool *, int, int *);
157 void	 pool_allocator_free(struct pool *, void *);
158 
159 /*
160  * The default pool allocator.
161  */
162 void	*pool_page_alloc(struct pool *, int, int *);
163 void	pool_page_free(struct pool *, void *);
164 
165 /*
166  * safe for interrupts; this is the default allocator
167  */
168 struct pool_allocator pool_allocator_single = {
169 	pool_page_alloc,
170 	pool_page_free
171 };
172 
173 void	*pool_multi_alloc(struct pool *, int, int *);
174 void	pool_multi_free(struct pool *, void *);
175 
176 struct pool_allocator pool_allocator_multi = {
177 	pool_multi_alloc,
178 	pool_multi_free
179 };
180 
181 void	*pool_multi_alloc_ni(struct pool *, int, int *);
182 void	pool_multi_free_ni(struct pool *, void *);
183 
184 struct pool_allocator pool_allocator_multi_ni = {
185 	pool_multi_alloc_ni,
186 	pool_multi_free_ni
187 };
188 
189 #ifdef DDB
190 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
191 	     __attribute__((__format__(__kprintf__,1,2))));
192 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
193 	     __attribute__((__format__(__kprintf__,1,2))));
194 #endif
195 
196 /* stale page garbage collectors */
197 void	pool_gc_sched(void *);
198 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
199 void	pool_gc_pages(void *);
200 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
201 int pool_wait_free = 1;
202 int pool_wait_gc = 8;
203 
204 RBT_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
205 
206 static inline int
207 phtree_compare(const struct pool_item_header *a,
208     const struct pool_item_header *b)
209 {
210 	vaddr_t va = (vaddr_t)a->ph_page;
211 	vaddr_t vb = (vaddr_t)b->ph_page;
212 
213 	/* the compares in this order are important for the NFIND to work */
214 	if (vb < va)
215 		return (-1);
216 	if (vb > va)
217 		return (1);
218 
219 	return (0);
220 }
221 
222 RBT_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
223 
224 /*
225  * Return the pool page header based on page address.
226  */
227 static inline struct pool_item_header *
228 pr_find_pagehead(struct pool *pp, void *v)
229 {
230 	struct pool_item_header *ph, key;
231 
232 	if (POOL_INPGHDR(pp)) {
233 		caddr_t page;
234 
235 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
236 
237 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
238 	}
239 
240 	key.ph_page = v;
241 	ph = RBT_NFIND(phtree, &pp->pr_phtree, &key);
242 	if (ph == NULL)
243 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
244 
245 	KASSERT(ph->ph_page <= (caddr_t)v);
246 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
247 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
248 
249 	return (ph);
250 }
251 
252 /*
253  * Initialize the given pool resource structure.
254  *
255  * We export this routine to allow other kernel parts to declare
256  * static pools that must be initialized before malloc() is available.
257  */
258 void
259 pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags,
260     const char *wchan, struct pool_allocator *palloc)
261 {
262 	int off = 0, space;
263 	unsigned int pgsize = PAGE_SIZE, items;
264 #ifdef DIAGNOSTIC
265 	struct pool *iter;
266 #endif
267 
268 	if (align == 0)
269 		align = ALIGN(1);
270 
271 	if (size < sizeof(struct pool_item))
272 		size = sizeof(struct pool_item);
273 
274 	size = roundup(size, align);
275 
276 	if (palloc == NULL) {
277 		while (size * 8 > pgsize)
278 			pgsize <<= 1;
279 
280 		if (pgsize > PAGE_SIZE) {
281 			palloc = ISSET(flags, PR_WAITOK) ?
282 			    &pool_allocator_multi_ni : &pool_allocator_multi;
283 		} else
284 			palloc = &pool_allocator_single;
285 	} else
286 		pgsize = palloc->pa_pagesz ? palloc->pa_pagesz : PAGE_SIZE;
287 
288 	items = pgsize / size;
289 
290 	/*
291 	 * Decide whether to put the page header off page to avoid
292 	 * wasting too large a part of the page. Off-page page headers
293 	 * go into an RB tree, so we can match a returned item with
294 	 * its header based on the page address.
295 	 */
296 	if (pgsize - (size * items) > sizeof(struct pool_item_header)) {
297 		off = pgsize - sizeof(struct pool_item_header);
298 	} else if (sizeof(struct pool_item_header) * 2 >= size) {
299 		off = pgsize - sizeof(struct pool_item_header);
300 		items = off / size;
301 	}
302 
303 	KASSERT(items > 0);
304 
305 	/*
306 	 * Initialize the pool structure.
307 	 */
308 	memset(pp, 0, sizeof(*pp));
309 	TAILQ_INIT(&pp->pr_emptypages);
310 	TAILQ_INIT(&pp->pr_fullpages);
311 	TAILQ_INIT(&pp->pr_partpages);
312 	pp->pr_curpage = NULL;
313 	pp->pr_npages = 0;
314 	pp->pr_minitems = 0;
315 	pp->pr_minpages = 0;
316 	pp->pr_maxpages = 8;
317 	pp->pr_size = size;
318 	pp->pr_pgsize = pgsize;
319 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
320 	pp->pr_phoffset = off;
321 	pp->pr_itemsperpage = items;
322 	pp->pr_wchan = wchan;
323 	pp->pr_alloc = palloc;
324 	pp->pr_nitems = 0;
325 	pp->pr_nout = 0;
326 	pp->pr_hardlimit = UINT_MAX;
327 	pp->pr_hardlimit_warning = NULL;
328 	pp->pr_hardlimit_ratecap.tv_sec = 0;
329 	pp->pr_hardlimit_ratecap.tv_usec = 0;
330 	pp->pr_hardlimit_warning_last.tv_sec = 0;
331 	pp->pr_hardlimit_warning_last.tv_usec = 0;
332 	RBT_INIT(phtree, &pp->pr_phtree);
333 
334 	/*
335 	 * Use the space between the chunks and the page header
336 	 * for cache coloring.
337 	 */
338 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
339 	space -= pp->pr_itemsperpage * pp->pr_size;
340 	pp->pr_align = align;
341 	pp->pr_maxcolors = (space / align) + 1;
342 
343 	pp->pr_nget = 0;
344 	pp->pr_nfail = 0;
345 	pp->pr_nput = 0;
346 	pp->pr_npagealloc = 0;
347 	pp->pr_npagefree = 0;
348 	pp->pr_hiwat = 0;
349 	pp->pr_nidle = 0;
350 
351 	pp->pr_ipl = ipl;
352 	mtx_init(&pp->pr_mtx, pp->pr_ipl);
353 	mtx_init(&pp->pr_requests_mtx, pp->pr_ipl);
354 	TAILQ_INIT(&pp->pr_requests);
355 
356 	if (phpool.pr_size == 0) {
357 		pool_init(&phpool, sizeof(struct pool_item_header), 0,
358 		    IPL_HIGH, 0, "phpool", NULL);
359 
360 		/* make sure phpool wont "recurse" */
361 		KASSERT(POOL_INPGHDR(&phpool));
362 	}
363 
364 	/* pglistalloc/constraint parameters */
365 	pp->pr_crange = &kp_dirty;
366 
367 	/* Insert this into the list of all pools. */
368 	rw_enter_write(&pool_lock);
369 #ifdef DIAGNOSTIC
370 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
371 		if (iter == pp)
372 			panic("%s: pool %s already on list", __func__, wchan);
373 	}
374 #endif
375 
376 	pp->pr_serial = ++pool_serial;
377 	if (pool_serial == 0)
378 		panic("%s: too much uptime", __func__);
379 
380 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
381 	pool_count++;
382 	rw_exit_write(&pool_lock);
383 }
384 
385 /*
386  * Decommission a pool resource.
387  */
388 void
389 pool_destroy(struct pool *pp)
390 {
391 	struct pool_item_header *ph;
392 	struct pool *prev, *iter;
393 
394 #ifdef MULTIPROCESSOR
395 	if (pp->pr_cache != NULL)
396 		pool_cache_destroy(pp);
397 #endif
398 
399 #ifdef DIAGNOSTIC
400 	if (pp->pr_nout != 0)
401 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
402 #endif
403 
404 	/* Remove from global pool list */
405 	rw_enter_write(&pool_lock);
406 	pool_count--;
407 	if (pp == SIMPLEQ_FIRST(&pool_head))
408 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
409 	else {
410 		prev = SIMPLEQ_FIRST(&pool_head);
411 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
412 			if (iter == pp) {
413 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
414 				    pr_poollist);
415 				break;
416 			}
417 			prev = iter;
418 		}
419 	}
420 	rw_exit_write(&pool_lock);
421 
422 	/* Remove all pages */
423 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
424 		mtx_enter(&pp->pr_mtx);
425 		pool_p_remove(pp, ph);
426 		mtx_leave(&pp->pr_mtx);
427 		pool_p_free(pp, ph);
428 	}
429 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
430 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
431 }
432 
433 void
434 pool_request_init(struct pool_request *pr,
435     void (*handler)(void *, void *), void *cookie)
436 {
437 	pr->pr_handler = handler;
438 	pr->pr_cookie = cookie;
439 	pr->pr_item = NULL;
440 }
441 
442 void
443 pool_request(struct pool *pp, struct pool_request *pr)
444 {
445 	mtx_enter(&pp->pr_requests_mtx);
446 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
447 	pool_runqueue(pp, PR_NOWAIT);
448 	mtx_leave(&pp->pr_requests_mtx);
449 }
450 
451 struct pool_get_memory {
452 	struct mutex mtx;
453 	void * volatile v;
454 };
455 
456 /*
457  * Grab an item from the pool.
458  */
459 void *
460 pool_get(struct pool *pp, int flags)
461 {
462 	void *v = NULL;
463 	int slowdown = 0;
464 
465 #ifdef MULTIPROCESSOR
466 	if (pp->pr_cache != NULL) {
467 		v = pool_cache_get(pp);
468 		if (v != NULL)
469 			goto good;
470 	}
471 #endif
472 
473 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
474 
475 	mtx_enter(&pp->pr_mtx);
476 	if (pp->pr_nout >= pp->pr_hardlimit) {
477 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
478 			goto fail;
479 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
480 		if (ISSET(flags, PR_NOWAIT))
481 			goto fail;
482 	}
483 	mtx_leave(&pp->pr_mtx);
484 
485 	if (slowdown && ISSET(flags, PR_WAITOK))
486 		yield();
487 
488 	if (v == NULL) {
489 		struct pool_get_memory mem = {
490 		    MUTEX_INITIALIZER(pp->pr_ipl),
491 		    NULL };
492 		struct pool_request pr;
493 
494 		pool_request_init(&pr, pool_get_done, &mem);
495 		pool_request(pp, &pr);
496 
497 		mtx_enter(&mem.mtx);
498 		while (mem.v == NULL)
499 			msleep(&mem, &mem.mtx, PSWP, pp->pr_wchan, 0);
500 		mtx_leave(&mem.mtx);
501 
502 		v = mem.v;
503 	}
504 
505 #ifdef MULTIPROCESSOR
506 good:
507 #endif
508 	if (ISSET(flags, PR_ZERO))
509 		memset(v, 0, pp->pr_size);
510 
511 	return (v);
512 
513 fail:
514 	pp->pr_nfail++;
515 	mtx_leave(&pp->pr_mtx);
516 	return (NULL);
517 }
518 
519 void
520 pool_get_done(void *xmem, void *v)
521 {
522 	struct pool_get_memory *mem = xmem;
523 
524 	mtx_enter(&mem->mtx);
525 	mem->v = v;
526 	mtx_leave(&mem->mtx);
527 
528 	wakeup_one(mem);
529 }
530 
531 void
532 pool_runqueue(struct pool *pp, int flags)
533 {
534 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
535 	struct pool_request *pr;
536 
537 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
538 	MUTEX_ASSERT_LOCKED(&pp->pr_requests_mtx);
539 
540 	if (pp->pr_requesting++)
541 		return;
542 
543 	do {
544 		pp->pr_requesting = 1;
545 
546 		/* no TAILQ_JOIN? :( */
547 		while ((pr = TAILQ_FIRST(&pp->pr_requests)) != NULL) {
548 			TAILQ_REMOVE(&pp->pr_requests, pr, pr_entry);
549 			TAILQ_INSERT_TAIL(&prl, pr, pr_entry);
550 		}
551 		if (TAILQ_EMPTY(&prl))
552 			continue;
553 
554 		mtx_leave(&pp->pr_requests_mtx);
555 
556 		mtx_enter(&pp->pr_mtx);
557 		pr = TAILQ_FIRST(&prl);
558 		while (pr != NULL) {
559 			int slowdown = 0;
560 
561 			if (pp->pr_nout >= pp->pr_hardlimit)
562 				break;
563 
564 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
565 			if (pr->pr_item == NULL) /* || slowdown ? */
566 				break;
567 
568 			pr = TAILQ_NEXT(pr, pr_entry);
569 		}
570 		mtx_leave(&pp->pr_mtx);
571 
572 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
573 		    pr->pr_item != NULL) {
574 			TAILQ_REMOVE(&prl, pr, pr_entry);
575 			(*pr->pr_handler)(pr->pr_cookie, pr->pr_item);
576 		}
577 
578 		mtx_enter(&pp->pr_requests_mtx);
579 	} while (--pp->pr_requesting);
580 
581 	/* no TAILQ_JOIN :( */
582 	while ((pr = TAILQ_FIRST(&prl)) != NULL) {
583 		TAILQ_REMOVE(&prl, pr, pr_entry);
584 		TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
585 	}
586 }
587 
588 void *
589 pool_do_get(struct pool *pp, int flags, int *slowdown)
590 {
591 	struct pool_item *pi;
592 	struct pool_item_header *ph;
593 
594 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
595 
596 	splassert(pp->pr_ipl);
597 
598 	/*
599 	 * Account for this item now to avoid races if we need to give up
600 	 * pr_mtx to allocate a page.
601 	 */
602 	pp->pr_nout++;
603 
604 	if (pp->pr_curpage == NULL) {
605 		mtx_leave(&pp->pr_mtx);
606 		ph = pool_p_alloc(pp, flags, slowdown);
607 		mtx_enter(&pp->pr_mtx);
608 
609 		if (ph == NULL) {
610 			pp->pr_nout--;
611 			return (NULL);
612 		}
613 
614 		pool_p_insert(pp, ph);
615 	}
616 
617 	ph = pp->pr_curpage;
618 	pi = XSIMPLEQ_FIRST(&ph->ph_itemlist);
619 	if (__predict_false(pi == NULL))
620 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
621 
622 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
623 		panic("%s: %s free list modified: "
624 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
625 		    __func__, pp->pr_wchan, ph->ph_page, pi,
626 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
627 	}
628 
629 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
630 
631 #ifdef DIAGNOSTIC
632 	if (pool_debug && POOL_PHPOISON(ph)) {
633 		size_t pidx;
634 		uint32_t pval;
635 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
636 		    &pidx, &pval)) {
637 			int *ip = (int *)(pi + 1);
638 			panic("%s: %s free list modified: "
639 			    "page %p; item addr %p; offset 0x%zx=0x%x",
640 			    __func__, pp->pr_wchan, ph->ph_page, pi,
641 			    pidx * sizeof(int), ip[pidx]);
642 		}
643 	}
644 #endif /* DIAGNOSTIC */
645 
646 	if (ph->ph_nmissing++ == 0) {
647 		/*
648 		 * This page was previously empty.  Move it to the list of
649 		 * partially-full pages.  This page is already curpage.
650 		 */
651 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
652 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
653 
654 		pp->pr_nidle--;
655 	}
656 
657 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
658 		/*
659 		 * This page is now full.  Move it to the full list
660 		 * and select a new current page.
661 		 */
662 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
663 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_pagelist);
664 		pool_update_curpage(pp);
665 	}
666 
667 	pp->pr_nget++;
668 
669 	return (pi);
670 }
671 
672 /*
673  * Return resource to the pool.
674  */
675 void
676 pool_put(struct pool *pp, void *v)
677 {
678 	struct pool_item *pi = v;
679 	struct pool_item_header *ph, *freeph = NULL;
680 
681 #ifdef DIAGNOSTIC
682 	if (v == NULL)
683 		panic("%s: NULL item", __func__);
684 #endif
685 
686 #ifdef MULTIPROCESSOR
687 	if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
688 		pool_cache_put(pp, v);
689 		return;
690 	}
691 #endif
692 
693 	mtx_enter(&pp->pr_mtx);
694 
695 	splassert(pp->pr_ipl);
696 
697 	ph = pr_find_pagehead(pp, v);
698 
699 #ifdef DIAGNOSTIC
700 	if (pool_debug) {
701 		struct pool_item *qi;
702 		XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list) {
703 			if (pi == qi) {
704 				panic("%s: %s: double pool_put: %p", __func__,
705 				    pp->pr_wchan, pi);
706 			}
707 		}
708 	}
709 #endif /* DIAGNOSTIC */
710 
711 	pi->pi_magic = POOL_IMAGIC(ph, pi);
712 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
713 #ifdef DIAGNOSTIC
714 	if (POOL_PHPOISON(ph))
715 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
716 #endif /* DIAGNOSTIC */
717 
718 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
719 		/*
720 		 * The page was previously completely full, move it to the
721 		 * partially-full list.
722 		 */
723 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_pagelist);
724 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
725 	}
726 
727 	if (ph->ph_nmissing == 0) {
728 		/*
729 		 * The page is now empty, so move it to the empty page list.
730 	 	 */
731 		pp->pr_nidle++;
732 
733 		ph->ph_tick = ticks;
734 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
735 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
736 		pool_update_curpage(pp);
737 	}
738 
739 	pp->pr_nout--;
740 	pp->pr_nput++;
741 
742 	/* is it time to free a page? */
743 	if (pp->pr_nidle > pp->pr_maxpages &&
744 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
745 	    (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
746 		freeph = ph;
747 		pool_p_remove(pp, freeph);
748 	}
749 	mtx_leave(&pp->pr_mtx);
750 
751 	if (freeph != NULL)
752 		pool_p_free(pp, freeph);
753 
754 	if (!TAILQ_EMPTY(&pp->pr_requests)) {
755 		mtx_enter(&pp->pr_requests_mtx);
756 		pool_runqueue(pp, PR_NOWAIT);
757 		mtx_leave(&pp->pr_requests_mtx);
758 	}
759 }
760 
761 /*
762  * Add N items to the pool.
763  */
764 int
765 pool_prime(struct pool *pp, int n)
766 {
767 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
768 	struct pool_item_header *ph;
769 	int newpages;
770 
771 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
772 
773 	while (newpages-- > 0) {
774 		int slowdown = 0;
775 
776 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
777 		if (ph == NULL) /* or slowdown? */
778 			break;
779 
780 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
781 	}
782 
783 	mtx_enter(&pp->pr_mtx);
784 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
785 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
786 		pool_p_insert(pp, ph);
787 	}
788 	mtx_leave(&pp->pr_mtx);
789 
790 	return (0);
791 }
792 
793 struct pool_item_header *
794 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
795 {
796 	struct pool_item_header *ph;
797 	struct pool_item *pi;
798 	caddr_t addr;
799 	int n;
800 
801 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
802 	KASSERT(pp->pr_size >= sizeof(*pi));
803 
804 	addr = pool_allocator_alloc(pp, flags, slowdown);
805 	if (addr == NULL)
806 		return (NULL);
807 
808 	if (POOL_INPGHDR(pp))
809 		ph = (struct pool_item_header *)(addr + pp->pr_phoffset);
810 	else {
811 		ph = pool_get(&phpool, flags);
812 		if (ph == NULL) {
813 			pool_allocator_free(pp, addr);
814 			return (NULL);
815 		}
816 	}
817 
818 	XSIMPLEQ_INIT(&ph->ph_itemlist);
819 	ph->ph_page = addr;
820 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
821 	ph->ph_colored = addr;
822 	ph->ph_nmissing = 0;
823 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
824 #ifdef DIAGNOSTIC
825 	/* use a bit in ph_magic to record if we poison page items */
826 	if (pool_debug)
827 		SET(ph->ph_magic, POOL_MAGICBIT);
828 	else
829 		CLR(ph->ph_magic, POOL_MAGICBIT);
830 #endif /* DIAGNOSTIC */
831 
832 	n = pp->pr_itemsperpage;
833 	while (n--) {
834 		pi = (struct pool_item *)addr;
835 		pi->pi_magic = POOL_IMAGIC(ph, pi);
836 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
837 
838 #ifdef DIAGNOSTIC
839 		if (POOL_PHPOISON(ph))
840 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
841 #endif /* DIAGNOSTIC */
842 
843 		addr += pp->pr_size;
844 	}
845 
846 	return (ph);
847 }
848 
849 void
850 pool_p_free(struct pool *pp, struct pool_item_header *ph)
851 {
852 	struct pool_item *pi;
853 
854 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
855 	KASSERT(ph->ph_nmissing == 0);
856 
857 	XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
858 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
859 			panic("%s: %s free list modified: "
860 			    "page %p; item addr %p; offset 0x%x=0x%lx",
861 			    __func__, pp->pr_wchan, ph->ph_page, pi,
862 			    0, pi->pi_magic);
863 		}
864 
865 #ifdef DIAGNOSTIC
866 		if (POOL_PHPOISON(ph)) {
867 			size_t pidx;
868 			uint32_t pval;
869 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
870 			    &pidx, &pval)) {
871 				int *ip = (int *)(pi + 1);
872 				panic("%s: %s free list modified: "
873 				    "page %p; item addr %p; offset 0x%zx=0x%x",
874 				    __func__, pp->pr_wchan, ph->ph_page, pi,
875 				    pidx * sizeof(int), ip[pidx]);
876 			}
877 		}
878 #endif
879 	}
880 
881 	pool_allocator_free(pp, ph->ph_page);
882 
883 	if (!POOL_INPGHDR(pp))
884 		pool_put(&phpool, ph);
885 }
886 
887 void
888 pool_p_insert(struct pool *pp, struct pool_item_header *ph)
889 {
890 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
891 
892 	/* If the pool was depleted, point at the new page */
893 	if (pp->pr_curpage == NULL)
894 		pp->pr_curpage = ph;
895 
896 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
897 	if (!POOL_INPGHDR(pp))
898 		RBT_INSERT(phtree, &pp->pr_phtree, ph);
899 
900 	pp->pr_nitems += pp->pr_itemsperpage;
901 	pp->pr_nidle++;
902 
903 	pp->pr_npagealloc++;
904 	if (++pp->pr_npages > pp->pr_hiwat)
905 		pp->pr_hiwat = pp->pr_npages;
906 }
907 
908 void
909 pool_p_remove(struct pool *pp, struct pool_item_header *ph)
910 {
911 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
912 
913 	pp->pr_npagefree++;
914 	pp->pr_npages--;
915 	pp->pr_nidle--;
916 	pp->pr_nitems -= pp->pr_itemsperpage;
917 
918 	if (!POOL_INPGHDR(pp))
919 		RBT_REMOVE(phtree, &pp->pr_phtree, ph);
920 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
921 
922 	pool_update_curpage(pp);
923 }
924 
925 void
926 pool_update_curpage(struct pool *pp)
927 {
928 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
929 	if (pp->pr_curpage == NULL) {
930 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
931 	}
932 }
933 
934 void
935 pool_setlowat(struct pool *pp, int n)
936 {
937 	int prime = 0;
938 
939 	mtx_enter(&pp->pr_mtx);
940 	pp->pr_minitems = n;
941 	pp->pr_minpages = (n == 0)
942 		? 0
943 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
944 
945 	if (pp->pr_nitems < n)
946 		prime = n - pp->pr_nitems;
947 	mtx_leave(&pp->pr_mtx);
948 
949 	if (prime > 0)
950 		pool_prime(pp, prime);
951 }
952 
953 void
954 pool_sethiwat(struct pool *pp, int n)
955 {
956 	pp->pr_maxpages = (n == 0)
957 		? 0
958 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
959 }
960 
961 int
962 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
963 {
964 	int error = 0;
965 
966 	if (n < pp->pr_nout) {
967 		error = EINVAL;
968 		goto done;
969 	}
970 
971 	pp->pr_hardlimit = n;
972 	pp->pr_hardlimit_warning = warnmsg;
973 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
974 	pp->pr_hardlimit_warning_last.tv_sec = 0;
975 	pp->pr_hardlimit_warning_last.tv_usec = 0;
976 
977 done:
978 	return (error);
979 }
980 
981 void
982 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
983 {
984 	pp->pr_crange = mode;
985 }
986 
987 /*
988  * Release all complete pages that have not been used recently.
989  *
990  * Returns non-zero if any pages have been reclaimed.
991  */
992 int
993 pool_reclaim(struct pool *pp)
994 {
995 	struct pool_item_header *ph, *phnext;
996 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
997 
998 	mtx_enter(&pp->pr_mtx);
999 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1000 		phnext = TAILQ_NEXT(ph, ph_pagelist);
1001 
1002 		/* Check our minimum page claim */
1003 		if (pp->pr_npages <= pp->pr_minpages)
1004 			break;
1005 
1006 		/*
1007 		 * If freeing this page would put us below
1008 		 * the low water mark, stop now.
1009 		 */
1010 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1011 		    pp->pr_minitems)
1012 			break;
1013 
1014 		pool_p_remove(pp, ph);
1015 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
1016 	}
1017 	mtx_leave(&pp->pr_mtx);
1018 
1019 	if (TAILQ_EMPTY(&pl))
1020 		return (0);
1021 
1022 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
1023 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
1024 		pool_p_free(pp, ph);
1025 	}
1026 
1027 	return (1);
1028 }
1029 
1030 /*
1031  * Release all complete pages that have not been used recently
1032  * from all pools.
1033  */
1034 void
1035 pool_reclaim_all(void)
1036 {
1037 	struct pool	*pp;
1038 
1039 	rw_enter_read(&pool_lock);
1040 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1041 		pool_reclaim(pp);
1042 	rw_exit_read(&pool_lock);
1043 }
1044 
1045 #ifdef DDB
1046 #include <machine/db_machdep.h>
1047 #include <ddb/db_output.h>
1048 
1049 /*
1050  * Diagnostic helpers.
1051  */
1052 void
1053 pool_printit(struct pool *pp, const char *modif,
1054     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1055 {
1056 	pool_print1(pp, modif, pr);
1057 }
1058 
1059 void
1060 pool_print_pagelist(struct pool_pagelist *pl,
1061     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1062 {
1063 	struct pool_item_header *ph;
1064 	struct pool_item *pi;
1065 
1066 	TAILQ_FOREACH(ph, pl, ph_pagelist) {
1067 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1068 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1069 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1070 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1071 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1072 				    pi, pi->pi_magic);
1073 			}
1074 		}
1075 	}
1076 }
1077 
1078 void
1079 pool_print1(struct pool *pp, const char *modif,
1080     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1081 {
1082 	struct pool_item_header *ph;
1083 	int print_pagelist = 0;
1084 	char c;
1085 
1086 	while ((c = *modif++) != '\0') {
1087 		if (c == 'p')
1088 			print_pagelist = 1;
1089 		modif++;
1090 	}
1091 
1092 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1093 	    pp->pr_maxcolors);
1094 	(*pr)("\talloc %p\n", pp->pr_alloc);
1095 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1096 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1097 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1098 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1099 
1100 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1101 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1102 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1103 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1104 
1105 	if (print_pagelist == 0)
1106 		return;
1107 
1108 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1109 		(*pr)("\n\tempty page list:\n");
1110 	pool_print_pagelist(&pp->pr_emptypages, pr);
1111 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1112 		(*pr)("\n\tfull page list:\n");
1113 	pool_print_pagelist(&pp->pr_fullpages, pr);
1114 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1115 		(*pr)("\n\tpartial-page list:\n");
1116 	pool_print_pagelist(&pp->pr_partpages, pr);
1117 
1118 	if (pp->pr_curpage == NULL)
1119 		(*pr)("\tno current page\n");
1120 	else
1121 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1122 }
1123 
1124 void
1125 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1126 {
1127 	struct pool *pp;
1128 	char maxp[16];
1129 	int ovflw;
1130 	char mode;
1131 
1132 	mode = modif[0];
1133 	if (mode != '\0' && mode != 'a') {
1134 		db_printf("usage: show all pools [/a]\n");
1135 		return;
1136 	}
1137 
1138 	if (mode == '\0')
1139 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1140 		    "Name",
1141 		    "Size",
1142 		    "Requests",
1143 		    "Fail",
1144 		    "Releases",
1145 		    "Pgreq",
1146 		    "Pgrel",
1147 		    "Npage",
1148 		    "Hiwat",
1149 		    "Minpg",
1150 		    "Maxpg",
1151 		    "Idle");
1152 	else
1153 		db_printf("%-12s %18s %18s\n",
1154 		    "Name", "Address", "Allocator");
1155 
1156 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1157 		if (mode == 'a') {
1158 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1159 			    pp->pr_alloc);
1160 			continue;
1161 		}
1162 
1163 		if (!pp->pr_nget)
1164 			continue;
1165 
1166 		if (pp->pr_maxpages == UINT_MAX)
1167 			snprintf(maxp, sizeof maxp, "inf");
1168 		else
1169 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1170 
1171 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1172 	(ovflw) += db_printf((fmt),			\
1173 	    (width) - (fixed) - (ovflw) > 0 ?		\
1174 	    (width) - (fixed) - (ovflw) : 0,		\
1175 	    (val)) - (width);				\
1176 	if ((ovflw) < 0)				\
1177 		(ovflw) = 0;				\
1178 } while (/* CONSTCOND */0)
1179 
1180 		ovflw = 0;
1181 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1182 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1183 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1184 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1185 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1186 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1187 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1188 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1189 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1190 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1191 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1192 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1193 
1194 		pool_chk(pp);
1195 	}
1196 }
1197 #endif /* DDB */
1198 
1199 #if defined(POOL_DEBUG) || defined(DDB)
1200 int
1201 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1202 {
1203 	struct pool_item *pi;
1204 	caddr_t page;
1205 	int n;
1206 	const char *label = pp->pr_wchan;
1207 
1208 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1209 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1210 		printf("%s: ", label);
1211 		printf("pool(%p:%s): page inconsistency: page %p; "
1212 		    "at page head addr %p (p %p)\n",
1213 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1214 		return 1;
1215 	}
1216 
1217 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1218 	     pi != NULL;
1219 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1220 		if ((caddr_t)pi < ph->ph_page ||
1221 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1222 			printf("%s: ", label);
1223 			printf("pool(%p:%s): page inconsistency: page %p;"
1224 			    " item ordinal %d; addr %p\n", pp,
1225 			    pp->pr_wchan, ph->ph_page, n, pi);
1226 			return (1);
1227 		}
1228 
1229 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1230 			printf("%s: ", label);
1231 			printf("pool(%p:%s): free list modified: "
1232 			    "page %p; item ordinal %d; addr %p "
1233 			    "(p %p); offset 0x%x=0x%lx\n",
1234 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1235 			    0, pi->pi_magic);
1236 		}
1237 
1238 #ifdef DIAGNOSTIC
1239 		if (POOL_PHPOISON(ph)) {
1240 			size_t pidx;
1241 			uint32_t pval;
1242 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1243 			    &pidx, &pval)) {
1244 				int *ip = (int *)(pi + 1);
1245 				printf("pool(%s): free list modified: "
1246 				    "page %p; item ordinal %d; addr %p "
1247 				    "(p %p); offset 0x%zx=0x%x\n",
1248 				    pp->pr_wchan, ph->ph_page, n, pi,
1249 				    page, pidx * sizeof(int), ip[pidx]);
1250 			}
1251 		}
1252 #endif /* DIAGNOSTIC */
1253 	}
1254 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1255 		printf("pool(%p:%s): page inconsistency: page %p;"
1256 		    " %d on list, %d missing, %d items per page\n", pp,
1257 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1258 		    pp->pr_itemsperpage);
1259 		return 1;
1260 	}
1261 	if (expected >= 0 && n != expected) {
1262 		printf("pool(%p:%s): page inconsistency: page %p;"
1263 		    " %d on list, %d missing, %d expected\n", pp,
1264 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1265 		    expected);
1266 		return 1;
1267 	}
1268 	return 0;
1269 }
1270 
1271 int
1272 pool_chk(struct pool *pp)
1273 {
1274 	struct pool_item_header *ph;
1275 	int r = 0;
1276 
1277 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1278 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1279 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1280 		r += pool_chk_page(pp, ph, 0);
1281 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1282 		r += pool_chk_page(pp, ph, -1);
1283 
1284 	return (r);
1285 }
1286 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1287 
1288 #ifdef DDB
1289 void
1290 pool_walk(struct pool *pp, int full,
1291     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1292     void (*func)(void *, int, int (*)(const char *, ...)
1293 	    __attribute__((__format__(__kprintf__,1,2)))))
1294 {
1295 	struct pool_item_header *ph;
1296 	struct pool_item *pi;
1297 	caddr_t cp;
1298 	int n;
1299 
1300 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1301 		cp = ph->ph_colored;
1302 		n = ph->ph_nmissing;
1303 
1304 		while (n--) {
1305 			func(cp, full, pr);
1306 			cp += pp->pr_size;
1307 		}
1308 	}
1309 
1310 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1311 		cp = ph->ph_colored;
1312 		n = ph->ph_nmissing;
1313 
1314 		do {
1315 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1316 				if (cp == (caddr_t)pi)
1317 					break;
1318 			}
1319 			if (cp != (caddr_t)pi) {
1320 				func(cp, full, pr);
1321 				n--;
1322 			}
1323 
1324 			cp += pp->pr_size;
1325 		} while (n > 0);
1326 	}
1327 }
1328 #endif
1329 
1330 /*
1331  * We have three different sysctls.
1332  * kern.pool.npools - the number of pools.
1333  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1334  * kern.pool.name.<pool#> - the name for pool#.
1335  */
1336 int
1337 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1338 {
1339 	struct kinfo_pool pi;
1340 	struct pool *pp;
1341 	int rv = ENOENT;
1342 
1343 	switch (name[0]) {
1344 	case KERN_POOL_NPOOLS:
1345 		if (namelen != 1)
1346 			return (ENOTDIR);
1347 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1348 
1349 	case KERN_POOL_NAME:
1350 	case KERN_POOL_POOL:
1351 		break;
1352 	default:
1353 		return (EOPNOTSUPP);
1354 	}
1355 
1356 	if (namelen != 2)
1357 		return (ENOTDIR);
1358 
1359 	rw_enter_read(&pool_lock);
1360 
1361 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1362 		if (name[1] == pp->pr_serial)
1363 			break;
1364 	}
1365 
1366 	if (pp == NULL)
1367 		goto done;
1368 
1369 	switch (name[0]) {
1370 	case KERN_POOL_NAME:
1371 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1372 		break;
1373 	case KERN_POOL_POOL:
1374 		memset(&pi, 0, sizeof(pi));
1375 
1376 		mtx_enter(&pp->pr_mtx);
1377 		pi.pr_size = pp->pr_size;
1378 		pi.pr_pgsize = pp->pr_pgsize;
1379 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1380 		pi.pr_npages = pp->pr_npages;
1381 		pi.pr_minpages = pp->pr_minpages;
1382 		pi.pr_maxpages = pp->pr_maxpages;
1383 		pi.pr_hardlimit = pp->pr_hardlimit;
1384 		pi.pr_nout = pp->pr_nout;
1385 		pi.pr_nitems = pp->pr_nitems;
1386 		pi.pr_nget = pp->pr_nget;
1387 		pi.pr_nput = pp->pr_nput;
1388 		pi.pr_nfail = pp->pr_nfail;
1389 		pi.pr_npagealloc = pp->pr_npagealloc;
1390 		pi.pr_npagefree = pp->pr_npagefree;
1391 		pi.pr_hiwat = pp->pr_hiwat;
1392 		pi.pr_nidle = pp->pr_nidle;
1393 		mtx_leave(&pp->pr_mtx);
1394 
1395 		pool_cache_info(pp, &pi);
1396 
1397 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1398 		break;
1399 	}
1400 
1401 done:
1402 	rw_exit_read(&pool_lock);
1403 
1404 	return (rv);
1405 }
1406 
1407 void
1408 pool_gc_sched(void *null)
1409 {
1410 	task_add(systqmp, &pool_gc_task);
1411 }
1412 
1413 void
1414 pool_gc_pages(void *null)
1415 {
1416 	struct pool *pp;
1417 	struct pool_item_header *ph, *freeph;
1418 	int s;
1419 
1420 	rw_enter_read(&pool_lock);
1421 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1422 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1423 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1424 		    !mtx_enter_try(&pp->pr_mtx)) /* try */
1425 			continue;
1426 
1427 		/* is it time to free a page? */
1428 		if (pp->pr_nidle > pp->pr_minpages &&
1429 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1430 		    (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
1431 			freeph = ph;
1432 			pool_p_remove(pp, freeph);
1433 		} else
1434 			freeph = NULL;
1435 
1436 		mtx_leave(&pp->pr_mtx);
1437 
1438 		if (freeph != NULL)
1439 			pool_p_free(pp, freeph);
1440 	}
1441 	splx(s);
1442 	rw_exit_read(&pool_lock);
1443 
1444 	timeout_add_sec(&pool_gc_tick, 1);
1445 }
1446 
1447 /*
1448  * Pool backend allocators.
1449  */
1450 
1451 void *
1452 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1453 {
1454 	void *v;
1455 
1456 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1457 
1458 #ifdef DIAGNOSTIC
1459 	if (v != NULL && POOL_INPGHDR(pp)) {
1460 		vaddr_t addr = (vaddr_t)v;
1461 		if ((addr & pp->pr_pgmask) != addr) {
1462 			panic("%s: %s page address %p isnt aligned to %u",
1463 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1464 		}
1465 	}
1466 #endif
1467 
1468 	return (v);
1469 }
1470 
1471 void
1472 pool_allocator_free(struct pool *pp, void *v)
1473 {
1474 	struct pool_allocator *pa = pp->pr_alloc;
1475 
1476 	(*pa->pa_free)(pp, v);
1477 }
1478 
1479 void *
1480 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1481 {
1482 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1483 
1484 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1485 	kd.kd_slowdown = slowdown;
1486 
1487 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1488 }
1489 
1490 void
1491 pool_page_free(struct pool *pp, void *v)
1492 {
1493 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1494 }
1495 
1496 void *
1497 pool_multi_alloc(struct pool *pp, int flags, int *slowdown)
1498 {
1499 	struct kmem_va_mode kv = kv_intrsafe;
1500 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1501 	void *v;
1502 	int s;
1503 
1504 	if (POOL_INPGHDR(pp))
1505 		kv.kv_align = pp->pr_pgsize;
1506 
1507 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1508 	kd.kd_slowdown = slowdown;
1509 
1510 	s = splvm();
1511 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1512 	splx(s);
1513 
1514 	return (v);
1515 }
1516 
1517 void
1518 pool_multi_free(struct pool *pp, void *v)
1519 {
1520 	struct kmem_va_mode kv = kv_intrsafe;
1521 	int s;
1522 
1523 	if (POOL_INPGHDR(pp))
1524 		kv.kv_align = pp->pr_pgsize;
1525 
1526 	s = splvm();
1527 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1528 	splx(s);
1529 }
1530 
1531 void *
1532 pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown)
1533 {
1534 	struct kmem_va_mode kv = kv_any;
1535 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1536 	void *v;
1537 
1538 	if (POOL_INPGHDR(pp))
1539 		kv.kv_align = pp->pr_pgsize;
1540 
1541 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1542 	kd.kd_slowdown = slowdown;
1543 
1544 	KERNEL_LOCK();
1545 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1546 	KERNEL_UNLOCK();
1547 
1548 	return (v);
1549 }
1550 
1551 void
1552 pool_multi_free_ni(struct pool *pp, void *v)
1553 {
1554 	struct kmem_va_mode kv = kv_any;
1555 
1556 	if (POOL_INPGHDR(pp))
1557 		kv.kv_align = pp->pr_pgsize;
1558 
1559 	KERNEL_LOCK();
1560 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1561 	KERNEL_UNLOCK();
1562 }
1563 
1564 #ifdef MULTIPROCESSOR
1565 
1566 struct pool pool_caches; /* per cpu cache entries */
1567 
1568 void
1569 pool_cache_init(struct pool *pp)
1570 {
1571 	struct cpumem *cm;
1572 	struct pool_cache *pc;
1573 	struct cpumem_iter i;
1574 
1575 	if (pool_caches.pr_size == 0) {
1576 		pool_init(&pool_caches, sizeof(struct pool_cache), 64,
1577 		    IPL_NONE, PR_WAITOK, "plcache", NULL);
1578 	}
1579 
1580 	KASSERT(pp->pr_size >= sizeof(*pc));
1581 
1582 	cm = cpumem_get(&pool_caches);
1583 
1584 	mtx_init(&pp->pr_cache_mtx, pp->pr_ipl);
1585 	TAILQ_INIT(&pp->pr_cache_lists);
1586 	pp->pr_cache_nlist = 0;
1587 	pp->pr_cache_items = 8;
1588 	pp->pr_cache_contention = 0;
1589 
1590 	CPUMEM_FOREACH(pc, &i, cm) {
1591 		pc->pc_actv = NULL;
1592 		pc->pc_nactv = 0;
1593 		pc->pc_prev = NULL;
1594 
1595 		pc->pc_gets = 0;
1596 		pc->pc_puts = 0;
1597 		pc->pc_fails = 0;
1598 		pc->pc_nout = 0;
1599 	}
1600 
1601 	pp->pr_cache = cm;
1602 }
1603 
1604 static inline void
1605 pool_list_enter(struct pool *pp)
1606 {
1607 	if (mtx_enter_try(&pp->pr_cache_mtx) == 0) {
1608 		mtx_enter(&pp->pr_cache_mtx);
1609 		pp->pr_cache_contention++;
1610 	}
1611 }
1612 
1613 static inline void
1614 pool_list_leave(struct pool *pp)
1615 {
1616 	mtx_leave(&pp->pr_cache_mtx);
1617 }
1618 
1619 static inline struct pool_list *
1620 pool_list_alloc(struct pool *pp, struct pool_cache *pc)
1621 {
1622 	struct pool_list *pl;
1623 
1624 	pool_list_enter(pp);
1625 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
1626 	if (pl != NULL) {
1627 		TAILQ_REMOVE(&pp->pr_cache_lists, pl, pl_nextl);
1628 		pp->pr_cache_nlist--;
1629 	}
1630 
1631 	pp->pr_cache_nout += pc->pc_nout;
1632 	pc->pc_nout = 0;
1633 	pool_list_leave(pp);
1634 
1635 	return (pl);
1636 }
1637 
1638 static inline void
1639 pool_list_free(struct pool *pp, struct pool_cache *pc, struct pool_list *pl)
1640 {
1641 	pool_list_enter(pp);
1642 	TAILQ_INSERT_TAIL(&pp->pr_cache_lists, pl, pl_nextl);
1643 	pp->pr_cache_nlist++;
1644 
1645 	pp->pr_cache_nout += pc->pc_nout;
1646 	pc->pc_nout = 0;
1647 	pool_list_leave(pp);
1648 }
1649 
1650 static inline struct pool_cache *
1651 pool_cache_enter(struct pool *pp, int *s)
1652 {
1653 	struct pool_cache *pc;
1654 
1655 	pc = cpumem_enter(pp->pr_cache);
1656 	*s = splraise(pp->pr_ipl);
1657 	pc->pc_gen++;
1658 
1659 	return (pc);
1660 }
1661 
1662 static inline void
1663 pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
1664 {
1665 	pc->pc_gen++;
1666 	splx(s);
1667 	cpumem_leave(pp->pr_cache, pc);
1668 }
1669 
1670 void *
1671 pool_cache_get(struct pool *pp)
1672 {
1673 	struct pool_cache *pc;
1674 	struct pool_list *pl;
1675 	int s;
1676 
1677 	pc = pool_cache_enter(pp, &s);
1678 
1679 	if (pc->pc_actv != NULL) {
1680 		pl = pc->pc_actv;
1681 	} else if (pc->pc_prev != NULL) {
1682 		pl = pc->pc_prev;
1683 		pc->pc_prev = NULL;
1684 	} else if ((pl = pool_list_alloc(pp, pc)) == NULL) {
1685 		pc->pc_fails++;
1686 		goto done;
1687 	}
1688 
1689 	pc->pc_actv = pl->pl_next;
1690 	pc->pc_nactv = POOL_LIST_NITEMS(pl) - 1;
1691 	pc->pc_gets++;
1692 	pc->pc_nout++;
1693 
1694 done:
1695 	pool_cache_leave(pp, pc, s);
1696 
1697 #ifdef DIAGNOSTIC
1698 	if (pool_debug && pl != NULL && POOL_LIST_POISONED(pl)) {
1699 		size_t pidx;
1700 		uint32_t pval;
1701 
1702 		if (poison_check(pl + 1, pp->pr_size - sizeof(*pl),
1703 		    &pidx, &pval)) {
1704 			int *ip = (int *)(pl + 1);
1705 			panic("%s: %s cpu free list modified: "
1706 			    "item addr %p; offset 0x%zx=0x%x",
1707 			    __func__, pp->pr_wchan, pl,
1708 			    pidx * sizeof(int) + sizeof(*pl), ip[pidx]);
1709 		}
1710 	}
1711 #endif
1712 
1713 	return (pl);
1714 }
1715 
1716 void
1717 pool_cache_put(struct pool *pp, void *v)
1718 {
1719 	struct pool_cache *pc;
1720 	struct pool_list *pl = v;
1721 	unsigned long nitems;
1722 	int s;
1723 #ifdef DIAGNOSTIC
1724 	int poison = pool_debug && pp->pr_size > sizeof(*pl);
1725 
1726 	if (poison)
1727 		poison_mem(pl + 1, pp->pr_size - sizeof(*pl));
1728 #endif
1729 
1730 	pc = pool_cache_enter(pp, &s);
1731 
1732 	nitems = pc->pc_nactv;
1733 	if (nitems >= pp->pr_cache_items) {
1734 		if (pc->pc_prev != NULL)
1735 			pool_list_free(pp, pc, pc->pc_prev);
1736 
1737 		pc->pc_prev = pc->pc_actv;
1738 
1739 		pc->pc_actv = NULL;
1740 		pc->pc_nactv = 0;
1741 		nitems = 0;
1742 	}
1743 
1744 	pl->pl_next = pc->pc_actv;
1745 	pl->pl_nitems = ++nitems;
1746 #ifdef DIAGNOSTIC
1747 	pl->pl_nitems |= poison ? POOL_LIST_NITEMS_POISON : 0;
1748 #endif
1749 
1750 	pc->pc_actv = pl;
1751 	pc->pc_nactv = nitems;
1752 
1753 	pc->pc_puts++;
1754 	pc->pc_nout--;
1755 
1756 	pool_cache_leave(pp, pc, s);
1757 }
1758 
1759 struct pool_list *
1760 pool_list_put(struct pool *pp, struct pool_list *pl)
1761 {
1762 	struct pool_list *rpl, *npl;
1763 
1764 	if (pl == NULL)
1765 		return (NULL);
1766 
1767 	rpl = TAILQ_NEXT(pl, pl_nextl);
1768 
1769 	do {
1770 		npl = pl->pl_next;
1771 		pool_put(pp, pl);
1772 		pl = npl;
1773 	} while (pl != NULL);
1774 
1775 	return (rpl);
1776 }
1777 
1778 void
1779 pool_cache_destroy(struct pool *pp)
1780 {
1781 	struct pool_cache *pc;
1782 	struct pool_list *pl;
1783 	struct cpumem_iter i;
1784 	struct cpumem *cm;
1785 
1786 	cm = pp->pr_cache;
1787 	pp->pr_cache = NULL; /* make pool_put avoid the cache */
1788 
1789 	CPUMEM_FOREACH(pc, &i, cm) {
1790 		pool_list_put(pp, pc->pc_actv);
1791 		pool_list_put(pp, pc->pc_prev);
1792 	}
1793 
1794 	cpumem_put(&pool_caches, cm);
1795 
1796 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
1797 	while (pl != NULL)
1798 		pl = pool_list_put(pp, pl);
1799 }
1800 
1801 void
1802 pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
1803 {
1804 	struct pool_cache *pc;
1805 	struct cpumem_iter i;
1806 
1807 	if (pp->pr_cache == NULL)
1808 		return;
1809 
1810 	/* loop through the caches twice to collect stats */
1811 
1812 	/* once without the mtx so we can yield while reading nget/nput */
1813 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
1814 		uint64_t gen, nget, nput;
1815 
1816 		do {
1817 			while ((gen = pc->pc_gen) & 1)
1818 				yield();
1819 
1820 			nget = pc->pc_gets;
1821 			nput = pc->pc_puts;
1822 		} while (gen != pc->pc_gen);
1823 
1824 		pi->pr_nget += nget;
1825 		pi->pr_nput += nput;
1826 	}
1827 
1828 	/* and once with the mtx so we can get consistent nout values */
1829 	mtx_enter(&pp->pr_cache_mtx);
1830 	CPUMEM_FOREACH(pc, &i, pp->pr_cache)
1831 		pi->pr_nout += pc->pc_nout;
1832 
1833 	pi->pr_nout += pp->pr_cache_nout;
1834 	mtx_leave(&pp->pr_cache_mtx);
1835 }
1836 #else /* MULTIPROCESSOR */
1837 void
1838 pool_cache_init(struct pool *pp)
1839 {
1840 	/* nop */
1841 }
1842 
1843 void
1844 pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
1845 {
1846 	/* nop */
1847 }
1848 #endif /* MULTIPROCESSOR */
1849