xref: /openbsd-src/sys/kern/subr_pool.c (revision 1f212a5e0c92d19620420d1709668a8ef497a4db)
1 /*	$OpenBSD: subr_pool.c,v 1.199 2016/11/02 01:20:50 dlg Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/syslog.h>
41 #include <sys/rwlock.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/timeout.h>
45 #include <sys/percpu.h>
46 
47 #include <uvm/uvm_extern.h>
48 
49 /*
50  * Pool resource management utility.
51  *
52  * Memory is allocated in pages which are split into pieces according to
53  * the pool item size. Each page is kept on one of three lists in the
54  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
55  * for empty, full and partially-full pages respectively. The individual
56  * pool items are on a linked list headed by `ph_itemlist' in each page
57  * header. The memory for building the page list is either taken from
58  * the allocated pages themselves (for small pool items) or taken from
59  * an internal pool of page headers (`phpool').
60  */
61 
62 /* List of all pools */
63 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
64 
65 /*
66  * Every pool gets a unique serial number assigned to it. If this counter
67  * wraps, we're screwed, but we shouldn't create so many pools anyway.
68  */
69 unsigned int pool_serial;
70 unsigned int pool_count;
71 
72 /* Lock the previous variables making up the global pool state */
73 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
74 
75 /* Private pool for page header structures */
76 struct pool phpool;
77 
78 struct pool_item_header {
79 	/* Page headers */
80 	TAILQ_ENTRY(pool_item_header)
81 				ph_pagelist;	/* pool page list */
82 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
83 	RBT_ENTRY(pool_item_header)
84 				ph_node;	/* Off-page page headers */
85 	int			ph_nmissing;	/* # of chunks in use */
86 	caddr_t			ph_page;	/* this page's address */
87 	caddr_t			ph_colored;	/* page's colored address */
88 	u_long			ph_magic;
89 	int			ph_tick;
90 };
91 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
92 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
93 
94 struct pool_item {
95 	u_long				pi_magic;
96 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
97 };
98 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
99 
100 #ifdef MULTIPROCESSOR
101 struct pool_list {
102 	struct pool_list	*pl_next;	/* next in list */
103 	unsigned long		 pl_cookie;
104 	struct pool_list	*pl_nextl;	/* next list */
105 	unsigned long		 pl_nitems;	/* items in list */
106 };
107 
108 struct pool_cache {
109 	struct pool_list	*pc_actv;
110 	unsigned long		 pc_nactv;	/* cache pc_actv nitems */
111 	struct pool_list	*pc_prev;
112 
113 	uint64_t		 pc_gen;	/* generation number */
114 	uint64_t		 pc_gets;
115 	uint64_t		 pc_puts;
116 	uint64_t		 pc_fails;
117 
118 	int			 pc_nout;
119 };
120 
121 void	*pool_cache_get(struct pool *);
122 void	 pool_cache_put(struct pool *, void *);
123 void	 pool_cache_destroy(struct pool *);
124 #endif
125 void	 pool_cache_info(struct pool *, struct kinfo_pool *);
126 
127 #ifdef POOL_DEBUG
128 int	pool_debug = 1;
129 #else
130 int	pool_debug = 0;
131 #endif
132 
133 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
134 
135 struct pool_item_header *
136 	 pool_p_alloc(struct pool *, int, int *);
137 void	 pool_p_insert(struct pool *, struct pool_item_header *);
138 void	 pool_p_remove(struct pool *, struct pool_item_header *);
139 void	 pool_p_free(struct pool *, struct pool_item_header *);
140 
141 void	 pool_update_curpage(struct pool *);
142 void	*pool_do_get(struct pool *, int, int *);
143 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
144 int	 pool_chk(struct pool *);
145 void	 pool_get_done(void *, void *);
146 void	 pool_runqueue(struct pool *, int);
147 
148 void	*pool_allocator_alloc(struct pool *, int, int *);
149 void	 pool_allocator_free(struct pool *, void *);
150 
151 /*
152  * The default pool allocator.
153  */
154 void	*pool_page_alloc(struct pool *, int, int *);
155 void	pool_page_free(struct pool *, void *);
156 
157 /*
158  * safe for interrupts; this is the default allocator
159  */
160 struct pool_allocator pool_allocator_single = {
161 	pool_page_alloc,
162 	pool_page_free
163 };
164 
165 void	*pool_multi_alloc(struct pool *, int, int *);
166 void	pool_multi_free(struct pool *, void *);
167 
168 struct pool_allocator pool_allocator_multi = {
169 	pool_multi_alloc,
170 	pool_multi_free
171 };
172 
173 void	*pool_multi_alloc_ni(struct pool *, int, int *);
174 void	pool_multi_free_ni(struct pool *, void *);
175 
176 struct pool_allocator pool_allocator_multi_ni = {
177 	pool_multi_alloc_ni,
178 	pool_multi_free_ni
179 };
180 
181 #ifdef DDB
182 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
183 	     __attribute__((__format__(__kprintf__,1,2))));
184 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
185 	     __attribute__((__format__(__kprintf__,1,2))));
186 #endif
187 
188 /* stale page garbage collectors */
189 void	pool_gc_sched(void *);
190 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
191 void	pool_gc_pages(void *);
192 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
193 int pool_wait_free = 1;
194 int pool_wait_gc = 8;
195 
196 RBT_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
197 
198 static inline int
199 phtree_compare(const struct pool_item_header *a,
200     const struct pool_item_header *b)
201 {
202 	vaddr_t va = (vaddr_t)a->ph_page;
203 	vaddr_t vb = (vaddr_t)b->ph_page;
204 
205 	/* the compares in this order are important for the NFIND to work */
206 	if (vb < va)
207 		return (-1);
208 	if (vb > va)
209 		return (1);
210 
211 	return (0);
212 }
213 
214 RBT_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
215 
216 /*
217  * Return the pool page header based on page address.
218  */
219 static inline struct pool_item_header *
220 pr_find_pagehead(struct pool *pp, void *v)
221 {
222 	struct pool_item_header *ph, key;
223 
224 	if (POOL_INPGHDR(pp)) {
225 		caddr_t page;
226 
227 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
228 
229 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
230 	}
231 
232 	key.ph_page = v;
233 	ph = RBT_NFIND(phtree, &pp->pr_phtree, &key);
234 	if (ph == NULL)
235 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
236 
237 	KASSERT(ph->ph_page <= (caddr_t)v);
238 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
239 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
240 
241 	return (ph);
242 }
243 
244 /*
245  * Initialize the given pool resource structure.
246  *
247  * We export this routine to allow other kernel parts to declare
248  * static pools that must be initialized before malloc() is available.
249  */
250 void
251 pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags,
252     const char *wchan, struct pool_allocator *palloc)
253 {
254 	int off = 0, space;
255 	unsigned int pgsize = PAGE_SIZE, items;
256 #ifdef DIAGNOSTIC
257 	struct pool *iter;
258 #endif
259 
260 	if (align == 0)
261 		align = ALIGN(1);
262 
263 	if (size < sizeof(struct pool_item))
264 		size = sizeof(struct pool_item);
265 
266 	size = roundup(size, align);
267 
268 	if (palloc == NULL) {
269 		while (size * 8 > pgsize)
270 			pgsize <<= 1;
271 
272 		if (pgsize > PAGE_SIZE) {
273 			palloc = ISSET(flags, PR_WAITOK) ?
274 			    &pool_allocator_multi_ni : &pool_allocator_multi;
275 		} else
276 			palloc = &pool_allocator_single;
277 	} else
278 		pgsize = palloc->pa_pagesz ? palloc->pa_pagesz : PAGE_SIZE;
279 
280 	items = pgsize / size;
281 
282 	/*
283 	 * Decide whether to put the page header off page to avoid
284 	 * wasting too large a part of the page. Off-page page headers
285 	 * go into an RB tree, so we can match a returned item with
286 	 * its header based on the page address.
287 	 */
288 	if (pgsize - (size * items) > sizeof(struct pool_item_header)) {
289 		off = pgsize - sizeof(struct pool_item_header);
290 	} else if (sizeof(struct pool_item_header) * 2 >= size) {
291 		off = pgsize - sizeof(struct pool_item_header);
292 		items = off / size;
293 	}
294 
295 	KASSERT(items > 0);
296 
297 	/*
298 	 * Initialize the pool structure.
299 	 */
300 	memset(pp, 0, sizeof(*pp));
301 	TAILQ_INIT(&pp->pr_emptypages);
302 	TAILQ_INIT(&pp->pr_fullpages);
303 	TAILQ_INIT(&pp->pr_partpages);
304 	pp->pr_curpage = NULL;
305 	pp->pr_npages = 0;
306 	pp->pr_minitems = 0;
307 	pp->pr_minpages = 0;
308 	pp->pr_maxpages = 8;
309 	pp->pr_size = size;
310 	pp->pr_pgsize = pgsize;
311 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
312 	pp->pr_phoffset = off;
313 	pp->pr_itemsperpage = items;
314 	pp->pr_wchan = wchan;
315 	pp->pr_alloc = palloc;
316 	pp->pr_nitems = 0;
317 	pp->pr_nout = 0;
318 	pp->pr_hardlimit = UINT_MAX;
319 	pp->pr_hardlimit_warning = NULL;
320 	pp->pr_hardlimit_ratecap.tv_sec = 0;
321 	pp->pr_hardlimit_ratecap.tv_usec = 0;
322 	pp->pr_hardlimit_warning_last.tv_sec = 0;
323 	pp->pr_hardlimit_warning_last.tv_usec = 0;
324 	RBT_INIT(phtree, &pp->pr_phtree);
325 
326 	/*
327 	 * Use the space between the chunks and the page header
328 	 * for cache coloring.
329 	 */
330 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
331 	space -= pp->pr_itemsperpage * pp->pr_size;
332 	pp->pr_align = align;
333 	pp->pr_maxcolors = (space / align) + 1;
334 
335 	pp->pr_nget = 0;
336 	pp->pr_nfail = 0;
337 	pp->pr_nput = 0;
338 	pp->pr_npagealloc = 0;
339 	pp->pr_npagefree = 0;
340 	pp->pr_hiwat = 0;
341 	pp->pr_nidle = 0;
342 
343 	pp->pr_ipl = ipl;
344 	mtx_init(&pp->pr_mtx, pp->pr_ipl);
345 	mtx_init(&pp->pr_requests_mtx, pp->pr_ipl);
346 	TAILQ_INIT(&pp->pr_requests);
347 
348 	if (phpool.pr_size == 0) {
349 		pool_init(&phpool, sizeof(struct pool_item_header), 0,
350 		    IPL_HIGH, 0, "phpool", NULL);
351 
352 		/* make sure phpool wont "recurse" */
353 		KASSERT(POOL_INPGHDR(&phpool));
354 	}
355 
356 	/* pglistalloc/constraint parameters */
357 	pp->pr_crange = &kp_dirty;
358 
359 	/* Insert this into the list of all pools. */
360 	rw_enter_write(&pool_lock);
361 #ifdef DIAGNOSTIC
362 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
363 		if (iter == pp)
364 			panic("%s: pool %s already on list", __func__, wchan);
365 	}
366 #endif
367 
368 	pp->pr_serial = ++pool_serial;
369 	if (pool_serial == 0)
370 		panic("%s: too much uptime", __func__);
371 
372 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
373 	pool_count++;
374 	rw_exit_write(&pool_lock);
375 }
376 
377 /*
378  * Decommission a pool resource.
379  */
380 void
381 pool_destroy(struct pool *pp)
382 {
383 	struct pool_item_header *ph;
384 	struct pool *prev, *iter;
385 
386 #ifdef MULTIPROCESSOR
387 	if (pp->pr_cache != NULL)
388 		pool_cache_destroy(pp);
389 #endif
390 
391 #ifdef DIAGNOSTIC
392 	if (pp->pr_nout != 0)
393 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
394 #endif
395 
396 	/* Remove from global pool list */
397 	rw_enter_write(&pool_lock);
398 	pool_count--;
399 	if (pp == SIMPLEQ_FIRST(&pool_head))
400 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
401 	else {
402 		prev = SIMPLEQ_FIRST(&pool_head);
403 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
404 			if (iter == pp) {
405 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
406 				    pr_poollist);
407 				break;
408 			}
409 			prev = iter;
410 		}
411 	}
412 	rw_exit_write(&pool_lock);
413 
414 	/* Remove all pages */
415 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
416 		mtx_enter(&pp->pr_mtx);
417 		pool_p_remove(pp, ph);
418 		mtx_leave(&pp->pr_mtx);
419 		pool_p_free(pp, ph);
420 	}
421 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
422 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
423 }
424 
425 void
426 pool_request_init(struct pool_request *pr,
427     void (*handler)(void *, void *), void *cookie)
428 {
429 	pr->pr_handler = handler;
430 	pr->pr_cookie = cookie;
431 	pr->pr_item = NULL;
432 }
433 
434 void
435 pool_request(struct pool *pp, struct pool_request *pr)
436 {
437 	mtx_enter(&pp->pr_requests_mtx);
438 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
439 	pool_runqueue(pp, PR_NOWAIT);
440 	mtx_leave(&pp->pr_requests_mtx);
441 }
442 
443 struct pool_get_memory {
444 	struct mutex mtx;
445 	void * volatile v;
446 };
447 
448 /*
449  * Grab an item from the pool.
450  */
451 void *
452 pool_get(struct pool *pp, int flags)
453 {
454 	void *v = NULL;
455 	int slowdown = 0;
456 
457 #ifdef MULTIPROCESSOR
458 	if (pp->pr_cache != NULL) {
459 		v = pool_cache_get(pp);
460 		if (v != NULL)
461 			goto good;
462 	}
463 #endif
464 
465 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
466 
467 	mtx_enter(&pp->pr_mtx);
468 	if (pp->pr_nout >= pp->pr_hardlimit) {
469 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
470 			goto fail;
471 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
472 		if (ISSET(flags, PR_NOWAIT))
473 			goto fail;
474 	}
475 	mtx_leave(&pp->pr_mtx);
476 
477 	if (slowdown && ISSET(flags, PR_WAITOK))
478 		yield();
479 
480 	if (v == NULL) {
481 		struct pool_get_memory mem = {
482 		    MUTEX_INITIALIZER(pp->pr_ipl),
483 		    NULL };
484 		struct pool_request pr;
485 
486 		pool_request_init(&pr, pool_get_done, &mem);
487 		pool_request(pp, &pr);
488 
489 		mtx_enter(&mem.mtx);
490 		while (mem.v == NULL)
491 			msleep(&mem, &mem.mtx, PSWP, pp->pr_wchan, 0);
492 		mtx_leave(&mem.mtx);
493 
494 		v = mem.v;
495 	}
496 
497 #ifdef MULTIPROCESSOR
498 good:
499 #endif
500 	if (ISSET(flags, PR_ZERO))
501 		memset(v, 0, pp->pr_size);
502 
503 	return (v);
504 
505 fail:
506 	pp->pr_nfail++;
507 	mtx_leave(&pp->pr_mtx);
508 	return (NULL);
509 }
510 
511 void
512 pool_get_done(void *xmem, void *v)
513 {
514 	struct pool_get_memory *mem = xmem;
515 
516 	mtx_enter(&mem->mtx);
517 	mem->v = v;
518 	mtx_leave(&mem->mtx);
519 
520 	wakeup_one(mem);
521 }
522 
523 void
524 pool_runqueue(struct pool *pp, int flags)
525 {
526 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
527 	struct pool_request *pr;
528 
529 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
530 	MUTEX_ASSERT_LOCKED(&pp->pr_requests_mtx);
531 
532 	if (pp->pr_requesting++)
533 		return;
534 
535 	do {
536 		pp->pr_requesting = 1;
537 
538 		/* no TAILQ_JOIN? :( */
539 		while ((pr = TAILQ_FIRST(&pp->pr_requests)) != NULL) {
540 			TAILQ_REMOVE(&pp->pr_requests, pr, pr_entry);
541 			TAILQ_INSERT_TAIL(&prl, pr, pr_entry);
542 		}
543 		if (TAILQ_EMPTY(&prl))
544 			continue;
545 
546 		mtx_leave(&pp->pr_requests_mtx);
547 
548 		mtx_enter(&pp->pr_mtx);
549 		pr = TAILQ_FIRST(&prl);
550 		while (pr != NULL) {
551 			int slowdown = 0;
552 
553 			if (pp->pr_nout >= pp->pr_hardlimit)
554 				break;
555 
556 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
557 			if (pr->pr_item == NULL) /* || slowdown ? */
558 				break;
559 
560 			pr = TAILQ_NEXT(pr, pr_entry);
561 		}
562 		mtx_leave(&pp->pr_mtx);
563 
564 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
565 		    pr->pr_item != NULL) {
566 			TAILQ_REMOVE(&prl, pr, pr_entry);
567 			(*pr->pr_handler)(pr->pr_cookie, pr->pr_item);
568 		}
569 
570 		mtx_enter(&pp->pr_requests_mtx);
571 	} while (--pp->pr_requesting);
572 
573 	/* no TAILQ_JOIN :( */
574 	while ((pr = TAILQ_FIRST(&prl)) != NULL) {
575 		TAILQ_REMOVE(&prl, pr, pr_entry);
576 		TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
577 	}
578 }
579 
580 void *
581 pool_do_get(struct pool *pp, int flags, int *slowdown)
582 {
583 	struct pool_item *pi;
584 	struct pool_item_header *ph;
585 
586 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
587 
588 	splassert(pp->pr_ipl);
589 
590 	/*
591 	 * Account for this item now to avoid races if we need to give up
592 	 * pr_mtx to allocate a page.
593 	 */
594 	pp->pr_nout++;
595 
596 	if (pp->pr_curpage == NULL) {
597 		mtx_leave(&pp->pr_mtx);
598 		ph = pool_p_alloc(pp, flags, slowdown);
599 		mtx_enter(&pp->pr_mtx);
600 
601 		if (ph == NULL) {
602 			pp->pr_nout--;
603 			return (NULL);
604 		}
605 
606 		pool_p_insert(pp, ph);
607 	}
608 
609 	ph = pp->pr_curpage;
610 	pi = XSIMPLEQ_FIRST(&ph->ph_itemlist);
611 	if (__predict_false(pi == NULL))
612 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
613 
614 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
615 		panic("%s: %s free list modified: "
616 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
617 		    __func__, pp->pr_wchan, ph->ph_page, pi,
618 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
619 	}
620 
621 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
622 
623 #ifdef DIAGNOSTIC
624 	if (pool_debug && POOL_PHPOISON(ph)) {
625 		size_t pidx;
626 		uint32_t pval;
627 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
628 		    &pidx, &pval)) {
629 			int *ip = (int *)(pi + 1);
630 			panic("%s: %s free list modified: "
631 			    "page %p; item addr %p; offset 0x%zx=0x%x",
632 			    __func__, pp->pr_wchan, ph->ph_page, pi,
633 			    pidx * sizeof(int), ip[pidx]);
634 		}
635 	}
636 #endif /* DIAGNOSTIC */
637 
638 	if (ph->ph_nmissing++ == 0) {
639 		/*
640 		 * This page was previously empty.  Move it to the list of
641 		 * partially-full pages.  This page is already curpage.
642 		 */
643 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
644 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
645 
646 		pp->pr_nidle--;
647 	}
648 
649 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
650 		/*
651 		 * This page is now full.  Move it to the full list
652 		 * and select a new current page.
653 		 */
654 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
655 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_pagelist);
656 		pool_update_curpage(pp);
657 	}
658 
659 	pp->pr_nget++;
660 
661 	return (pi);
662 }
663 
664 /*
665  * Return resource to the pool.
666  */
667 void
668 pool_put(struct pool *pp, void *v)
669 {
670 	struct pool_item *pi = v;
671 	struct pool_item_header *ph, *freeph = NULL;
672 
673 #ifdef DIAGNOSTIC
674 	if (v == NULL)
675 		panic("%s: NULL item", __func__);
676 #endif
677 
678 #ifdef MULTIPROCESSOR
679 	if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
680 		pool_cache_put(pp, v);
681 		return;
682 	}
683 #endif
684 
685 	mtx_enter(&pp->pr_mtx);
686 
687 	splassert(pp->pr_ipl);
688 
689 	ph = pr_find_pagehead(pp, v);
690 
691 #ifdef DIAGNOSTIC
692 	if (pool_debug) {
693 		struct pool_item *qi;
694 		XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list) {
695 			if (pi == qi) {
696 				panic("%s: %s: double pool_put: %p", __func__,
697 				    pp->pr_wchan, pi);
698 			}
699 		}
700 	}
701 #endif /* DIAGNOSTIC */
702 
703 	pi->pi_magic = POOL_IMAGIC(ph, pi);
704 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
705 #ifdef DIAGNOSTIC
706 	if (POOL_PHPOISON(ph))
707 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
708 #endif /* DIAGNOSTIC */
709 
710 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
711 		/*
712 		 * The page was previously completely full, move it to the
713 		 * partially-full list.
714 		 */
715 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_pagelist);
716 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_pagelist);
717 	}
718 
719 	if (ph->ph_nmissing == 0) {
720 		/*
721 		 * The page is now empty, so move it to the empty page list.
722 	 	 */
723 		pp->pr_nidle++;
724 
725 		ph->ph_tick = ticks;
726 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_pagelist);
727 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
728 		pool_update_curpage(pp);
729 	}
730 
731 	pp->pr_nout--;
732 	pp->pr_nput++;
733 
734 	/* is it time to free a page? */
735 	if (pp->pr_nidle > pp->pr_maxpages &&
736 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
737 	    (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
738 		freeph = ph;
739 		pool_p_remove(pp, freeph);
740 	}
741 	mtx_leave(&pp->pr_mtx);
742 
743 	if (freeph != NULL)
744 		pool_p_free(pp, freeph);
745 
746 	if (!TAILQ_EMPTY(&pp->pr_requests)) {
747 		mtx_enter(&pp->pr_requests_mtx);
748 		pool_runqueue(pp, PR_NOWAIT);
749 		mtx_leave(&pp->pr_requests_mtx);
750 	}
751 }
752 
753 /*
754  * Add N items to the pool.
755  */
756 int
757 pool_prime(struct pool *pp, int n)
758 {
759 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
760 	struct pool_item_header *ph;
761 	int newpages;
762 
763 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
764 
765 	while (newpages-- > 0) {
766 		int slowdown = 0;
767 
768 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
769 		if (ph == NULL) /* or slowdown? */
770 			break;
771 
772 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
773 	}
774 
775 	mtx_enter(&pp->pr_mtx);
776 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
777 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
778 		pool_p_insert(pp, ph);
779 	}
780 	mtx_leave(&pp->pr_mtx);
781 
782 	return (0);
783 }
784 
785 struct pool_item_header *
786 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
787 {
788 	struct pool_item_header *ph;
789 	struct pool_item *pi;
790 	caddr_t addr;
791 	int n;
792 
793 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
794 	KASSERT(pp->pr_size >= sizeof(*pi));
795 
796 	addr = pool_allocator_alloc(pp, flags, slowdown);
797 	if (addr == NULL)
798 		return (NULL);
799 
800 	if (POOL_INPGHDR(pp))
801 		ph = (struct pool_item_header *)(addr + pp->pr_phoffset);
802 	else {
803 		ph = pool_get(&phpool, flags);
804 		if (ph == NULL) {
805 			pool_allocator_free(pp, addr);
806 			return (NULL);
807 		}
808 	}
809 
810 	XSIMPLEQ_INIT(&ph->ph_itemlist);
811 	ph->ph_page = addr;
812 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
813 	ph->ph_colored = addr;
814 	ph->ph_nmissing = 0;
815 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
816 #ifdef DIAGNOSTIC
817 	/* use a bit in ph_magic to record if we poison page items */
818 	if (pool_debug)
819 		SET(ph->ph_magic, POOL_MAGICBIT);
820 	else
821 		CLR(ph->ph_magic, POOL_MAGICBIT);
822 #endif /* DIAGNOSTIC */
823 
824 	n = pp->pr_itemsperpage;
825 	while (n--) {
826 		pi = (struct pool_item *)addr;
827 		pi->pi_magic = POOL_IMAGIC(ph, pi);
828 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
829 
830 #ifdef DIAGNOSTIC
831 		if (POOL_PHPOISON(ph))
832 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
833 #endif /* DIAGNOSTIC */
834 
835 		addr += pp->pr_size;
836 	}
837 
838 	return (ph);
839 }
840 
841 void
842 pool_p_free(struct pool *pp, struct pool_item_header *ph)
843 {
844 	struct pool_item *pi;
845 
846 	MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx);
847 	KASSERT(ph->ph_nmissing == 0);
848 
849 	XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
850 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
851 			panic("%s: %s free list modified: "
852 			    "page %p; item addr %p; offset 0x%x=0x%lx",
853 			    __func__, pp->pr_wchan, ph->ph_page, pi,
854 			    0, pi->pi_magic);
855 		}
856 
857 #ifdef DIAGNOSTIC
858 		if (POOL_PHPOISON(ph)) {
859 			size_t pidx;
860 			uint32_t pval;
861 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
862 			    &pidx, &pval)) {
863 				int *ip = (int *)(pi + 1);
864 				panic("%s: %s free list modified: "
865 				    "page %p; item addr %p; offset 0x%zx=0x%x",
866 				    __func__, pp->pr_wchan, ph->ph_page, pi,
867 				    pidx * sizeof(int), ip[pidx]);
868 			}
869 		}
870 #endif
871 	}
872 
873 	pool_allocator_free(pp, ph->ph_page);
874 
875 	if (!POOL_INPGHDR(pp))
876 		pool_put(&phpool, ph);
877 }
878 
879 void
880 pool_p_insert(struct pool *pp, struct pool_item_header *ph)
881 {
882 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
883 
884 	/* If the pool was depleted, point at the new page */
885 	if (pp->pr_curpage == NULL)
886 		pp->pr_curpage = ph;
887 
888 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_pagelist);
889 	if (!POOL_INPGHDR(pp))
890 		RBT_INSERT(phtree, &pp->pr_phtree, ph);
891 
892 	pp->pr_nitems += pp->pr_itemsperpage;
893 	pp->pr_nidle++;
894 
895 	pp->pr_npagealloc++;
896 	if (++pp->pr_npages > pp->pr_hiwat)
897 		pp->pr_hiwat = pp->pr_npages;
898 }
899 
900 void
901 pool_p_remove(struct pool *pp, struct pool_item_header *ph)
902 {
903 	MUTEX_ASSERT_LOCKED(&pp->pr_mtx);
904 
905 	pp->pr_npagefree++;
906 	pp->pr_npages--;
907 	pp->pr_nidle--;
908 	pp->pr_nitems -= pp->pr_itemsperpage;
909 
910 	if (!POOL_INPGHDR(pp))
911 		RBT_REMOVE(phtree, &pp->pr_phtree, ph);
912 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_pagelist);
913 
914 	pool_update_curpage(pp);
915 }
916 
917 void
918 pool_update_curpage(struct pool *pp)
919 {
920 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
921 	if (pp->pr_curpage == NULL) {
922 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
923 	}
924 }
925 
926 void
927 pool_setlowat(struct pool *pp, int n)
928 {
929 	int prime = 0;
930 
931 	mtx_enter(&pp->pr_mtx);
932 	pp->pr_minitems = n;
933 	pp->pr_minpages = (n == 0)
934 		? 0
935 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
936 
937 	if (pp->pr_nitems < n)
938 		prime = n - pp->pr_nitems;
939 	mtx_leave(&pp->pr_mtx);
940 
941 	if (prime > 0)
942 		pool_prime(pp, prime);
943 }
944 
945 void
946 pool_sethiwat(struct pool *pp, int n)
947 {
948 	pp->pr_maxpages = (n == 0)
949 		? 0
950 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
951 }
952 
953 int
954 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
955 {
956 	int error = 0;
957 
958 	if (n < pp->pr_nout) {
959 		error = EINVAL;
960 		goto done;
961 	}
962 
963 	pp->pr_hardlimit = n;
964 	pp->pr_hardlimit_warning = warnmsg;
965 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
966 	pp->pr_hardlimit_warning_last.tv_sec = 0;
967 	pp->pr_hardlimit_warning_last.tv_usec = 0;
968 
969 done:
970 	return (error);
971 }
972 
973 void
974 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
975 {
976 	pp->pr_crange = mode;
977 }
978 
979 /*
980  * Release all complete pages that have not been used recently.
981  *
982  * Returns non-zero if any pages have been reclaimed.
983  */
984 int
985 pool_reclaim(struct pool *pp)
986 {
987 	struct pool_item_header *ph, *phnext;
988 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
989 
990 	mtx_enter(&pp->pr_mtx);
991 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
992 		phnext = TAILQ_NEXT(ph, ph_pagelist);
993 
994 		/* Check our minimum page claim */
995 		if (pp->pr_npages <= pp->pr_minpages)
996 			break;
997 
998 		/*
999 		 * If freeing this page would put us below
1000 		 * the low water mark, stop now.
1001 		 */
1002 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1003 		    pp->pr_minitems)
1004 			break;
1005 
1006 		pool_p_remove(pp, ph);
1007 		TAILQ_INSERT_TAIL(&pl, ph, ph_pagelist);
1008 	}
1009 	mtx_leave(&pp->pr_mtx);
1010 
1011 	if (TAILQ_EMPTY(&pl))
1012 		return (0);
1013 
1014 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
1015 		TAILQ_REMOVE(&pl, ph, ph_pagelist);
1016 		pool_p_free(pp, ph);
1017 	}
1018 
1019 	return (1);
1020 }
1021 
1022 /*
1023  * Release all complete pages that have not been used recently
1024  * from all pools.
1025  */
1026 void
1027 pool_reclaim_all(void)
1028 {
1029 	struct pool	*pp;
1030 
1031 	rw_enter_read(&pool_lock);
1032 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1033 		pool_reclaim(pp);
1034 	rw_exit_read(&pool_lock);
1035 }
1036 
1037 #ifdef DDB
1038 #include <machine/db_machdep.h>
1039 #include <ddb/db_output.h>
1040 
1041 /*
1042  * Diagnostic helpers.
1043  */
1044 void
1045 pool_printit(struct pool *pp, const char *modif,
1046     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1047 {
1048 	pool_print1(pp, modif, pr);
1049 }
1050 
1051 void
1052 pool_print_pagelist(struct pool_pagelist *pl,
1053     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1054 {
1055 	struct pool_item_header *ph;
1056 	struct pool_item *pi;
1057 
1058 	TAILQ_FOREACH(ph, pl, ph_pagelist) {
1059 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1060 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1061 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1062 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1063 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1064 				    pi, pi->pi_magic);
1065 			}
1066 		}
1067 	}
1068 }
1069 
1070 void
1071 pool_print1(struct pool *pp, const char *modif,
1072     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1073 {
1074 	struct pool_item_header *ph;
1075 	int print_pagelist = 0;
1076 	char c;
1077 
1078 	while ((c = *modif++) != '\0') {
1079 		if (c == 'p')
1080 			print_pagelist = 1;
1081 		modif++;
1082 	}
1083 
1084 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1085 	    pp->pr_maxcolors);
1086 	(*pr)("\talloc %p\n", pp->pr_alloc);
1087 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1088 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1089 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1090 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1091 
1092 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1093 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1094 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1095 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1096 
1097 	if (print_pagelist == 0)
1098 		return;
1099 
1100 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1101 		(*pr)("\n\tempty page list:\n");
1102 	pool_print_pagelist(&pp->pr_emptypages, pr);
1103 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1104 		(*pr)("\n\tfull page list:\n");
1105 	pool_print_pagelist(&pp->pr_fullpages, pr);
1106 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1107 		(*pr)("\n\tpartial-page list:\n");
1108 	pool_print_pagelist(&pp->pr_partpages, pr);
1109 
1110 	if (pp->pr_curpage == NULL)
1111 		(*pr)("\tno current page\n");
1112 	else
1113 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1114 }
1115 
1116 void
1117 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1118 {
1119 	struct pool *pp;
1120 	char maxp[16];
1121 	int ovflw;
1122 	char mode;
1123 
1124 	mode = modif[0];
1125 	if (mode != '\0' && mode != 'a') {
1126 		db_printf("usage: show all pools [/a]\n");
1127 		return;
1128 	}
1129 
1130 	if (mode == '\0')
1131 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1132 		    "Name",
1133 		    "Size",
1134 		    "Requests",
1135 		    "Fail",
1136 		    "Releases",
1137 		    "Pgreq",
1138 		    "Pgrel",
1139 		    "Npage",
1140 		    "Hiwat",
1141 		    "Minpg",
1142 		    "Maxpg",
1143 		    "Idle");
1144 	else
1145 		db_printf("%-12s %18s %18s\n",
1146 		    "Name", "Address", "Allocator");
1147 
1148 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1149 		if (mode == 'a') {
1150 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1151 			    pp->pr_alloc);
1152 			continue;
1153 		}
1154 
1155 		if (!pp->pr_nget)
1156 			continue;
1157 
1158 		if (pp->pr_maxpages == UINT_MAX)
1159 			snprintf(maxp, sizeof maxp, "inf");
1160 		else
1161 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1162 
1163 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1164 	(ovflw) += db_printf((fmt),			\
1165 	    (width) - (fixed) - (ovflw) > 0 ?		\
1166 	    (width) - (fixed) - (ovflw) : 0,		\
1167 	    (val)) - (width);				\
1168 	if ((ovflw) < 0)				\
1169 		(ovflw) = 0;				\
1170 } while (/* CONSTCOND */0)
1171 
1172 		ovflw = 0;
1173 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1174 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1175 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1176 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1177 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1178 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1179 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1180 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1181 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1182 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1183 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1184 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1185 
1186 		pool_chk(pp);
1187 	}
1188 }
1189 #endif /* DDB */
1190 
1191 #if defined(POOL_DEBUG) || defined(DDB)
1192 int
1193 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1194 {
1195 	struct pool_item *pi;
1196 	caddr_t page;
1197 	int n;
1198 	const char *label = pp->pr_wchan;
1199 
1200 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1201 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1202 		printf("%s: ", label);
1203 		printf("pool(%p:%s): page inconsistency: page %p; "
1204 		    "at page head addr %p (p %p)\n",
1205 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1206 		return 1;
1207 	}
1208 
1209 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1210 	     pi != NULL;
1211 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1212 		if ((caddr_t)pi < ph->ph_page ||
1213 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1214 			printf("%s: ", label);
1215 			printf("pool(%p:%s): page inconsistency: page %p;"
1216 			    " item ordinal %d; addr %p\n", pp,
1217 			    pp->pr_wchan, ph->ph_page, n, pi);
1218 			return (1);
1219 		}
1220 
1221 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1222 			printf("%s: ", label);
1223 			printf("pool(%p:%s): free list modified: "
1224 			    "page %p; item ordinal %d; addr %p "
1225 			    "(p %p); offset 0x%x=0x%lx\n",
1226 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1227 			    0, pi->pi_magic);
1228 		}
1229 
1230 #ifdef DIAGNOSTIC
1231 		if (POOL_PHPOISON(ph)) {
1232 			size_t pidx;
1233 			uint32_t pval;
1234 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1235 			    &pidx, &pval)) {
1236 				int *ip = (int *)(pi + 1);
1237 				printf("pool(%s): free list modified: "
1238 				    "page %p; item ordinal %d; addr %p "
1239 				    "(p %p); offset 0x%zx=0x%x\n",
1240 				    pp->pr_wchan, ph->ph_page, n, pi,
1241 				    page, pidx * sizeof(int), ip[pidx]);
1242 			}
1243 		}
1244 #endif /* DIAGNOSTIC */
1245 	}
1246 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1247 		printf("pool(%p:%s): page inconsistency: page %p;"
1248 		    " %d on list, %d missing, %d items per page\n", pp,
1249 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1250 		    pp->pr_itemsperpage);
1251 		return 1;
1252 	}
1253 	if (expected >= 0 && n != expected) {
1254 		printf("pool(%p:%s): page inconsistency: page %p;"
1255 		    " %d on list, %d missing, %d expected\n", pp,
1256 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1257 		    expected);
1258 		return 1;
1259 	}
1260 	return 0;
1261 }
1262 
1263 int
1264 pool_chk(struct pool *pp)
1265 {
1266 	struct pool_item_header *ph;
1267 	int r = 0;
1268 
1269 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1270 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1271 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1272 		r += pool_chk_page(pp, ph, 0);
1273 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1274 		r += pool_chk_page(pp, ph, -1);
1275 
1276 	return (r);
1277 }
1278 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1279 
1280 #ifdef DDB
1281 void
1282 pool_walk(struct pool *pp, int full,
1283     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1284     void (*func)(void *, int, int (*)(const char *, ...)
1285 	    __attribute__((__format__(__kprintf__,1,2)))))
1286 {
1287 	struct pool_item_header *ph;
1288 	struct pool_item *pi;
1289 	caddr_t cp;
1290 	int n;
1291 
1292 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1293 		cp = ph->ph_colored;
1294 		n = ph->ph_nmissing;
1295 
1296 		while (n--) {
1297 			func(cp, full, pr);
1298 			cp += pp->pr_size;
1299 		}
1300 	}
1301 
1302 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1303 		cp = ph->ph_colored;
1304 		n = ph->ph_nmissing;
1305 
1306 		do {
1307 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1308 				if (cp == (caddr_t)pi)
1309 					break;
1310 			}
1311 			if (cp != (caddr_t)pi) {
1312 				func(cp, full, pr);
1313 				n--;
1314 			}
1315 
1316 			cp += pp->pr_size;
1317 		} while (n > 0);
1318 	}
1319 }
1320 #endif
1321 
1322 /*
1323  * We have three different sysctls.
1324  * kern.pool.npools - the number of pools.
1325  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1326  * kern.pool.name.<pool#> - the name for pool#.
1327  */
1328 int
1329 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1330 {
1331 	struct kinfo_pool pi;
1332 	struct pool *pp;
1333 	int rv = ENOENT;
1334 
1335 	switch (name[0]) {
1336 	case KERN_POOL_NPOOLS:
1337 		if (namelen != 1)
1338 			return (ENOTDIR);
1339 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1340 
1341 	case KERN_POOL_NAME:
1342 	case KERN_POOL_POOL:
1343 		break;
1344 	default:
1345 		return (EOPNOTSUPP);
1346 	}
1347 
1348 	if (namelen != 2)
1349 		return (ENOTDIR);
1350 
1351 	rw_enter_read(&pool_lock);
1352 
1353 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1354 		if (name[1] == pp->pr_serial)
1355 			break;
1356 	}
1357 
1358 	if (pp == NULL)
1359 		goto done;
1360 
1361 	switch (name[0]) {
1362 	case KERN_POOL_NAME:
1363 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1364 		break;
1365 	case KERN_POOL_POOL:
1366 		memset(&pi, 0, sizeof(pi));
1367 
1368 		mtx_enter(&pp->pr_mtx);
1369 		pi.pr_size = pp->pr_size;
1370 		pi.pr_pgsize = pp->pr_pgsize;
1371 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1372 		pi.pr_npages = pp->pr_npages;
1373 		pi.pr_minpages = pp->pr_minpages;
1374 		pi.pr_maxpages = pp->pr_maxpages;
1375 		pi.pr_hardlimit = pp->pr_hardlimit;
1376 		pi.pr_nout = pp->pr_nout;
1377 		pi.pr_nitems = pp->pr_nitems;
1378 		pi.pr_nget = pp->pr_nget;
1379 		pi.pr_nput = pp->pr_nput;
1380 		pi.pr_nfail = pp->pr_nfail;
1381 		pi.pr_npagealloc = pp->pr_npagealloc;
1382 		pi.pr_npagefree = pp->pr_npagefree;
1383 		pi.pr_hiwat = pp->pr_hiwat;
1384 		pi.pr_nidle = pp->pr_nidle;
1385 		mtx_leave(&pp->pr_mtx);
1386 
1387 		pool_cache_info(pp, &pi);
1388 
1389 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1390 		break;
1391 	}
1392 
1393 done:
1394 	rw_exit_read(&pool_lock);
1395 
1396 	return (rv);
1397 }
1398 
1399 void
1400 pool_gc_sched(void *null)
1401 {
1402 	task_add(systqmp, &pool_gc_task);
1403 }
1404 
1405 void
1406 pool_gc_pages(void *null)
1407 {
1408 	struct pool *pp;
1409 	struct pool_item_header *ph, *freeph;
1410 	int s;
1411 
1412 	rw_enter_read(&pool_lock);
1413 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1414 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1415 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1416 		    !mtx_enter_try(&pp->pr_mtx)) /* try */
1417 			continue;
1418 
1419 		/* is it time to free a page? */
1420 		if (pp->pr_nidle > pp->pr_minpages &&
1421 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1422 		    (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
1423 			freeph = ph;
1424 			pool_p_remove(pp, freeph);
1425 		} else
1426 			freeph = NULL;
1427 
1428 		mtx_leave(&pp->pr_mtx);
1429 
1430 		if (freeph != NULL)
1431 			pool_p_free(pp, freeph);
1432 	}
1433 	splx(s);
1434 	rw_exit_read(&pool_lock);
1435 
1436 	timeout_add_sec(&pool_gc_tick, 1);
1437 }
1438 
1439 /*
1440  * Pool backend allocators.
1441  */
1442 
1443 void *
1444 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1445 {
1446 	void *v;
1447 
1448 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1449 
1450 #ifdef DIAGNOSTIC
1451 	if (v != NULL && POOL_INPGHDR(pp)) {
1452 		vaddr_t addr = (vaddr_t)v;
1453 		if ((addr & pp->pr_pgmask) != addr) {
1454 			panic("%s: %s page address %p isnt aligned to %u",
1455 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1456 		}
1457 	}
1458 #endif
1459 
1460 	return (v);
1461 }
1462 
1463 void
1464 pool_allocator_free(struct pool *pp, void *v)
1465 {
1466 	struct pool_allocator *pa = pp->pr_alloc;
1467 
1468 	(*pa->pa_free)(pp, v);
1469 }
1470 
1471 void *
1472 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1473 {
1474 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1475 
1476 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1477 	kd.kd_slowdown = slowdown;
1478 
1479 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1480 }
1481 
1482 void
1483 pool_page_free(struct pool *pp, void *v)
1484 {
1485 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1486 }
1487 
1488 void *
1489 pool_multi_alloc(struct pool *pp, int flags, int *slowdown)
1490 {
1491 	struct kmem_va_mode kv = kv_intrsafe;
1492 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1493 	void *v;
1494 	int s;
1495 
1496 	if (POOL_INPGHDR(pp))
1497 		kv.kv_align = pp->pr_pgsize;
1498 
1499 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1500 	kd.kd_slowdown = slowdown;
1501 
1502 	s = splvm();
1503 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1504 	splx(s);
1505 
1506 	return (v);
1507 }
1508 
1509 void
1510 pool_multi_free(struct pool *pp, void *v)
1511 {
1512 	struct kmem_va_mode kv = kv_intrsafe;
1513 	int s;
1514 
1515 	if (POOL_INPGHDR(pp))
1516 		kv.kv_align = pp->pr_pgsize;
1517 
1518 	s = splvm();
1519 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1520 	splx(s);
1521 }
1522 
1523 void *
1524 pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown)
1525 {
1526 	struct kmem_va_mode kv = kv_any;
1527 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1528 	void *v;
1529 
1530 	if (POOL_INPGHDR(pp))
1531 		kv.kv_align = pp->pr_pgsize;
1532 
1533 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1534 	kd.kd_slowdown = slowdown;
1535 
1536 	KERNEL_LOCK();
1537 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1538 	KERNEL_UNLOCK();
1539 
1540 	return (v);
1541 }
1542 
1543 void
1544 pool_multi_free_ni(struct pool *pp, void *v)
1545 {
1546 	struct kmem_va_mode kv = kv_any;
1547 
1548 	if (POOL_INPGHDR(pp))
1549 		kv.kv_align = pp->pr_pgsize;
1550 
1551 	KERNEL_LOCK();
1552 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1553 	KERNEL_UNLOCK();
1554 }
1555 
1556 #ifdef MULTIPROCESSOR
1557 
1558 struct pool pool_caches; /* per cpu cache entries */
1559 
1560 void
1561 pool_cache_init(struct pool *pp)
1562 {
1563 	struct cpumem *cm;
1564 	struct pool_cache *pc;
1565 	struct cpumem_iter i;
1566 
1567 	if (pool_caches.pr_size == 0) {
1568 		pool_init(&pool_caches, sizeof(struct pool_cache), 64,
1569 		    IPL_NONE, PR_WAITOK, "plcache", NULL);
1570 	}
1571 
1572 	KASSERT(pp->pr_size >= sizeof(*pc));
1573 
1574 	cm = cpumem_get(&pool_caches);
1575 
1576 	mtx_init(&pp->pr_cache_mtx, pp->pr_ipl);
1577 	pp->pr_cache_list = NULL;
1578 	pp->pr_cache_nlist = 0;
1579 	pp->pr_cache_items = 8;
1580 	pp->pr_cache_contention = 0;
1581 
1582 	CPUMEM_FOREACH(pc, &i, cm) {
1583 		pc->pc_actv = NULL;
1584 		pc->pc_nactv = 0;
1585 		pc->pc_prev = NULL;
1586 
1587 		pc->pc_gets = 0;
1588 		pc->pc_puts = 0;
1589 		pc->pc_fails = 0;
1590 		pc->pc_nout = 0;
1591 	}
1592 
1593 	pp->pr_cache = cm;
1594 }
1595 
1596 static inline void
1597 pool_list_enter(struct pool *pp)
1598 {
1599 	if (mtx_enter_try(&pp->pr_cache_mtx) == 0) {
1600 		mtx_enter(&pp->pr_cache_mtx);
1601 		pp->pr_cache_contention++;
1602 	}
1603 }
1604 
1605 static inline void
1606 pool_list_leave(struct pool *pp)
1607 {
1608 	mtx_leave(&pp->pr_cache_mtx);
1609 }
1610 
1611 static inline struct pool_list *
1612 pool_list_alloc(struct pool *pp, struct pool_cache *pc)
1613 {
1614 	struct pool_list *pl;
1615 
1616 	pool_list_enter(pp);
1617 	pl = pp->pr_cache_list;
1618 	if (pl != NULL) {
1619 		pp->pr_cache_list = pl->pl_nextl;
1620 		pp->pr_cache_nlist--;
1621 	}
1622 
1623 	pp->pr_cache_nout += pc->pc_nout;
1624 	pc->pc_nout = 0;
1625 	pool_list_leave(pp);
1626 
1627 	return (pl);
1628 }
1629 
1630 static inline void
1631 pool_list_free(struct pool *pp, struct pool_cache *pc, struct pool_list *pl)
1632 {
1633 	pool_list_enter(pp);
1634 	pl->pl_nextl = pp->pr_cache_list;
1635 	pp->pr_cache_list = pl;
1636 	pp->pr_cache_nlist++;
1637 
1638 	pp->pr_cache_nout += pc->pc_nout;
1639 	pc->pc_nout = 0;
1640 	pool_list_leave(pp);
1641 }
1642 
1643 static inline struct pool_cache *
1644 pool_cache_enter(struct pool *pp, int *s)
1645 {
1646 	struct pool_cache *pc;
1647 
1648 	pc = cpumem_enter(pp->pr_cache);
1649 	*s = splraise(pp->pr_ipl);
1650 	pc->pc_gen++;
1651 
1652 	return (pc);
1653 }
1654 
1655 static inline void
1656 pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
1657 {
1658 	pc->pc_gen++;
1659 	splx(s);
1660 	cpumem_leave(pp->pr_cache, pc);
1661 }
1662 
1663 void *
1664 pool_cache_get(struct pool *pp)
1665 {
1666 	struct pool_cache *pc;
1667 	struct pool_list *pl;
1668 	int s;
1669 
1670 	pc = pool_cache_enter(pp, &s);
1671 
1672 	if (pc->pc_actv != NULL) {
1673 		pl = pc->pc_actv;
1674 	} else if (pc->pc_prev != NULL) {
1675 		pl = pc->pc_prev;
1676 		pc->pc_prev = NULL;
1677 	} else if ((pl = pool_list_alloc(pp, pc)) == NULL) {
1678 		pc->pc_fails++;
1679 		goto done;
1680 	}
1681 
1682 	pc->pc_actv = pl->pl_next;
1683 	pc->pc_nactv = pl->pl_nitems - 1;
1684 	pc->pc_gets++;
1685 	pc->pc_nout++;
1686 done:
1687 	pool_cache_leave(pp, pc, s);
1688 
1689 	return (pl);
1690 }
1691 
1692 void
1693 pool_cache_put(struct pool *pp, void *v)
1694 {
1695 	struct pool_cache *pc;
1696 	struct pool_list *pl = v;
1697 	unsigned long cache_items = pp->pr_cache_items;
1698 	unsigned long nitems;
1699 	int s;
1700 
1701 	pc = pool_cache_enter(pp, &s);
1702 
1703 	nitems = pc->pc_nactv;
1704 	if (nitems >= cache_items) {
1705 		if (pc->pc_prev != NULL)
1706 			pool_list_free(pp, pc, pc->pc_prev);
1707 
1708 		pc->pc_prev = pc->pc_actv;
1709 
1710 		pc->pc_actv = NULL;
1711 		pc->pc_nactv = 0;
1712 		nitems = 0;
1713 	}
1714 
1715 	pl->pl_next = pc->pc_actv;
1716 	pl->pl_nitems = ++nitems;
1717 
1718 	pc->pc_actv = pl;
1719 	pc->pc_nactv = nitems;
1720 
1721 	pc->pc_puts++;
1722 	pc->pc_nout--;
1723 
1724 	pool_cache_leave(pp, pc, s);
1725 }
1726 
1727 struct pool_list *
1728 pool_list_put(struct pool *pp, struct pool_list *pl)
1729 {
1730 	struct pool_list *rpl, *npl;
1731 
1732 	if (pl == NULL)
1733 		return (NULL);
1734 
1735 	rpl = (struct pool_list *)pl->pl_next;
1736 
1737 	do {
1738 		npl = pl->pl_next;
1739 		pool_put(pp, pl);
1740 		pl = npl;
1741 	} while (pl != NULL);
1742 
1743 	return (rpl);
1744 }
1745 
1746 void
1747 pool_cache_destroy(struct pool *pp)
1748 {
1749 	struct pool_cache *pc;
1750 	struct pool_list *pl;
1751 	struct cpumem_iter i;
1752 	struct cpumem *cm;
1753 
1754 	cm = pp->pr_cache;
1755 	pp->pr_cache = NULL; /* make pool_put avoid the cache */
1756 
1757 	CPUMEM_FOREACH(pc, &i, cm) {
1758 		pool_list_put(pp, pc->pc_actv);
1759 		pool_list_put(pp, pc->pc_prev);
1760 	}
1761 
1762 	cpumem_put(&pool_caches, cm);
1763 
1764 	pl = pp->pr_cache_list;
1765 	while (pl != NULL)
1766 		pl = pool_list_put(pp, pl);
1767 }
1768 
1769 void
1770 pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
1771 {
1772 	struct pool_cache *pc;
1773 	struct cpumem_iter i;
1774 
1775 	if (pp->pr_cache == NULL)
1776 		return;
1777 
1778 	/* loop through the caches twice to collect stats */
1779 
1780 	/* once without the mtx so we can yield while reading nget/nput */
1781 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
1782 		uint64_t gen, nget, nput;
1783 
1784 		do {
1785 			while ((gen = pc->pc_gen) & 1)
1786 				yield();
1787 
1788 			nget = pc->pc_gets;
1789 			nput = pc->pc_puts;
1790 		} while (gen != pc->pc_gen);
1791 
1792 		pi->pr_nget += nget;
1793 		pi->pr_nput += nput;
1794 	}
1795 
1796 	/* and once with the mtx so we can get consistent nout values */
1797 	mtx_enter(&pp->pr_cache_mtx);
1798 	CPUMEM_FOREACH(pc, &i, pp->pr_cache)
1799 		pi->pr_nout += pc->pc_nout;
1800 
1801 	pi->pr_nout += pp->pr_cache_nout;
1802 	mtx_leave(&pp->pr_cache_mtx);
1803 }
1804 #else /* MULTIPROCESSOR */
1805 void
1806 pool_cache_init(struct pool *pp)
1807 {
1808 	/* nop */
1809 }
1810 
1811 void
1812 pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
1813 {
1814 	/* nop */
1815 }
1816 #endif /* MULTIPROCESSOR */
1817