xref: /openbsd-src/sys/kern/subr_pool.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: subr_pool.c,v 1.231 2021/01/02 03:23:59 cheloha Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/proc.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/time.h>
45 #include <sys/timeout.h>
46 #include <sys/percpu.h>
47 
48 #include <uvm/uvm_extern.h>
49 
50 /*
51  * Pool resource management utility.
52  *
53  * Memory is allocated in pages which are split into pieces according to
54  * the pool item size. Each page is kept on one of three lists in the
55  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
56  * for empty, full and partially-full pages respectively. The individual
57  * pool items are on a linked list headed by `ph_items' in each page
58  * header. The memory for building the page list is either taken from
59  * the allocated pages themselves (for small pool items) or taken from
60  * an internal pool of page headers (`phpool').
61  */
62 
63 /* List of all pools */
64 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
65 
66 /*
67  * Every pool gets a unique serial number assigned to it. If this counter
68  * wraps, we're screwed, but we shouldn't create so many pools anyway.
69  */
70 unsigned int pool_serial;
71 unsigned int pool_count;
72 
73 /* Lock the previous variables making up the global pool state */
74 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
75 
76 /* Private pool for page header structures */
77 struct pool phpool;
78 
79 struct pool_lock_ops {
80 	void	(*pl_init)(struct pool *, union pool_lock *,
81 		    const struct lock_type *);
82 	void	(*pl_enter)(union pool_lock *);
83 	int	(*pl_enter_try)(union pool_lock *);
84 	void	(*pl_leave)(union pool_lock *);
85 	void	(*pl_assert_locked)(union pool_lock *);
86 	void	(*pl_assert_unlocked)(union pool_lock *);
87 	int	(*pl_sleep)(void *, union pool_lock *, int, const char *);
88 };
89 
90 static const struct pool_lock_ops pool_lock_ops_mtx;
91 static const struct pool_lock_ops pool_lock_ops_rw;
92 
93 #ifdef WITNESS
94 #define pl_init(pp, pl) do {						\
95 	static const struct lock_type __lock_type = { .lt_name = #pl };	\
96 	(pp)->pr_lock_ops->pl_init(pp, pl, &__lock_type);		\
97 } while (0)
98 #else /* WITNESS */
99 #define pl_init(pp, pl)		(pp)->pr_lock_ops->pl_init(pp, pl, NULL)
100 #endif /* WITNESS */
101 
102 static inline void
103 pl_enter(struct pool *pp, union pool_lock *pl)
104 {
105 	pp->pr_lock_ops->pl_enter(pl);
106 }
107 static inline int
108 pl_enter_try(struct pool *pp, union pool_lock *pl)
109 {
110 	return pp->pr_lock_ops->pl_enter_try(pl);
111 }
112 static inline void
113 pl_leave(struct pool *pp, union pool_lock *pl)
114 {
115 	pp->pr_lock_ops->pl_leave(pl);
116 }
117 static inline void
118 pl_assert_locked(struct pool *pp, union pool_lock *pl)
119 {
120 	pp->pr_lock_ops->pl_assert_locked(pl);
121 }
122 static inline void
123 pl_assert_unlocked(struct pool *pp, union pool_lock *pl)
124 {
125 	pp->pr_lock_ops->pl_assert_unlocked(pl);
126 }
127 static inline int
128 pl_sleep(struct pool *pp, void *ident, union pool_lock *lock, int priority,
129     const char *wmesg)
130 {
131 	return pp->pr_lock_ops->pl_sleep(ident, lock, priority, wmesg);
132 }
133 
134 struct pool_item {
135 	u_long				pi_magic;
136 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
137 };
138 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
139 
140 struct pool_page_header {
141 	/* Page headers */
142 	TAILQ_ENTRY(pool_page_header)
143 				ph_entry;	/* pool page list */
144 	XSIMPLEQ_HEAD(, pool_item)
145 				ph_items;	/* free items on the page */
146 	RBT_ENTRY(pool_page_header)
147 				ph_node;	/* off-page page headers */
148 	unsigned int		ph_nmissing;	/* # of chunks in use */
149 	caddr_t			ph_page;	/* this page's address */
150 	caddr_t			ph_colored;	/* page's colored address */
151 	unsigned long		ph_magic;
152 	uint64_t		ph_timestamp;
153 };
154 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
155 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
156 
157 #ifdef MULTIPROCESSOR
158 struct pool_cache_item {
159 	struct pool_cache_item	*ci_next;	/* next item in list */
160 	unsigned long		 ci_nitems;	/* number of items in list */
161 	TAILQ_ENTRY(pool_cache_item)
162 				 ci_nextl;	/* entry in list of lists */
163 };
164 
165 /* we store whether the cached item is poisoned in the high bit of nitems */
166 #define POOL_CACHE_ITEM_NITEMS_MASK	0x7ffffffUL
167 #define POOL_CACHE_ITEM_NITEMS_POISON	0x8000000UL
168 
169 #define POOL_CACHE_ITEM_NITEMS(_ci)					\
170     ((_ci)->ci_nitems & POOL_CACHE_ITEM_NITEMS_MASK)
171 
172 #define POOL_CACHE_ITEM_POISONED(_ci)					\
173     ISSET((_ci)->ci_nitems, POOL_CACHE_ITEM_NITEMS_POISON)
174 
175 struct pool_cache {
176 	struct pool_cache_item	*pc_actv;	/* active list of items */
177 	unsigned long		 pc_nactv;	/* actv head nitems cache */
178 	struct pool_cache_item	*pc_prev;	/* previous list of items */
179 
180 	uint64_t		 pc_gen;	/* generation number */
181 	uint64_t		 pc_nget;	/* # of successful requests */
182 	uint64_t		 pc_nfail;	/* # of unsuccessful reqs */
183 	uint64_t		 pc_nput;	/* # of releases */
184 	uint64_t		 pc_nlget;	/* # of list requests */
185 	uint64_t		 pc_nlfail;	/* # of fails getting a list */
186 	uint64_t		 pc_nlput;	/* # of list releases */
187 
188 	int			 pc_nout;
189 };
190 
191 void	*pool_cache_get(struct pool *);
192 void	 pool_cache_put(struct pool *, void *);
193 void	 pool_cache_destroy(struct pool *);
194 void	 pool_cache_gc(struct pool *);
195 #endif
196 void	 pool_cache_pool_info(struct pool *, struct kinfo_pool *);
197 int	 pool_cache_info(struct pool *, void *, size_t *);
198 int	 pool_cache_cpus_info(struct pool *, void *, size_t *);
199 
200 #ifdef POOL_DEBUG
201 int	pool_debug = 1;
202 #else
203 int	pool_debug = 0;
204 #endif
205 
206 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
207 
208 struct pool_page_header *
209 	 pool_p_alloc(struct pool *, int, int *);
210 void	 pool_p_insert(struct pool *, struct pool_page_header *);
211 void	 pool_p_remove(struct pool *, struct pool_page_header *);
212 void	 pool_p_free(struct pool *, struct pool_page_header *);
213 
214 void	 pool_update_curpage(struct pool *);
215 void	*pool_do_get(struct pool *, int, int *);
216 void	 pool_do_put(struct pool *, void *);
217 int	 pool_chk_page(struct pool *, struct pool_page_header *, int);
218 int	 pool_chk(struct pool *);
219 void	 pool_get_done(struct pool *, void *, void *);
220 void	 pool_runqueue(struct pool *, int);
221 
222 void	*pool_allocator_alloc(struct pool *, int, int *);
223 void	 pool_allocator_free(struct pool *, void *);
224 
225 /*
226  * The default pool allocator.
227  */
228 void	*pool_page_alloc(struct pool *, int, int *);
229 void	pool_page_free(struct pool *, void *);
230 
231 /*
232  * safe for interrupts; this is the default allocator
233  */
234 struct pool_allocator pool_allocator_single = {
235 	pool_page_alloc,
236 	pool_page_free,
237 	POOL_ALLOC_SIZE(PAGE_SIZE, POOL_ALLOC_ALIGNED)
238 };
239 
240 void	*pool_multi_alloc(struct pool *, int, int *);
241 void	pool_multi_free(struct pool *, void *);
242 
243 struct pool_allocator pool_allocator_multi = {
244 	pool_multi_alloc,
245 	pool_multi_free,
246 	POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED)
247 };
248 
249 void	*pool_multi_alloc_ni(struct pool *, int, int *);
250 void	pool_multi_free_ni(struct pool *, void *);
251 
252 struct pool_allocator pool_allocator_multi_ni = {
253 	pool_multi_alloc_ni,
254 	pool_multi_free_ni,
255 	POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED)
256 };
257 
258 #ifdef DDB
259 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
260 	     __attribute__((__format__(__kprintf__,1,2))));
261 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
262 	     __attribute__((__format__(__kprintf__,1,2))));
263 #endif
264 
265 /* stale page garbage collectors */
266 void	pool_gc_sched(void *);
267 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
268 void	pool_gc_pages(void *);
269 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
270 
271 #define POOL_WAIT_FREE	SEC_TO_NSEC(1)
272 #define POOL_WAIT_GC	SEC_TO_NSEC(8)
273 
274 /*
275  * TODO Move getnsecuptime() to kern_tc.c and document it when we
276  * have callers in other modules.
277  */
278 static uint64_t
279 getnsecuptime(void)
280 {
281 	struct timespec now;
282 
283 	getnanouptime(&now);
284 	return TIMESPEC_TO_NSEC(&now);
285 }
286 
287 RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare);
288 
289 static inline int
290 phtree_compare(const struct pool_page_header *a,
291     const struct pool_page_header *b)
292 {
293 	vaddr_t va = (vaddr_t)a->ph_page;
294 	vaddr_t vb = (vaddr_t)b->ph_page;
295 
296 	/* the compares in this order are important for the NFIND to work */
297 	if (vb < va)
298 		return (-1);
299 	if (vb > va)
300 		return (1);
301 
302 	return (0);
303 }
304 
305 RBT_GENERATE(phtree, pool_page_header, ph_node, phtree_compare);
306 
307 /*
308  * Return the pool page header based on page address.
309  */
310 static inline struct pool_page_header *
311 pr_find_pagehead(struct pool *pp, void *v)
312 {
313 	struct pool_page_header *ph, key;
314 
315 	if (POOL_INPGHDR(pp)) {
316 		caddr_t page;
317 
318 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
319 
320 		return ((struct pool_page_header *)(page + pp->pr_phoffset));
321 	}
322 
323 	key.ph_page = v;
324 	ph = RBT_NFIND(phtree, &pp->pr_phtree, &key);
325 	if (ph == NULL)
326 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
327 
328 	KASSERT(ph->ph_page <= (caddr_t)v);
329 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
330 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
331 
332 	return (ph);
333 }
334 
335 /*
336  * Initialize the given pool resource structure.
337  *
338  * We export this routine to allow other kernel parts to declare
339  * static pools that must be initialized before malloc() is available.
340  */
341 void
342 pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags,
343     const char *wchan, struct pool_allocator *palloc)
344 {
345 	int off = 0, space;
346 	unsigned int pgsize = PAGE_SIZE, items;
347 	size_t pa_pagesz;
348 #ifdef DIAGNOSTIC
349 	struct pool *iter;
350 #endif
351 
352 	if (align == 0)
353 		align = ALIGN(1);
354 
355 	if (size < sizeof(struct pool_item))
356 		size = sizeof(struct pool_item);
357 
358 	size = roundup(size, align);
359 
360 	while (size * 8 > pgsize)
361 		pgsize <<= 1;
362 
363 	if (palloc == NULL) {
364 		if (pgsize > PAGE_SIZE) {
365 			palloc = ISSET(flags, PR_WAITOK) ?
366 			    &pool_allocator_multi_ni : &pool_allocator_multi;
367 		} else
368 			palloc = &pool_allocator_single;
369 
370 		pa_pagesz = palloc->pa_pagesz;
371 	} else {
372 		size_t pgsizes;
373 
374 		pa_pagesz = palloc->pa_pagesz;
375 		if (pa_pagesz == 0)
376 			pa_pagesz = POOL_ALLOC_DEFAULT;
377 
378 		pgsizes = pa_pagesz & ~POOL_ALLOC_ALIGNED;
379 
380 		/* make sure the allocator can fit at least one item */
381 		if (size > pgsizes) {
382 			panic("%s: pool %s item size 0x%zx > "
383 			    "allocator %p sizes 0x%zx", __func__, wchan,
384 			    size, palloc, pgsizes);
385 		}
386 
387 		/* shrink pgsize until it fits into the range */
388 		while (!ISSET(pgsizes, pgsize))
389 			pgsize >>= 1;
390 	}
391 	KASSERT(ISSET(pa_pagesz, pgsize));
392 
393 	items = pgsize / size;
394 
395 	/*
396 	 * Decide whether to put the page header off page to avoid
397 	 * wasting too large a part of the page. Off-page page headers
398 	 * go into an RB tree, so we can match a returned item with
399 	 * its header based on the page address.
400 	 */
401 	if (ISSET(pa_pagesz, POOL_ALLOC_ALIGNED)) {
402 		if (pgsize - (size * items) >
403 		    sizeof(struct pool_page_header)) {
404 			off = pgsize - sizeof(struct pool_page_header);
405 		} else if (sizeof(struct pool_page_header) * 2 >= size) {
406 			off = pgsize - sizeof(struct pool_page_header);
407 			items = off / size;
408 		}
409 	}
410 
411 	KASSERT(items > 0);
412 
413 	/*
414 	 * Initialize the pool structure.
415 	 */
416 	memset(pp, 0, sizeof(*pp));
417 	if (ISSET(flags, PR_RWLOCK)) {
418 		KASSERT(flags & PR_WAITOK);
419 		pp->pr_lock_ops = &pool_lock_ops_rw;
420 	} else
421 		pp->pr_lock_ops = &pool_lock_ops_mtx;
422 	TAILQ_INIT(&pp->pr_emptypages);
423 	TAILQ_INIT(&pp->pr_fullpages);
424 	TAILQ_INIT(&pp->pr_partpages);
425 	pp->pr_curpage = NULL;
426 	pp->pr_npages = 0;
427 	pp->pr_minitems = 0;
428 	pp->pr_minpages = 0;
429 	pp->pr_maxpages = 8;
430 	pp->pr_size = size;
431 	pp->pr_pgsize = pgsize;
432 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
433 	pp->pr_phoffset = off;
434 	pp->pr_itemsperpage = items;
435 	pp->pr_wchan = wchan;
436 	pp->pr_alloc = palloc;
437 	pp->pr_nitems = 0;
438 	pp->pr_nout = 0;
439 	pp->pr_hardlimit = UINT_MAX;
440 	pp->pr_hardlimit_warning = NULL;
441 	pp->pr_hardlimit_ratecap.tv_sec = 0;
442 	pp->pr_hardlimit_ratecap.tv_usec = 0;
443 	pp->pr_hardlimit_warning_last.tv_sec = 0;
444 	pp->pr_hardlimit_warning_last.tv_usec = 0;
445 	RBT_INIT(phtree, &pp->pr_phtree);
446 
447 	/*
448 	 * Use the space between the chunks and the page header
449 	 * for cache coloring.
450 	 */
451 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
452 	space -= pp->pr_itemsperpage * pp->pr_size;
453 	pp->pr_align = align;
454 	pp->pr_maxcolors = (space / align) + 1;
455 
456 	pp->pr_nget = 0;
457 	pp->pr_nfail = 0;
458 	pp->pr_nput = 0;
459 	pp->pr_npagealloc = 0;
460 	pp->pr_npagefree = 0;
461 	pp->pr_hiwat = 0;
462 	pp->pr_nidle = 0;
463 
464 	pp->pr_ipl = ipl;
465 	pp->pr_flags = flags;
466 
467 	pl_init(pp, &pp->pr_lock);
468 	pl_init(pp, &pp->pr_requests_lock);
469 	TAILQ_INIT(&pp->pr_requests);
470 
471 	if (phpool.pr_size == 0) {
472 		pool_init(&phpool, sizeof(struct pool_page_header), 0,
473 		    IPL_HIGH, 0, "phpool", NULL);
474 
475 		/* make sure phpool wont "recurse" */
476 		KASSERT(POOL_INPGHDR(&phpool));
477 	}
478 
479 	/* pglistalloc/constraint parameters */
480 	pp->pr_crange = &kp_dirty;
481 
482 	/* Insert this into the list of all pools. */
483 	rw_enter_write(&pool_lock);
484 #ifdef DIAGNOSTIC
485 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
486 		if (iter == pp)
487 			panic("%s: pool %s already on list", __func__, wchan);
488 	}
489 #endif
490 
491 	pp->pr_serial = ++pool_serial;
492 	if (pool_serial == 0)
493 		panic("%s: too much uptime", __func__);
494 
495 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
496 	pool_count++;
497 	rw_exit_write(&pool_lock);
498 }
499 
500 /*
501  * Decommission a pool resource.
502  */
503 void
504 pool_destroy(struct pool *pp)
505 {
506 	struct pool_page_header *ph;
507 	struct pool *prev, *iter;
508 
509 #ifdef MULTIPROCESSOR
510 	if (pp->pr_cache != NULL)
511 		pool_cache_destroy(pp);
512 #endif
513 
514 #ifdef DIAGNOSTIC
515 	if (pp->pr_nout != 0)
516 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
517 #endif
518 
519 	/* Remove from global pool list */
520 	rw_enter_write(&pool_lock);
521 	pool_count--;
522 	if (pp == SIMPLEQ_FIRST(&pool_head))
523 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
524 	else {
525 		prev = SIMPLEQ_FIRST(&pool_head);
526 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
527 			if (iter == pp) {
528 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
529 				    pr_poollist);
530 				break;
531 			}
532 			prev = iter;
533 		}
534 	}
535 	rw_exit_write(&pool_lock);
536 
537 	/* Remove all pages */
538 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
539 		pl_enter(pp, &pp->pr_lock);
540 		pool_p_remove(pp, ph);
541 		pl_leave(pp, &pp->pr_lock);
542 		pool_p_free(pp, ph);
543 	}
544 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
545 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
546 }
547 
548 void
549 pool_request_init(struct pool_request *pr,
550     void (*handler)(struct pool *, void *, void *), void *cookie)
551 {
552 	pr->pr_handler = handler;
553 	pr->pr_cookie = cookie;
554 	pr->pr_item = NULL;
555 }
556 
557 void
558 pool_request(struct pool *pp, struct pool_request *pr)
559 {
560 	pl_enter(pp, &pp->pr_requests_lock);
561 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
562 	pool_runqueue(pp, PR_NOWAIT);
563 	pl_leave(pp, &pp->pr_requests_lock);
564 }
565 
566 struct pool_get_memory {
567 	union pool_lock lock;
568 	void * volatile v;
569 };
570 
571 /*
572  * Grab an item from the pool.
573  */
574 void *
575 pool_get(struct pool *pp, int flags)
576 {
577 	void *v = NULL;
578 	int slowdown = 0;
579 
580 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
581 	if (pp->pr_flags & PR_RWLOCK)
582 		KASSERT(flags & PR_WAITOK);
583 
584 #ifdef MULTIPROCESSOR
585 	if (pp->pr_cache != NULL) {
586 		v = pool_cache_get(pp);
587 		if (v != NULL)
588 			goto good;
589 	}
590 #endif
591 
592 	pl_enter(pp, &pp->pr_lock);
593 	if (pp->pr_nout >= pp->pr_hardlimit) {
594 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
595 			goto fail;
596 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
597 		if (ISSET(flags, PR_NOWAIT))
598 			goto fail;
599 	}
600 	pl_leave(pp, &pp->pr_lock);
601 
602 	if ((slowdown || pool_debug == 2) && ISSET(flags, PR_WAITOK))
603 		yield();
604 
605 	if (v == NULL) {
606 		struct pool_get_memory mem = { .v = NULL };
607 		struct pool_request pr;
608 
609 #ifdef DIAGNOSTIC
610 		if (ISSET(flags, PR_WAITOK) && curproc == &proc0)
611 			panic("%s: cannot sleep for memory during boot",
612 			    __func__);
613 #endif
614 		pl_init(pp, &mem.lock);
615 		pool_request_init(&pr, pool_get_done, &mem);
616 		pool_request(pp, &pr);
617 
618 		pl_enter(pp, &mem.lock);
619 		while (mem.v == NULL)
620 			pl_sleep(pp, &mem, &mem.lock, PSWP, pp->pr_wchan);
621 		pl_leave(pp, &mem.lock);
622 
623 		v = mem.v;
624 	}
625 
626 #ifdef MULTIPROCESSOR
627 good:
628 #endif
629 	if (ISSET(flags, PR_ZERO))
630 		memset(v, 0, pp->pr_size);
631 
632 	return (v);
633 
634 fail:
635 	pp->pr_nfail++;
636 	pl_leave(pp, &pp->pr_lock);
637 	return (NULL);
638 }
639 
640 void
641 pool_get_done(struct pool *pp, void *xmem, void *v)
642 {
643 	struct pool_get_memory *mem = xmem;
644 
645 	pl_enter(pp, &mem->lock);
646 	mem->v = v;
647 	pl_leave(pp, &mem->lock);
648 
649 	wakeup_one(mem);
650 }
651 
652 void
653 pool_runqueue(struct pool *pp, int flags)
654 {
655 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
656 	struct pool_request *pr;
657 
658 	pl_assert_unlocked(pp, &pp->pr_lock);
659 	pl_assert_locked(pp, &pp->pr_requests_lock);
660 
661 	if (pp->pr_requesting++)
662 		return;
663 
664 	do {
665 		pp->pr_requesting = 1;
666 
667 		TAILQ_CONCAT(&prl, &pp->pr_requests, pr_entry);
668 		if (TAILQ_EMPTY(&prl))
669 			continue;
670 
671 		pl_leave(pp, &pp->pr_requests_lock);
672 
673 		pl_enter(pp, &pp->pr_lock);
674 		pr = TAILQ_FIRST(&prl);
675 		while (pr != NULL) {
676 			int slowdown = 0;
677 
678 			if (pp->pr_nout >= pp->pr_hardlimit)
679 				break;
680 
681 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
682 			if (pr->pr_item == NULL) /* || slowdown ? */
683 				break;
684 
685 			pr = TAILQ_NEXT(pr, pr_entry);
686 		}
687 		pl_leave(pp, &pp->pr_lock);
688 
689 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
690 		    pr->pr_item != NULL) {
691 			TAILQ_REMOVE(&prl, pr, pr_entry);
692 			(*pr->pr_handler)(pp, pr->pr_cookie, pr->pr_item);
693 		}
694 
695 		pl_enter(pp, &pp->pr_requests_lock);
696 	} while (--pp->pr_requesting);
697 
698 	TAILQ_CONCAT(&pp->pr_requests, &prl, pr_entry);
699 }
700 
701 void *
702 pool_do_get(struct pool *pp, int flags, int *slowdown)
703 {
704 	struct pool_item *pi;
705 	struct pool_page_header *ph;
706 
707 	pl_assert_locked(pp, &pp->pr_lock);
708 
709 	splassert(pp->pr_ipl);
710 
711 	/*
712 	 * Account for this item now to avoid races if we need to give up
713 	 * pr_lock to allocate a page.
714 	 */
715 	pp->pr_nout++;
716 
717 	if (pp->pr_curpage == NULL) {
718 		pl_leave(pp, &pp->pr_lock);
719 		ph = pool_p_alloc(pp, flags, slowdown);
720 		pl_enter(pp, &pp->pr_lock);
721 
722 		if (ph == NULL) {
723 			pp->pr_nout--;
724 			return (NULL);
725 		}
726 
727 		pool_p_insert(pp, ph);
728 	}
729 
730 	ph = pp->pr_curpage;
731 	pi = XSIMPLEQ_FIRST(&ph->ph_items);
732 	if (__predict_false(pi == NULL))
733 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
734 
735 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
736 		panic("%s: %s free list modified: "
737 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
738 		    __func__, pp->pr_wchan, ph->ph_page, pi,
739 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
740 	}
741 
742 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_items, pi_list);
743 
744 #ifdef DIAGNOSTIC
745 	if (pool_debug && POOL_PHPOISON(ph)) {
746 		size_t pidx;
747 		uint32_t pval;
748 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
749 		    &pidx, &pval)) {
750 			int *ip = (int *)(pi + 1);
751 			panic("%s: %s free list modified: "
752 			    "page %p; item addr %p; offset 0x%zx=0x%x",
753 			    __func__, pp->pr_wchan, ph->ph_page, pi,
754 			    (pidx * sizeof(int)) + sizeof(*pi), ip[pidx]);
755 		}
756 	}
757 #endif /* DIAGNOSTIC */
758 
759 	if (ph->ph_nmissing++ == 0) {
760 		/*
761 		 * This page was previously empty.  Move it to the list of
762 		 * partially-full pages.  This page is already curpage.
763 		 */
764 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry);
765 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry);
766 
767 		pp->pr_nidle--;
768 	}
769 
770 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
771 		/*
772 		 * This page is now full.  Move it to the full list
773 		 * and select a new current page.
774 		 */
775 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry);
776 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_entry);
777 		pool_update_curpage(pp);
778 	}
779 
780 	pp->pr_nget++;
781 
782 	return (pi);
783 }
784 
785 /*
786  * Return resource to the pool.
787  */
788 void
789 pool_put(struct pool *pp, void *v)
790 {
791 	struct pool_page_header *ph, *freeph = NULL;
792 
793 #ifdef DIAGNOSTIC
794 	if (v == NULL)
795 		panic("%s: NULL item", __func__);
796 #endif
797 
798 #ifdef MULTIPROCESSOR
799 	if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
800 		pool_cache_put(pp, v);
801 		return;
802 	}
803 #endif
804 
805 	pl_enter(pp, &pp->pr_lock);
806 
807 	pool_do_put(pp, v);
808 
809 	pp->pr_nout--;
810 	pp->pr_nput++;
811 
812 	/* is it time to free a page? */
813 	if (pp->pr_nidle > pp->pr_maxpages &&
814 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
815 	    getnsecuptime() - ph->ph_timestamp > POOL_WAIT_FREE) {
816 		freeph = ph;
817 		pool_p_remove(pp, freeph);
818 	}
819 
820 	pl_leave(pp, &pp->pr_lock);
821 
822 	if (freeph != NULL)
823 		pool_p_free(pp, freeph);
824 
825 	pool_wakeup(pp);
826 }
827 
828 void
829 pool_wakeup(struct pool *pp)
830 {
831 	if (!TAILQ_EMPTY(&pp->pr_requests)) {
832 		pl_enter(pp, &pp->pr_requests_lock);
833 		pool_runqueue(pp, PR_NOWAIT);
834 		pl_leave(pp, &pp->pr_requests_lock);
835 	}
836 }
837 
838 void
839 pool_do_put(struct pool *pp, void *v)
840 {
841 	struct pool_item *pi = v;
842 	struct pool_page_header *ph;
843 
844 	splassert(pp->pr_ipl);
845 
846 	ph = pr_find_pagehead(pp, v);
847 
848 #ifdef DIAGNOSTIC
849 	if (pool_debug) {
850 		struct pool_item *qi;
851 		XSIMPLEQ_FOREACH(qi, &ph->ph_items, pi_list) {
852 			if (pi == qi) {
853 				panic("%s: %s: double pool_put: %p", __func__,
854 				    pp->pr_wchan, pi);
855 			}
856 		}
857 	}
858 #endif /* DIAGNOSTIC */
859 
860 	pi->pi_magic = POOL_IMAGIC(ph, pi);
861 	XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list);
862 #ifdef DIAGNOSTIC
863 	if (POOL_PHPOISON(ph))
864 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
865 #endif /* DIAGNOSTIC */
866 
867 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
868 		/*
869 		 * The page was previously completely full, move it to the
870 		 * partially-full list.
871 		 */
872 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_entry);
873 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry);
874 	}
875 
876 	if (ph->ph_nmissing == 0) {
877 		/*
878 		 * The page is now empty, so move it to the empty page list.
879 		 */
880 		pp->pr_nidle++;
881 
882 		ph->ph_timestamp = getnsecuptime();
883 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry);
884 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry);
885 		pool_update_curpage(pp);
886 	}
887 }
888 
889 /*
890  * Add N items to the pool.
891  */
892 int
893 pool_prime(struct pool *pp, int n)
894 {
895 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
896 	struct pool_page_header *ph;
897 	int newpages;
898 
899 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
900 
901 	while (newpages-- > 0) {
902 		int slowdown = 0;
903 
904 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
905 		if (ph == NULL) /* or slowdown? */
906 			break;
907 
908 		TAILQ_INSERT_TAIL(&pl, ph, ph_entry);
909 	}
910 
911 	pl_enter(pp, &pp->pr_lock);
912 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
913 		TAILQ_REMOVE(&pl, ph, ph_entry);
914 		pool_p_insert(pp, ph);
915 	}
916 	pl_leave(pp, &pp->pr_lock);
917 
918 	return (0);
919 }
920 
921 struct pool_page_header *
922 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
923 {
924 	struct pool_page_header *ph;
925 	struct pool_item *pi;
926 	caddr_t addr;
927 	unsigned int order;
928 	int o;
929 	int n;
930 
931 	pl_assert_unlocked(pp, &pp->pr_lock);
932 	KASSERT(pp->pr_size >= sizeof(*pi));
933 
934 	addr = pool_allocator_alloc(pp, flags, slowdown);
935 	if (addr == NULL)
936 		return (NULL);
937 
938 	if (POOL_INPGHDR(pp))
939 		ph = (struct pool_page_header *)(addr + pp->pr_phoffset);
940 	else {
941 		ph = pool_get(&phpool, flags);
942 		if (ph == NULL) {
943 			pool_allocator_free(pp, addr);
944 			return (NULL);
945 		}
946 	}
947 
948 	XSIMPLEQ_INIT(&ph->ph_items);
949 	ph->ph_page = addr;
950 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
951 	ph->ph_colored = addr;
952 	ph->ph_nmissing = 0;
953 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
954 #ifdef DIAGNOSTIC
955 	/* use a bit in ph_magic to record if we poison page items */
956 	if (pool_debug)
957 		SET(ph->ph_magic, POOL_MAGICBIT);
958 	else
959 		CLR(ph->ph_magic, POOL_MAGICBIT);
960 #endif /* DIAGNOSTIC */
961 
962 	n = pp->pr_itemsperpage;
963 	o = 32;
964 	while (n--) {
965 		pi = (struct pool_item *)addr;
966 		pi->pi_magic = POOL_IMAGIC(ph, pi);
967 
968 		if (o == 32) {
969 			order = arc4random();
970 			o = 0;
971 		}
972 		if (ISSET(order, 1 << o++))
973 			XSIMPLEQ_INSERT_TAIL(&ph->ph_items, pi, pi_list);
974 		else
975 			XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list);
976 
977 #ifdef DIAGNOSTIC
978 		if (POOL_PHPOISON(ph))
979 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
980 #endif /* DIAGNOSTIC */
981 
982 		addr += pp->pr_size;
983 	}
984 
985 	return (ph);
986 }
987 
988 void
989 pool_p_free(struct pool *pp, struct pool_page_header *ph)
990 {
991 	struct pool_item *pi;
992 
993 	pl_assert_unlocked(pp, &pp->pr_lock);
994 	KASSERT(ph->ph_nmissing == 0);
995 
996 	XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
997 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
998 			panic("%s: %s free list modified: "
999 			    "page %p; item addr %p; offset 0x%x=0x%lx",
1000 			    __func__, pp->pr_wchan, ph->ph_page, pi,
1001 			    0, pi->pi_magic);
1002 		}
1003 
1004 #ifdef DIAGNOSTIC
1005 		if (POOL_PHPOISON(ph)) {
1006 			size_t pidx;
1007 			uint32_t pval;
1008 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1009 			    &pidx, &pval)) {
1010 				int *ip = (int *)(pi + 1);
1011 				panic("%s: %s free list modified: "
1012 				    "page %p; item addr %p; offset 0x%zx=0x%x",
1013 				    __func__, pp->pr_wchan, ph->ph_page, pi,
1014 				    pidx * sizeof(int), ip[pidx]);
1015 			}
1016 		}
1017 #endif
1018 	}
1019 
1020 	pool_allocator_free(pp, ph->ph_page);
1021 
1022 	if (!POOL_INPGHDR(pp))
1023 		pool_put(&phpool, ph);
1024 }
1025 
1026 void
1027 pool_p_insert(struct pool *pp, struct pool_page_header *ph)
1028 {
1029 	pl_assert_locked(pp, &pp->pr_lock);
1030 
1031 	/* If the pool was depleted, point at the new page */
1032 	if (pp->pr_curpage == NULL)
1033 		pp->pr_curpage = ph;
1034 
1035 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry);
1036 	if (!POOL_INPGHDR(pp))
1037 		RBT_INSERT(phtree, &pp->pr_phtree, ph);
1038 
1039 	pp->pr_nitems += pp->pr_itemsperpage;
1040 	pp->pr_nidle++;
1041 
1042 	pp->pr_npagealloc++;
1043 	if (++pp->pr_npages > pp->pr_hiwat)
1044 		pp->pr_hiwat = pp->pr_npages;
1045 }
1046 
1047 void
1048 pool_p_remove(struct pool *pp, struct pool_page_header *ph)
1049 {
1050 	pl_assert_locked(pp, &pp->pr_lock);
1051 
1052 	pp->pr_npagefree++;
1053 	pp->pr_npages--;
1054 	pp->pr_nidle--;
1055 	pp->pr_nitems -= pp->pr_itemsperpage;
1056 
1057 	if (!POOL_INPGHDR(pp))
1058 		RBT_REMOVE(phtree, &pp->pr_phtree, ph);
1059 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry);
1060 
1061 	pool_update_curpage(pp);
1062 }
1063 
1064 void
1065 pool_update_curpage(struct pool *pp)
1066 {
1067 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
1068 	if (pp->pr_curpage == NULL) {
1069 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
1070 	}
1071 }
1072 
1073 void
1074 pool_setlowat(struct pool *pp, int n)
1075 {
1076 	int prime = 0;
1077 
1078 	pl_enter(pp, &pp->pr_lock);
1079 	pp->pr_minitems = n;
1080 	pp->pr_minpages = (n == 0)
1081 		? 0
1082 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1083 
1084 	if (pp->pr_nitems < n)
1085 		prime = n - pp->pr_nitems;
1086 	pl_leave(pp, &pp->pr_lock);
1087 
1088 	if (prime > 0)
1089 		pool_prime(pp, prime);
1090 }
1091 
1092 void
1093 pool_sethiwat(struct pool *pp, int n)
1094 {
1095 	pp->pr_maxpages = (n == 0)
1096 		? 0
1097 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1098 }
1099 
1100 int
1101 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
1102 {
1103 	int error = 0;
1104 
1105 	if (n < pp->pr_nout) {
1106 		error = EINVAL;
1107 		goto done;
1108 	}
1109 
1110 	pp->pr_hardlimit = n;
1111 	pp->pr_hardlimit_warning = warnmsg;
1112 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1113 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1114 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1115 
1116 done:
1117 	return (error);
1118 }
1119 
1120 void
1121 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
1122 {
1123 	pp->pr_crange = mode;
1124 }
1125 
1126 /*
1127  * Release all complete pages that have not been used recently.
1128  *
1129  * Returns non-zero if any pages have been reclaimed.
1130  */
1131 int
1132 pool_reclaim(struct pool *pp)
1133 {
1134 	struct pool_page_header *ph, *phnext;
1135 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
1136 
1137 	pl_enter(pp, &pp->pr_lock);
1138 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1139 		phnext = TAILQ_NEXT(ph, ph_entry);
1140 
1141 		/* Check our minimum page claim */
1142 		if (pp->pr_npages <= pp->pr_minpages)
1143 			break;
1144 
1145 		/*
1146 		 * If freeing this page would put us below
1147 		 * the low water mark, stop now.
1148 		 */
1149 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1150 		    pp->pr_minitems)
1151 			break;
1152 
1153 		pool_p_remove(pp, ph);
1154 		TAILQ_INSERT_TAIL(&pl, ph, ph_entry);
1155 	}
1156 	pl_leave(pp, &pp->pr_lock);
1157 
1158 	if (TAILQ_EMPTY(&pl))
1159 		return (0);
1160 
1161 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
1162 		TAILQ_REMOVE(&pl, ph, ph_entry);
1163 		pool_p_free(pp, ph);
1164 	}
1165 
1166 	return (1);
1167 }
1168 
1169 /*
1170  * Release all complete pages that have not been used recently
1171  * from all pools.
1172  */
1173 void
1174 pool_reclaim_all(void)
1175 {
1176 	struct pool	*pp;
1177 
1178 	rw_enter_read(&pool_lock);
1179 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1180 		pool_reclaim(pp);
1181 	rw_exit_read(&pool_lock);
1182 }
1183 
1184 #ifdef DDB
1185 #include <machine/db_machdep.h>
1186 #include <ddb/db_output.h>
1187 
1188 /*
1189  * Diagnostic helpers.
1190  */
1191 void
1192 pool_printit(struct pool *pp, const char *modif,
1193     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1194 {
1195 	pool_print1(pp, modif, pr);
1196 }
1197 
1198 void
1199 pool_print_pagelist(struct pool_pagelist *pl,
1200     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1201 {
1202 	struct pool_page_header *ph;
1203 	struct pool_item *pi;
1204 
1205 	TAILQ_FOREACH(ph, pl, ph_entry) {
1206 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1207 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1208 		XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
1209 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1210 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1211 				    pi, pi->pi_magic);
1212 			}
1213 		}
1214 	}
1215 }
1216 
1217 void
1218 pool_print1(struct pool *pp, const char *modif,
1219     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1220 {
1221 	struct pool_page_header *ph;
1222 	int print_pagelist = 0;
1223 	char c;
1224 
1225 	while ((c = *modif++) != '\0') {
1226 		if (c == 'p')
1227 			print_pagelist = 1;
1228 		modif++;
1229 	}
1230 
1231 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1232 	    pp->pr_maxcolors);
1233 	(*pr)("\talloc %p\n", pp->pr_alloc);
1234 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1235 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1236 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1237 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1238 
1239 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1240 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1241 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1242 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1243 
1244 	if (print_pagelist == 0)
1245 		return;
1246 
1247 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1248 		(*pr)("\n\tempty page list:\n");
1249 	pool_print_pagelist(&pp->pr_emptypages, pr);
1250 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1251 		(*pr)("\n\tfull page list:\n");
1252 	pool_print_pagelist(&pp->pr_fullpages, pr);
1253 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1254 		(*pr)("\n\tpartial-page list:\n");
1255 	pool_print_pagelist(&pp->pr_partpages, pr);
1256 
1257 	if (pp->pr_curpage == NULL)
1258 		(*pr)("\tno current page\n");
1259 	else
1260 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1261 }
1262 
1263 void
1264 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1265 {
1266 	struct pool *pp;
1267 	char maxp[16];
1268 	int ovflw;
1269 	char mode;
1270 
1271 	mode = modif[0];
1272 	if (mode != '\0' && mode != 'a') {
1273 		db_printf("usage: show all pools [/a]\n");
1274 		return;
1275 	}
1276 
1277 	if (mode == '\0')
1278 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1279 		    "Name",
1280 		    "Size",
1281 		    "Requests",
1282 		    "Fail",
1283 		    "Releases",
1284 		    "Pgreq",
1285 		    "Pgrel",
1286 		    "Npage",
1287 		    "Hiwat",
1288 		    "Minpg",
1289 		    "Maxpg",
1290 		    "Idle");
1291 	else
1292 		db_printf("%-12s %18s %18s\n",
1293 		    "Name", "Address", "Allocator");
1294 
1295 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1296 		if (mode == 'a') {
1297 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1298 			    pp->pr_alloc);
1299 			continue;
1300 		}
1301 
1302 		if (!pp->pr_nget)
1303 			continue;
1304 
1305 		if (pp->pr_maxpages == UINT_MAX)
1306 			snprintf(maxp, sizeof maxp, "inf");
1307 		else
1308 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1309 
1310 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1311 	(ovflw) += db_printf((fmt),			\
1312 	    (width) - (fixed) - (ovflw) > 0 ?		\
1313 	    (width) - (fixed) - (ovflw) : 0,		\
1314 	    (val)) - (width);				\
1315 	if ((ovflw) < 0)				\
1316 		(ovflw) = 0;				\
1317 } while (/* CONSTCOND */0)
1318 
1319 		ovflw = 0;
1320 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1321 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1322 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1323 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1324 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1325 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1326 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1327 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1328 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1329 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1330 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1331 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1332 
1333 		pool_chk(pp);
1334 	}
1335 }
1336 #endif /* DDB */
1337 
1338 #if defined(POOL_DEBUG) || defined(DDB)
1339 int
1340 pool_chk_page(struct pool *pp, struct pool_page_header *ph, int expected)
1341 {
1342 	struct pool_item *pi;
1343 	caddr_t page;
1344 	int n;
1345 	const char *label = pp->pr_wchan;
1346 
1347 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1348 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1349 		printf("%s: ", label);
1350 		printf("pool(%p:%s): page inconsistency: page %p; "
1351 		    "at page head addr %p (p %p)\n",
1352 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1353 		return 1;
1354 	}
1355 
1356 	for (pi = XSIMPLEQ_FIRST(&ph->ph_items), n = 0;
1357 	     pi != NULL;
1358 	     pi = XSIMPLEQ_NEXT(&ph->ph_items, pi, pi_list), n++) {
1359 		if ((caddr_t)pi < ph->ph_page ||
1360 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1361 			printf("%s: ", label);
1362 			printf("pool(%p:%s): page inconsistency: page %p;"
1363 			    " item ordinal %d; addr %p\n", pp,
1364 			    pp->pr_wchan, ph->ph_page, n, pi);
1365 			return (1);
1366 		}
1367 
1368 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1369 			printf("%s: ", label);
1370 			printf("pool(%p:%s): free list modified: "
1371 			    "page %p; item ordinal %d; addr %p "
1372 			    "(p %p); offset 0x%x=0x%lx\n",
1373 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1374 			    0, pi->pi_magic);
1375 		}
1376 
1377 #ifdef DIAGNOSTIC
1378 		if (POOL_PHPOISON(ph)) {
1379 			size_t pidx;
1380 			uint32_t pval;
1381 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1382 			    &pidx, &pval)) {
1383 				int *ip = (int *)(pi + 1);
1384 				printf("pool(%s): free list modified: "
1385 				    "page %p; item ordinal %d; addr %p "
1386 				    "(p %p); offset 0x%zx=0x%x\n",
1387 				    pp->pr_wchan, ph->ph_page, n, pi,
1388 				    page, pidx * sizeof(int), ip[pidx]);
1389 			}
1390 		}
1391 #endif /* DIAGNOSTIC */
1392 	}
1393 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1394 		printf("pool(%p:%s): page inconsistency: page %p;"
1395 		    " %d on list, %d missing, %d items per page\n", pp,
1396 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1397 		    pp->pr_itemsperpage);
1398 		return 1;
1399 	}
1400 	if (expected >= 0 && n != expected) {
1401 		printf("pool(%p:%s): page inconsistency: page %p;"
1402 		    " %d on list, %d missing, %d expected\n", pp,
1403 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1404 		    expected);
1405 		return 1;
1406 	}
1407 	return 0;
1408 }
1409 
1410 int
1411 pool_chk(struct pool *pp)
1412 {
1413 	struct pool_page_header *ph;
1414 	int r = 0;
1415 
1416 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_entry)
1417 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1418 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry)
1419 		r += pool_chk_page(pp, ph, 0);
1420 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry)
1421 		r += pool_chk_page(pp, ph, -1);
1422 
1423 	return (r);
1424 }
1425 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1426 
1427 #ifdef DDB
1428 void
1429 pool_walk(struct pool *pp, int full,
1430     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1431     void (*func)(void *, int, int (*)(const char *, ...)
1432 	    __attribute__((__format__(__kprintf__,1,2)))))
1433 {
1434 	struct pool_page_header *ph;
1435 	struct pool_item *pi;
1436 	caddr_t cp;
1437 	int n;
1438 
1439 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry) {
1440 		cp = ph->ph_colored;
1441 		n = ph->ph_nmissing;
1442 
1443 		while (n--) {
1444 			func(cp, full, pr);
1445 			cp += pp->pr_size;
1446 		}
1447 	}
1448 
1449 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry) {
1450 		cp = ph->ph_colored;
1451 		n = ph->ph_nmissing;
1452 
1453 		do {
1454 			XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
1455 				if (cp == (caddr_t)pi)
1456 					break;
1457 			}
1458 			if (cp != (caddr_t)pi) {
1459 				func(cp, full, pr);
1460 				n--;
1461 			}
1462 
1463 			cp += pp->pr_size;
1464 		} while (n > 0);
1465 	}
1466 }
1467 #endif
1468 
1469 /*
1470  * We have three different sysctls.
1471  * kern.pool.npools - the number of pools.
1472  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1473  * kern.pool.name.<pool#> - the name for pool#.
1474  */
1475 int
1476 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1477 {
1478 	struct kinfo_pool pi;
1479 	struct pool *pp;
1480 	int rv = ENOENT;
1481 
1482 	switch (name[0]) {
1483 	case KERN_POOL_NPOOLS:
1484 		if (namelen != 1)
1485 			return (ENOTDIR);
1486 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1487 
1488 	case KERN_POOL_NAME:
1489 	case KERN_POOL_POOL:
1490 	case KERN_POOL_CACHE:
1491 	case KERN_POOL_CACHE_CPUS:
1492 		break;
1493 	default:
1494 		return (EOPNOTSUPP);
1495 	}
1496 
1497 	if (namelen != 2)
1498 		return (ENOTDIR);
1499 
1500 	rw_enter_read(&pool_lock);
1501 
1502 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1503 		if (name[1] == pp->pr_serial)
1504 			break;
1505 	}
1506 
1507 	if (pp == NULL)
1508 		goto done;
1509 
1510 	switch (name[0]) {
1511 	case KERN_POOL_NAME:
1512 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1513 		break;
1514 	case KERN_POOL_POOL:
1515 		memset(&pi, 0, sizeof(pi));
1516 
1517 		pl_enter(pp, &pp->pr_lock);
1518 		pi.pr_size = pp->pr_size;
1519 		pi.pr_pgsize = pp->pr_pgsize;
1520 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1521 		pi.pr_npages = pp->pr_npages;
1522 		pi.pr_minpages = pp->pr_minpages;
1523 		pi.pr_maxpages = pp->pr_maxpages;
1524 		pi.pr_hardlimit = pp->pr_hardlimit;
1525 		pi.pr_nout = pp->pr_nout;
1526 		pi.pr_nitems = pp->pr_nitems;
1527 		pi.pr_nget = pp->pr_nget;
1528 		pi.pr_nput = pp->pr_nput;
1529 		pi.pr_nfail = pp->pr_nfail;
1530 		pi.pr_npagealloc = pp->pr_npagealloc;
1531 		pi.pr_npagefree = pp->pr_npagefree;
1532 		pi.pr_hiwat = pp->pr_hiwat;
1533 		pi.pr_nidle = pp->pr_nidle;
1534 		pl_leave(pp, &pp->pr_lock);
1535 
1536 		pool_cache_pool_info(pp, &pi);
1537 
1538 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1539 		break;
1540 
1541 	case KERN_POOL_CACHE:
1542 		rv = pool_cache_info(pp, oldp, oldlenp);
1543 		break;
1544 
1545 	case KERN_POOL_CACHE_CPUS:
1546 		rv = pool_cache_cpus_info(pp, oldp, oldlenp);
1547 		break;
1548 	}
1549 
1550 done:
1551 	rw_exit_read(&pool_lock);
1552 
1553 	return (rv);
1554 }
1555 
1556 void
1557 pool_gc_sched(void *null)
1558 {
1559 	task_add(systqmp, &pool_gc_task);
1560 }
1561 
1562 void
1563 pool_gc_pages(void *null)
1564 {
1565 	struct pool *pp;
1566 	struct pool_page_header *ph, *freeph;
1567 	int s;
1568 
1569 	rw_enter_read(&pool_lock);
1570 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1571 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1572 #ifdef MULTIPROCESSOR
1573 		if (pp->pr_cache != NULL)
1574 			pool_cache_gc(pp);
1575 #endif
1576 
1577 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1578 		    !pl_enter_try(pp, &pp->pr_lock)) /* try */
1579 			continue;
1580 
1581 		/* is it time to free a page? */
1582 		if (pp->pr_nidle > pp->pr_minpages &&
1583 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1584 		    getnsecuptime() - ph->ph_timestamp > POOL_WAIT_GC) {
1585 			freeph = ph;
1586 			pool_p_remove(pp, freeph);
1587 		} else
1588 			freeph = NULL;
1589 
1590 		pl_leave(pp, &pp->pr_lock);
1591 
1592 		if (freeph != NULL)
1593 			pool_p_free(pp, freeph);
1594 	}
1595 	splx(s);
1596 	rw_exit_read(&pool_lock);
1597 
1598 	timeout_add_sec(&pool_gc_tick, 1);
1599 }
1600 
1601 /*
1602  * Pool backend allocators.
1603  */
1604 
1605 void *
1606 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1607 {
1608 	void *v;
1609 
1610 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1611 
1612 #ifdef DIAGNOSTIC
1613 	if (v != NULL && POOL_INPGHDR(pp)) {
1614 		vaddr_t addr = (vaddr_t)v;
1615 		if ((addr & pp->pr_pgmask) != addr) {
1616 			panic("%s: %s page address %p isnt aligned to %u",
1617 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1618 		}
1619 	}
1620 #endif
1621 
1622 	return (v);
1623 }
1624 
1625 void
1626 pool_allocator_free(struct pool *pp, void *v)
1627 {
1628 	struct pool_allocator *pa = pp->pr_alloc;
1629 
1630 	(*pa->pa_free)(pp, v);
1631 }
1632 
1633 void *
1634 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1635 {
1636 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1637 
1638 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1639 	kd.kd_slowdown = slowdown;
1640 
1641 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1642 }
1643 
1644 void
1645 pool_page_free(struct pool *pp, void *v)
1646 {
1647 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1648 }
1649 
1650 void *
1651 pool_multi_alloc(struct pool *pp, int flags, int *slowdown)
1652 {
1653 	struct kmem_va_mode kv = kv_intrsafe;
1654 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1655 	void *v;
1656 	int s;
1657 
1658 	if (POOL_INPGHDR(pp))
1659 		kv.kv_align = pp->pr_pgsize;
1660 
1661 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1662 	kd.kd_slowdown = slowdown;
1663 
1664 	s = splvm();
1665 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1666 	splx(s);
1667 
1668 	return (v);
1669 }
1670 
1671 void
1672 pool_multi_free(struct pool *pp, void *v)
1673 {
1674 	struct kmem_va_mode kv = kv_intrsafe;
1675 	int s;
1676 
1677 	if (POOL_INPGHDR(pp))
1678 		kv.kv_align = pp->pr_pgsize;
1679 
1680 	s = splvm();
1681 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1682 	splx(s);
1683 }
1684 
1685 void *
1686 pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown)
1687 {
1688 	struct kmem_va_mode kv = kv_any;
1689 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1690 	void *v;
1691 
1692 	if (POOL_INPGHDR(pp))
1693 		kv.kv_align = pp->pr_pgsize;
1694 
1695 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1696 	kd.kd_slowdown = slowdown;
1697 
1698 	KERNEL_LOCK();
1699 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1700 	KERNEL_UNLOCK();
1701 
1702 	return (v);
1703 }
1704 
1705 void
1706 pool_multi_free_ni(struct pool *pp, void *v)
1707 {
1708 	struct kmem_va_mode kv = kv_any;
1709 
1710 	if (POOL_INPGHDR(pp))
1711 		kv.kv_align = pp->pr_pgsize;
1712 
1713 	KERNEL_LOCK();
1714 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1715 	KERNEL_UNLOCK();
1716 }
1717 
1718 #ifdef MULTIPROCESSOR
1719 
1720 struct pool pool_caches; /* per cpu cache entries */
1721 
1722 void
1723 pool_cache_init(struct pool *pp)
1724 {
1725 	struct cpumem *cm;
1726 	struct pool_cache *pc;
1727 	struct cpumem_iter i;
1728 
1729 	if (pool_caches.pr_size == 0) {
1730 		pool_init(&pool_caches, sizeof(struct pool_cache),
1731 		    CACHELINESIZE, IPL_NONE, PR_WAITOK | PR_RWLOCK,
1732 		    "plcache", NULL);
1733 	}
1734 
1735 	/* must be able to use the pool items as cache list items */
1736 	KASSERT(pp->pr_size >= sizeof(struct pool_cache_item));
1737 
1738 	cm = cpumem_get(&pool_caches);
1739 
1740 	pl_init(pp, &pp->pr_cache_lock);
1741 	arc4random_buf(pp->pr_cache_magic, sizeof(pp->pr_cache_magic));
1742 	TAILQ_INIT(&pp->pr_cache_lists);
1743 	pp->pr_cache_nitems = 0;
1744 	pp->pr_cache_timestamp = getnsecuptime();
1745 	pp->pr_cache_items = 8;
1746 	pp->pr_cache_contention = 0;
1747 	pp->pr_cache_ngc = 0;
1748 
1749 	CPUMEM_FOREACH(pc, &i, cm) {
1750 		pc->pc_actv = NULL;
1751 		pc->pc_nactv = 0;
1752 		pc->pc_prev = NULL;
1753 
1754 		pc->pc_nget = 0;
1755 		pc->pc_nfail = 0;
1756 		pc->pc_nput = 0;
1757 		pc->pc_nlget = 0;
1758 		pc->pc_nlfail = 0;
1759 		pc->pc_nlput = 0;
1760 		pc->pc_nout = 0;
1761 	}
1762 
1763 	membar_producer();
1764 
1765 	pp->pr_cache = cm;
1766 }
1767 
1768 static inline void
1769 pool_cache_item_magic(struct pool *pp, struct pool_cache_item *ci)
1770 {
1771 	unsigned long *entry = (unsigned long *)&ci->ci_nextl;
1772 
1773 	entry[0] = pp->pr_cache_magic[0] ^ (u_long)ci;
1774 	entry[1] = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next;
1775 }
1776 
1777 static inline void
1778 pool_cache_item_magic_check(struct pool *pp, struct pool_cache_item *ci)
1779 {
1780 	unsigned long *entry;
1781 	unsigned long val;
1782 
1783 	entry = (unsigned long *)&ci->ci_nextl;
1784 	val = pp->pr_cache_magic[0] ^ (u_long)ci;
1785 	if (*entry != val)
1786 		goto fail;
1787 
1788 	entry++;
1789 	val = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next;
1790 	if (*entry != val)
1791 		goto fail;
1792 
1793 	return;
1794 
1795 fail:
1796 	panic("%s: %s cpu free list modified: item addr %p+%zu 0x%lx!=0x%lx",
1797 	    __func__, pp->pr_wchan, ci, (caddr_t)entry - (caddr_t)ci,
1798 	    *entry, val);
1799 }
1800 
1801 static inline void
1802 pool_list_enter(struct pool *pp)
1803 {
1804 	if (pl_enter_try(pp, &pp->pr_cache_lock) == 0) {
1805 		pl_enter(pp, &pp->pr_cache_lock);
1806 		pp->pr_cache_contention++;
1807 	}
1808 }
1809 
1810 static inline void
1811 pool_list_leave(struct pool *pp)
1812 {
1813 	pl_leave(pp, &pp->pr_cache_lock);
1814 }
1815 
1816 static inline struct pool_cache_item *
1817 pool_cache_list_alloc(struct pool *pp, struct pool_cache *pc)
1818 {
1819 	struct pool_cache_item *pl;
1820 
1821 	pool_list_enter(pp);
1822 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
1823 	if (pl != NULL) {
1824 		TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl);
1825 		pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl);
1826 
1827 		pool_cache_item_magic(pp, pl);
1828 
1829 		pc->pc_nlget++;
1830 	} else
1831 		pc->pc_nlfail++;
1832 
1833 	/* fold this cpus nout into the global while we have the lock */
1834 	pp->pr_cache_nout += pc->pc_nout;
1835 	pc->pc_nout = 0;
1836 	pool_list_leave(pp);
1837 
1838 	return (pl);
1839 }
1840 
1841 static inline void
1842 pool_cache_list_free(struct pool *pp, struct pool_cache *pc,
1843     struct pool_cache_item *ci)
1844 {
1845 	pool_list_enter(pp);
1846 	if (TAILQ_EMPTY(&pp->pr_cache_lists))
1847 		pp->pr_cache_timestamp = getnsecuptime();
1848 
1849 	pp->pr_cache_nitems += POOL_CACHE_ITEM_NITEMS(ci);
1850 	TAILQ_INSERT_TAIL(&pp->pr_cache_lists, ci, ci_nextl);
1851 
1852 	pc->pc_nlput++;
1853 
1854 	/* fold this cpus nout into the global while we have the lock */
1855 	pp->pr_cache_nout += pc->pc_nout;
1856 	pc->pc_nout = 0;
1857 	pool_list_leave(pp);
1858 }
1859 
1860 static inline struct pool_cache *
1861 pool_cache_enter(struct pool *pp, int *s)
1862 {
1863 	struct pool_cache *pc;
1864 
1865 	pc = cpumem_enter(pp->pr_cache);
1866 	*s = splraise(pp->pr_ipl);
1867 	pc->pc_gen++;
1868 
1869 	return (pc);
1870 }
1871 
1872 static inline void
1873 pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
1874 {
1875 	pc->pc_gen++;
1876 	splx(s);
1877 	cpumem_leave(pp->pr_cache, pc);
1878 }
1879 
1880 void *
1881 pool_cache_get(struct pool *pp)
1882 {
1883 	struct pool_cache *pc;
1884 	struct pool_cache_item *ci;
1885 	int s;
1886 
1887 	pc = pool_cache_enter(pp, &s);
1888 
1889 	if (pc->pc_actv != NULL) {
1890 		ci = pc->pc_actv;
1891 	} else if (pc->pc_prev != NULL) {
1892 		ci = pc->pc_prev;
1893 		pc->pc_prev = NULL;
1894 	} else if ((ci = pool_cache_list_alloc(pp, pc)) == NULL) {
1895 		pc->pc_nfail++;
1896 		goto done;
1897 	}
1898 
1899 	pool_cache_item_magic_check(pp, ci);
1900 #ifdef DIAGNOSTIC
1901 	if (pool_debug && POOL_CACHE_ITEM_POISONED(ci)) {
1902 		size_t pidx;
1903 		uint32_t pval;
1904 
1905 		if (poison_check(ci + 1, pp->pr_size - sizeof(*ci),
1906 		    &pidx, &pval)) {
1907 			int *ip = (int *)(ci + 1);
1908 			ip += pidx;
1909 
1910 			panic("%s: %s cpu free list modified: "
1911 			    "item addr %p+%zu 0x%x!=0x%x",
1912 			    __func__, pp->pr_wchan, ci,
1913 			    (caddr_t)ip - (caddr_t)ci, *ip, pval);
1914 		}
1915 	}
1916 #endif
1917 
1918 	pc->pc_actv = ci->ci_next;
1919 	pc->pc_nactv = POOL_CACHE_ITEM_NITEMS(ci) - 1;
1920 	pc->pc_nget++;
1921 	pc->pc_nout++;
1922 
1923 done:
1924 	pool_cache_leave(pp, pc, s);
1925 
1926 	return (ci);
1927 }
1928 
1929 void
1930 pool_cache_put(struct pool *pp, void *v)
1931 {
1932 	struct pool_cache *pc;
1933 	struct pool_cache_item *ci = v;
1934 	unsigned long nitems;
1935 	int s;
1936 #ifdef DIAGNOSTIC
1937 	int poison = pool_debug && pp->pr_size > sizeof(*ci);
1938 
1939 	if (poison)
1940 		poison_mem(ci + 1, pp->pr_size - sizeof(*ci));
1941 #endif
1942 
1943 	pc = pool_cache_enter(pp, &s);
1944 
1945 	nitems = pc->pc_nactv;
1946 	if (nitems >= pp->pr_cache_items) {
1947 		if (pc->pc_prev != NULL)
1948 			pool_cache_list_free(pp, pc, pc->pc_prev);
1949 
1950 		pc->pc_prev = pc->pc_actv;
1951 
1952 		pc->pc_actv = NULL;
1953 		pc->pc_nactv = 0;
1954 		nitems = 0;
1955 	}
1956 
1957 	ci->ci_next = pc->pc_actv;
1958 	ci->ci_nitems = ++nitems;
1959 #ifdef DIAGNOSTIC
1960 	ci->ci_nitems |= poison ? POOL_CACHE_ITEM_NITEMS_POISON : 0;
1961 #endif
1962 	pool_cache_item_magic(pp, ci);
1963 
1964 	pc->pc_actv = ci;
1965 	pc->pc_nactv = nitems;
1966 
1967 	pc->pc_nput++;
1968 	pc->pc_nout--;
1969 
1970 	pool_cache_leave(pp, pc, s);
1971 }
1972 
1973 struct pool_cache_item *
1974 pool_cache_list_put(struct pool *pp, struct pool_cache_item *pl)
1975 {
1976 	struct pool_cache_item *rpl, *next;
1977 
1978 	if (pl == NULL)
1979 		return (NULL);
1980 
1981 	rpl = TAILQ_NEXT(pl, ci_nextl);
1982 
1983 	pl_enter(pp, &pp->pr_lock);
1984 	do {
1985 		next = pl->ci_next;
1986 		pool_do_put(pp, pl);
1987 		pl = next;
1988 	} while (pl != NULL);
1989 	pl_leave(pp, &pp->pr_lock);
1990 
1991 	return (rpl);
1992 }
1993 
1994 void
1995 pool_cache_destroy(struct pool *pp)
1996 {
1997 	struct pool_cache *pc;
1998 	struct pool_cache_item *pl;
1999 	struct cpumem_iter i;
2000 	struct cpumem *cm;
2001 
2002 	rw_enter_write(&pool_lock); /* serialise with the gc */
2003 	cm = pp->pr_cache;
2004 	pp->pr_cache = NULL; /* make pool_put avoid the cache */
2005 	rw_exit_write(&pool_lock);
2006 
2007 	CPUMEM_FOREACH(pc, &i, cm) {
2008 		pool_cache_list_put(pp, pc->pc_actv);
2009 		pool_cache_list_put(pp, pc->pc_prev);
2010 	}
2011 
2012 	cpumem_put(&pool_caches, cm);
2013 
2014 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
2015 	while (pl != NULL)
2016 		pl = pool_cache_list_put(pp, pl);
2017 }
2018 
2019 void
2020 pool_cache_gc(struct pool *pp)
2021 {
2022 	unsigned int contention, delta;
2023 
2024 	if (getnsecuptime() - pp->pr_cache_timestamp > POOL_WAIT_GC &&
2025 	    !TAILQ_EMPTY(&pp->pr_cache_lists) &&
2026 	    pl_enter_try(pp, &pp->pr_cache_lock)) {
2027 		struct pool_cache_item *pl = NULL;
2028 
2029 		pl = TAILQ_FIRST(&pp->pr_cache_lists);
2030 		if (pl != NULL) {
2031 			TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl);
2032 			pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl);
2033 			pp->pr_cache_timestamp = getnsecuptime();
2034 
2035 			pp->pr_cache_ngc++;
2036 		}
2037 
2038 		pl_leave(pp, &pp->pr_cache_lock);
2039 
2040 		pool_cache_list_put(pp, pl);
2041 	}
2042 
2043 	/*
2044 	 * if there's a lot of contention on the pr_cache_mtx then consider
2045 	 * growing the length of the list to reduce the need to access the
2046 	 * global pool.
2047 	 */
2048 
2049 	contention = pp->pr_cache_contention;
2050 	delta = contention - pp->pr_cache_contention_prev;
2051 	if (delta > 8 /* magic */) {
2052 		if ((ncpusfound * 8 * 2) <= pp->pr_cache_nitems)
2053 			pp->pr_cache_items += 8;
2054 	} else if (delta == 0) {
2055 		if (pp->pr_cache_items > 8)
2056 			pp->pr_cache_items--;
2057 	}
2058 	pp->pr_cache_contention_prev = contention;
2059 }
2060 
2061 void
2062 pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi)
2063 {
2064 	struct pool_cache *pc;
2065 	struct cpumem_iter i;
2066 
2067 	if (pp->pr_cache == NULL)
2068 		return;
2069 
2070 	/* loop through the caches twice to collect stats */
2071 
2072 	/* once without the lock so we can yield while reading nget/nput */
2073 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
2074 		uint64_t gen, nget, nput;
2075 
2076 		do {
2077 			while ((gen = pc->pc_gen) & 1)
2078 				yield();
2079 
2080 			nget = pc->pc_nget;
2081 			nput = pc->pc_nput;
2082 		} while (gen != pc->pc_gen);
2083 
2084 		pi->pr_nget += nget;
2085 		pi->pr_nput += nput;
2086 	}
2087 
2088 	/* and once with the mtx so we can get consistent nout values */
2089 	pl_enter(pp, &pp->pr_cache_lock);
2090 	CPUMEM_FOREACH(pc, &i, pp->pr_cache)
2091 		pi->pr_nout += pc->pc_nout;
2092 
2093 	pi->pr_nout += pp->pr_cache_nout;
2094 	pl_leave(pp, &pp->pr_cache_lock);
2095 }
2096 
2097 int
2098 pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp)
2099 {
2100 	struct kinfo_pool_cache kpc;
2101 
2102 	if (pp->pr_cache == NULL)
2103 		return (EOPNOTSUPP);
2104 
2105 	memset(&kpc, 0, sizeof(kpc)); /* don't leak padding */
2106 
2107 	pl_enter(pp, &pp->pr_cache_lock);
2108 	kpc.pr_ngc = pp->pr_cache_ngc;
2109 	kpc.pr_len = pp->pr_cache_items;
2110 	kpc.pr_nitems = pp->pr_cache_nitems;
2111 	kpc.pr_contention = pp->pr_cache_contention;
2112 	pl_leave(pp, &pp->pr_cache_lock);
2113 
2114 	return (sysctl_rdstruct(oldp, oldlenp, NULL, &kpc, sizeof(kpc)));
2115 }
2116 
2117 int
2118 pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp)
2119 {
2120 	struct pool_cache *pc;
2121 	struct kinfo_pool_cache_cpu *kpcc, *info;
2122 	unsigned int cpu = 0;
2123 	struct cpumem_iter i;
2124 	int error = 0;
2125 	size_t len;
2126 
2127 	if (pp->pr_cache == NULL)
2128 		return (EOPNOTSUPP);
2129 	if (*oldlenp % sizeof(*kpcc))
2130 		return (EINVAL);
2131 
2132 	kpcc = mallocarray(ncpusfound, sizeof(*kpcc), M_TEMP,
2133 	    M_WAITOK|M_CANFAIL|M_ZERO);
2134 	if (kpcc == NULL)
2135 		return (EIO);
2136 
2137 	len = ncpusfound * sizeof(*kpcc);
2138 
2139 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
2140 		uint64_t gen;
2141 
2142 		if (cpu >= ncpusfound) {
2143 			error = EIO;
2144 			goto err;
2145 		}
2146 
2147 		info = &kpcc[cpu];
2148 		info->pr_cpu = cpu;
2149 
2150 		do {
2151 			while ((gen = pc->pc_gen) & 1)
2152 				yield();
2153 
2154 			info->pr_nget = pc->pc_nget;
2155 			info->pr_nfail = pc->pc_nfail;
2156 			info->pr_nput = pc->pc_nput;
2157 			info->pr_nlget = pc->pc_nlget;
2158 			info->pr_nlfail = pc->pc_nlfail;
2159 			info->pr_nlput = pc->pc_nlput;
2160 		} while (gen != pc->pc_gen);
2161 
2162 		cpu++;
2163 	}
2164 
2165 	error = sysctl_rdstruct(oldp, oldlenp, NULL, kpcc, len);
2166 err:
2167 	free(kpcc, M_TEMP, len);
2168 
2169 	return (error);
2170 }
2171 #else /* MULTIPROCESSOR */
2172 void
2173 pool_cache_init(struct pool *pp)
2174 {
2175 	/* nop */
2176 }
2177 
2178 void
2179 pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi)
2180 {
2181 	/* nop */
2182 }
2183 
2184 int
2185 pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp)
2186 {
2187 	return (EOPNOTSUPP);
2188 }
2189 
2190 int
2191 pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp)
2192 {
2193 	return (EOPNOTSUPP);
2194 }
2195 #endif /* MULTIPROCESSOR */
2196 
2197 
2198 void
2199 pool_lock_mtx_init(struct pool *pp, union pool_lock *lock,
2200     const struct lock_type *type)
2201 {
2202 	_mtx_init_flags(&lock->prl_mtx, pp->pr_ipl, pp->pr_wchan, 0, type);
2203 }
2204 
2205 void
2206 pool_lock_mtx_enter(union pool_lock *lock)
2207 {
2208 	mtx_enter(&lock->prl_mtx);
2209 }
2210 
2211 int
2212 pool_lock_mtx_enter_try(union pool_lock *lock)
2213 {
2214 	return (mtx_enter_try(&lock->prl_mtx));
2215 }
2216 
2217 void
2218 pool_lock_mtx_leave(union pool_lock *lock)
2219 {
2220 	mtx_leave(&lock->prl_mtx);
2221 }
2222 
2223 void
2224 pool_lock_mtx_assert_locked(union pool_lock *lock)
2225 {
2226 	MUTEX_ASSERT_LOCKED(&lock->prl_mtx);
2227 }
2228 
2229 void
2230 pool_lock_mtx_assert_unlocked(union pool_lock *lock)
2231 {
2232 	MUTEX_ASSERT_UNLOCKED(&lock->prl_mtx);
2233 }
2234 
2235 int
2236 pool_lock_mtx_sleep(void *ident, union pool_lock *lock, int priority,
2237     const char *wmesg)
2238 {
2239 	return msleep_nsec(ident, &lock->prl_mtx, priority, wmesg, INFSLP);
2240 }
2241 
2242 static const struct pool_lock_ops pool_lock_ops_mtx = {
2243 	pool_lock_mtx_init,
2244 	pool_lock_mtx_enter,
2245 	pool_lock_mtx_enter_try,
2246 	pool_lock_mtx_leave,
2247 	pool_lock_mtx_assert_locked,
2248 	pool_lock_mtx_assert_unlocked,
2249 	pool_lock_mtx_sleep,
2250 };
2251 
2252 void
2253 pool_lock_rw_init(struct pool *pp, union pool_lock *lock,
2254     const struct lock_type *type)
2255 {
2256 	_rw_init_flags(&lock->prl_rwlock, pp->pr_wchan, 0, type);
2257 }
2258 
2259 void
2260 pool_lock_rw_enter(union pool_lock *lock)
2261 {
2262 	rw_enter_write(&lock->prl_rwlock);
2263 }
2264 
2265 int
2266 pool_lock_rw_enter_try(union pool_lock *lock)
2267 {
2268 	return (rw_enter(&lock->prl_rwlock, RW_WRITE | RW_NOSLEEP) == 0);
2269 }
2270 
2271 void
2272 pool_lock_rw_leave(union pool_lock *lock)
2273 {
2274 	rw_exit_write(&lock->prl_rwlock);
2275 }
2276 
2277 void
2278 pool_lock_rw_assert_locked(union pool_lock *lock)
2279 {
2280 	rw_assert_wrlock(&lock->prl_rwlock);
2281 }
2282 
2283 void
2284 pool_lock_rw_assert_unlocked(union pool_lock *lock)
2285 {
2286 	KASSERT(rw_status(&lock->prl_rwlock) != RW_WRITE);
2287 }
2288 
2289 int
2290 pool_lock_rw_sleep(void *ident, union pool_lock *lock, int priority,
2291     const char *wmesg)
2292 {
2293 	return rwsleep_nsec(ident, &lock->prl_rwlock, priority, wmesg, INFSLP);
2294 }
2295 
2296 static const struct pool_lock_ops pool_lock_ops_rw = {
2297 	pool_lock_rw_init,
2298 	pool_lock_rw_enter,
2299 	pool_lock_rw_enter_try,
2300 	pool_lock_rw_leave,
2301 	pool_lock_rw_assert_locked,
2302 	pool_lock_rw_assert_unlocked,
2303 	pool_lock_rw_sleep,
2304 };
2305