xref: /openbsd-src/sys/kern/subr_pool.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: subr_pool.c,v 1.138 2014/07/10 13:34:39 tedu Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/errno.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/pool.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 
44 #include <uvm/uvm_extern.h>
45 #include <dev/rndvar.h>
46 
47 /*
48  * Pool resource management utility.
49  *
50  * Memory is allocated in pages which are split into pieces according to
51  * the pool item size. Each page is kept on one of three lists in the
52  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
53  * for empty, full and partially-full pages respectively. The individual
54  * pool items are on a linked list headed by `ph_itemlist' in each page
55  * header. The memory for building the page list is either taken from
56  * the allocated pages themselves (for small pool items) or taken from
57  * an internal pool of page headers (`phpool').
58  */
59 
60 /* List of all pools */
61 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
62 
63 /* Private pool for page header structures */
64 struct pool phpool;
65 
66 struct pool_item_header {
67 	/* Page headers */
68 	LIST_ENTRY(pool_item_header)
69 				ph_pagelist;	/* pool page list */
70 	XSIMPLEQ_HEAD(,pool_item) ph_itemlist;	/* chunk list for this page */
71 	RB_ENTRY(pool_item_header)
72 				ph_node;	/* Off-page page headers */
73 	int			ph_nmissing;	/* # of chunks in use */
74 	caddr_t			ph_page;	/* this page's address */
75 	caddr_t			ph_colored;	/* page's colored address */
76 	int			ph_pagesize;
77 	int			ph_magic;
78 };
79 
80 struct pool_item {
81 	u_int32_t pi_magic;
82 	/* Other entries use only this list entry */
83 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
84 };
85 
86 #ifdef POOL_DEBUG
87 int	pool_debug = 1;
88 #else
89 int	pool_debug = 0;
90 #endif
91 
92 #define	POOL_NEEDS_CATCHUP(pp)						\
93 	((pp)->pr_nitems < (pp)->pr_minitems)
94 
95 /*
96  * Every pool gets a unique serial number assigned to it. If this counter
97  * wraps, we're screwed, but we shouldn't create so many pools anyway.
98  */
99 unsigned int pool_serial;
100 
101 int	 pool_catchup(struct pool *);
102 void	 pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
103 void	 pool_update_curpage(struct pool *);
104 void	 pool_swizzle_curpage(struct pool *);
105 void	*pool_do_get(struct pool *, int);
106 void	 pool_do_put(struct pool *, void *);
107 void	 pr_rmpage(struct pool *, struct pool_item_header *,
108 	    struct pool_pagelist *);
109 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
110 int	 pool_chk(struct pool *);
111 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int);
112 
113 void	*pool_allocator_alloc(struct pool *, int, int *);
114 void	 pool_allocator_free(struct pool *, void *);
115 
116 /*
117  * XXX - quick hack. For pools with large items we want to use a special
118  *       allocator. For now, instead of having the allocator figure out
119  *       the allocation size from the pool (which can be done trivially
120  *       with round_page(pr_itemsperpage * pr_size)) which would require
121  *	 lots of changes everywhere, we just create allocators for each
122  *	 size. We limit those to 128 pages.
123  */
124 #define POOL_LARGE_MAXPAGES 128
125 struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES];
126 struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES];
127 void	*pool_large_alloc(struct pool *, int, int *);
128 void	pool_large_free(struct pool *, void *);
129 void	*pool_large_alloc_ni(struct pool *, int, int *);
130 void	pool_large_free_ni(struct pool *, void *);
131 
132 
133 #ifdef DDB
134 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
135 	     __attribute__((__format__(__kprintf__,1,2))));
136 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
137 	     __attribute__((__format__(__kprintf__,1,2))));
138 #endif
139 
140 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
141 
142 static __inline int
143 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
144 {
145 	long diff = (vaddr_t)a->ph_page - (vaddr_t)b->ph_page;
146 	if (diff < 0)
147 		return -(-diff >= a->ph_pagesize);
148 	else if (diff > 0)
149 		return (diff >= b->ph_pagesize);
150 	else
151 		return (0);
152 }
153 
154 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
155 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
156 
157 /*
158  * Return the pool page header based on page address.
159  */
160 static __inline struct pool_item_header *
161 pr_find_pagehead(struct pool *pp, void *v)
162 {
163 	struct pool_item_header *ph, tmp;
164 
165 	if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
166 		caddr_t page;
167 
168 		page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
169 
170 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
171 	}
172 
173 	/*
174 	 * The trick we're using in the tree compare function is to compare
175 	 * two elements equal when they overlap. We want to return the
176 	 * page header that belongs to the element just before this address.
177 	 * We don't want this element to compare equal to the next element,
178 	 * so the compare function takes the pagesize from the lower element.
179 	 * If this header is the lower, its pagesize is zero, so it can't
180 	 * overlap with the next header. But if the header we're looking for
181 	 * is lower, we'll use its pagesize and it will overlap and return
182 	 * equal.
183 	 */
184 	tmp.ph_page = v;
185 	tmp.ph_pagesize = 0;
186 	ph = RB_FIND(phtree, &pp->pr_phtree, &tmp);
187 
188 	if (ph) {
189 		KASSERT(ph->ph_page <= (caddr_t)v);
190 		KASSERT(ph->ph_page + ph->ph_pagesize > (caddr_t)v);
191 	}
192 	return ph;
193 }
194 
195 /*
196  * Remove a page from the pool.
197  */
198 void
199 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
200     struct pool_pagelist *pq)
201 {
202 
203 	/*
204 	 * If the page was idle, decrement the idle page count.
205 	 */
206 	if (ph->ph_nmissing == 0) {
207 #ifdef DIAGNOSTIC
208 		if (pp->pr_nidle == 0)
209 			panic("pr_rmpage: nidle inconsistent");
210 		if (pp->pr_nitems < pp->pr_itemsperpage)
211 			panic("pr_rmpage: nitems inconsistent");
212 #endif
213 		pp->pr_nidle--;
214 	}
215 
216 	pp->pr_nitems -= pp->pr_itemsperpage;
217 
218 	/*
219 	 * Unlink a page from the pool and release it (or queue it for release).
220 	 */
221 	LIST_REMOVE(ph, ph_pagelist);
222 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
223 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
224 	pp->pr_npages--;
225 	pp->pr_npagefree++;
226 	pool_update_curpage(pp);
227 
228 	if (pq) {
229 		LIST_INSERT_HEAD(pq, ph, ph_pagelist);
230 	} else {
231 		pool_allocator_free(pp, ph->ph_page);
232 		if ((pp->pr_roflags & PR_PHINPAGE) == 0)
233 			pool_put(&phpool, ph);
234 	}
235 }
236 
237 /*
238  * Initialize the given pool resource structure.
239  *
240  * We export this routine to allow other kernel parts to declare
241  * static pools that must be initialized before malloc() is available.
242  */
243 void
244 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
245     const char *wchan, struct pool_allocator *palloc)
246 {
247 	int off, slack;
248 #ifdef DIAGNOSTIC
249 	struct pool *iter;
250 
251 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
252 		if (iter == pp)
253 			panic("init pool already on list");
254 	}
255 #endif
256 
257 #ifdef MALLOC_DEBUG
258 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
259 		flags &= ~PR_DEBUG;
260 #endif
261 	/*
262 	 * Check arguments and construct default values.
263 	 */
264 	if (palloc == NULL) {
265 		if (size > PAGE_SIZE) {
266 			int psize;
267 
268 			/*
269 			 * XXX - should take align into account as well.
270 			 */
271 			if (size == round_page(size))
272 				psize = size / PAGE_SIZE;
273 			else
274 				psize = PAGE_SIZE / roundup(size % PAGE_SIZE,
275 				    1024);
276 			if (psize > POOL_LARGE_MAXPAGES)
277 				psize = POOL_LARGE_MAXPAGES;
278 			if (flags & PR_WAITOK)
279 				palloc = &pool_allocator_large_ni[psize-1];
280 			else
281 				palloc = &pool_allocator_large[psize-1];
282 			if (palloc->pa_pagesz == 0) {
283 				palloc->pa_pagesz = psize * PAGE_SIZE;
284 				if (flags & PR_WAITOK) {
285 					palloc->pa_alloc = pool_large_alloc_ni;
286 					palloc->pa_free = pool_large_free_ni;
287 				} else {
288 					palloc->pa_alloc = pool_large_alloc;
289 					palloc->pa_free = pool_large_free;
290 				}
291 			}
292 		} else {
293 			palloc = &pool_allocator_nointr;
294 		}
295 	}
296 	if (palloc->pa_pagesz == 0) {
297 		palloc->pa_pagesz = PAGE_SIZE;
298 	}
299 	if (palloc->pa_pagemask == 0) {
300 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
301 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
302 	}
303 
304 	if (align == 0)
305 		align = ALIGN(1);
306 
307 	if (size < sizeof(struct pool_item))
308 		size = sizeof(struct pool_item);
309 
310 	size = roundup(size, align);
311 #ifdef DIAGNOSTIC
312 	if (size > palloc->pa_pagesz)
313 		panic("pool_init: pool item size (%lu) too large",
314 		    (u_long)size);
315 #endif
316 
317 	/*
318 	 * Initialize the pool structure.
319 	 */
320 	LIST_INIT(&pp->pr_emptypages);
321 	LIST_INIT(&pp->pr_fullpages);
322 	LIST_INIT(&pp->pr_partpages);
323 	pp->pr_curpage = NULL;
324 	pp->pr_npages = 0;
325 	pp->pr_minitems = 0;
326 	pp->pr_minpages = 0;
327 	pp->pr_maxpages = 8;
328 	pp->pr_roflags = flags;
329 	pp->pr_flags = 0;
330 	pp->pr_size = size;
331 	pp->pr_align = align;
332 	pp->pr_wchan = wchan;
333 	pp->pr_alloc = palloc;
334 	pp->pr_nitems = 0;
335 	pp->pr_nout = 0;
336 	pp->pr_hardlimit = UINT_MAX;
337 	pp->pr_hardlimit_warning = NULL;
338 	pp->pr_hardlimit_ratecap.tv_sec = 0;
339 	pp->pr_hardlimit_ratecap.tv_usec = 0;
340 	pp->pr_hardlimit_warning_last.tv_sec = 0;
341 	pp->pr_hardlimit_warning_last.tv_usec = 0;
342 	pp->pr_serial = ++pool_serial;
343 	if (pool_serial == 0)
344 		panic("pool_init: too much uptime");
345 
346 	/*
347 	 * Decide whether to put the page header off page to avoid
348 	 * wasting too large a part of the page. Off-page page headers
349 	 * go into an RB tree, so we can match a returned item with
350 	 * its header based on the page address.
351 	 * We use 1/16 of the page size as the threshold (XXX: tune)
352 	 */
353 	if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) {
354 		/* Use the end of the page for the page header */
355 		pp->pr_roflags |= PR_PHINPAGE;
356 		pp->pr_phoffset = off = palloc->pa_pagesz -
357 		    ALIGN(sizeof(struct pool_item_header));
358 	} else {
359 		/* The page header will be taken from our page header pool */
360 		pp->pr_phoffset = 0;
361 		off = palloc->pa_pagesz;
362 		RB_INIT(&pp->pr_phtree);
363 	}
364 
365 	/*
366 	 * Alignment is to take place at `ioff' within the item. This means
367 	 * we must reserve up to `align - 1' bytes on the page to allow
368 	 * appropriate positioning of each item.
369 	 *
370 	 * Silently enforce `0 <= ioff < align'.
371 	 */
372 	pp->pr_itemoffset = ioff = ioff % align;
373 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
374 	KASSERT(pp->pr_itemsperpage != 0);
375 
376 	/*
377 	 * Use the slack between the chunks and the page header
378 	 * for "cache coloring".
379 	 */
380 	slack = off - pp->pr_itemsperpage * pp->pr_size;
381 	pp->pr_maxcolor = (slack / align) * align;
382 	pp->pr_curcolor = 0;
383 
384 	pp->pr_nget = 0;
385 	pp->pr_nfail = 0;
386 	pp->pr_nput = 0;
387 	pp->pr_npagealloc = 0;
388 	pp->pr_npagefree = 0;
389 	pp->pr_hiwat = 0;
390 	pp->pr_nidle = 0;
391 
392 	pp->pr_ipl = -1;
393 	mtx_init(&pp->pr_mtx, IPL_NONE);
394 
395 	if (phpool.pr_size == 0) {
396 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
397 		    0, "phpool", NULL);
398 		pool_setipl(&phpool, IPL_HIGH);
399 	}
400 
401 	/* pglistalloc/constraint parameters */
402 	pp->pr_crange = &kp_dirty;
403 
404 	/* Insert this into the list of all pools. */
405 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
406 }
407 
408 void
409 pool_setipl(struct pool *pp, int ipl)
410 {
411 	pp->pr_ipl = ipl;
412 	mtx_init(&pp->pr_mtx, ipl);
413 }
414 
415 /*
416  * Decommission a pool resource.
417  */
418 void
419 pool_destroy(struct pool *pp)
420 {
421 	struct pool_item_header *ph;
422 	struct pool *prev, *iter;
423 
424 	/* Remove from global pool list */
425 	if (pp == SIMPLEQ_FIRST(&pool_head))
426 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
427 	else {
428 		prev = SIMPLEQ_FIRST(&pool_head);
429 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
430 			if (iter == pp) {
431 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
432 				    pr_poollist);
433 				goto removed;
434 			}
435 			prev = iter;
436 		}
437 #ifdef DIAGNOSTIC
438 		panic("destroyed pool not on list");
439 #endif
440 	}
441 removed:
442 #ifdef DIAGNOSTIC
443 	if (pp->pr_nout != 0)
444 		panic("pool_destroy: pool busy: still out: %u", pp->pr_nout);
445 #endif
446 
447 	/* Remove all pages */
448 	while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
449 		pr_rmpage(pp, ph, NULL);
450 	KASSERT(LIST_EMPTY(&pp->pr_fullpages));
451 	KASSERT(LIST_EMPTY(&pp->pr_partpages));
452 
453 }
454 
455 struct pool_item_header *
456 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
457 {
458 	struct pool_item_header *ph;
459 
460 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
461 		ph = (struct pool_item_header *)(storage + pp->pr_phoffset);
462 	else
463 		ph = pool_get(&phpool, (flags & ~(PR_WAITOK | PR_ZERO)) |
464 		    PR_NOWAIT);
465 #ifdef DIAGNOSTIC
466 	if (pool_debug && ph != NULL)
467 		ph->ph_magic = poison_value(ph);
468 #endif
469 	return (ph);
470 }
471 
472 /*
473  * Grab an item from the pool; must be called at appropriate spl level
474  */
475 void *
476 pool_get(struct pool *pp, int flags)
477 {
478 	void *v;
479 
480 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
481 
482 	if ((flags & PR_WAITOK) != 0) {
483 #ifdef DIAGNOSTIC
484 		assertwaitok();
485 		if (pool_debug == 2)
486 			yield();
487 #endif
488 		if (!cold && pool_debug) {
489 			KERNEL_UNLOCK();
490 			KERNEL_LOCK();
491 		}
492 	}
493 
494 	mtx_enter(&pp->pr_mtx);
495 #ifdef POOL_DEBUG
496 	if (pp->pr_roflags & PR_DEBUGCHK) {
497 		if (pool_chk(pp))
498 			panic("before pool_get");
499 	}
500 #endif
501 	v = pool_do_get(pp, flags);
502 #ifdef POOL_DEBUG
503 	if (pp->pr_roflags & PR_DEBUGCHK) {
504 		if (pool_chk(pp))
505 			panic("after pool_get");
506 	}
507 #endif
508 	if (v != NULL)
509 		pp->pr_nget++;
510 	mtx_leave(&pp->pr_mtx);
511 	if (v == NULL)
512 		return (v);
513 
514 	if (flags & PR_ZERO)
515 		memset(v, 0, pp->pr_size);
516 
517 	return (v);
518 }
519 
520 void *
521 pool_do_get(struct pool *pp, int flags)
522 {
523 	struct pool_item *pi;
524 	struct pool_item_header *ph;
525 	void *v;
526 	int slowdown = 0;
527 
528 #ifdef MALLOC_DEBUG
529 	if (pp->pr_roflags & PR_DEBUG) {
530 		void *addr;
531 
532 		addr = NULL;
533 		debug_malloc(pp->pr_size, M_DEBUG,
534 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
535 		return (addr);
536 	}
537 #endif
538 
539 startover:
540 	/*
541 	 * Check to see if we've reached the hard limit.  If we have,
542 	 * and we can wait, then wait until an item has been returned to
543 	 * the pool.
544 	 */
545 #ifdef DIAGNOSTIC
546 	if (pp->pr_nout > pp->pr_hardlimit)
547 		panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan);
548 #endif
549 	if (pp->pr_nout == pp->pr_hardlimit) {
550 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
551 			/*
552 			 * XXX: A warning isn't logged in this case.  Should
553 			 * it be?
554 			 */
555 			pp->pr_flags |= PR_WANTED;
556 			pool_sleep(pp);
557 			goto startover;
558 		}
559 
560 		/*
561 		 * Log a message that the hard limit has been hit.
562 		 */
563 		if (pp->pr_hardlimit_warning != NULL &&
564 		    ratecheck(&pp->pr_hardlimit_warning_last,
565 		    &pp->pr_hardlimit_ratecap))
566 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
567 
568 		pp->pr_nfail++;
569 		return (NULL);
570 	}
571 
572 	pool_swizzle_curpage(pp);
573 	/*
574 	 * The convention we use is that if `curpage' is not NULL, then
575 	 * it points at a non-empty bucket. In particular, `curpage'
576 	 * never points at a page header which has PR_PHINPAGE set and
577 	 * has no items in its bucket.
578 	 */
579 	if ((ph = pp->pr_curpage) == NULL) {
580 #ifdef DIAGNOSTIC
581 		if (pp->pr_nitems != 0) {
582 			printf("pool_do_get: %s: curpage NULL, nitems %u\n",
583 			    pp->pr_wchan, pp->pr_nitems);
584 			panic("pool_do_get: nitems inconsistent");
585 		}
586 #endif
587 
588 		/*
589 		 * Call the back-end page allocator for more memory.
590 		 */
591 		v = pool_allocator_alloc(pp, flags, &slowdown);
592 		if (v != NULL)
593 			ph = pool_alloc_item_header(pp, v, flags);
594 
595 		if (v == NULL || ph == NULL) {
596 			if (v != NULL)
597 				pool_allocator_free(pp, v);
598 
599 			if ((flags & PR_WAITOK) == 0) {
600 				pp->pr_nfail++;
601 				return (NULL);
602 			}
603 
604 			/*
605 			 * Wait for items to be returned to this pool.
606 			 *
607 			 * XXX: maybe we should wake up once a second and
608 			 * try again?
609 			 */
610 			pp->pr_flags |= PR_WANTED;
611 			pool_sleep(pp);
612 			goto startover;
613 		}
614 
615 		/* We have more memory; add it to the pool */
616 		pool_prime_page(pp, v, ph);
617 		pp->pr_npagealloc++;
618 
619 		if (slowdown && (flags & PR_WAITOK)) {
620 			mtx_leave(&pp->pr_mtx);
621 			yield();
622 			mtx_enter(&pp->pr_mtx);
623 		}
624 
625 		/* Start the allocation process over. */
626 		goto startover;
627 	}
628 	if ((v = pi = XSIMPLEQ_FIRST(&ph->ph_itemlist)) == NULL) {
629 		panic("pool_do_get: %s: page empty", pp->pr_wchan);
630 	}
631 #ifdef DIAGNOSTIC
632 	if (pp->pr_nitems == 0) {
633 		printf("pool_do_get: %s: items on itemlist, nitems %u\n",
634 		    pp->pr_wchan, pp->pr_nitems);
635 		panic("pool_do_get: nitems inconsistent");
636 	}
637 #endif
638 
639 #ifdef DIAGNOSTIC
640 	if (pi->pi_magic != poison_value(pi))
641 		panic("pool_do_get(%s): free list modified: "
642 		    "page %p; item addr %p; offset 0x%x=0x%x",
643 		    pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic);
644 	if (pool_debug && ph->ph_magic) {
645 		size_t pidx;
646 		uint32_t pval;
647 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
648 		    &pidx, &pval)) {
649 			int *ip = (int *)(pi + 1);
650 			panic("pool_do_get(%s): free list modified: "
651 			    "page %p; item addr %p; offset 0x%zx=0x%x",
652 			    pp->pr_wchan, ph->ph_page, pi,
653 			    pidx * sizeof(int), ip[pidx]);
654 		}
655 	}
656 #endif /* DIAGNOSTIC */
657 
658 	/*
659 	 * Remove from item list.
660 	 */
661 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list);
662 	pp->pr_nitems--;
663 	pp->pr_nout++;
664 	if (ph->ph_nmissing == 0) {
665 #ifdef DIAGNOSTIC
666 		if (pp->pr_nidle == 0)
667 			panic("pool_do_get: nidle inconsistent");
668 #endif
669 		pp->pr_nidle--;
670 
671 		/*
672 		 * This page was previously empty.  Move it to the list of
673 		 * partially-full pages.  This page is already curpage.
674 		 */
675 		LIST_REMOVE(ph, ph_pagelist);
676 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
677 	}
678 	ph->ph_nmissing++;
679 	if (XSIMPLEQ_EMPTY(&ph->ph_itemlist)) {
680 #ifdef DIAGNOSTIC
681 		if (ph->ph_nmissing != pp->pr_itemsperpage) {
682 			panic("pool_do_get: %s: nmissing inconsistent",
683 			    pp->pr_wchan);
684 		}
685 #endif
686 		/*
687 		 * This page is now full.  Move it to the full list
688 		 * and select a new current page.
689 		 */
690 		LIST_REMOVE(ph, ph_pagelist);
691 		LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
692 		pool_update_curpage(pp);
693 	}
694 
695 	/*
696 	 * If we have a low water mark and we are now below that low
697 	 * water mark, add more items to the pool.
698 	 */
699 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
700 		/*
701 		 * XXX: Should we log a warning?  Should we set up a timeout
702 		 * to try again in a second or so?  The latter could break
703 		 * a caller's assumptions about interrupt protection, etc.
704 		 */
705 	}
706 	return (v);
707 }
708 
709 /*
710  * Return resource to the pool; must be called at appropriate spl level
711  */
712 void
713 pool_put(struct pool *pp, void *v)
714 {
715 	mtx_enter(&pp->pr_mtx);
716 #ifdef POOL_DEBUG
717 	if (pp->pr_roflags & PR_DEBUGCHK) {
718 		if (pool_chk(pp))
719 			panic("before pool_put");
720 	}
721 #endif
722 	pool_do_put(pp, v);
723 #ifdef POOL_DEBUG
724 	if (pp->pr_roflags & PR_DEBUGCHK) {
725 		if (pool_chk(pp))
726 			panic("after pool_put");
727 	}
728 #endif
729 	pp->pr_nput++;
730 	mtx_leave(&pp->pr_mtx);
731 }
732 
733 /*
734  * Internal version of pool_put().
735  */
736 void
737 pool_do_put(struct pool *pp, void *v)
738 {
739 	struct pool_item *pi = v;
740 	struct pool_item_header *ph;
741 
742 	if (v == NULL)
743 		panic("pool_put of NULL");
744 
745 #ifdef MALLOC_DEBUG
746 	if (pp->pr_roflags & PR_DEBUG) {
747 		debug_free(v, M_DEBUG);
748 		return;
749 	}
750 #endif
751 
752 #ifdef DIAGNOSTIC
753 	if (pp->pr_ipl != -1)
754 		splassert(pp->pr_ipl);
755 
756 	if (pp->pr_nout == 0) {
757 		printf("pool %s: putting with none out\n",
758 		    pp->pr_wchan);
759 		panic("pool_do_put");
760 	}
761 #endif
762 
763 	if ((ph = pr_find_pagehead(pp, v)) == NULL) {
764 		panic("pool_do_put: %s: page header missing", pp->pr_wchan);
765 	}
766 
767 	/*
768 	 * Return to item list.
769 	 */
770 #ifdef DIAGNOSTIC
771 	if (pool_debug) {
772 		struct pool_item *qi;
773 		XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list)
774 			if (pi == qi)
775 				panic("double pool_put: %p", pi);
776 	}
777 	pi->pi_magic = poison_value(pi);
778 	if (ph->ph_magic) {
779 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
780 	}
781 #endif /* DIAGNOSTIC */
782 
783 	XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
784 	ph->ph_nmissing--;
785 	pp->pr_nitems++;
786 	pp->pr_nout--;
787 
788 	/* Cancel "pool empty" condition if it exists */
789 	if (pp->pr_curpage == NULL)
790 		pp->pr_curpage = ph;
791 
792 	if (pp->pr_flags & PR_WANTED) {
793 		pp->pr_flags &= ~PR_WANTED;
794 		wakeup(pp);
795 	}
796 
797 	/*
798 	 * If this page is now empty, do one of two things:
799 	 *
800 	 *	(1) If we have more pages than the page high water mark,
801 	 *	    free the page back to the system.
802 	 *
803 	 *	(2) Otherwise, move the page to the empty page list.
804 	 *
805 	 * Either way, select a new current page (so we use a partially-full
806 	 * page if one is available).
807 	 */
808 	if (ph->ph_nmissing == 0) {
809 		pp->pr_nidle++;
810 		if (pp->pr_nidle > pp->pr_maxpages) {
811 			pr_rmpage(pp, ph, NULL);
812 		} else {
813 			LIST_REMOVE(ph, ph_pagelist);
814 			LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
815 			pool_update_curpage(pp);
816 		}
817 	}
818 	/*
819 	 * If the page was previously completely full, move it to the
820 	 * partially-full list.
821 	 */
822 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
823 		LIST_REMOVE(ph, ph_pagelist);
824 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
825 	}
826 }
827 
828 /*
829  * Add N items to the pool.
830  */
831 int
832 pool_prime(struct pool *pp, int n)
833 {
834 	struct pool_item_header *ph;
835 	caddr_t cp;
836 	int newpages;
837 	int slowdown;
838 
839 	mtx_enter(&pp->pr_mtx);
840 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
841 
842 	while (newpages-- > 0) {
843 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
844 		if (cp != NULL)
845 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
846 		if (cp == NULL || ph == NULL) {
847 			if (cp != NULL)
848 				pool_allocator_free(pp, cp);
849 			break;
850 		}
851 
852 		pool_prime_page(pp, cp, ph);
853 		pp->pr_npagealloc++;
854 		pp->pr_minpages++;
855 	}
856 
857 	if (pp->pr_minpages >= pp->pr_maxpages)
858 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
859 
860 	mtx_leave(&pp->pr_mtx);
861 	return (0);
862 }
863 
864 /*
865  * Add a page worth of items to the pool.
866  *
867  * Note, we must be called with the pool descriptor LOCKED.
868  */
869 void
870 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
871 {
872 	struct pool_item *pi;
873 	caddr_t cp = storage;
874 	unsigned int align = pp->pr_align;
875 	unsigned int ioff = pp->pr_itemoffset;
876 	int n;
877 
878 	/*
879 	 * Insert page header.
880 	 */
881 	LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
882 	XSIMPLEQ_INIT(&ph->ph_itemlist);
883 	ph->ph_page = storage;
884 	ph->ph_pagesize = pp->pr_alloc->pa_pagesz;
885 	ph->ph_nmissing = 0;
886 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
887 		RB_INSERT(phtree, &pp->pr_phtree, ph);
888 
889 	pp->pr_nidle++;
890 
891 	/*
892 	 * Color this page.
893 	 */
894 	cp = (caddr_t)(cp + pp->pr_curcolor);
895 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
896 		pp->pr_curcolor = 0;
897 
898 	/*
899 	 * Adjust storage to apply alignment to `pr_itemoffset' in each item.
900 	 */
901 	if (ioff != 0)
902 		cp = (caddr_t)(cp + (align - ioff));
903 	ph->ph_colored = cp;
904 
905 	/*
906 	 * Insert remaining chunks on the bucket list.
907 	 */
908 	n = pp->pr_itemsperpage;
909 	pp->pr_nitems += n;
910 
911 	while (n--) {
912 		pi = (struct pool_item *)cp;
913 
914 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
915 
916 		/* Insert on page list */
917 		XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
918 
919 #ifdef DIAGNOSTIC
920 		pi->pi_magic = poison_value(pi);
921 		if (ph->ph_magic) {
922 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
923 		}
924 #endif /* DIAGNOSTIC */
925 		cp = (caddr_t)(cp + pp->pr_size);
926 	}
927 
928 	/*
929 	 * If the pool was depleted, point at the new page.
930 	 */
931 	if (pp->pr_curpage == NULL)
932 		pp->pr_curpage = ph;
933 
934 	if (++pp->pr_npages > pp->pr_hiwat)
935 		pp->pr_hiwat = pp->pr_npages;
936 }
937 
938 /*
939  * Used by pool_get() when nitems drops below the low water mark.  This
940  * is used to catch up pr_nitems with the low water mark.
941  *
942  * Note we never wait for memory here, we let the caller decide what to do.
943  */
944 int
945 pool_catchup(struct pool *pp)
946 {
947 	struct pool_item_header *ph;
948 	caddr_t cp;
949 	int error = 0;
950 	int slowdown;
951 
952 	while (POOL_NEEDS_CATCHUP(pp)) {
953 		/*
954 		 * Call the page back-end allocator for more memory.
955 		 */
956 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
957 		if (cp != NULL)
958 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
959 		if (cp == NULL || ph == NULL) {
960 			if (cp != NULL)
961 				pool_allocator_free(pp, cp);
962 			error = ENOMEM;
963 			break;
964 		}
965 		pool_prime_page(pp, cp, ph);
966 		pp->pr_npagealloc++;
967 	}
968 
969 	return (error);
970 }
971 
972 void
973 pool_update_curpage(struct pool *pp)
974 {
975 
976 	pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
977 	if (pp->pr_curpage == NULL) {
978 		pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
979 	}
980 }
981 
982 void
983 pool_swizzle_curpage(struct pool *pp)
984 {
985 	struct pool_item_header *ph, *next;
986 
987 	if ((ph = pp->pr_curpage) == NULL)
988 		return;
989 	if (arc4random_uniform(16) != 0)
990 		return;
991 	next = LIST_FIRST(&pp->pr_partpages);
992 	if (next == ph)
993 		next = LIST_NEXT(next, ph_pagelist);
994 	if (next == NULL) {
995 		next = LIST_FIRST(&pp->pr_emptypages);
996 		if (next == ph)
997 			next = LIST_NEXT(next, ph_pagelist);
998 	}
999 	if (next != NULL)
1000 		pp->pr_curpage = next;
1001 }
1002 
1003 void
1004 pool_setlowat(struct pool *pp, int n)
1005 {
1006 
1007 	pp->pr_minitems = n;
1008 	pp->pr_minpages = (n == 0)
1009 		? 0
1010 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1011 
1012 	mtx_enter(&pp->pr_mtx);
1013 	/* Make sure we're caught up with the newly-set low water mark. */
1014 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
1015 		/*
1016 		 * XXX: Should we log a warning?  Should we set up a timeout
1017 		 * to try again in a second or so?  The latter could break
1018 		 * a caller's assumptions about interrupt protection, etc.
1019 		 */
1020 	}
1021 	mtx_leave(&pp->pr_mtx);
1022 }
1023 
1024 void
1025 pool_sethiwat(struct pool *pp, int n)
1026 {
1027 
1028 	pp->pr_maxpages = (n == 0)
1029 		? 0
1030 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1031 }
1032 
1033 int
1034 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
1035 {
1036 	int error = 0;
1037 
1038 	if (n < pp->pr_nout) {
1039 		error = EINVAL;
1040 		goto done;
1041 	}
1042 
1043 	pp->pr_hardlimit = n;
1044 	pp->pr_hardlimit_warning = warnmsg;
1045 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1046 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1047 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1048 
1049 done:
1050 	return (error);
1051 }
1052 
1053 void
1054 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
1055 {
1056 	pp->pr_crange = mode;
1057 }
1058 
1059 /*
1060  * Release all complete pages that have not been used recently.
1061  *
1062  * Returns non-zero if any pages have been reclaimed.
1063  */
1064 int
1065 pool_reclaim(struct pool *pp)
1066 {
1067 	struct pool_item_header *ph, *phnext;
1068 	struct pool_pagelist pq;
1069 
1070 	LIST_INIT(&pq);
1071 
1072 	mtx_enter(&pp->pr_mtx);
1073 	for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1074 		phnext = LIST_NEXT(ph, ph_pagelist);
1075 
1076 		/* Check our minimum page claim */
1077 		if (pp->pr_npages <= pp->pr_minpages)
1078 			break;
1079 
1080 		KASSERT(ph->ph_nmissing == 0);
1081 
1082 		/*
1083 		 * If freeing this page would put us below
1084 		 * the low water mark, stop now.
1085 		 */
1086 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1087 		    pp->pr_minitems)
1088 			break;
1089 
1090 		pr_rmpage(pp, ph, &pq);
1091 	}
1092 	mtx_leave(&pp->pr_mtx);
1093 
1094 	if (LIST_EMPTY(&pq))
1095 		return (0);
1096 	while ((ph = LIST_FIRST(&pq)) != NULL) {
1097 		LIST_REMOVE(ph, ph_pagelist);
1098 		pool_allocator_free(pp, ph->ph_page);
1099 		if (pp->pr_roflags & PR_PHINPAGE)
1100 			continue;
1101 		pool_put(&phpool, ph);
1102 	}
1103 
1104 	return (1);
1105 }
1106 
1107 /*
1108  * Release all complete pages that have not been used recently
1109  * from all pools.
1110  */
1111 void
1112 pool_reclaim_all(void)
1113 {
1114 	struct pool	*pp;
1115 	int		s;
1116 
1117 	s = splhigh();
1118 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1119 		pool_reclaim(pp);
1120 	splx(s);
1121 }
1122 
1123 #ifdef DDB
1124 #include <machine/db_machdep.h>
1125 #include <ddb/db_interface.h>
1126 #include <ddb/db_output.h>
1127 
1128 /*
1129  * Diagnostic helpers.
1130  */
1131 void
1132 pool_printit(struct pool *pp, const char *modif,
1133     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1134 {
1135 	pool_print1(pp, modif, pr);
1136 }
1137 
1138 void
1139 pool_print_pagelist(struct pool_pagelist *pl,
1140     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1141 {
1142 	struct pool_item_header *ph;
1143 #ifdef DIAGNOSTIC
1144 	struct pool_item *pi;
1145 #endif
1146 
1147 	LIST_FOREACH(ph, pl, ph_pagelist) {
1148 		(*pr)("\t\tpage %p, nmissing %d\n",
1149 		    ph->ph_page, ph->ph_nmissing);
1150 #ifdef DIAGNOSTIC
1151 		XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1152 			if (pi->pi_magic != poison_value(pi)) {
1153 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1154 				    pi, pi->pi_magic);
1155 			}
1156 		}
1157 #endif
1158 	}
1159 }
1160 
1161 void
1162 pool_print1(struct pool *pp, const char *modif,
1163     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1164 {
1165 	struct pool_item_header *ph;
1166 	int print_pagelist = 0;
1167 	char c;
1168 
1169 	while ((c = *modif++) != '\0') {
1170 		if (c == 'p')
1171 			print_pagelist = 1;
1172 		modif++;
1173 	}
1174 
1175 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1176 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1177 	    pp->pr_roflags);
1178 	(*pr)("\talloc %p\n", pp->pr_alloc);
1179 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1180 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1181 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1182 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1183 
1184 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1185 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1186 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1187 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1188 
1189 	if (print_pagelist == 0)
1190 		return;
1191 
1192 	if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
1193 		(*pr)("\n\tempty page list:\n");
1194 	pool_print_pagelist(&pp->pr_emptypages, pr);
1195 	if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
1196 		(*pr)("\n\tfull page list:\n");
1197 	pool_print_pagelist(&pp->pr_fullpages, pr);
1198 	if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
1199 		(*pr)("\n\tpartial-page list:\n");
1200 	pool_print_pagelist(&pp->pr_partpages, pr);
1201 
1202 	if (pp->pr_curpage == NULL)
1203 		(*pr)("\tno current page\n");
1204 	else
1205 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1206 }
1207 
1208 void
1209 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1210 {
1211 	struct pool *pp;
1212 	char maxp[16];
1213 	int ovflw;
1214 	char mode;
1215 
1216 	mode = modif[0];
1217 	if (mode != '\0' && mode != 'a') {
1218 		db_printf("usage: show all pools [/a]\n");
1219 		return;
1220 	}
1221 
1222 	if (mode == '\0')
1223 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1224 		    "Name",
1225 		    "Size",
1226 		    "Requests",
1227 		    "Fail",
1228 		    "Releases",
1229 		    "Pgreq",
1230 		    "Pgrel",
1231 		    "Npage",
1232 		    "Hiwat",
1233 		    "Minpg",
1234 		    "Maxpg",
1235 		    "Idle");
1236 	else
1237 		db_printf("%-12s %18s %18s\n",
1238 		    "Name", "Address", "Allocator");
1239 
1240 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1241 		if (mode == 'a') {
1242 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1243 			    pp->pr_alloc);
1244 			continue;
1245 		}
1246 
1247 		if (!pp->pr_nget)
1248 			continue;
1249 
1250 		if (pp->pr_maxpages == UINT_MAX)
1251 			snprintf(maxp, sizeof maxp, "inf");
1252 		else
1253 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1254 
1255 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1256 	(ovflw) += db_printf((fmt),			\
1257 	    (width) - (fixed) - (ovflw) > 0 ?		\
1258 	    (width) - (fixed) - (ovflw) : 0,		\
1259 	    (val)) - (width);				\
1260 	if ((ovflw) < 0)				\
1261 		(ovflw) = 0;				\
1262 } while (/* CONSTCOND */0)
1263 
1264 		ovflw = 0;
1265 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1266 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1267 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1268 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1269 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1270 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1271 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1272 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1273 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1274 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1275 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1276 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1277 
1278 		pool_chk(pp);
1279 	}
1280 }
1281 #endif /* DDB */
1282 
1283 #if defined(POOL_DEBUG) || defined(DDB)
1284 int
1285 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1286 {
1287 	struct pool_item *pi;
1288 	caddr_t page;
1289 	int n;
1290 	const char *label = pp->pr_wchan;
1291 
1292 	page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask);
1293 	if (page != ph->ph_page &&
1294 	    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1295 		printf("%s: ", label);
1296 		printf("pool(%p:%s): page inconsistency: page %p; "
1297 		    "at page head addr %p (p %p)\n",
1298 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1299 		return 1;
1300 	}
1301 
1302 	for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0;
1303 	     pi != NULL;
1304 	     pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) {
1305 
1306 #ifdef DIAGNOSTIC
1307 		if (pi->pi_magic != poison_value(pi)) {
1308 			printf("%s: ", label);
1309 			printf("pool(%s): free list modified: "
1310 			    "page %p; item ordinal %d; addr %p "
1311 			    "(p %p); offset 0x%x=0x%x\n",
1312 			    pp->pr_wchan, ph->ph_page, n, pi, page,
1313 			    0, pi->pi_magic);
1314 		}
1315 		if (pool_debug && ph->ph_magic) {
1316 			size_t pidx;
1317 			uint32_t pval;
1318 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1319 			    &pidx, &pval)) {
1320 				int *ip = (int *)(pi + 1);
1321 				printf("pool(%s): free list modified: "
1322 				    "page %p; item ordinal %d; addr %p "
1323 				    "(p %p); offset 0x%zx=0x%x\n",
1324 				    pp->pr_wchan, ph->ph_page, n, pi,
1325 				    page, pidx * sizeof(int), ip[pidx]);
1326 			}
1327 		}
1328 #endif /* DIAGNOSTIC */
1329 		page =
1330 		    (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask);
1331 		if (page == ph->ph_page)
1332 			continue;
1333 
1334 		printf("%s: ", label);
1335 		printf("pool(%p:%s): page inconsistency: page %p;"
1336 		    " item ordinal %d; addr %p (p %p)\n", pp,
1337 		    pp->pr_wchan, ph->ph_page, n, pi, page);
1338 		return 1;
1339 	}
1340 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1341 		printf("pool(%p:%s): page inconsistency: page %p;"
1342 		    " %d on list, %d missing, %d items per page\n", pp,
1343 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1344 		    pp->pr_itemsperpage);
1345 		return 1;
1346 	}
1347 	if (expected >= 0 && n != expected) {
1348 		printf("pool(%p:%s): page inconsistency: page %p;"
1349 		    " %d on list, %d missing, %d expected\n", pp,
1350 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1351 		    expected);
1352 		return 1;
1353 	}
1354 	return 0;
1355 }
1356 
1357 int
1358 pool_chk(struct pool *pp)
1359 {
1360 	struct pool_item_header *ph;
1361 	int r = 0;
1362 
1363 	LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1364 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1365 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1366 		r += pool_chk_page(pp, ph, 0);
1367 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1368 		r += pool_chk_page(pp, ph, -1);
1369 
1370 	return (r);
1371 }
1372 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1373 
1374 #ifdef DDB
1375 void
1376 pool_walk(struct pool *pp, int full,
1377     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1378     void (*func)(void *, int, int (*)(const char *, ...)
1379 	    __attribute__((__format__(__kprintf__,1,2)))))
1380 {
1381 	struct pool_item_header *ph;
1382 	struct pool_item *pi;
1383 	caddr_t cp;
1384 	int n;
1385 
1386 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1387 		cp = ph->ph_colored;
1388 		n = ph->ph_nmissing;
1389 
1390 		while (n--) {
1391 			func(cp, full, pr);
1392 			cp += pp->pr_size;
1393 		}
1394 	}
1395 
1396 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1397 		cp = ph->ph_colored;
1398 		n = ph->ph_nmissing;
1399 
1400 		do {
1401 			XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1402 				if (cp == (caddr_t)pi)
1403 					break;
1404 			}
1405 			if (cp != (caddr_t)pi) {
1406 				func(cp, full, pr);
1407 				n--;
1408 			}
1409 
1410 			cp += pp->pr_size;
1411 		} while (n > 0);
1412 	}
1413 }
1414 #endif
1415 
1416 /*
1417  * We have three different sysctls.
1418  * kern.pool.npools - the number of pools.
1419  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1420  * kern.pool.name.<pool#> - the name for pool#.
1421  */
1422 int
1423 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1424 {
1425 	struct kinfo_pool pi;
1426 	struct pool *pp;
1427 	size_t buflen = where != NULL ? *sizep : 0;
1428 	int npools = 0, s;
1429 	unsigned int lookfor;
1430 	size_t len;
1431 
1432 	switch (*name) {
1433 	case KERN_POOL_NPOOLS:
1434 		if (namelen != 1 || buflen != sizeof(int))
1435 			return (EINVAL);
1436 		lookfor = 0;
1437 		break;
1438 	case KERN_POOL_NAME:
1439 		if (namelen != 2 || buflen < 1)
1440 			return (EINVAL);
1441 		lookfor = name[1];
1442 		break;
1443 	case KERN_POOL_POOL:
1444 		if (namelen != 2 || buflen != sizeof(pi))
1445 			return (EINVAL);
1446 		lookfor = name[1];
1447 		break;
1448 	default:
1449 		return (EINVAL);
1450 	}
1451 
1452 	s = splvm();
1453 
1454 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1455 		npools++;
1456 		if (lookfor == pp->pr_serial)
1457 			break;
1458 	}
1459 
1460 	splx(s);
1461 
1462 	if (*name != KERN_POOL_NPOOLS && pp == NULL)
1463 		return (ENOENT);
1464 
1465 	switch (*name) {
1466 	case KERN_POOL_NPOOLS:
1467 		return copyout(&npools, where, buflen);
1468 	case KERN_POOL_NAME:
1469 		len = strlen(pp->pr_wchan) + 1;
1470 		if (*sizep < len)
1471 			return (ENOMEM);
1472 		*sizep = len;
1473 		return copyout(pp->pr_wchan, where, len);
1474 	case KERN_POOL_POOL:
1475 		memset(&pi, 0, sizeof(pi));
1476 		pi.pr_size = pp->pr_size;
1477 		pi.pr_pgsize = pp->pr_alloc->pa_pagesz;
1478 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1479 		pi.pr_minpages = pp->pr_minpages;
1480 		pi.pr_maxpages = pp->pr_maxpages;
1481 		pi.pr_hardlimit = pp->pr_hardlimit;
1482 		pi.pr_nout = pp->pr_nout;
1483 		pi.pr_nitems = pp->pr_nitems;
1484 		pi.pr_nget = pp->pr_nget;
1485 		pi.pr_nput = pp->pr_nput;
1486 		pi.pr_nfail = pp->pr_nfail;
1487 		pi.pr_npagealloc = pp->pr_npagealloc;
1488 		pi.pr_npagefree = pp->pr_npagefree;
1489 		pi.pr_hiwat = pp->pr_hiwat;
1490 		pi.pr_nidle = pp->pr_nidle;
1491 		return copyout(&pi, where, buflen);
1492 	}
1493 	/* NOTREACHED */
1494 	return (0); /* XXX - Stupid gcc */
1495 }
1496 
1497 /*
1498  * Pool backend allocators.
1499  *
1500  * Each pool has a backend allocator that handles allocation, deallocation
1501  */
1502 void	*pool_page_alloc(struct pool *, int, int *);
1503 void	pool_page_free(struct pool *, void *);
1504 
1505 /*
1506  * safe for interrupts, name preserved for compat this is the default
1507  * allocator
1508  */
1509 struct pool_allocator pool_allocator_nointr = {
1510 	pool_page_alloc, pool_page_free, 0,
1511 };
1512 
1513 /*
1514  * XXX - we have at least three different resources for the same allocation
1515  *  and each resource can be depleted. First we have the ready elements in
1516  *  the pool. Then we have the resource (typically a vm_map) for this
1517  *  allocator, then we have physical memory. Waiting for any of these can
1518  *  be unnecessary when any other is freed, but the kernel doesn't support
1519  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1520  *  the pool (so that we can be awakened when an item is returned to the pool),
1521  *  but we set PA_WANT on the allocator. When a page is returned to
1522  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1523  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1524  *  We also wake up the allocator in case someone without a pool (malloc)
1525  *  is sleeping waiting for this allocator.
1526  */
1527 
1528 void *
1529 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1530 {
1531 	int waitok = flags & PR_WAITOK;
1532 	void *v;
1533 
1534 	if (waitok)
1535 		mtx_leave(&pp->pr_mtx);
1536 	v = pp->pr_alloc->pa_alloc(pp, flags, slowdown);
1537 	if (waitok)
1538 		mtx_enter(&pp->pr_mtx);
1539 
1540 	return (v);
1541 }
1542 
1543 void
1544 pool_allocator_free(struct pool *pp, void *v)
1545 {
1546 	struct pool_allocator *pa = pp->pr_alloc;
1547 
1548 	(*pa->pa_free)(pp, v);
1549 }
1550 
1551 void *
1552 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1553 {
1554 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1555 
1556 	kd.kd_waitok = (flags & PR_WAITOK);
1557 	kd.kd_slowdown = slowdown;
1558 
1559 	return (km_alloc(PAGE_SIZE, &kv_page, pp->pr_crange, &kd));
1560 }
1561 
1562 void
1563 pool_page_free(struct pool *pp, void *v)
1564 {
1565 	km_free(v, PAGE_SIZE, &kv_page, pp->pr_crange);
1566 }
1567 
1568 void *
1569 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1570 {
1571 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1572 	void *v;
1573 	int s;
1574 
1575 	kd.kd_waitok = (flags & PR_WAITOK);
1576 	kd.kd_slowdown = slowdown;
1577 
1578 	s = splvm();
1579 	v = km_alloc(pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange,
1580 	    &kd);
1581 	splx(s);
1582 
1583 	return (v);
1584 }
1585 
1586 void
1587 pool_large_free(struct pool *pp, void *v)
1588 {
1589 	int s;
1590 
1591 	s = splvm();
1592 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange);
1593 	splx(s);
1594 }
1595 
1596 void *
1597 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1598 {
1599 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1600 
1601 	kd.kd_waitok = (flags & PR_WAITOK);
1602 	kd.kd_slowdown = slowdown;
1603 
1604 	return (km_alloc(pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange, &kd));
1605 }
1606 
1607 void
1608 pool_large_free_ni(struct pool *pp, void *v)
1609 {
1610 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange);
1611 }
1612