xref: /openbsd-src/sys/kern/subr_pool.c (revision b426ab7bc6c256bfb7af9a9f082a20534b39a28b)
1 /*	$OpenBSD: subr_pool.c,v 1.93 2010/06/27 03:03:48 thib Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/errno.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/pool.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 
44 #include <uvm/uvm.h>
45 
46 
47 /*
48  * Pool resource management utility.
49  *
50  * Memory is allocated in pages which are split into pieces according to
51  * the pool item size. Each page is kept on one of three lists in the
52  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
53  * for empty, full and partially-full pages respectively. The individual
54  * pool items are on a linked list headed by `ph_itemlist' in each page
55  * header. The memory for building the page list is either taken from
56  * the allocated pages themselves (for small pool items) or taken from
57  * an internal pool of page headers (`phpool').
58  */
59 
60 /* List of all pools */
61 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
62 
63 /* Private pool for page header structures */
64 struct pool phpool;
65 
66 struct pool_item_header {
67 	/* Page headers */
68 	LIST_ENTRY(pool_item_header)
69 				ph_pagelist;	/* pool page list */
70 	TAILQ_HEAD(,pool_item)	ph_itemlist;	/* chunk list for this page */
71 	RB_ENTRY(pool_item_header)
72 				ph_node;	/* Off-page page headers */
73 	int			ph_nmissing;	/* # of chunks in use */
74 	caddr_t			ph_page;	/* this page's address */
75 	caddr_t			ph_colored;	/* page's colored address */
76 	int			ph_pagesize;
77 };
78 
79 struct pool_item {
80 #ifdef DIAGNOSTIC
81 	u_int32_t pi_magic;
82 #endif
83 	/* Other entries use only this list entry */
84 	TAILQ_ENTRY(pool_item)	pi_list;
85 };
86 
87 #ifdef DEADBEEF1
88 #define	PI_MAGIC DEADBEEF1
89 #else
90 #define	PI_MAGIC 0xdeafbeef
91 #endif
92 
93 #define	POOL_NEEDS_CATCHUP(pp)						\
94 	((pp)->pr_nitems < (pp)->pr_minitems)
95 
96 /*
97  * Default constraint range for pools, that cover the whole
98  * address space.
99  */
100 struct uvm_constraint_range	pool_full_range = { 0x0, (paddr_t)-1 };
101 
102 /*
103  * Every pool gets a unique serial number assigned to it. If this counter
104  * wraps, we're screwed, but we shouldn't create so many pools anyway.
105  */
106 unsigned int pool_serial;
107 
108 int	 pool_catchup(struct pool *);
109 void	 pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
110 void	 pool_update_curpage(struct pool *);
111 void	*pool_do_get(struct pool *, int);
112 void	 pool_do_put(struct pool *, void *);
113 void	 pr_rmpage(struct pool *, struct pool_item_header *,
114 	    struct pool_pagelist *);
115 int	pool_chk_page(struct pool *, const char *, struct pool_item_header *);
116 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int);
117 
118 void	*pool_allocator_alloc(struct pool *, int, int *);
119 void	 pool_allocator_free(struct pool *, void *);
120 
121 /*
122  * XXX - quick hack. For pools with large items we want to use a special
123  *       allocator. For now, instead of having the allocator figure out
124  *       the allocation size from the pool (which can be done trivially
125  *       with round_page(pr_itemsperpage * pr_size)) which would require
126  *	 lots of changes everywhere, we just create allocators for each
127  *	 size. We limit those to 128 pages.
128  */
129 #define POOL_LARGE_MAXPAGES 128
130 struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES];
131 struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES];
132 void	*pool_large_alloc(struct pool *, int, int *);
133 void	pool_large_free(struct pool *, void *);
134 void	*pool_large_alloc_ni(struct pool *, int, int *);
135 void	pool_large_free_ni(struct pool *, void *);
136 
137 
138 #ifdef DDB
139 void	 pool_print_pagelist(struct pool_pagelist *,
140 	    int (*)(const char *, ...));
141 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...));
142 #endif
143 
144 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
145 
146 static __inline int
147 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
148 {
149 	long diff = (vaddr_t)a->ph_page - (vaddr_t)b->ph_page;
150 	if (diff < 0)
151 		return -(-diff >= a->ph_pagesize);
152 	else if (diff > 0)
153 		return (diff >= b->ph_pagesize);
154 	else
155 		return (0);
156 }
157 
158 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
159 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
160 
161 /*
162  * Return the pool page header based on page address.
163  */
164 static __inline struct pool_item_header *
165 pr_find_pagehead(struct pool *pp, void *v)
166 {
167 	struct pool_item_header *ph, tmp;
168 
169 	if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
170 		caddr_t page;
171 
172 		page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
173 
174 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
175 	}
176 
177 	/*
178 	 * The trick we're using in the tree compare function is to compare
179 	 * two elements equal when they overlap. We want to return the
180 	 * page header that belongs to the element just before this address.
181 	 * We don't want this element to compare equal to the next element,
182 	 * so the compare function takes the pagesize from the lower element.
183 	 * If this header is the lower, its pagesize is zero, so it can't
184 	 * overlap with the next header. But if the header we're looking for
185 	 * is lower, we'll use its pagesize and it will overlap and return
186 	 * equal.
187 	 */
188 	tmp.ph_page = v;
189 	tmp.ph_pagesize = 0;
190 	ph = RB_FIND(phtree, &pp->pr_phtree, &tmp);
191 
192 	if (ph) {
193 		KASSERT(ph->ph_page <= (caddr_t)v);
194 		KASSERT(ph->ph_page + ph->ph_pagesize > (caddr_t)v);
195 	}
196 	return ph;
197 }
198 
199 /*
200  * Remove a page from the pool.
201  */
202 void
203 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
204     struct pool_pagelist *pq)
205 {
206 
207 	/*
208 	 * If the page was idle, decrement the idle page count.
209 	 */
210 	if (ph->ph_nmissing == 0) {
211 #ifdef DIAGNOSTIC
212 		if (pp->pr_nidle == 0)
213 			panic("pr_rmpage: nidle inconsistent");
214 		if (pp->pr_nitems < pp->pr_itemsperpage)
215 			panic("pr_rmpage: nitems inconsistent");
216 #endif
217 		pp->pr_nidle--;
218 	}
219 
220 	pp->pr_nitems -= pp->pr_itemsperpage;
221 
222 	/*
223 	 * Unlink a page from the pool and release it (or queue it for release).
224 	 */
225 	LIST_REMOVE(ph, ph_pagelist);
226 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
227 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
228 	if (pq) {
229 		LIST_INSERT_HEAD(pq, ph, ph_pagelist);
230 	} else {
231 		pool_allocator_free(pp, ph->ph_page);
232 		if ((pp->pr_roflags & PR_PHINPAGE) == 0)
233 			pool_put(&phpool, ph);
234 	}
235 	pp->pr_npages--;
236 	pp->pr_npagefree++;
237 
238 	pool_update_curpage(pp);
239 }
240 
241 /*
242  * Initialize the given pool resource structure.
243  *
244  * We export this routine to allow other kernel parts to declare
245  * static pools that must be initialized before malloc() is available.
246  */
247 void
248 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
249     const char *wchan, struct pool_allocator *palloc)
250 {
251 	int off, slack;
252 
253 #ifdef MALLOC_DEBUG
254 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
255 		flags &= ~PR_DEBUG;
256 #endif
257 	/*
258 	 * Check arguments and construct default values.
259 	 */
260 	if (palloc == NULL) {
261 		if (size > PAGE_SIZE) {
262 			int psize;
263 
264 			/*
265 			 * XXX - should take align into account as well.
266 			 */
267 			if (size == round_page(size))
268 				psize = size / PAGE_SIZE;
269 			else
270 				psize = PAGE_SIZE / roundup(size % PAGE_SIZE,
271 				    1024);
272 			if (psize > POOL_LARGE_MAXPAGES)
273 				psize = POOL_LARGE_MAXPAGES;
274 			if (flags & PR_WAITOK)
275 				palloc = &pool_allocator_large_ni[psize-1];
276 			else
277 				palloc = &pool_allocator_large[psize-1];
278 			if (palloc->pa_pagesz == 0) {
279 				palloc->pa_pagesz = psize * PAGE_SIZE;
280 				if (flags & PR_WAITOK) {
281 					palloc->pa_alloc = pool_large_alloc_ni;
282 					palloc->pa_free = pool_large_free_ni;
283 				} else {
284 					palloc->pa_alloc = pool_large_alloc;
285 					palloc->pa_free = pool_large_free;
286 				}
287 			}
288 		} else {
289 			palloc = &pool_allocator_nointr;
290 		}
291 	}
292 	if (palloc->pa_pagesz == 0) {
293 		palloc->pa_pagesz = PAGE_SIZE;
294 	}
295 	if (palloc->pa_pagemask == 0) {
296 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
297 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
298 	}
299 
300 	if (align == 0)
301 		align = ALIGN(1);
302 
303 	if (size < sizeof(struct pool_item))
304 		size = sizeof(struct pool_item);
305 
306 	size = roundup(size, align);
307 #ifdef DIAGNOSTIC
308 	if (size > palloc->pa_pagesz)
309 		panic("pool_init: pool item size (%lu) too large",
310 		    (u_long)size);
311 #endif
312 
313 	/*
314 	 * Initialize the pool structure.
315 	 */
316 	LIST_INIT(&pp->pr_emptypages);
317 	LIST_INIT(&pp->pr_fullpages);
318 	LIST_INIT(&pp->pr_partpages);
319 	pp->pr_curpage = NULL;
320 	pp->pr_npages = 0;
321 	pp->pr_minitems = 0;
322 	pp->pr_minpages = 0;
323 	pp->pr_maxpages = 8;
324 	pp->pr_roflags = flags;
325 	pp->pr_flags = 0;
326 	pp->pr_size = size;
327 	pp->pr_align = align;
328 	pp->pr_wchan = wchan;
329 	pp->pr_alloc = palloc;
330 	pp->pr_nitems = 0;
331 	pp->pr_nout = 0;
332 	pp->pr_hardlimit = UINT_MAX;
333 	pp->pr_hardlimit_warning = NULL;
334 	pp->pr_hardlimit_ratecap.tv_sec = 0;
335 	pp->pr_hardlimit_ratecap.tv_usec = 0;
336 	pp->pr_hardlimit_warning_last.tv_sec = 0;
337 	pp->pr_hardlimit_warning_last.tv_usec = 0;
338 	pp->pr_serial = ++pool_serial;
339 	if (pool_serial == 0)
340 		panic("pool_init: too much uptime");
341 
342         /* constructor, destructor, and arg */
343 	pp->pr_ctor = NULL;
344 	pp->pr_dtor = NULL;
345 	pp->pr_arg = NULL;
346 
347 	/*
348 	 * Decide whether to put the page header off page to avoid
349 	 * wasting too large a part of the page. Off-page page headers
350 	 * go into an RB tree, so we can match a returned item with
351 	 * its header based on the page address.
352 	 * We use 1/16 of the page size as the threshold (XXX: tune)
353 	 */
354 	if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) {
355 		/* Use the end of the page for the page header */
356 		pp->pr_roflags |= PR_PHINPAGE;
357 		pp->pr_phoffset = off = palloc->pa_pagesz -
358 		    ALIGN(sizeof(struct pool_item_header));
359 	} else {
360 		/* The page header will be taken from our page header pool */
361 		pp->pr_phoffset = 0;
362 		off = palloc->pa_pagesz;
363 		RB_INIT(&pp->pr_phtree);
364 	}
365 
366 	/*
367 	 * Alignment is to take place at `ioff' within the item. This means
368 	 * we must reserve up to `align - 1' bytes on the page to allow
369 	 * appropriate positioning of each item.
370 	 *
371 	 * Silently enforce `0 <= ioff < align'.
372 	 */
373 	pp->pr_itemoffset = ioff = ioff % align;
374 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
375 	KASSERT(pp->pr_itemsperpage != 0);
376 
377 	/*
378 	 * Use the slack between the chunks and the page header
379 	 * for "cache coloring".
380 	 */
381 	slack = off - pp->pr_itemsperpage * pp->pr_size;
382 	pp->pr_maxcolor = (slack / align) * align;
383 	pp->pr_curcolor = 0;
384 
385 	pp->pr_nget = 0;
386 	pp->pr_nfail = 0;
387 	pp->pr_nput = 0;
388 	pp->pr_npagealloc = 0;
389 	pp->pr_npagefree = 0;
390 	pp->pr_hiwat = 0;
391 	pp->pr_nidle = 0;
392 
393 	pp->pr_ipl = -1;
394 	mtx_init(&pp->pr_mtx, IPL_NONE);
395 
396 	if (phpool.pr_size == 0) {
397 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
398 		    0, "phpool", NULL);
399 		pool_setipl(&phpool, IPL_HIGH);
400 	}
401 
402 	/* pglistalloc/constraint parameters */
403 	pp->pr_crange = &pool_full_range;
404 	pp->pr_pa_nsegs = 0;
405 
406 	/* Insert this into the list of all pools. */
407 	TAILQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
408 }
409 
410 void
411 pool_setipl(struct pool *pp, int ipl)
412 {
413 	pp->pr_ipl = ipl;
414 	mtx_init(&pp->pr_mtx, ipl);
415 }
416 
417 /*
418  * Decommission a pool resource.
419  */
420 void
421 pool_destroy(struct pool *pp)
422 {
423 	struct pool_item_header *ph;
424 
425 #ifdef DIAGNOSTIC
426 	if (pp->pr_nout != 0)
427 		panic("pool_destroy: pool busy: still out: %u", pp->pr_nout);
428 #endif
429 
430 	/* Remove all pages */
431 	while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
432 		pr_rmpage(pp, ph, NULL);
433 	KASSERT(LIST_EMPTY(&pp->pr_fullpages));
434 	KASSERT(LIST_EMPTY(&pp->pr_partpages));
435 
436 	/* Remove from global pool list */
437 	TAILQ_REMOVE(&pool_head, pp, pr_poollist);
438 }
439 
440 struct pool_item_header *
441 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
442 {
443 	struct pool_item_header *ph;
444 
445 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
446 		ph = (struct pool_item_header *)(storage + pp->pr_phoffset);
447 	else
448 		ph = pool_get(&phpool, flags & ~(PR_WAITOK | PR_ZERO));
449 
450 	return (ph);
451 }
452 
453 /*
454  * Grab an item from the pool; must be called at appropriate spl level
455  */
456 void *
457 pool_get(struct pool *pp, int flags)
458 {
459 	void *v;
460 
461 #ifdef DIAGNOSTIC
462 	if ((flags & PR_WAITOK) != 0)
463 		splassert(IPL_NONE);
464 #endif /* DIAGNOSTIC */
465 
466 	mtx_enter(&pp->pr_mtx);
467 	v = pool_do_get(pp, flags);
468 	mtx_leave(&pp->pr_mtx);
469 	if (v == NULL)
470 		return (v);
471 
472 	if (pp->pr_ctor) {
473 		if (flags & PR_ZERO)
474 			panic("pool_get: PR_ZERO when ctor set");
475 		if (pp->pr_ctor(pp->pr_arg, v, flags)) {
476 			mtx_enter(&pp->pr_mtx);
477 			pool_do_put(pp, v);
478 			mtx_leave(&pp->pr_mtx);
479 			v = NULL;
480 		}
481 	} else {
482 		if (flags & PR_ZERO)
483 			memset(v, 0, pp->pr_size);
484 	}
485 	if (v != NULL)
486 		pp->pr_nget++;
487 	return (v);
488 }
489 
490 void *
491 pool_do_get(struct pool *pp, int flags)
492 {
493 	struct pool_item *pi;
494 	struct pool_item_header *ph;
495 	void *v;
496 	int slowdown = 0;
497 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
498 	int i, *ip;
499 #endif
500 
501 #ifdef MALLOC_DEBUG
502 	if (pp->pr_roflags & PR_DEBUG) {
503 		void *addr;
504 
505 		addr = NULL;
506 		debug_malloc(pp->pr_size, M_DEBUG,
507 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
508 		return (addr);
509 	}
510 #endif
511 
512 startover:
513 	/*
514 	 * Check to see if we've reached the hard limit.  If we have,
515 	 * and we can wait, then wait until an item has been returned to
516 	 * the pool.
517 	 */
518 #ifdef DIAGNOSTIC
519 	if (__predict_false(pp->pr_nout > pp->pr_hardlimit))
520 		panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan);
521 #endif
522 	if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
523 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
524 			/*
525 			 * XXX: A warning isn't logged in this case.  Should
526 			 * it be?
527 			 */
528 			pp->pr_flags |= PR_WANTED;
529 			pool_sleep(pp);
530 			goto startover;
531 		}
532 
533 		/*
534 		 * Log a message that the hard limit has been hit.
535 		 */
536 		if (pp->pr_hardlimit_warning != NULL &&
537 		    ratecheck(&pp->pr_hardlimit_warning_last,
538 		    &pp->pr_hardlimit_ratecap))
539 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
540 
541 		pp->pr_nfail++;
542 		return (NULL);
543 	}
544 
545 	/*
546 	 * The convention we use is that if `curpage' is not NULL, then
547 	 * it points at a non-empty bucket. In particular, `curpage'
548 	 * never points at a page header which has PR_PHINPAGE set and
549 	 * has no items in its bucket.
550 	 */
551 	if ((ph = pp->pr_curpage) == NULL) {
552 #ifdef DIAGNOSTIC
553 		if (pp->pr_nitems != 0) {
554 			printf("pool_do_get: %s: curpage NULL, nitems %u\n",
555 			    pp->pr_wchan, pp->pr_nitems);
556 			panic("pool_do_get: nitems inconsistent");
557 		}
558 #endif
559 
560 		/*
561 		 * Call the back-end page allocator for more memory.
562 		 */
563 		v = pool_allocator_alloc(pp, flags, &slowdown);
564 		if (__predict_true(v != NULL))
565 			ph = pool_alloc_item_header(pp, v, flags);
566 
567 		if (__predict_false(v == NULL || ph == NULL)) {
568 			if (v != NULL)
569 				pool_allocator_free(pp, v);
570 
571 			if ((flags & PR_WAITOK) == 0) {
572 				pp->pr_nfail++;
573 				return (NULL);
574 			}
575 
576 			/*
577 			 * Wait for items to be returned to this pool.
578 			 *
579 			 * XXX: maybe we should wake up once a second and
580 			 * try again?
581 			 */
582 			pp->pr_flags |= PR_WANTED;
583 			pool_sleep(pp);
584 			goto startover;
585 		}
586 
587 		/* We have more memory; add it to the pool */
588 		pool_prime_page(pp, v, ph);
589 		pp->pr_npagealloc++;
590 
591 		if (slowdown && (flags & PR_WAITOK)) {
592 			mtx_leave(&pp->pr_mtx);
593 			yield();
594 			mtx_enter(&pp->pr_mtx);
595 		}
596 
597 		/* Start the allocation process over. */
598 		goto startover;
599 	}
600 	if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) {
601 		panic("pool_do_get: %s: page empty", pp->pr_wchan);
602 	}
603 #ifdef DIAGNOSTIC
604 	if (__predict_false(pp->pr_nitems == 0)) {
605 		printf("pool_do_get: %s: items on itemlist, nitems %u\n",
606 		    pp->pr_wchan, pp->pr_nitems);
607 		panic("pool_do_get: nitems inconsistent");
608 	}
609 #endif
610 
611 #ifdef DIAGNOSTIC
612 	if (__predict_false(pi->pi_magic != PI_MAGIC))
613 		panic("pool_do_get(%s): free list modified: "
614 		    "page %p; item addr %p; offset 0x%x=0x%x",
615 		    pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic);
616 #ifdef POOL_DEBUG
617 	for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
618 	    i < pp->pr_size / sizeof(int); i++) {
619 		if (ip[i] != PI_MAGIC) {
620 			panic("pool_do_get(%s): free list modified: "
621 			    "page %p; item addr %p; offset 0x%x=0x%x",
622 			    pp->pr_wchan, ph->ph_page, pi,
623 			    i * sizeof(int), ip[i]);
624 		}
625 	}
626 #endif /* POOL_DEBUG */
627 #endif /* DIAGNOSTIC */
628 
629 	/*
630 	 * Remove from item list.
631 	 */
632 	TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list);
633 	pp->pr_nitems--;
634 	pp->pr_nout++;
635 	if (ph->ph_nmissing == 0) {
636 #ifdef DIAGNOSTIC
637 		if (__predict_false(pp->pr_nidle == 0))
638 			panic("pool_do_get: nidle inconsistent");
639 #endif
640 		pp->pr_nidle--;
641 
642 		/*
643 		 * This page was previously empty.  Move it to the list of
644 		 * partially-full pages.  This page is already curpage.
645 		 */
646 		LIST_REMOVE(ph, ph_pagelist);
647 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
648 	}
649 	ph->ph_nmissing++;
650 	if (TAILQ_EMPTY(&ph->ph_itemlist)) {
651 #ifdef DIAGNOSTIC
652 		if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) {
653 			panic("pool_do_get: %s: nmissing inconsistent",
654 			    pp->pr_wchan);
655 		}
656 #endif
657 		/*
658 		 * This page is now full.  Move it to the full list
659 		 * and select a new current page.
660 		 */
661 		LIST_REMOVE(ph, ph_pagelist);
662 		LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
663 		pool_update_curpage(pp);
664 	}
665 
666 	/*
667 	 * If we have a low water mark and we are now below that low
668 	 * water mark, add more items to the pool.
669 	 */
670 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
671 		/*
672 		 * XXX: Should we log a warning?  Should we set up a timeout
673 		 * to try again in a second or so?  The latter could break
674 		 * a caller's assumptions about interrupt protection, etc.
675 		 */
676 	}
677 	return (v);
678 }
679 
680 /*
681  * Return resource to the pool; must be called at appropriate spl level
682  */
683 void
684 pool_put(struct pool *pp, void *v)
685 {
686 	if (pp->pr_dtor)
687 		pp->pr_dtor(pp->pr_arg, v);
688 	mtx_enter(&pp->pr_mtx);
689 	pool_do_put(pp, v);
690 	mtx_leave(&pp->pr_mtx);
691 	pp->pr_nput++;
692 }
693 
694 /*
695  * Internal version of pool_put().
696  */
697 void
698 pool_do_put(struct pool *pp, void *v)
699 {
700 	struct pool_item *pi = v;
701 	struct pool_item_header *ph;
702 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
703 	int i, *ip;
704 #endif
705 
706 	if (v == NULL)
707 		panic("pool_put of NULL");
708 
709 #ifdef MALLOC_DEBUG
710 	if (pp->pr_roflags & PR_DEBUG) {
711 		debug_free(v, M_DEBUG);
712 		return;
713 	}
714 #endif
715 
716 #ifdef DIAGNOSTIC
717 	if (pp->pr_ipl != -1)
718 		splassert(pp->pr_ipl);
719 
720 	if (__predict_false(pp->pr_nout == 0)) {
721 		printf("pool %s: putting with none out\n",
722 		    pp->pr_wchan);
723 		panic("pool_do_put");
724 	}
725 #endif
726 
727 	if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
728 		panic("pool_do_put: %s: page header missing", pp->pr_wchan);
729 	}
730 
731 	/*
732 	 * Return to item list.
733 	 */
734 #ifdef DIAGNOSTIC
735 	pi->pi_magic = PI_MAGIC;
736 #ifdef POOL_DEBUG
737 	for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
738 	    i < pp->pr_size / sizeof(int); i++)
739 		ip[i] = PI_MAGIC;
740 #endif /* POOL_DEBUG */
741 #endif /* DIAGNOSTIC */
742 
743 	TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
744 	ph->ph_nmissing--;
745 	pp->pr_nitems++;
746 	pp->pr_nout--;
747 
748 	/* Cancel "pool empty" condition if it exists */
749 	if (pp->pr_curpage == NULL)
750 		pp->pr_curpage = ph;
751 
752 	if (pp->pr_flags & PR_WANTED) {
753 		pp->pr_flags &= ~PR_WANTED;
754 		if (ph->ph_nmissing == 0)
755 			pp->pr_nidle++;
756 		wakeup(pp);
757 		return;
758 	}
759 
760 	/*
761 	 * If this page is now empty, do one of two things:
762 	 *
763 	 *	(1) If we have more pages than the page high water mark,
764 	 *	    free the page back to the system.
765 	 *
766 	 *	(2) Otherwise, move the page to the empty page list.
767 	 *
768 	 * Either way, select a new current page (so we use a partially-full
769 	 * page if one is available).
770 	 */
771 	if (ph->ph_nmissing == 0) {
772 		pp->pr_nidle++;
773 		if (pp->pr_nidle > pp->pr_maxpages) {
774 			pr_rmpage(pp, ph, NULL);
775 		} else {
776 			LIST_REMOVE(ph, ph_pagelist);
777 			LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
778 		}
779 		pool_update_curpage(pp);
780 	}
781 
782 	/*
783 	 * If the page was previously completely full, move it to the
784 	 * partially-full list and make it the current page.  The next
785 	 * allocation will get the item from this page, instead of
786 	 * further fragmenting the pool.
787 	 */
788 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
789 		LIST_REMOVE(ph, ph_pagelist);
790 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
791 		pp->pr_curpage = ph;
792 	}
793 }
794 
795 /*
796  * Add N items to the pool.
797  */
798 int
799 pool_prime(struct pool *pp, int n)
800 {
801 	struct pool_item_header *ph;
802 	caddr_t cp;
803 	int newpages;
804 	int slowdown;
805 
806 	mtx_enter(&pp->pr_mtx);
807 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
808 
809 	while (newpages-- > 0) {
810 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
811 		if (__predict_true(cp != NULL))
812 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
813 		if (__predict_false(cp == NULL || ph == NULL)) {
814 			if (cp != NULL)
815 				pool_allocator_free(pp, cp);
816 			break;
817 		}
818 
819 		pool_prime_page(pp, cp, ph);
820 		pp->pr_npagealloc++;
821 		pp->pr_minpages++;
822 	}
823 
824 	if (pp->pr_minpages >= pp->pr_maxpages)
825 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
826 
827 	mtx_leave(&pp->pr_mtx);
828 	return (0);
829 }
830 
831 /*
832  * Add a page worth of items to the pool.
833  *
834  * Note, we must be called with the pool descriptor LOCKED.
835  */
836 void
837 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
838 {
839 	struct pool_item *pi;
840 	caddr_t cp = storage;
841 	unsigned int align = pp->pr_align;
842 	unsigned int ioff = pp->pr_itemoffset;
843 	int n;
844 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
845 	int i, *ip;
846 #endif
847 
848 	/*
849 	 * Insert page header.
850 	 */
851 	LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
852 	TAILQ_INIT(&ph->ph_itemlist);
853 	ph->ph_page = storage;
854 	ph->ph_pagesize = pp->pr_alloc->pa_pagesz;
855 	ph->ph_nmissing = 0;
856 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
857 		RB_INSERT(phtree, &pp->pr_phtree, ph);
858 
859 	pp->pr_nidle++;
860 
861 	/*
862 	 * Color this page.
863 	 */
864 	cp = (caddr_t)(cp + pp->pr_curcolor);
865 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
866 		pp->pr_curcolor = 0;
867 
868 	/*
869 	 * Adjust storage to apply alignment to `pr_itemoffset' in each item.
870 	 */
871 	if (ioff != 0)
872 		cp = (caddr_t)(cp + (align - ioff));
873 	ph->ph_colored = cp;
874 
875 	/*
876 	 * Insert remaining chunks on the bucket list.
877 	 */
878 	n = pp->pr_itemsperpage;
879 	pp->pr_nitems += n;
880 
881 	while (n--) {
882 		pi = (struct pool_item *)cp;
883 
884 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
885 
886 		/* Insert on page list */
887 		TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
888 
889 #ifdef DIAGNOSTIC
890 		pi->pi_magic = PI_MAGIC;
891 #ifdef POOL_DEBUG
892 		for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
893 		    i < pp->pr_size / sizeof(int); i++)
894 			ip[i] = PI_MAGIC;
895 #endif /* POOL_DEBUG */
896 #endif /* DIAGNOSTIC */
897 		cp = (caddr_t)(cp + pp->pr_size);
898 	}
899 
900 	/*
901 	 * If the pool was depleted, point at the new page.
902 	 */
903 	if (pp->pr_curpage == NULL)
904 		pp->pr_curpage = ph;
905 
906 	if (++pp->pr_npages > pp->pr_hiwat)
907 		pp->pr_hiwat = pp->pr_npages;
908 }
909 
910 /*
911  * Used by pool_get() when nitems drops below the low water mark.  This
912  * is used to catch up pr_nitems with the low water mark.
913  *
914  * Note we never wait for memory here, we let the caller decide what to do.
915  */
916 int
917 pool_catchup(struct pool *pp)
918 {
919 	struct pool_item_header *ph;
920 	caddr_t cp;
921 	int error = 0;
922 	int slowdown;
923 
924 	while (POOL_NEEDS_CATCHUP(pp)) {
925 		/*
926 		 * Call the page back-end allocator for more memory.
927 		 */
928 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
929 		if (__predict_true(cp != NULL))
930 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
931 		if (__predict_false(cp == NULL || ph == NULL)) {
932 			if (cp != NULL)
933 				pool_allocator_free(pp, cp);
934 			error = ENOMEM;
935 			break;
936 		}
937 		pool_prime_page(pp, cp, ph);
938 		pp->pr_npagealloc++;
939 	}
940 
941 	return (error);
942 }
943 
944 void
945 pool_update_curpage(struct pool *pp)
946 {
947 
948 	pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
949 	if (pp->pr_curpage == NULL) {
950 		pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
951 	}
952 }
953 
954 void
955 pool_setlowat(struct pool *pp, int n)
956 {
957 
958 	pp->pr_minitems = n;
959 	pp->pr_minpages = (n == 0)
960 		? 0
961 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
962 
963 	mtx_enter(&pp->pr_mtx);
964 	/* Make sure we're caught up with the newly-set low water mark. */
965 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
966 		/*
967 		 * XXX: Should we log a warning?  Should we set up a timeout
968 		 * to try again in a second or so?  The latter could break
969 		 * a caller's assumptions about interrupt protection, etc.
970 		 */
971 	}
972 	mtx_leave(&pp->pr_mtx);
973 }
974 
975 void
976 pool_sethiwat(struct pool *pp, int n)
977 {
978 
979 	pp->pr_maxpages = (n == 0)
980 		? 0
981 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
982 }
983 
984 int
985 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
986 {
987 	int error = 0;
988 
989 	if (n < pp->pr_nout) {
990 		error = EINVAL;
991 		goto done;
992 	}
993 
994 	pp->pr_hardlimit = n;
995 	pp->pr_hardlimit_warning = warnmsg;
996 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
997 	pp->pr_hardlimit_warning_last.tv_sec = 0;
998 	pp->pr_hardlimit_warning_last.tv_usec = 0;
999 
1000 	/*
1001 	 * In-line version of pool_sethiwat().
1002 	 */
1003 	pp->pr_maxpages = (n == 0 || n == UINT_MAX)
1004 		? n
1005 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1006 
1007 done:
1008 	return (error);
1009 }
1010 
1011 void
1012 pool_set_constraints(struct pool *pp, struct uvm_constraint_range *range,
1013     int nsegs)
1014 {
1015 	/*
1016 	 * Subsequent changes to the constrictions are only
1017 	 * allowed to make them _more_ strict.
1018 	 */
1019 	KASSERT(pp->pr_crange->ucr_high >= range->ucr_high &&
1020 	    pp->pr_crange->ucr_low <= range->ucr_low);
1021 
1022 	pp->pr_crange = range;
1023 	pp->pr_pa_nsegs = nsegs;
1024 }
1025 
1026 void
1027 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int),
1028     void (*dtor)(void *, void *), void *arg)
1029 {
1030 	pp->pr_ctor = ctor;
1031 	pp->pr_dtor = dtor;
1032 	pp->pr_arg = arg;
1033 }
1034 /*
1035  * Release all complete pages that have not been used recently.
1036  *
1037  * Returns non-zero if any pages have been reclaimed.
1038  */
1039 int
1040 pool_reclaim(struct pool *pp)
1041 {
1042 	struct pool_item_header *ph, *phnext;
1043 	struct pool_pagelist pq;
1044 
1045 	LIST_INIT(&pq);
1046 
1047 	mtx_enter(&pp->pr_mtx);
1048 	for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1049 		phnext = LIST_NEXT(ph, ph_pagelist);
1050 
1051 		/* Check our minimum page claim */
1052 		if (pp->pr_npages <= pp->pr_minpages)
1053 			break;
1054 
1055 		KASSERT(ph->ph_nmissing == 0);
1056 
1057 		/*
1058 		 * If freeing this page would put us below
1059 		 * the low water mark, stop now.
1060 		 */
1061 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1062 		    pp->pr_minitems)
1063 			break;
1064 
1065 		pr_rmpage(pp, ph, &pq);
1066 	}
1067 	mtx_leave(&pp->pr_mtx);
1068 
1069 	if (LIST_EMPTY(&pq))
1070 		return (0);
1071 	while ((ph = LIST_FIRST(&pq)) != NULL) {
1072 		LIST_REMOVE(ph, ph_pagelist);
1073 		pool_allocator_free(pp, ph->ph_page);
1074 		if (pp->pr_roflags & PR_PHINPAGE)
1075 			continue;
1076 		pool_put(&phpool, ph);
1077 	}
1078 
1079 	return (1);
1080 }
1081 
1082 #ifdef DDB
1083 #include <machine/db_machdep.h>
1084 #include <ddb/db_interface.h>
1085 #include <ddb/db_output.h>
1086 
1087 /*
1088  * Diagnostic helpers.
1089  */
1090 void
1091 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1092 {
1093 	pool_print1(pp, modif, pr);
1094 }
1095 
1096 void
1097 pool_print_pagelist(struct pool_pagelist *pl, int (*pr)(const char *, ...))
1098 {
1099 	struct pool_item_header *ph;
1100 #ifdef DIAGNOSTIC
1101 	struct pool_item *pi;
1102 #endif
1103 
1104 	LIST_FOREACH(ph, pl, ph_pagelist) {
1105 		(*pr)("\t\tpage %p, nmissing %d\n",
1106 		    ph->ph_page, ph->ph_nmissing);
1107 #ifdef DIAGNOSTIC
1108 		TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1109 			if (pi->pi_magic != PI_MAGIC) {
1110 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1111 				    pi, pi->pi_magic);
1112 			}
1113 		}
1114 #endif
1115 	}
1116 }
1117 
1118 void
1119 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1120 {
1121 	struct pool_item_header *ph;
1122 	int print_pagelist = 0;
1123 	char c;
1124 
1125 	while ((c = *modif++) != '\0') {
1126 		if (c == 'p')
1127 			print_pagelist = 1;
1128 		modif++;
1129 	}
1130 
1131 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1132 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1133 	    pp->pr_roflags);
1134 	(*pr)("\talloc %p\n", pp->pr_alloc);
1135 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1136 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1137 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1138 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1139 
1140 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1141 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1142 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1143 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1144 
1145 	if (print_pagelist == 0)
1146 		return;
1147 
1148 	if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
1149 		(*pr)("\n\tempty page list:\n");
1150 	pool_print_pagelist(&pp->pr_emptypages, pr);
1151 	if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
1152 		(*pr)("\n\tfull page list:\n");
1153 	pool_print_pagelist(&pp->pr_fullpages, pr);
1154 	if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
1155 		(*pr)("\n\tpartial-page list:\n");
1156 	pool_print_pagelist(&pp->pr_partpages, pr);
1157 
1158 	if (pp->pr_curpage == NULL)
1159 		(*pr)("\tno current page\n");
1160 	else
1161 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1162 }
1163 
1164 void
1165 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1166 {
1167 	struct pool *pp;
1168 	char maxp[16];
1169 	int ovflw;
1170 	char mode;
1171 
1172 	mode = modif[0];
1173 	if (mode != '\0' && mode != 'a') {
1174 		db_printf("usage: show all pools [/a]\n");
1175 		return;
1176 	}
1177 
1178 	if (mode == '\0')
1179 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1180 		    "Name",
1181 		    "Size",
1182 		    "Requests",
1183 		    "Fail",
1184 		    "Releases",
1185 		    "Pgreq",
1186 		    "Pgrel",
1187 		    "Npage",
1188 		    "Hiwat",
1189 		    "Minpg",
1190 		    "Maxpg",
1191 		    "Idle");
1192 	else
1193 		db_printf("%-10s %18s %18s\n",
1194 		    "Name", "Address", "Allocator");
1195 
1196 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1197 		if (mode == 'a') {
1198 			db_printf("%-10s %18p %18p\n", pp->pr_wchan, pp,
1199 			    pp->pr_alloc);
1200 			continue;
1201 		}
1202 
1203 		if (!pp->pr_nget)
1204 			continue;
1205 
1206 		if (pp->pr_maxpages == UINT_MAX)
1207 			snprintf(maxp, sizeof maxp, "inf");
1208 		else
1209 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1210 
1211 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1212 	(ovflw) += db_printf((fmt),			\
1213 	    (width) - (fixed) - (ovflw) > 0 ?		\
1214 	    (width) - (fixed) - (ovflw) : 0,		\
1215 	    (val)) - (width);				\
1216 	if ((ovflw) < 0)				\
1217 		(ovflw) = 0;				\
1218 } while (/* CONSTCOND */0)
1219 
1220 		ovflw = 0;
1221 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1222 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1223 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1224 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1225 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1226 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1227 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1228 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1229 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1230 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1231 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1232 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1233 
1234 		pool_chk(pp, pp->pr_wchan);
1235 	}
1236 }
1237 
1238 int
1239 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
1240 {
1241 	struct pool_item *pi;
1242 	caddr_t page;
1243 	int n;
1244 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
1245 	int i, *ip;
1246 #endif
1247 
1248 	page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask);
1249 	if (page != ph->ph_page &&
1250 	    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1251 		if (label != NULL)
1252 			printf("%s: ", label);
1253 		printf("pool(%p:%s): page inconsistency: page %p; "
1254 		    "at page head addr %p (p %p)\n",
1255 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1256 		return 1;
1257 	}
1258 
1259 	for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0;
1260 	     pi != NULL;
1261 	     pi = TAILQ_NEXT(pi,pi_list), n++) {
1262 
1263 #ifdef DIAGNOSTIC
1264 		if (pi->pi_magic != PI_MAGIC) {
1265 			if (label != NULL)
1266 				printf("%s: ", label);
1267 			printf("pool(%s): free list modified: "
1268 			    "page %p; item ordinal %d; addr %p "
1269 			    "(p %p); offset 0x%x=0x%x\n",
1270 			    pp->pr_wchan, ph->ph_page, n, pi, page,
1271 			    0, pi->pi_magic);
1272 		}
1273 #ifdef POOL_DEBUG
1274 		for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
1275 		    i < pp->pr_size / sizeof(int); i++) {
1276 			if (ip[i] != PI_MAGIC) {
1277 				printf("pool(%s): free list modified: "
1278 				    "page %p; item ordinal %d; addr %p "
1279 				    "(p %p); offset 0x%x=0x%x\n",
1280 				    pp->pr_wchan, ph->ph_page, n, pi,
1281 				    page, i * sizeof(int), ip[i]);
1282 			}
1283 		}
1284 
1285 #endif /* POOL_DEBUG */
1286 #endif /* DIAGNOSTIC */
1287 		page =
1288 		    (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask);
1289 		if (page == ph->ph_page)
1290 			continue;
1291 
1292 		if (label != NULL)
1293 			printf("%s: ", label);
1294 		printf("pool(%p:%s): page inconsistency: page %p;"
1295 		    " item ordinal %d; addr %p (p %p)\n", pp,
1296 		    pp->pr_wchan, ph->ph_page, n, pi, page);
1297 		return 1;
1298 	}
1299 	return 0;
1300 }
1301 
1302 int
1303 pool_chk(struct pool *pp, const char *label)
1304 {
1305 	struct pool_item_header *ph;
1306 	int r = 0;
1307 
1308 	LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1309 		r += pool_chk_page(pp, label, ph);
1310 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1311 		r += pool_chk_page(pp, label, ph);
1312 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1313 		r += pool_chk_page(pp, label, ph);
1314 
1315 	return (r);
1316 }
1317 
1318 void
1319 pool_walk(struct pool *pp, int full, int (*pr)(const char *, ...),
1320     void (*func)(void *, int, int (*)(const char *, ...)))
1321 {
1322 	struct pool_item_header *ph;
1323 	struct pool_item *pi;
1324 	caddr_t cp;
1325 	int n;
1326 
1327 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1328 		cp = ph->ph_colored;
1329 		n = ph->ph_nmissing;
1330 
1331 		while (n--) {
1332 			func(cp, full, pr);
1333 			cp += pp->pr_size;
1334 		}
1335 	}
1336 
1337 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1338 		cp = ph->ph_colored;
1339 		n = ph->ph_nmissing;
1340 
1341 		do {
1342 			TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1343 				if (cp == (caddr_t)pi)
1344 					break;
1345 			}
1346 			if (cp != (caddr_t)pi) {
1347 				func(cp, full, pr);
1348 				n--;
1349 			}
1350 
1351 			cp += pp->pr_size;
1352 		} while (n > 0);
1353 	}
1354 }
1355 #endif
1356 
1357 /*
1358  * We have three different sysctls.
1359  * kern.pool.npools - the number of pools.
1360  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1361  * kern.pool.name.<pool#> - the name for pool#.
1362  */
1363 int
1364 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1365 {
1366 	struct pool *pp, *foundpool = NULL;
1367 	size_t buflen = where != NULL ? *sizep : 0;
1368 	int npools = 0, s;
1369 	unsigned int lookfor;
1370 	size_t len;
1371 
1372 	switch (*name) {
1373 	case KERN_POOL_NPOOLS:
1374 		if (namelen != 1 || buflen != sizeof(int))
1375 			return (EINVAL);
1376 		lookfor = 0;
1377 		break;
1378 	case KERN_POOL_NAME:
1379 		if (namelen != 2 || buflen < 1)
1380 			return (EINVAL);
1381 		lookfor = name[1];
1382 		break;
1383 	case KERN_POOL_POOL:
1384 		if (namelen != 2 || buflen != sizeof(struct pool))
1385 			return (EINVAL);
1386 		lookfor = name[1];
1387 		break;
1388 	default:
1389 		return (EINVAL);
1390 	}
1391 
1392 	s = splvm();
1393 
1394 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1395 		npools++;
1396 		if (lookfor == pp->pr_serial) {
1397 			foundpool = pp;
1398 			break;
1399 		}
1400 	}
1401 
1402 	splx(s);
1403 
1404 	if (*name != KERN_POOL_NPOOLS && foundpool == NULL)
1405 		return (ENOENT);
1406 
1407 	switch (*name) {
1408 	case KERN_POOL_NPOOLS:
1409 		return copyout(&npools, where, buflen);
1410 	case KERN_POOL_NAME:
1411 		len = strlen(foundpool->pr_wchan) + 1;
1412 		if (*sizep < len)
1413 			return (ENOMEM);
1414 		*sizep = len;
1415 		return copyout(foundpool->pr_wchan, where, len);
1416 	case KERN_POOL_POOL:
1417 		return copyout(foundpool, where, buflen);
1418 	}
1419 	/* NOTREACHED */
1420 	return (0); /* XXX - Stupid gcc */
1421 }
1422 
1423 /*
1424  * Pool backend allocators.
1425  *
1426  * Each pool has a backend allocator that handles allocation, deallocation
1427  */
1428 void	*pool_page_alloc(struct pool *, int, int *);
1429 void	pool_page_free(struct pool *, void *);
1430 
1431 /*
1432  * safe for interrupts, name preserved for compat this is the default
1433  * allocator
1434  */
1435 struct pool_allocator pool_allocator_nointr = {
1436 	pool_page_alloc, pool_page_free, 0,
1437 };
1438 
1439 /*
1440  * XXX - we have at least three different resources for the same allocation
1441  *  and each resource can be depleted. First we have the ready elements in
1442  *  the pool. Then we have the resource (typically a vm_map) for this
1443  *  allocator, then we have physical memory. Waiting for any of these can
1444  *  be unnecessary when any other is freed, but the kernel doesn't support
1445  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1446  *  the pool (so that we can be awakened when an item is returned to the pool),
1447  *  but we set PA_WANT on the allocator. When a page is returned to
1448  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1449  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1450  *  We also wake up the allocator in case someone without a pool (malloc)
1451  *  is sleeping waiting for this allocator.
1452  */
1453 
1454 void *
1455 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1456 {
1457 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1458 	void *v;
1459 
1460 	if (waitok)
1461 		mtx_leave(&pp->pr_mtx);
1462 	v = pp->pr_alloc->pa_alloc(pp, flags, slowdown);
1463 	if (waitok)
1464 		mtx_enter(&pp->pr_mtx);
1465 
1466 	return (v);
1467 }
1468 
1469 void
1470 pool_allocator_free(struct pool *pp, void *v)
1471 {
1472 	struct pool_allocator *pa = pp->pr_alloc;
1473 
1474 	(*pa->pa_free)(pp, v);
1475 }
1476 
1477 void *
1478 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1479 {
1480 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1481 
1482 	return (uvm_km_getpage_pla(kfl, slowdown, pp->pr_crange->ucr_low,
1483 	    pp->pr_crange->ucr_high, 0, 0));
1484 }
1485 
1486 void
1487 pool_page_free(struct pool *pp, void *v)
1488 {
1489 	uvm_km_putpage(v);
1490 }
1491 
1492 void *
1493 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1494 {
1495 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1496 	vaddr_t va;
1497 	int s;
1498 
1499 	s = splvm();
1500 	va = uvm_km_kmemalloc_pla(kmem_map, NULL, pp->pr_alloc->pa_pagesz, kfl,
1501 	    pp->pr_crange->ucr_low, pp->pr_crange->ucr_high,
1502 	    0, 0, pp->pr_pa_nsegs);
1503 	splx(s);
1504 
1505 	return ((void *)va);
1506 }
1507 
1508 void
1509 pool_large_free(struct pool *pp, void *v)
1510 {
1511 	int s;
1512 
1513 	s = splvm();
1514 	uvm_km_free(kmem_map, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
1515 	splx(s);
1516 }
1517 
1518 void *
1519 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1520 {
1521 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1522 
1523 	return ((void *)uvm_km_kmemalloc_pla(kernel_map, uvm.kernel_object,
1524 	    pp->pr_alloc->pa_pagesz, kfl,
1525 	    pp->pr_crange->ucr_low, pp->pr_crange->ucr_high,
1526 	    0, 0, pp->pr_pa_nsegs));
1527 }
1528 
1529 void
1530 pool_large_free_ni(struct pool *pp, void *v)
1531 {
1532 	uvm_km_free(kernel_map, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
1533 }
1534