xref: /openbsd-src/sys/kern/subr_pool.c (revision 1bba1af71490ec097b40089a36d2b396093c448d)
1 /*	$OpenBSD: subr_pool.c,v 1.98 2010/09/26 21:03:57 tedu Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/errno.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/pool.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 
44 #include <uvm/uvm.h>
45 
46 
47 /*
48  * Pool resource management utility.
49  *
50  * Memory is allocated in pages which are split into pieces according to
51  * the pool item size. Each page is kept on one of three lists in the
52  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
53  * for empty, full and partially-full pages respectively. The individual
54  * pool items are on a linked list headed by `ph_itemlist' in each page
55  * header. The memory for building the page list is either taken from
56  * the allocated pages themselves (for small pool items) or taken from
57  * an internal pool of page headers (`phpool').
58  */
59 
60 /* List of all pools */
61 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
62 
63 /* Private pool for page header structures */
64 struct pool phpool;
65 
66 struct pool_item_header {
67 	/* Page headers */
68 	LIST_ENTRY(pool_item_header)
69 				ph_pagelist;	/* pool page list */
70 	TAILQ_HEAD(,pool_item)	ph_itemlist;	/* chunk list for this page */
71 	RB_ENTRY(pool_item_header)
72 				ph_node;	/* Off-page page headers */
73 	int			ph_nmissing;	/* # of chunks in use */
74 	caddr_t			ph_page;	/* this page's address */
75 	caddr_t			ph_colored;	/* page's colored address */
76 	int			ph_pagesize;
77 };
78 
79 struct pool_item {
80 #ifdef DIAGNOSTIC
81 	u_int32_t pi_magic;
82 #endif
83 	/* Other entries use only this list entry */
84 	TAILQ_ENTRY(pool_item)	pi_list;
85 };
86 
87 #ifdef DEADBEEF1
88 #define	PI_MAGIC DEADBEEF1
89 #else
90 #define	PI_MAGIC 0xdeafbeef
91 #endif
92 
93 #define	POOL_NEEDS_CATCHUP(pp)						\
94 	((pp)->pr_nitems < (pp)->pr_minitems)
95 
96 /*
97  * Every pool gets a unique serial number assigned to it. If this counter
98  * wraps, we're screwed, but we shouldn't create so many pools anyway.
99  */
100 unsigned int pool_serial;
101 
102 int	 pool_catchup(struct pool *);
103 void	 pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
104 void	 pool_update_curpage(struct pool *);
105 void	*pool_do_get(struct pool *, int);
106 void	 pool_do_put(struct pool *, void *);
107 void	 pr_rmpage(struct pool *, struct pool_item_header *,
108 	    struct pool_pagelist *);
109 int	pool_chk_page(struct pool *, const char *, struct pool_item_header *);
110 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int);
111 
112 void	*pool_allocator_alloc(struct pool *, int, int *);
113 void	 pool_allocator_free(struct pool *, void *);
114 
115 /*
116  * XXX - quick hack. For pools with large items we want to use a special
117  *       allocator. For now, instead of having the allocator figure out
118  *       the allocation size from the pool (which can be done trivially
119  *       with round_page(pr_itemsperpage * pr_size)) which would require
120  *	 lots of changes everywhere, we just create allocators for each
121  *	 size. We limit those to 128 pages.
122  */
123 #define POOL_LARGE_MAXPAGES 128
124 struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES];
125 struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES];
126 void	*pool_large_alloc(struct pool *, int, int *);
127 void	pool_large_free(struct pool *, void *);
128 void	*pool_large_alloc_ni(struct pool *, int, int *);
129 void	pool_large_free_ni(struct pool *, void *);
130 
131 
132 #ifdef DDB
133 void	 pool_print_pagelist(struct pool_pagelist *,
134 	    int (*)(const char *, ...));
135 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...));
136 #endif
137 
138 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
139 
140 static __inline int
141 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
142 {
143 	long diff = (vaddr_t)a->ph_page - (vaddr_t)b->ph_page;
144 	if (diff < 0)
145 		return -(-diff >= a->ph_pagesize);
146 	else if (diff > 0)
147 		return (diff >= b->ph_pagesize);
148 	else
149 		return (0);
150 }
151 
152 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
153 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
154 
155 /*
156  * Return the pool page header based on page address.
157  */
158 static __inline struct pool_item_header *
159 pr_find_pagehead(struct pool *pp, void *v)
160 {
161 	struct pool_item_header *ph, tmp;
162 
163 	if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
164 		caddr_t page;
165 
166 		page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
167 
168 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
169 	}
170 
171 	/*
172 	 * The trick we're using in the tree compare function is to compare
173 	 * two elements equal when they overlap. We want to return the
174 	 * page header that belongs to the element just before this address.
175 	 * We don't want this element to compare equal to the next element,
176 	 * so the compare function takes the pagesize from the lower element.
177 	 * If this header is the lower, its pagesize is zero, so it can't
178 	 * overlap with the next header. But if the header we're looking for
179 	 * is lower, we'll use its pagesize and it will overlap and return
180 	 * equal.
181 	 */
182 	tmp.ph_page = v;
183 	tmp.ph_pagesize = 0;
184 	ph = RB_FIND(phtree, &pp->pr_phtree, &tmp);
185 
186 	if (ph) {
187 		KASSERT(ph->ph_page <= (caddr_t)v);
188 		KASSERT(ph->ph_page + ph->ph_pagesize > (caddr_t)v);
189 	}
190 	return ph;
191 }
192 
193 /*
194  * Remove a page from the pool.
195  */
196 void
197 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
198     struct pool_pagelist *pq)
199 {
200 
201 	/*
202 	 * If the page was idle, decrement the idle page count.
203 	 */
204 	if (ph->ph_nmissing == 0) {
205 #ifdef DIAGNOSTIC
206 		if (pp->pr_nidle == 0)
207 			panic("pr_rmpage: nidle inconsistent");
208 		if (pp->pr_nitems < pp->pr_itemsperpage)
209 			panic("pr_rmpage: nitems inconsistent");
210 #endif
211 		pp->pr_nidle--;
212 	}
213 
214 	pp->pr_nitems -= pp->pr_itemsperpage;
215 
216 	/*
217 	 * Unlink a page from the pool and release it (or queue it for release).
218 	 */
219 	LIST_REMOVE(ph, ph_pagelist);
220 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
221 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
222 	if (pq) {
223 		LIST_INSERT_HEAD(pq, ph, ph_pagelist);
224 	} else {
225 		pool_allocator_free(pp, ph->ph_page);
226 		if ((pp->pr_roflags & PR_PHINPAGE) == 0)
227 			pool_put(&phpool, ph);
228 	}
229 	pp->pr_npages--;
230 	pp->pr_npagefree++;
231 
232 	pool_update_curpage(pp);
233 }
234 
235 /*
236  * Initialize the given pool resource structure.
237  *
238  * We export this routine to allow other kernel parts to declare
239  * static pools that must be initialized before malloc() is available.
240  */
241 void
242 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
243     const char *wchan, struct pool_allocator *palloc)
244 {
245 	int off, slack;
246 
247 #ifdef MALLOC_DEBUG
248 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
249 		flags &= ~PR_DEBUG;
250 #endif
251 	/*
252 	 * Check arguments and construct default values.
253 	 */
254 	if (palloc == NULL) {
255 		if (size > PAGE_SIZE) {
256 			int psize;
257 
258 			/*
259 			 * XXX - should take align into account as well.
260 			 */
261 			if (size == round_page(size))
262 				psize = size / PAGE_SIZE;
263 			else
264 				psize = PAGE_SIZE / roundup(size % PAGE_SIZE,
265 				    1024);
266 			if (psize > POOL_LARGE_MAXPAGES)
267 				psize = POOL_LARGE_MAXPAGES;
268 			if (flags & PR_WAITOK)
269 				palloc = &pool_allocator_large_ni[psize-1];
270 			else
271 				palloc = &pool_allocator_large[psize-1];
272 			if (palloc->pa_pagesz == 0) {
273 				palloc->pa_pagesz = psize * PAGE_SIZE;
274 				if (flags & PR_WAITOK) {
275 					palloc->pa_alloc = pool_large_alloc_ni;
276 					palloc->pa_free = pool_large_free_ni;
277 				} else {
278 					palloc->pa_alloc = pool_large_alloc;
279 					palloc->pa_free = pool_large_free;
280 				}
281 			}
282 		} else {
283 			palloc = &pool_allocator_nointr;
284 		}
285 	}
286 	if (palloc->pa_pagesz == 0) {
287 		palloc->pa_pagesz = PAGE_SIZE;
288 	}
289 	if (palloc->pa_pagemask == 0) {
290 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
291 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
292 	}
293 
294 	if (align == 0)
295 		align = ALIGN(1);
296 
297 	if (size < sizeof(struct pool_item))
298 		size = sizeof(struct pool_item);
299 
300 	size = roundup(size, align);
301 #ifdef DIAGNOSTIC
302 	if (size > palloc->pa_pagesz)
303 		panic("pool_init: pool item size (%lu) too large",
304 		    (u_long)size);
305 #endif
306 
307 	/*
308 	 * Initialize the pool structure.
309 	 */
310 	LIST_INIT(&pp->pr_emptypages);
311 	LIST_INIT(&pp->pr_fullpages);
312 	LIST_INIT(&pp->pr_partpages);
313 	pp->pr_curpage = NULL;
314 	pp->pr_npages = 0;
315 	pp->pr_minitems = 0;
316 	pp->pr_minpages = 0;
317 	pp->pr_maxpages = 8;
318 	pp->pr_roflags = flags;
319 	pp->pr_flags = 0;
320 	pp->pr_size = size;
321 	pp->pr_align = align;
322 	pp->pr_wchan = wchan;
323 	pp->pr_alloc = palloc;
324 	pp->pr_nitems = 0;
325 	pp->pr_nout = 0;
326 	pp->pr_hardlimit = UINT_MAX;
327 	pp->pr_hardlimit_warning = NULL;
328 	pp->pr_hardlimit_ratecap.tv_sec = 0;
329 	pp->pr_hardlimit_ratecap.tv_usec = 0;
330 	pp->pr_hardlimit_warning_last.tv_sec = 0;
331 	pp->pr_hardlimit_warning_last.tv_usec = 0;
332 	pp->pr_serial = ++pool_serial;
333 	if (pool_serial == 0)
334 		panic("pool_init: too much uptime");
335 
336         /* constructor, destructor, and arg */
337 	pp->pr_ctor = NULL;
338 	pp->pr_dtor = NULL;
339 	pp->pr_arg = NULL;
340 
341 	/*
342 	 * Decide whether to put the page header off page to avoid
343 	 * wasting too large a part of the page. Off-page page headers
344 	 * go into an RB tree, so we can match a returned item with
345 	 * its header based on the page address.
346 	 * We use 1/16 of the page size as the threshold (XXX: tune)
347 	 */
348 	if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) {
349 		/* Use the end of the page for the page header */
350 		pp->pr_roflags |= PR_PHINPAGE;
351 		pp->pr_phoffset = off = palloc->pa_pagesz -
352 		    ALIGN(sizeof(struct pool_item_header));
353 	} else {
354 		/* The page header will be taken from our page header pool */
355 		pp->pr_phoffset = 0;
356 		off = palloc->pa_pagesz;
357 		RB_INIT(&pp->pr_phtree);
358 	}
359 
360 	/*
361 	 * Alignment is to take place at `ioff' within the item. This means
362 	 * we must reserve up to `align - 1' bytes on the page to allow
363 	 * appropriate positioning of each item.
364 	 *
365 	 * Silently enforce `0 <= ioff < align'.
366 	 */
367 	pp->pr_itemoffset = ioff = ioff % align;
368 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
369 	KASSERT(pp->pr_itemsperpage != 0);
370 
371 	/*
372 	 * Use the slack between the chunks and the page header
373 	 * for "cache coloring".
374 	 */
375 	slack = off - pp->pr_itemsperpage * pp->pr_size;
376 	pp->pr_maxcolor = (slack / align) * align;
377 	pp->pr_curcolor = 0;
378 
379 	pp->pr_nget = 0;
380 	pp->pr_nfail = 0;
381 	pp->pr_nput = 0;
382 	pp->pr_npagealloc = 0;
383 	pp->pr_npagefree = 0;
384 	pp->pr_hiwat = 0;
385 	pp->pr_nidle = 0;
386 
387 	pp->pr_ipl = -1;
388 	mtx_init(&pp->pr_mtx, IPL_NONE);
389 
390 	if (phpool.pr_size == 0) {
391 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
392 		    0, "phpool", NULL);
393 		pool_setipl(&phpool, IPL_HIGH);
394 	}
395 
396 	/* pglistalloc/constraint parameters */
397 	pp->pr_crange = &no_constraint;
398 	pp->pr_pa_nsegs = 0;
399 
400 	/* Insert this into the list of all pools. */
401 	TAILQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
402 }
403 
404 void
405 pool_setipl(struct pool *pp, int ipl)
406 {
407 	pp->pr_ipl = ipl;
408 	mtx_init(&pp->pr_mtx, ipl);
409 }
410 
411 /*
412  * Decommission a pool resource.
413  */
414 void
415 pool_destroy(struct pool *pp)
416 {
417 	struct pool_item_header *ph;
418 
419 #ifdef DIAGNOSTIC
420 	if (pp->pr_nout != 0)
421 		panic("pool_destroy: pool busy: still out: %u", pp->pr_nout);
422 #endif
423 
424 	/* Remove all pages */
425 	while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
426 		pr_rmpage(pp, ph, NULL);
427 	KASSERT(LIST_EMPTY(&pp->pr_fullpages));
428 	KASSERT(LIST_EMPTY(&pp->pr_partpages));
429 
430 	/* Remove from global pool list */
431 	TAILQ_REMOVE(&pool_head, pp, pr_poollist);
432 }
433 
434 struct pool_item_header *
435 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
436 {
437 	struct pool_item_header *ph;
438 
439 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
440 		ph = (struct pool_item_header *)(storage + pp->pr_phoffset);
441 	else
442 		ph = pool_get(&phpool, (flags & ~(PR_WAITOK | PR_ZERO)) |
443 		    PR_NOWAIT);
444 
445 	return (ph);
446 }
447 
448 /*
449  * Grab an item from the pool; must be called at appropriate spl level
450  */
451 void *
452 pool_get(struct pool *pp, int flags)
453 {
454 	void *v;
455 
456 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
457 
458 #ifdef DIAGNOSTIC
459 	if ((flags & PR_WAITOK) != 0)
460 		assertwaitok();
461 #endif /* DIAGNOSTIC */
462 
463 	mtx_enter(&pp->pr_mtx);
464 	v = pool_do_get(pp, flags);
465 	mtx_leave(&pp->pr_mtx);
466 	if (v == NULL)
467 		return (v);
468 
469 	if (pp->pr_ctor) {
470 		if (flags & PR_ZERO)
471 			panic("pool_get: PR_ZERO when ctor set");
472 		if (pp->pr_ctor(pp->pr_arg, v, flags)) {
473 			mtx_enter(&pp->pr_mtx);
474 			pool_do_put(pp, v);
475 			mtx_leave(&pp->pr_mtx);
476 			v = NULL;
477 		}
478 	} else {
479 		if (flags & PR_ZERO)
480 			memset(v, 0, pp->pr_size);
481 	}
482 	if (v != NULL)
483 		pp->pr_nget++;
484 	return (v);
485 }
486 
487 void *
488 pool_do_get(struct pool *pp, int flags)
489 {
490 	struct pool_item *pi;
491 	struct pool_item_header *ph;
492 	void *v;
493 	int slowdown = 0;
494 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
495 	int i, *ip;
496 #endif
497 
498 #ifdef MALLOC_DEBUG
499 	if (pp->pr_roflags & PR_DEBUG) {
500 		void *addr;
501 
502 		addr = NULL;
503 		debug_malloc(pp->pr_size, M_DEBUG,
504 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
505 		return (addr);
506 	}
507 #endif
508 
509 startover:
510 	/*
511 	 * Check to see if we've reached the hard limit.  If we have,
512 	 * and we can wait, then wait until an item has been returned to
513 	 * the pool.
514 	 */
515 #ifdef DIAGNOSTIC
516 	if (__predict_false(pp->pr_nout > pp->pr_hardlimit))
517 		panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan);
518 #endif
519 	if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
520 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
521 			/*
522 			 * XXX: A warning isn't logged in this case.  Should
523 			 * it be?
524 			 */
525 			pp->pr_flags |= PR_WANTED;
526 			pool_sleep(pp);
527 			goto startover;
528 		}
529 
530 		/*
531 		 * Log a message that the hard limit has been hit.
532 		 */
533 		if (pp->pr_hardlimit_warning != NULL &&
534 		    ratecheck(&pp->pr_hardlimit_warning_last,
535 		    &pp->pr_hardlimit_ratecap))
536 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
537 
538 		pp->pr_nfail++;
539 		return (NULL);
540 	}
541 
542 	/*
543 	 * The convention we use is that if `curpage' is not NULL, then
544 	 * it points at a non-empty bucket. In particular, `curpage'
545 	 * never points at a page header which has PR_PHINPAGE set and
546 	 * has no items in its bucket.
547 	 */
548 	if ((ph = pp->pr_curpage) == NULL) {
549 #ifdef DIAGNOSTIC
550 		if (pp->pr_nitems != 0) {
551 			printf("pool_do_get: %s: curpage NULL, nitems %u\n",
552 			    pp->pr_wchan, pp->pr_nitems);
553 			panic("pool_do_get: nitems inconsistent");
554 		}
555 #endif
556 
557 		/*
558 		 * Call the back-end page allocator for more memory.
559 		 */
560 		v = pool_allocator_alloc(pp, flags, &slowdown);
561 		if (__predict_true(v != NULL))
562 			ph = pool_alloc_item_header(pp, v, flags);
563 
564 		if (__predict_false(v == NULL || ph == NULL)) {
565 			if (v != NULL)
566 				pool_allocator_free(pp, v);
567 
568 			if ((flags & PR_WAITOK) == 0) {
569 				pp->pr_nfail++;
570 				return (NULL);
571 			}
572 
573 			/*
574 			 * Wait for items to be returned to this pool.
575 			 *
576 			 * XXX: maybe we should wake up once a second and
577 			 * try again?
578 			 */
579 			pp->pr_flags |= PR_WANTED;
580 			pool_sleep(pp);
581 			goto startover;
582 		}
583 
584 		/* We have more memory; add it to the pool */
585 		pool_prime_page(pp, v, ph);
586 		pp->pr_npagealloc++;
587 
588 		if (slowdown && (flags & PR_WAITOK)) {
589 			mtx_leave(&pp->pr_mtx);
590 			yield();
591 			mtx_enter(&pp->pr_mtx);
592 		}
593 
594 		/* Start the allocation process over. */
595 		goto startover;
596 	}
597 	if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) {
598 		panic("pool_do_get: %s: page empty", pp->pr_wchan);
599 	}
600 #ifdef DIAGNOSTIC
601 	if (__predict_false(pp->pr_nitems == 0)) {
602 		printf("pool_do_get: %s: items on itemlist, nitems %u\n",
603 		    pp->pr_wchan, pp->pr_nitems);
604 		panic("pool_do_get: nitems inconsistent");
605 	}
606 #endif
607 
608 #ifdef DIAGNOSTIC
609 	if (__predict_false(pi->pi_magic != PI_MAGIC))
610 		panic("pool_do_get(%s): free list modified: "
611 		    "page %p; item addr %p; offset 0x%x=0x%x",
612 		    pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic);
613 #ifdef POOL_DEBUG
614 	for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
615 	    i < pp->pr_size / sizeof(int); i++) {
616 		if (ip[i] != PI_MAGIC) {
617 			panic("pool_do_get(%s): free list modified: "
618 			    "page %p; item addr %p; offset 0x%x=0x%x",
619 			    pp->pr_wchan, ph->ph_page, pi,
620 			    i * sizeof(int), ip[i]);
621 		}
622 	}
623 #endif /* POOL_DEBUG */
624 #endif /* DIAGNOSTIC */
625 
626 	/*
627 	 * Remove from item list.
628 	 */
629 	TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list);
630 	pp->pr_nitems--;
631 	pp->pr_nout++;
632 	if (ph->ph_nmissing == 0) {
633 #ifdef DIAGNOSTIC
634 		if (__predict_false(pp->pr_nidle == 0))
635 			panic("pool_do_get: nidle inconsistent");
636 #endif
637 		pp->pr_nidle--;
638 
639 		/*
640 		 * This page was previously empty.  Move it to the list of
641 		 * partially-full pages.  This page is already curpage.
642 		 */
643 		LIST_REMOVE(ph, ph_pagelist);
644 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
645 	}
646 	ph->ph_nmissing++;
647 	if (TAILQ_EMPTY(&ph->ph_itemlist)) {
648 #ifdef DIAGNOSTIC
649 		if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) {
650 			panic("pool_do_get: %s: nmissing inconsistent",
651 			    pp->pr_wchan);
652 		}
653 #endif
654 		/*
655 		 * This page is now full.  Move it to the full list
656 		 * and select a new current page.
657 		 */
658 		LIST_REMOVE(ph, ph_pagelist);
659 		LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
660 		pool_update_curpage(pp);
661 	}
662 
663 	/*
664 	 * If we have a low water mark and we are now below that low
665 	 * water mark, add more items to the pool.
666 	 */
667 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
668 		/*
669 		 * XXX: Should we log a warning?  Should we set up a timeout
670 		 * to try again in a second or so?  The latter could break
671 		 * a caller's assumptions about interrupt protection, etc.
672 		 */
673 	}
674 	return (v);
675 }
676 
677 /*
678  * Return resource to the pool; must be called at appropriate spl level
679  */
680 void
681 pool_put(struct pool *pp, void *v)
682 {
683 	if (pp->pr_dtor)
684 		pp->pr_dtor(pp->pr_arg, v);
685 	mtx_enter(&pp->pr_mtx);
686 	pool_do_put(pp, v);
687 	mtx_leave(&pp->pr_mtx);
688 	pp->pr_nput++;
689 }
690 
691 /*
692  * Internal version of pool_put().
693  */
694 void
695 pool_do_put(struct pool *pp, void *v)
696 {
697 	struct pool_item *pi = v;
698 	struct pool_item_header *ph;
699 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
700 	int i, *ip;
701 #endif
702 
703 	if (v == NULL)
704 		panic("pool_put of NULL");
705 
706 #ifdef MALLOC_DEBUG
707 	if (pp->pr_roflags & PR_DEBUG) {
708 		debug_free(v, M_DEBUG);
709 		return;
710 	}
711 #endif
712 
713 #ifdef DIAGNOSTIC
714 	if (pp->pr_ipl != -1)
715 		splassert(pp->pr_ipl);
716 
717 	if (__predict_false(pp->pr_nout == 0)) {
718 		printf("pool %s: putting with none out\n",
719 		    pp->pr_wchan);
720 		panic("pool_do_put");
721 	}
722 #endif
723 
724 	if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
725 		panic("pool_do_put: %s: page header missing", pp->pr_wchan);
726 	}
727 
728 	/*
729 	 * Return to item list.
730 	 */
731 #ifdef DIAGNOSTIC
732 	pi->pi_magic = PI_MAGIC;
733 #ifdef POOL_DEBUG
734 	for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
735 	    i < pp->pr_size / sizeof(int); i++)
736 		ip[i] = PI_MAGIC;
737 #endif /* POOL_DEBUG */
738 #endif /* DIAGNOSTIC */
739 
740 	TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
741 	ph->ph_nmissing--;
742 	pp->pr_nitems++;
743 	pp->pr_nout--;
744 
745 	/* Cancel "pool empty" condition if it exists */
746 	if (pp->pr_curpage == NULL)
747 		pp->pr_curpage = ph;
748 
749 	if (pp->pr_flags & PR_WANTED) {
750 		pp->pr_flags &= ~PR_WANTED;
751 		if (ph->ph_nmissing == 0)
752 			pp->pr_nidle++;
753 		wakeup(pp);
754 		return;
755 	}
756 
757 	/*
758 	 * If this page is now empty, do one of two things:
759 	 *
760 	 *	(1) If we have more pages than the page high water mark,
761 	 *	    free the page back to the system.
762 	 *
763 	 *	(2) Otherwise, move the page to the empty page list.
764 	 *
765 	 * Either way, select a new current page (so we use a partially-full
766 	 * page if one is available).
767 	 */
768 	if (ph->ph_nmissing == 0) {
769 		pp->pr_nidle++;
770 		if (pp->pr_nidle > pp->pr_maxpages) {
771 			pr_rmpage(pp, ph, NULL);
772 		} else {
773 			LIST_REMOVE(ph, ph_pagelist);
774 			LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
775 		}
776 		pool_update_curpage(pp);
777 	}
778 
779 	/*
780 	 * If the page was previously completely full, move it to the
781 	 * partially-full list and make it the current page.  The next
782 	 * allocation will get the item from this page, instead of
783 	 * further fragmenting the pool.
784 	 */
785 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
786 		LIST_REMOVE(ph, ph_pagelist);
787 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
788 		pp->pr_curpage = ph;
789 	}
790 }
791 
792 /*
793  * Add N items to the pool.
794  */
795 int
796 pool_prime(struct pool *pp, int n)
797 {
798 	struct pool_item_header *ph;
799 	caddr_t cp;
800 	int newpages;
801 	int slowdown;
802 
803 	mtx_enter(&pp->pr_mtx);
804 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
805 
806 	while (newpages-- > 0) {
807 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
808 		if (__predict_true(cp != NULL))
809 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
810 		if (__predict_false(cp == NULL || ph == NULL)) {
811 			if (cp != NULL)
812 				pool_allocator_free(pp, cp);
813 			break;
814 		}
815 
816 		pool_prime_page(pp, cp, ph);
817 		pp->pr_npagealloc++;
818 		pp->pr_minpages++;
819 	}
820 
821 	if (pp->pr_minpages >= pp->pr_maxpages)
822 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
823 
824 	mtx_leave(&pp->pr_mtx);
825 	return (0);
826 }
827 
828 /*
829  * Add a page worth of items to the pool.
830  *
831  * Note, we must be called with the pool descriptor LOCKED.
832  */
833 void
834 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
835 {
836 	struct pool_item *pi;
837 	caddr_t cp = storage;
838 	unsigned int align = pp->pr_align;
839 	unsigned int ioff = pp->pr_itemoffset;
840 	int n;
841 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
842 	int i, *ip;
843 #endif
844 
845 	/*
846 	 * Insert page header.
847 	 */
848 	LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
849 	TAILQ_INIT(&ph->ph_itemlist);
850 	ph->ph_page = storage;
851 	ph->ph_pagesize = pp->pr_alloc->pa_pagesz;
852 	ph->ph_nmissing = 0;
853 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
854 		RB_INSERT(phtree, &pp->pr_phtree, ph);
855 
856 	pp->pr_nidle++;
857 
858 	/*
859 	 * Color this page.
860 	 */
861 	cp = (caddr_t)(cp + pp->pr_curcolor);
862 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
863 		pp->pr_curcolor = 0;
864 
865 	/*
866 	 * Adjust storage to apply alignment to `pr_itemoffset' in each item.
867 	 */
868 	if (ioff != 0)
869 		cp = (caddr_t)(cp + (align - ioff));
870 	ph->ph_colored = cp;
871 
872 	/*
873 	 * Insert remaining chunks on the bucket list.
874 	 */
875 	n = pp->pr_itemsperpage;
876 	pp->pr_nitems += n;
877 
878 	while (n--) {
879 		pi = (struct pool_item *)cp;
880 
881 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
882 
883 		/* Insert on page list */
884 		TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
885 
886 #ifdef DIAGNOSTIC
887 		pi->pi_magic = PI_MAGIC;
888 #ifdef POOL_DEBUG
889 		for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
890 		    i < pp->pr_size / sizeof(int); i++)
891 			ip[i] = PI_MAGIC;
892 #endif /* POOL_DEBUG */
893 #endif /* DIAGNOSTIC */
894 		cp = (caddr_t)(cp + pp->pr_size);
895 	}
896 
897 	/*
898 	 * If the pool was depleted, point at the new page.
899 	 */
900 	if (pp->pr_curpage == NULL)
901 		pp->pr_curpage = ph;
902 
903 	if (++pp->pr_npages > pp->pr_hiwat)
904 		pp->pr_hiwat = pp->pr_npages;
905 }
906 
907 /*
908  * Used by pool_get() when nitems drops below the low water mark.  This
909  * is used to catch up pr_nitems with the low water mark.
910  *
911  * Note we never wait for memory here, we let the caller decide what to do.
912  */
913 int
914 pool_catchup(struct pool *pp)
915 {
916 	struct pool_item_header *ph;
917 	caddr_t cp;
918 	int error = 0;
919 	int slowdown;
920 
921 	while (POOL_NEEDS_CATCHUP(pp)) {
922 		/*
923 		 * Call the page back-end allocator for more memory.
924 		 */
925 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
926 		if (__predict_true(cp != NULL))
927 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
928 		if (__predict_false(cp == NULL || ph == NULL)) {
929 			if (cp != NULL)
930 				pool_allocator_free(pp, cp);
931 			error = ENOMEM;
932 			break;
933 		}
934 		pool_prime_page(pp, cp, ph);
935 		pp->pr_npagealloc++;
936 	}
937 
938 	return (error);
939 }
940 
941 void
942 pool_update_curpage(struct pool *pp)
943 {
944 
945 	pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
946 	if (pp->pr_curpage == NULL) {
947 		pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
948 	}
949 }
950 
951 void
952 pool_setlowat(struct pool *pp, int n)
953 {
954 
955 	pp->pr_minitems = n;
956 	pp->pr_minpages = (n == 0)
957 		? 0
958 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
959 
960 	mtx_enter(&pp->pr_mtx);
961 	/* Make sure we're caught up with the newly-set low water mark. */
962 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
963 		/*
964 		 * XXX: Should we log a warning?  Should we set up a timeout
965 		 * to try again in a second or so?  The latter could break
966 		 * a caller's assumptions about interrupt protection, etc.
967 		 */
968 	}
969 	mtx_leave(&pp->pr_mtx);
970 }
971 
972 void
973 pool_sethiwat(struct pool *pp, int n)
974 {
975 
976 	pp->pr_maxpages = (n == 0)
977 		? 0
978 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
979 }
980 
981 int
982 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
983 {
984 	int error = 0;
985 
986 	if (n < pp->pr_nout) {
987 		error = EINVAL;
988 		goto done;
989 	}
990 
991 	pp->pr_hardlimit = n;
992 	pp->pr_hardlimit_warning = warnmsg;
993 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
994 	pp->pr_hardlimit_warning_last.tv_sec = 0;
995 	pp->pr_hardlimit_warning_last.tv_usec = 0;
996 
997 	/*
998 	 * In-line version of pool_sethiwat().
999 	 */
1000 	pp->pr_maxpages = (n == 0 || n == UINT_MAX)
1001 		? n
1002 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1003 
1004 done:
1005 	return (error);
1006 }
1007 
1008 void
1009 pool_set_constraints(struct pool *pp, struct uvm_constraint_range *range,
1010     int nsegs)
1011 {
1012 	/*
1013 	 * Subsequent changes to the constrictions are only
1014 	 * allowed to make them _more_ strict.
1015 	 */
1016 	KASSERT(pp->pr_crange->ucr_high >= range->ucr_high &&
1017 	    pp->pr_crange->ucr_low <= range->ucr_low);
1018 
1019 	pp->pr_crange = range;
1020 	pp->pr_pa_nsegs = nsegs;
1021 }
1022 
1023 void
1024 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int),
1025     void (*dtor)(void *, void *), void *arg)
1026 {
1027 	pp->pr_ctor = ctor;
1028 	pp->pr_dtor = dtor;
1029 	pp->pr_arg = arg;
1030 }
1031 /*
1032  * Release all complete pages that have not been used recently.
1033  *
1034  * Returns non-zero if any pages have been reclaimed.
1035  */
1036 int
1037 pool_reclaim(struct pool *pp)
1038 {
1039 	struct pool_item_header *ph, *phnext;
1040 	struct pool_pagelist pq;
1041 
1042 	LIST_INIT(&pq);
1043 
1044 	mtx_enter(&pp->pr_mtx);
1045 	for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1046 		phnext = LIST_NEXT(ph, ph_pagelist);
1047 
1048 		/* Check our minimum page claim */
1049 		if (pp->pr_npages <= pp->pr_minpages)
1050 			break;
1051 
1052 		KASSERT(ph->ph_nmissing == 0);
1053 
1054 		/*
1055 		 * If freeing this page would put us below
1056 		 * the low water mark, stop now.
1057 		 */
1058 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1059 		    pp->pr_minitems)
1060 			break;
1061 
1062 		pr_rmpage(pp, ph, &pq);
1063 	}
1064 	mtx_leave(&pp->pr_mtx);
1065 
1066 	if (LIST_EMPTY(&pq))
1067 		return (0);
1068 	while ((ph = LIST_FIRST(&pq)) != NULL) {
1069 		LIST_REMOVE(ph, ph_pagelist);
1070 		pool_allocator_free(pp, ph->ph_page);
1071 		if (pp->pr_roflags & PR_PHINPAGE)
1072 			continue;
1073 		pool_put(&phpool, ph);
1074 	}
1075 
1076 	return (1);
1077 }
1078 
1079 #ifdef DDB
1080 #include <machine/db_machdep.h>
1081 #include <ddb/db_interface.h>
1082 #include <ddb/db_output.h>
1083 
1084 /*
1085  * Diagnostic helpers.
1086  */
1087 void
1088 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1089 {
1090 	pool_print1(pp, modif, pr);
1091 }
1092 
1093 void
1094 pool_print_pagelist(struct pool_pagelist *pl, int (*pr)(const char *, ...))
1095 {
1096 	struct pool_item_header *ph;
1097 #ifdef DIAGNOSTIC
1098 	struct pool_item *pi;
1099 #endif
1100 
1101 	LIST_FOREACH(ph, pl, ph_pagelist) {
1102 		(*pr)("\t\tpage %p, nmissing %d\n",
1103 		    ph->ph_page, ph->ph_nmissing);
1104 #ifdef DIAGNOSTIC
1105 		TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1106 			if (pi->pi_magic != PI_MAGIC) {
1107 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1108 				    pi, pi->pi_magic);
1109 			}
1110 		}
1111 #endif
1112 	}
1113 }
1114 
1115 void
1116 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1117 {
1118 	struct pool_item_header *ph;
1119 	int print_pagelist = 0;
1120 	char c;
1121 
1122 	while ((c = *modif++) != '\0') {
1123 		if (c == 'p')
1124 			print_pagelist = 1;
1125 		modif++;
1126 	}
1127 
1128 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1129 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1130 	    pp->pr_roflags);
1131 	(*pr)("\talloc %p\n", pp->pr_alloc);
1132 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1133 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1134 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1135 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1136 
1137 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1138 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1139 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1140 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1141 
1142 	if (print_pagelist == 0)
1143 		return;
1144 
1145 	if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
1146 		(*pr)("\n\tempty page list:\n");
1147 	pool_print_pagelist(&pp->pr_emptypages, pr);
1148 	if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
1149 		(*pr)("\n\tfull page list:\n");
1150 	pool_print_pagelist(&pp->pr_fullpages, pr);
1151 	if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
1152 		(*pr)("\n\tpartial-page list:\n");
1153 	pool_print_pagelist(&pp->pr_partpages, pr);
1154 
1155 	if (pp->pr_curpage == NULL)
1156 		(*pr)("\tno current page\n");
1157 	else
1158 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1159 }
1160 
1161 void
1162 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1163 {
1164 	struct pool *pp;
1165 	char maxp[16];
1166 	int ovflw;
1167 	char mode;
1168 
1169 	mode = modif[0];
1170 	if (mode != '\0' && mode != 'a') {
1171 		db_printf("usage: show all pools [/a]\n");
1172 		return;
1173 	}
1174 
1175 	if (mode == '\0')
1176 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1177 		    "Name",
1178 		    "Size",
1179 		    "Requests",
1180 		    "Fail",
1181 		    "Releases",
1182 		    "Pgreq",
1183 		    "Pgrel",
1184 		    "Npage",
1185 		    "Hiwat",
1186 		    "Minpg",
1187 		    "Maxpg",
1188 		    "Idle");
1189 	else
1190 		db_printf("%-10s %18s %18s\n",
1191 		    "Name", "Address", "Allocator");
1192 
1193 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1194 		if (mode == 'a') {
1195 			db_printf("%-10s %18p %18p\n", pp->pr_wchan, pp,
1196 			    pp->pr_alloc);
1197 			continue;
1198 		}
1199 
1200 		if (!pp->pr_nget)
1201 			continue;
1202 
1203 		if (pp->pr_maxpages == UINT_MAX)
1204 			snprintf(maxp, sizeof maxp, "inf");
1205 		else
1206 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1207 
1208 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1209 	(ovflw) += db_printf((fmt),			\
1210 	    (width) - (fixed) - (ovflw) > 0 ?		\
1211 	    (width) - (fixed) - (ovflw) : 0,		\
1212 	    (val)) - (width);				\
1213 	if ((ovflw) < 0)				\
1214 		(ovflw) = 0;				\
1215 } while (/* CONSTCOND */0)
1216 
1217 		ovflw = 0;
1218 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1219 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1220 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1221 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1222 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1223 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1224 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1225 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1226 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1227 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1228 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1229 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1230 
1231 		pool_chk(pp, pp->pr_wchan);
1232 	}
1233 }
1234 
1235 int
1236 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
1237 {
1238 	struct pool_item *pi;
1239 	caddr_t page;
1240 	int n;
1241 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
1242 	int i, *ip;
1243 #endif
1244 
1245 	page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask);
1246 	if (page != ph->ph_page &&
1247 	    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1248 		if (label != NULL)
1249 			printf("%s: ", label);
1250 		printf("pool(%p:%s): page inconsistency: page %p; "
1251 		    "at page head addr %p (p %p)\n",
1252 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1253 		return 1;
1254 	}
1255 
1256 	for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0;
1257 	     pi != NULL;
1258 	     pi = TAILQ_NEXT(pi,pi_list), n++) {
1259 
1260 #ifdef DIAGNOSTIC
1261 		if (pi->pi_magic != PI_MAGIC) {
1262 			if (label != NULL)
1263 				printf("%s: ", label);
1264 			printf("pool(%s): free list modified: "
1265 			    "page %p; item ordinal %d; addr %p "
1266 			    "(p %p); offset 0x%x=0x%x\n",
1267 			    pp->pr_wchan, ph->ph_page, n, pi, page,
1268 			    0, pi->pi_magic);
1269 		}
1270 #ifdef POOL_DEBUG
1271 		for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
1272 		    i < pp->pr_size / sizeof(int); i++) {
1273 			if (ip[i] != PI_MAGIC) {
1274 				printf("pool(%s): free list modified: "
1275 				    "page %p; item ordinal %d; addr %p "
1276 				    "(p %p); offset 0x%x=0x%x\n",
1277 				    pp->pr_wchan, ph->ph_page, n, pi,
1278 				    page, i * sizeof(int), ip[i]);
1279 			}
1280 		}
1281 
1282 #endif /* POOL_DEBUG */
1283 #endif /* DIAGNOSTIC */
1284 		page =
1285 		    (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask);
1286 		if (page == ph->ph_page)
1287 			continue;
1288 
1289 		if (label != NULL)
1290 			printf("%s: ", label);
1291 		printf("pool(%p:%s): page inconsistency: page %p;"
1292 		    " item ordinal %d; addr %p (p %p)\n", pp,
1293 		    pp->pr_wchan, ph->ph_page, n, pi, page);
1294 		return 1;
1295 	}
1296 	return 0;
1297 }
1298 
1299 int
1300 pool_chk(struct pool *pp, const char *label)
1301 {
1302 	struct pool_item_header *ph;
1303 	int r = 0;
1304 
1305 	LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1306 		r += pool_chk_page(pp, label, ph);
1307 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1308 		r += pool_chk_page(pp, label, ph);
1309 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1310 		r += pool_chk_page(pp, label, ph);
1311 
1312 	return (r);
1313 }
1314 
1315 void
1316 pool_walk(struct pool *pp, int full, int (*pr)(const char *, ...),
1317     void (*func)(void *, int, int (*)(const char *, ...)))
1318 {
1319 	struct pool_item_header *ph;
1320 	struct pool_item *pi;
1321 	caddr_t cp;
1322 	int n;
1323 
1324 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1325 		cp = ph->ph_colored;
1326 		n = ph->ph_nmissing;
1327 
1328 		while (n--) {
1329 			func(cp, full, pr);
1330 			cp += pp->pr_size;
1331 		}
1332 	}
1333 
1334 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1335 		cp = ph->ph_colored;
1336 		n = ph->ph_nmissing;
1337 
1338 		do {
1339 			TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1340 				if (cp == (caddr_t)pi)
1341 					break;
1342 			}
1343 			if (cp != (caddr_t)pi) {
1344 				func(cp, full, pr);
1345 				n--;
1346 			}
1347 
1348 			cp += pp->pr_size;
1349 		} while (n > 0);
1350 	}
1351 }
1352 #endif
1353 
1354 /*
1355  * We have three different sysctls.
1356  * kern.pool.npools - the number of pools.
1357  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1358  * kern.pool.name.<pool#> - the name for pool#.
1359  */
1360 int
1361 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1362 {
1363 	struct pool *pp, *foundpool = NULL;
1364 	size_t buflen = where != NULL ? *sizep : 0;
1365 	int npools = 0, s;
1366 	unsigned int lookfor;
1367 	size_t len;
1368 
1369 	switch (*name) {
1370 	case KERN_POOL_NPOOLS:
1371 		if (namelen != 1 || buflen != sizeof(int))
1372 			return (EINVAL);
1373 		lookfor = 0;
1374 		break;
1375 	case KERN_POOL_NAME:
1376 		if (namelen != 2 || buflen < 1)
1377 			return (EINVAL);
1378 		lookfor = name[1];
1379 		break;
1380 	case KERN_POOL_POOL:
1381 		if (namelen != 2 || buflen != sizeof(struct pool))
1382 			return (EINVAL);
1383 		lookfor = name[1];
1384 		break;
1385 	default:
1386 		return (EINVAL);
1387 	}
1388 
1389 	s = splvm();
1390 
1391 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1392 		npools++;
1393 		if (lookfor == pp->pr_serial) {
1394 			foundpool = pp;
1395 			break;
1396 		}
1397 	}
1398 
1399 	splx(s);
1400 
1401 	if (*name != KERN_POOL_NPOOLS && foundpool == NULL)
1402 		return (ENOENT);
1403 
1404 	switch (*name) {
1405 	case KERN_POOL_NPOOLS:
1406 		return copyout(&npools, where, buflen);
1407 	case KERN_POOL_NAME:
1408 		len = strlen(foundpool->pr_wchan) + 1;
1409 		if (*sizep < len)
1410 			return (ENOMEM);
1411 		*sizep = len;
1412 		return copyout(foundpool->pr_wchan, where, len);
1413 	case KERN_POOL_POOL:
1414 		return copyout(foundpool, where, buflen);
1415 	}
1416 	/* NOTREACHED */
1417 	return (0); /* XXX - Stupid gcc */
1418 }
1419 
1420 /*
1421  * Pool backend allocators.
1422  *
1423  * Each pool has a backend allocator that handles allocation, deallocation
1424  */
1425 void	*pool_page_alloc(struct pool *, int, int *);
1426 void	pool_page_free(struct pool *, void *);
1427 
1428 /*
1429  * safe for interrupts, name preserved for compat this is the default
1430  * allocator
1431  */
1432 struct pool_allocator pool_allocator_nointr = {
1433 	pool_page_alloc, pool_page_free, 0,
1434 };
1435 
1436 /*
1437  * XXX - we have at least three different resources for the same allocation
1438  *  and each resource can be depleted. First we have the ready elements in
1439  *  the pool. Then we have the resource (typically a vm_map) for this
1440  *  allocator, then we have physical memory. Waiting for any of these can
1441  *  be unnecessary when any other is freed, but the kernel doesn't support
1442  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1443  *  the pool (so that we can be awakened when an item is returned to the pool),
1444  *  but we set PA_WANT on the allocator. When a page is returned to
1445  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1446  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1447  *  We also wake up the allocator in case someone without a pool (malloc)
1448  *  is sleeping waiting for this allocator.
1449  */
1450 
1451 void *
1452 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1453 {
1454 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1455 	void *v;
1456 
1457 	if (waitok)
1458 		mtx_leave(&pp->pr_mtx);
1459 	v = pp->pr_alloc->pa_alloc(pp, flags, slowdown);
1460 	if (waitok)
1461 		mtx_enter(&pp->pr_mtx);
1462 
1463 	return (v);
1464 }
1465 
1466 void
1467 pool_allocator_free(struct pool *pp, void *v)
1468 {
1469 	struct pool_allocator *pa = pp->pr_alloc;
1470 
1471 	(*pa->pa_free)(pp, v);
1472 }
1473 
1474 void *
1475 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1476 {
1477 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1478 
1479 	return (uvm_km_getpage_pla(kfl, slowdown, pp->pr_crange->ucr_low,
1480 	    pp->pr_crange->ucr_high, 0, 0));
1481 }
1482 
1483 void
1484 pool_page_free(struct pool *pp, void *v)
1485 {
1486 	uvm_km_putpage(v);
1487 }
1488 
1489 void *
1490 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1491 {
1492 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1493 	vaddr_t va;
1494 	int s;
1495 
1496 	s = splvm();
1497 	va = uvm_km_kmemalloc_pla(kmem_map, NULL, pp->pr_alloc->pa_pagesz, 0,
1498 	    kfl, pp->pr_crange->ucr_low, pp->pr_crange->ucr_high,
1499 	    0, 0, pp->pr_pa_nsegs);
1500 	splx(s);
1501 
1502 	return ((void *)va);
1503 }
1504 
1505 void
1506 pool_large_free(struct pool *pp, void *v)
1507 {
1508 	int s;
1509 
1510 	s = splvm();
1511 	uvm_km_free(kmem_map, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
1512 	splx(s);
1513 }
1514 
1515 void *
1516 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1517 {
1518 	int kfl = (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT;
1519 
1520 	return ((void *)uvm_km_kmemalloc_pla(kernel_map, uvm.kernel_object,
1521 	    pp->pr_alloc->pa_pagesz, 0, kfl,
1522 	    pp->pr_crange->ucr_low, pp->pr_crange->ucr_high,
1523 	    0, 0, pp->pr_pa_nsegs));
1524 }
1525 
1526 void
1527 pool_large_free_ni(struct pool *pp, void *v)
1528 {
1529 	uvm_km_free(kernel_map, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
1530 }
1531