xref: /openbsd-src/sys/kern/subr_pool.c (revision db3296cf5c1dd9058ceecc3a29fe4aaa0bd26000)
1 /*	$OpenBSD: subr_pool.c,v 1.38 2002/12/20 07:48:00 art Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the NetBSD
23  *	Foundation, Inc. and its contributors.
24  * 4. Neither the name of The NetBSD Foundation nor the names of its
25  *    contributors may be used to endorse or promote products derived
26  *    from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38  * POSSIBILITY OF SUCH DAMAGE.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/errno.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47 #include <sys/lock.h>
48 #include <sys/pool.h>
49 #include <sys/syslog.h>
50 #include <sys/sysctl.h>
51 
52 #include <uvm/uvm.h>
53 
54 /*
55  * XXX - for now.
56  */
57 #define SIMPLELOCK_INITIALIZER { SLOCK_UNLOCKED }
58 #ifdef LOCKDEBUG
59 #define simple_lock_freecheck(a, s) do { /* nothing */ } while (0)
60 #define simple_lock_only_held(lkp, str) do { /* nothing */ } while (0)
61 #endif
62 
63 /*
64  * Pool resource management utility.
65  *
66  * Memory is allocated in pages which are split into pieces according
67  * to the pool item size. Each page is kept on a list headed by `pr_pagelist'
68  * in the pool structure and the individual pool items are on a linked list
69  * headed by `ph_itemlist' in each page header. The memory for building
70  * the page list is either taken from the allocated pages themselves (for
71  * small pool items) or taken from an internal pool of page headers (`phpool').
72  */
73 
74 /* List of all pools */
75 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
76 
77 /* Private pool for page header structures */
78 static struct pool phpool;
79 
80 /* # of seconds to retain page after last use */
81 int pool_inactive_time = 10;
82 
83 /* Next candidate for drainage (see pool_drain()) */
84 static struct pool	*drainpp;
85 
86 /* This spin lock protects both pool_head and drainpp. */
87 struct simplelock pool_head_slock = SIMPLELOCK_INITIALIZER;
88 
89 struct pool_item_header {
90 	/* Page headers */
91 	TAILQ_ENTRY(pool_item_header)
92 				ph_pagelist;	/* pool page list */
93 	TAILQ_HEAD(,pool_item)	ph_itemlist;	/* chunk list for this page */
94 	LIST_ENTRY(pool_item_header)
95 				ph_hashlist;	/* Off-page page headers */
96 	int			ph_nmissing;	/* # of chunks in use */
97 	caddr_t			ph_page;	/* this page's address */
98 	struct timeval		ph_time;	/* last referenced */
99 };
100 TAILQ_HEAD(pool_pagelist,pool_item_header);
101 
102 struct pool_item {
103 #ifdef DIAGNOSTIC
104 	int pi_magic;
105 #endif
106 #define	PI_MAGIC 0xdeafbeef
107 	/* Other entries use only this list entry */
108 	TAILQ_ENTRY(pool_item)	pi_list;
109 };
110 
111 #define	PR_HASH_INDEX(pp,addr) \
112 	(((u_long)(addr) >> (pp)->pr_alloc->pa_pageshift) & (PR_HASHTABSIZE - 1))
113 
114 #define	POOL_NEEDS_CATCHUP(pp)						\
115 	((pp)->pr_nitems < (pp)->pr_minitems)
116 
117 /*
118  * Every pool get a unique serial number assigned to it. If this counter
119  * wraps, we're screwed, but we shouldn't create so many pools anyway.
120  */
121 unsigned int pool_serial;
122 
123 /*
124  * Pool cache management.
125  *
126  * Pool caches provide a way for constructed objects to be cached by the
127  * pool subsystem.  This can lead to performance improvements by avoiding
128  * needless object construction/destruction; it is deferred until absolutely
129  * necessary.
130  *
131  * Caches are grouped into cache groups.  Each cache group references
132  * up to 16 constructed objects.  When a cache allocates an object
133  * from the pool, it calls the object's constructor and places it into
134  * a cache group.  When a cache group frees an object back to the pool,
135  * it first calls the object's destructor.  This allows the object to
136  * persist in constructed form while freed to the cache.
137  *
138  * Multiple caches may exist for each pool.  This allows a single
139  * object type to have multiple constructed forms.  The pool references
140  * each cache, so that when a pool is drained by the pagedaemon, it can
141  * drain each individual cache as well.  Each time a cache is drained,
142  * the most idle cache group is freed to the pool in its entirety.
143  *
144  * Pool caches are layed on top of pools.  By layering them, we can avoid
145  * the complexity of cache management for pools which would not benefit
146  * from it.
147  */
148 
149 /* The cache group pool. */
150 static struct pool pcgpool;
151 
152 /* The pool cache group. */
153 #define	PCG_NOBJECTS		16
154 struct pool_cache_group {
155 	TAILQ_ENTRY(pool_cache_group)
156 		pcg_list;	/* link in the pool cache's group list */
157 	u_int	pcg_avail;	/* # available objects */
158 				/* pointers to the objects */
159 	void	*pcg_objects[PCG_NOBJECTS];
160 };
161 
162 void	pool_cache_reclaim(struct pool_cache *);
163 void	pool_cache_do_invalidate(struct pool_cache *, int,
164     void (*)(struct pool *, void *));
165 
166 int	pool_catchup(struct pool *);
167 void	pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
168 void	pool_do_put(struct pool *, void *);
169 void	pr_rmpage(struct pool *, struct pool_item_header *,
170     struct pool_pagelist *);
171 
172 void	*pool_allocator_alloc(struct pool *, int);
173 void	pool_allocator_free(struct pool *, void *);
174 
175 void pool_print1(struct pool *, const char *, int (*)(const char *, ...));
176 
177 /*
178  * Pool log entry. An array of these is allocated in pool_init().
179  */
180 struct pool_log {
181 	const char	*pl_file;
182 	long		pl_line;
183 	int		pl_action;
184 #define	PRLOG_GET	1
185 #define	PRLOG_PUT	2
186 	void		*pl_addr;
187 };
188 
189 /* Number of entries in pool log buffers */
190 #ifndef POOL_LOGSIZE
191 #define	POOL_LOGSIZE	10
192 #endif
193 
194 int pool_logsize = POOL_LOGSIZE;
195 
196 #ifdef POOL_DIAGNOSTIC
197 static __inline void
198 pr_log(struct pool *pp, void *v, int action, const char *file, long line)
199 {
200 	int n = pp->pr_curlogentry;
201 	struct pool_log *pl;
202 
203 	if ((pp->pr_roflags & PR_LOGGING) == 0)
204 		return;
205 
206 	/*
207 	 * Fill in the current entry. Wrap around and overwrite
208 	 * the oldest entry if necessary.
209 	 */
210 	pl = &pp->pr_log[n];
211 	pl->pl_file = file;
212 	pl->pl_line = line;
213 	pl->pl_action = action;
214 	pl->pl_addr = v;
215 	if (++n >= pp->pr_logsize)
216 		n = 0;
217 	pp->pr_curlogentry = n;
218 }
219 
220 static void
221 pr_printlog(struct pool *pp, struct pool_item *pi,
222     int (*pr)(const char *, ...))
223 {
224 	int i = pp->pr_logsize;
225 	int n = pp->pr_curlogentry;
226 
227 	if ((pp->pr_roflags & PR_LOGGING) == 0)
228 		return;
229 
230 	/*
231 	 * Print all entries in this pool's log.
232 	 */
233 	while (i-- > 0) {
234 		struct pool_log *pl = &pp->pr_log[n];
235 		if (pl->pl_action != 0) {
236 			if (pi == NULL || pi == pl->pl_addr) {
237 				(*pr)("\tlog entry %d:\n", i);
238 				(*pr)("\t\taction = %s, addr = %p\n",
239 				    pl->pl_action == PRLOG_GET ? "get" : "put",
240 				    pl->pl_addr);
241 				(*pr)("\t\tfile: %s at line %lu\n",
242 				    pl->pl_file, pl->pl_line);
243 			}
244 		}
245 		if (++n >= pp->pr_logsize)
246 			n = 0;
247 	}
248 }
249 
250 static __inline void
251 pr_enter(struct pool *pp, const char *file, long line)
252 {
253 
254 	if (__predict_false(pp->pr_entered_file != NULL)) {
255 		printf("pool %s: reentrancy at file %s line %ld\n",
256 		    pp->pr_wchan, file, line);
257 		printf("         previous entry at file %s line %ld\n",
258 		    pp->pr_entered_file, pp->pr_entered_line);
259 		panic("pr_enter");
260 	}
261 
262 	pp->pr_entered_file = file;
263 	pp->pr_entered_line = line;
264 }
265 
266 static __inline void
267 pr_leave(struct pool *pp)
268 {
269 
270 	if (__predict_false(pp->pr_entered_file == NULL)) {
271 		printf("pool %s not entered?\n", pp->pr_wchan);
272 		panic("pr_leave");
273 	}
274 
275 	pp->pr_entered_file = NULL;
276 	pp->pr_entered_line = 0;
277 }
278 
279 static __inline void
280 pr_enter_check(struct pool *pp, int (*pr)(const char *, ...))
281 {
282 
283 	if (pp->pr_entered_file != NULL)
284 		(*pr)("\n\tcurrently entered from file %s line %ld\n",
285 		    pp->pr_entered_file, pp->pr_entered_line);
286 }
287 #else
288 #define	pr_log(pp, v, action, file, line)
289 #define	pr_printlog(pp, pi, pr)
290 #define	pr_enter(pp, file, line)
291 #define	pr_leave(pp)
292 #define	pr_enter_check(pp, pr)
293 #endif /* POOL_DIAGNOSTIC */
294 
295 /*
296  * Return the pool page header based on page address.
297  */
298 static __inline struct pool_item_header *
299 pr_find_pagehead(struct pool *pp, caddr_t page)
300 {
301 	struct pool_item_header *ph;
302 
303 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
304 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
305 
306 	for (ph = LIST_FIRST(&pp->pr_hashtab[PR_HASH_INDEX(pp, page)]);
307 	     ph != NULL;
308 	     ph = LIST_NEXT(ph, ph_hashlist)) {
309 		if (ph->ph_page == page)
310 			return (ph);
311 	}
312 	return (NULL);
313 }
314 
315 /*
316  * Remove a page from the pool.
317  */
318 void
319 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
320      struct pool_pagelist *pq)
321 {
322 	int s;
323 
324 	/*
325 	 * If the page was idle, decrement the idle page count.
326 	 */
327 	if (ph->ph_nmissing == 0) {
328 #ifdef DIAGNOSTIC
329 		if (pp->pr_nidle == 0)
330 			panic("pr_rmpage: nidle inconsistent");
331 		if (pp->pr_nitems < pp->pr_itemsperpage)
332 			panic("pr_rmpage: nitems inconsistent");
333 #endif
334 		pp->pr_nidle--;
335 	}
336 
337 	pp->pr_nitems -= pp->pr_itemsperpage;
338 
339 	/*
340 	 * Unlink a page from the pool and release it (or queue it for release).
341 	 */
342 	TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
343 	if (pq) {
344 		TAILQ_INSERT_HEAD(pq, ph, ph_pagelist);
345 	} else {
346 		pool_allocator_free(pp, ph->ph_page);
347 		if ((pp->pr_roflags & PR_PHINPAGE) == 0) {
348 			LIST_REMOVE(ph, ph_hashlist);
349 			s = splhigh();
350 			pool_put(&phpool, ph);
351 			splx(s);
352 		}
353 	}
354 	pp->pr_npages--;
355 	pp->pr_npagefree++;
356 
357 	if (pp->pr_curpage == ph) {
358 		/*
359 		 * Find a new non-empty page header, if any.
360 		 * Start search from the page head, to increase the
361 		 * chance for "high water" pages to be freed.
362 		 */
363 		TAILQ_FOREACH(ph, &pp->pr_pagelist, ph_pagelist)
364 			if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
365 				break;
366 
367 		pp->pr_curpage = ph;
368 	}
369 }
370 
371 /*
372  * Initialize the given pool resource structure.
373  *
374  * We export this routine to allow other kernel parts to declare
375  * static pools that must be initialized before malloc() is available.
376  */
377 void
378 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
379     const char *wchan, struct pool_allocator *palloc)
380 {
381 	int off, slack, i;
382 
383 #ifdef POOL_DIAGNOSTIC
384 	/*
385 	 * Always log if POOL_DIAGNOSTIC is defined.
386 	 */
387 	if (pool_logsize != 0)
388 		flags |= PR_LOGGING;
389 #endif
390 
391 #ifdef MALLOC_DEBUG
392 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
393 		flags &= ~PR_DEBUG;
394 #endif
395 	/*
396 	 * Check arguments and construct default values.
397 	 */
398 	if (palloc == NULL)
399 		palloc = &pool_allocator_kmem;
400 	if ((palloc->pa_flags & PA_INITIALIZED) == 0) {
401 		if (palloc->pa_pagesz == 0)
402 			palloc->pa_pagesz = PAGE_SIZE;
403 
404 		TAILQ_INIT(&palloc->pa_list);
405 
406 		simple_lock_init(&palloc->pa_slock);
407 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
408 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
409 		palloc->pa_flags |= PA_INITIALIZED;
410 	}
411 
412 	if (align == 0)
413 		align = ALIGN(1);
414 
415 	if (size < sizeof(struct pool_item))
416 		size = sizeof(struct pool_item);
417 
418 	size = roundup(size, align);
419 #ifdef DIAGNOSTIC
420 	if (size > palloc->pa_pagesz)
421 		panic("pool_init: pool item size (%lu) too large",
422 		      (u_long)size);
423 #endif
424 
425 	/*
426 	 * Initialize the pool structure.
427 	 */
428 	TAILQ_INIT(&pp->pr_pagelist);
429 	TAILQ_INIT(&pp->pr_cachelist);
430 	pp->pr_curpage = NULL;
431 	pp->pr_npages = 0;
432 	pp->pr_minitems = 0;
433 	pp->pr_minpages = 0;
434 	pp->pr_maxpages = UINT_MAX;
435 	pp->pr_roflags = flags;
436 	pp->pr_flags = 0;
437 	pp->pr_size = size;
438 	pp->pr_align = align;
439 	pp->pr_wchan = wchan;
440 	pp->pr_alloc = palloc;
441 	pp->pr_nitems = 0;
442 	pp->pr_nout = 0;
443 	pp->pr_hardlimit = UINT_MAX;
444 	pp->pr_hardlimit_warning = NULL;
445 	pp->pr_hardlimit_ratecap.tv_sec = 0;
446 	pp->pr_hardlimit_ratecap.tv_usec = 0;
447 	pp->pr_hardlimit_warning_last.tv_sec = 0;
448 	pp->pr_hardlimit_warning_last.tv_usec = 0;
449 	pp->pr_drain_hook = NULL;
450 	pp->pr_drain_hook_arg = NULL;
451 	pp->pr_serial = ++pool_serial;
452 	if (pool_serial == 0)
453 		panic("pool_init: too much uptime");
454 
455 	/*
456 	 * Decide whether to put the page header off page to avoid
457 	 * wasting too large a part of the page. Off-page page headers
458 	 * go on a hash table, so we can match a returned item
459 	 * with its header based on the page address.
460 	 * We use 1/16 of the page size as the threshold (XXX: tune)
461 	 */
462 	if (pp->pr_size < palloc->pa_pagesz/16) {
463 		/* Use the end of the page for the page header */
464 		pp->pr_roflags |= PR_PHINPAGE;
465 		pp->pr_phoffset = off = palloc->pa_pagesz -
466 		    ALIGN(sizeof(struct pool_item_header));
467 	} else {
468 		/* The page header will be taken from our page header pool */
469 		pp->pr_phoffset = 0;
470 		off = palloc->pa_pagesz;
471 		for (i = 0; i < PR_HASHTABSIZE; i++) {
472 			LIST_INIT(&pp->pr_hashtab[i]);
473 		}
474 	}
475 
476 	/*
477 	 * Alignment is to take place at `ioff' within the item. This means
478 	 * we must reserve up to `align - 1' bytes on the page to allow
479 	 * appropriate positioning of each item.
480 	 *
481 	 * Silently enforce `0 <= ioff < align'.
482 	 */
483 	pp->pr_itemoffset = ioff = ioff % align;
484 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
485 	KASSERT(pp->pr_itemsperpage != 0);
486 
487 	/*
488 	 * Use the slack between the chunks and the page header
489 	 * for "cache coloring".
490 	 */
491 	slack = off - pp->pr_itemsperpage * pp->pr_size;
492 	pp->pr_maxcolor = (slack / align) * align;
493 	pp->pr_curcolor = 0;
494 
495 	pp->pr_nget = 0;
496 	pp->pr_nfail = 0;
497 	pp->pr_nput = 0;
498 	pp->pr_npagealloc = 0;
499 	pp->pr_npagefree = 0;
500 	pp->pr_hiwat = 0;
501 	pp->pr_nidle = 0;
502 
503 #ifdef POOL_DIAGNOSTIC
504 	if (flags & PR_LOGGING) {
505 		if (kmem_map == NULL ||
506 		    (pp->pr_log = malloc(pool_logsize * sizeof(struct pool_log),
507 		     M_TEMP, M_NOWAIT)) == NULL)
508 			pp->pr_roflags &= ~PR_LOGGING;
509 		pp->pr_curlogentry = 0;
510 		pp->pr_logsize = pool_logsize;
511 	}
512 #endif
513 
514 	pp->pr_entered_file = NULL;
515 	pp->pr_entered_line = 0;
516 
517 	simple_lock_init(&pp->pr_slock);
518 
519 	/*
520 	 * Initialize private page header pool and cache magazine pool if we
521 	 * haven't done so yet.
522 	 * XXX LOCKING.
523 	 */
524 	if (phpool.pr_size == 0) {
525 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
526 		    0, "phpool", NULL);
527 		pool_init(&pcgpool, sizeof(struct pool_cache_group), 0, 0,
528 		    0, "pcgpool", NULL);
529 	}
530 
531 	/* Insert this into the list of all pools. */
532 	simple_lock(&pool_head_slock);
533 	TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
534 	simple_unlock(&pool_head_slock);
535 
536 	/* Insert into the list of pools using this allocator. */
537 	simple_lock(&palloc->pa_slock);
538 	TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
539 	simple_unlock(&palloc->pa_slock);
540 }
541 
542 /*
543  * De-commision a pool resource.
544  */
545 void
546 pool_destroy(struct pool *pp)
547 {
548 	struct pool_item_header *ph;
549 	struct pool_cache *pc;
550 
551 	/* Locking order: pool_allocator -> pool */
552 	simple_lock(&pp->pr_alloc->pa_slock);
553 	TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
554 	simple_unlock(&pp->pr_alloc->pa_slock);
555 
556 	/* Destroy all caches for this pool. */
557 	while ((pc = TAILQ_FIRST(&pp->pr_cachelist)) != NULL)
558 		pool_cache_destroy(pc);
559 
560 #ifdef DIAGNOSTIC
561 	if (pp->pr_nout != 0) {
562 		pr_printlog(pp, NULL, printf);
563 		panic("pool_destroy: pool busy: still out: %u",
564 		    pp->pr_nout);
565 	}
566 #endif
567 
568 	/* Remove all pages */
569 	while ((ph = TAILQ_FIRST(&pp->pr_pagelist)) != NULL)
570 		pr_rmpage(pp, ph, NULL);
571 
572 	/* Remove from global pool list */
573 	simple_lock(&pool_head_slock);
574 	TAILQ_REMOVE(&pool_head, pp, pr_poollist);
575 	if (drainpp == pp) {
576 		drainpp = NULL;
577 	}
578 	simple_unlock(&pool_head_slock);
579 
580 #ifdef POOL_DIAGNOSTIC
581 	if ((pp->pr_roflags & PR_LOGGING) != 0)
582 		free(pp->pr_log, M_TEMP);
583 #endif
584 }
585 
586 void
587 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
588 {
589 	/* XXX no locking -- must be used just after pool_init() */
590 #ifdef DIAGNOSTIC
591 	if (pp->pr_drain_hook != NULL)
592 		panic("pool_set_drain_hook(%s): already set", pp->pr_wchan);
593 #endif
594 	pp->pr_drain_hook = fn;
595 	pp->pr_drain_hook_arg = arg;
596 }
597 
598 static __inline struct pool_item_header *
599 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
600 {
601 	struct pool_item_header *ph;
602 	int s;
603 
604 	LOCK_ASSERT(simple_lock_held(&pp->pr_slock) == 0);
605 
606 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
607 		ph = (struct pool_item_header *) (storage + pp->pr_phoffset);
608 	else {
609 		s = splhigh();
610 		ph = pool_get(&phpool, flags);
611 		splx(s);
612 	}
613 
614 	return (ph);
615 }
616 
617 /*
618  * Grab an item from the pool; must be called at appropriate spl level
619  */
620 void *
621 #ifdef POOL_DIAGNOSTIC
622 _pool_get(struct pool *pp, int flags, const char *file, long line)
623 #else
624 pool_get(struct pool *pp, int flags)
625 #endif
626 {
627 	struct pool_item *pi;
628 	struct pool_item_header *ph;
629 	void *v;
630 
631 #ifdef DIAGNOSTIC
632 	if ((flags & PR_WAITOK) != 0)
633 		splassert(IPL_NONE);
634 	if (__predict_false(curproc == NULL && /* doing_shutdown == 0 && XXX*/
635 			    (flags & PR_WAITOK) != 0))
636 		panic("pool_get: %s:must have NOWAIT", pp->pr_wchan);
637 
638 #ifdef LOCKDEBUG
639 	if (flags & PR_WAITOK)
640 		simple_lock_only_held(NULL, "pool_get(PR_WAITOK)");
641 #endif
642 #endif /* DIAGNOSTIC */
643 
644 #ifdef MALLOC_DEBUG
645 	if (pp->pr_roflags & PR_DEBUG) {
646 		void *addr;
647 
648 		addr = NULL;
649 		debug_malloc(pp->pr_size, M_DEBUG,
650 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
651 		return (addr);
652 	}
653 #endif
654 
655 	simple_lock(&pp->pr_slock);
656 	pr_enter(pp, file, line);
657 
658  startover:
659 	/*
660 	 * Check to see if we've reached the hard limit.  If we have,
661 	 * and we can wait, then wait until an item has been returned to
662 	 * the pool.
663 	 */
664 #ifdef DIAGNOSTIC
665 	if (__predict_false(pp->pr_nout > pp->pr_hardlimit)) {
666 		pr_leave(pp);
667 		simple_unlock(&pp->pr_slock);
668 		panic("pool_get: %s: crossed hard limit", pp->pr_wchan);
669 	}
670 #endif
671 	if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
672 		if (pp->pr_drain_hook != NULL) {
673 			/*
674 			 * Since the drain hook is going to free things
675 			 * back to the pool, unlock, call hook, re-lock
676 			 * and check hardlimit condition again.
677 			 */
678 			pr_leave(pp);
679 			simple_unlock(&pp->pr_slock);
680 			(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
681 			simple_lock(&pp->pr_slock);
682 			pr_enter(pp, file, line);
683 			if (pp->pr_nout < pp->pr_hardlimit)
684 				goto startover;
685 		}
686 
687 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
688 			/*
689 			 * XXX: A warning isn't logged in this case.  Should
690 			 * it be?
691 			 */
692 			pp->pr_flags |= PR_WANTED;
693 			pr_leave(pp);
694 			ltsleep(pp, PSWP, pp->pr_wchan, 0, &pp->pr_slock);
695 			pr_enter(pp, file, line);
696 			goto startover;
697 		}
698 
699 		/*
700 		 * Log a message that the hard limit has been hit.
701 		 */
702 		if (pp->pr_hardlimit_warning != NULL &&
703 		    ratecheck(&pp->pr_hardlimit_warning_last,
704 			      &pp->pr_hardlimit_ratecap))
705 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
706 
707 		pp->pr_nfail++;
708 
709 		pr_leave(pp);
710 		simple_unlock(&pp->pr_slock);
711 		return (NULL);
712 	}
713 
714 	/*
715 	 * The convention we use is that if `curpage' is not NULL, then
716 	 * it points at a non-empty bucket. In particular, `curpage'
717 	 * never points at a page header which has PR_PHINPAGE set and
718 	 * has no items in its bucket.
719 	 */
720 	if ((ph = pp->pr_curpage) == NULL) {
721 #ifdef DIAGNOSTIC
722 		if (pp->pr_nitems != 0) {
723 			simple_unlock(&pp->pr_slock);
724 			printf("pool_get: %s: curpage NULL, nitems %u\n",
725 			    pp->pr_wchan, pp->pr_nitems);
726 			panic("pool_get: nitems inconsistent");
727 		}
728 #endif
729 
730 		/*
731 		 * Call the back-end page allocator for more memory.
732 		 * Release the pool lock, as the back-end page allocator
733 		 * may block.
734 		 */
735 		pr_leave(pp);
736 		simple_unlock(&pp->pr_slock);
737 		v = pool_allocator_alloc(pp, flags);
738 		if (__predict_true(v != NULL))
739 			ph = pool_alloc_item_header(pp, v, flags);
740 		simple_lock(&pp->pr_slock);
741 		pr_enter(pp, file, line);
742 
743 		if (__predict_false(v == NULL || ph == NULL)) {
744 			if (v != NULL)
745 				pool_allocator_free(pp, v);
746 
747 			/*
748 			 * We were unable to allocate a page or item
749 			 * header, but we released the lock during
750 			 * allocation, so perhaps items were freed
751 			 * back to the pool.  Check for this case.
752 			 */
753 			if (pp->pr_curpage != NULL)
754 				goto startover;
755 
756 			if ((flags & PR_WAITOK) == 0) {
757 				pp->pr_nfail++;
758 				pr_leave(pp);
759 				simple_unlock(&pp->pr_slock);
760 				return (NULL);
761 			}
762 
763 			/*
764 			 * Wait for items to be returned to this pool.
765 			 *
766 			 * XXX: maybe we should wake up once a second and
767 			 * try again?
768 			 */
769 			pp->pr_flags |= PR_WANTED;
770 			/* PA_WANTED is already set on the allocator. */
771 			pr_leave(pp);
772 			ltsleep(pp, PSWP, pp->pr_wchan, 0, &pp->pr_slock);
773 			pr_enter(pp, file, line);
774 			goto startover;
775 		}
776 
777 		/* We have more memory; add it to the pool */
778 		pool_prime_page(pp, v, ph);
779 		pp->pr_npagealloc++;
780 
781 		/* Start the allocation process over. */
782 		goto startover;
783 	}
784 
785 	if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) {
786 		pr_leave(pp);
787 		simple_unlock(&pp->pr_slock);
788 		panic("pool_get: %s: page empty", pp->pr_wchan);
789 	}
790 #ifdef DIAGNOSTIC
791 	if (__predict_false(pp->pr_nitems == 0)) {
792 		pr_leave(pp);
793 		simple_unlock(&pp->pr_slock);
794 		printf("pool_get: %s: items on itemlist, nitems %u\n",
795 		    pp->pr_wchan, pp->pr_nitems);
796 		panic("pool_get: nitems inconsistent");
797 	}
798 #endif
799 
800 #ifdef POOL_DIAGNOSTIC
801 	pr_log(pp, v, PRLOG_GET, file, line);
802 #endif
803 
804 #ifdef DIAGNOSTIC
805 	if (__predict_false(pi->pi_magic != PI_MAGIC)) {
806 		pr_printlog(pp, pi, printf);
807 		panic("pool_get(%s): free list modified: magic=%x; page %p;"
808 		       " item addr %p",
809 			pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
810 	}
811 #endif
812 
813 	/*
814 	 * Remove from item list.
815 	 */
816 	TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list);
817 	pp->pr_nitems--;
818 	pp->pr_nout++;
819 	if (ph->ph_nmissing == 0) {
820 #ifdef DIAGNOSTIC
821 		if (__predict_false(pp->pr_nidle == 0))
822 			panic("pool_get: nidle inconsistent");
823 #endif
824 		pp->pr_nidle--;
825 	}
826 	ph->ph_nmissing++;
827 	if (TAILQ_FIRST(&ph->ph_itemlist) == NULL) {
828 #ifdef DIAGNOSTIC
829 		if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) {
830 			pr_leave(pp);
831 			simple_unlock(&pp->pr_slock);
832 			panic("pool_get: %s: nmissing inconsistent",
833 			    pp->pr_wchan);
834 		}
835 #endif
836 		/*
837 		 * Find a new non-empty page header, if any.
838 		 * Start search from the page head, to increase
839 		 * the chance for "high water" pages to be freed.
840 		 *
841 		 * Migrate empty pages to the end of the list.  This
842 		 * will speed the update of curpage as pages become
843 		 * idle.  Empty pages intermingled with idle pages
844 		 * is no big deal.  As soon as a page becomes un-empty,
845 		 * it will move back to the head of the list.
846 		 */
847 		TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
848 		TAILQ_INSERT_TAIL(&pp->pr_pagelist, ph, ph_pagelist);
849 		TAILQ_FOREACH(ph, &pp->pr_pagelist, ph_pagelist)
850 			if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
851 				break;
852 
853 		pp->pr_curpage = ph;
854 	}
855 
856 	pp->pr_nget++;
857 
858 	/*
859 	 * If we have a low water mark and we are now below that low
860 	 * water mark, add more items to the pool.
861 	 */
862 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
863 		/*
864 		 * XXX: Should we log a warning?  Should we set up a timeout
865 		 * to try again in a second or so?  The latter could break
866 		 * a caller's assumptions about interrupt protection, etc.
867 		 */
868 	}
869 
870 	pr_leave(pp);
871 	simple_unlock(&pp->pr_slock);
872 	return (v);
873 }
874 
875 /*
876  * Internal version of pool_put().  Pool is already locked/entered.
877  */
878 void
879 pool_do_put(struct pool *pp, void *v)
880 {
881 	struct pool_item *pi = v;
882 	struct pool_item_header *ph;
883 	caddr_t page;
884 	int s;
885 
886 #ifdef MALLOC_DEBUG
887 	if (pp->pr_roflags & PR_DEBUG) {
888 		debug_free(v, M_DEBUG);
889 		return;
890 	}
891 #endif
892 
893 	LOCK_ASSERT(simple_lock_held(&pp->pr_slock));
894 
895 	page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
896 
897 #ifdef DIAGNOSTIC
898 	if (__predict_false(pp->pr_nout == 0)) {
899 		printf("pool %s: putting with none out\n",
900 		    pp->pr_wchan);
901 		panic("pool_put");
902 	}
903 #endif
904 
905 	if (__predict_false((ph = pr_find_pagehead(pp, page)) == NULL)) {
906 		pr_printlog(pp, NULL, printf);
907 		panic("pool_put: %s: page header missing", pp->pr_wchan);
908 	}
909 
910 #ifdef LOCKDEBUG
911 	/*
912 	 * Check if we're freeing a locked simple lock.
913 	 */
914 	simple_lock_freecheck((caddr_t)pi, ((caddr_t)pi) + pp->pr_size);
915 #endif
916 
917 	/*
918 	 * Return to item list.
919 	 */
920 #ifdef DIAGNOSTIC
921 	pi->pi_magic = PI_MAGIC;
922 #endif
923 #ifdef DEBUG
924 	{
925 		int i, *ip = v;
926 
927 		for (i = 0; i < pp->pr_size / sizeof(int); i++) {
928 			*ip++ = PI_MAGIC;
929 		}
930 	}
931 #endif
932 
933 	TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
934 	ph->ph_nmissing--;
935 	pp->pr_nput++;
936 	pp->pr_nitems++;
937 	pp->pr_nout--;
938 
939 	/* Cancel "pool empty" condition if it exists */
940 	if (pp->pr_curpage == NULL)
941 		pp->pr_curpage = ph;
942 
943 	if (pp->pr_flags & PR_WANTED) {
944 		pp->pr_flags &= ~PR_WANTED;
945 		if (ph->ph_nmissing == 0)
946 			pp->pr_nidle++;
947 		wakeup((caddr_t)pp);
948 		return;
949 	}
950 
951 	/*
952 	 * If this page is now complete, do one of two things:
953 	 *
954 	 *	(1) If we have more pages than the page high water
955 	 *	    mark, free the page back to the system.
956 	 *
957 	 *	(2) Move it to the end of the page list, so that
958 	 *	    we minimize our chances of fragmenting the
959 	 *	    pool.  Idle pages migrate to the end (along with
960 	 *	    completely empty pages, so that we find un-empty
961 	 *	    pages more quickly when we update curpage) of the
962 	 *	    list so they can be more easily swept up by
963 	 *	    the pagedaemon when pages are scarce.
964 	 */
965 	if (ph->ph_nmissing == 0) {
966 		pp->pr_nidle++;
967 		if (pp->pr_npages > pp->pr_maxpages ||
968 		    (pp->pr_alloc->pa_flags & PA_WANT) != 0) {
969 			pr_rmpage(pp, ph, NULL);
970 		} else {
971 			TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
972 			TAILQ_INSERT_TAIL(&pp->pr_pagelist, ph, ph_pagelist);
973 
974 			/*
975 			 * Update the timestamp on the page.  A page must
976 			 * be idle for some period of time before it can
977 			 * be reclaimed by the pagedaemon.  This minimizes
978 			 * ping-pong'ing for memory.
979 			 */
980 			s = splclock();
981 			ph->ph_time = mono_time;
982 			splx(s);
983 
984 			/*
985 			 * Update the current page pointer.  Just look for
986 			 * the first page with any free items.
987 			 *
988 			 * XXX: Maybe we want an option to look for the
989 			 * page with the fewest available items, to minimize
990 			 * fragmentation?
991 			 */
992 			TAILQ_FOREACH(ph, &pp->pr_pagelist, ph_pagelist)
993 				if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
994 					break;
995 
996 			pp->pr_curpage = ph;
997 		}
998 	}
999 	/*
1000 	 * If the page has just become un-empty, move it to the head of
1001 	 * the list, and make it the current page.  The next allocation
1002 	 * will get the item from this page, instead of further fragmenting
1003 	 * the pool.
1004 	 */
1005 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
1006 		TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
1007 		TAILQ_INSERT_HEAD(&pp->pr_pagelist, ph, ph_pagelist);
1008 		pp->pr_curpage = ph;
1009 	}
1010 }
1011 
1012 /*
1013  * Return resource to the pool; must be called at appropriate spl level
1014  */
1015 #ifdef POOL_DIAGNOSTIC
1016 void
1017 _pool_put(struct pool *pp, void *v, const char *file, long line)
1018 {
1019 
1020 	simple_lock(&pp->pr_slock);
1021 	pr_enter(pp, file, line);
1022 
1023 	pr_log(pp, v, PRLOG_PUT, file, line);
1024 
1025 	pool_do_put(pp, v);
1026 
1027 	pr_leave(pp);
1028 	simple_unlock(&pp->pr_slock);
1029 }
1030 #undef pool_put
1031 #endif /* POOL_DIAGNOSTIC */
1032 
1033 void
1034 pool_put(struct pool *pp, void *v)
1035 {
1036 
1037 	simple_lock(&pp->pr_slock);
1038 
1039 	pool_do_put(pp, v);
1040 
1041 	simple_unlock(&pp->pr_slock);
1042 }
1043 
1044 #ifdef POOL_DIAGNOSTIC
1045 #define		pool_put(h, v)	_pool_put((h), (v), __FILE__, __LINE__)
1046 #endif
1047 
1048 /*
1049  * Add N items to the pool.
1050  */
1051 int
1052 pool_prime(struct pool *pp, int n)
1053 {
1054 	struct pool_item_header *ph;
1055 	caddr_t cp;
1056 	int newpages;
1057 
1058 	simple_lock(&pp->pr_slock);
1059 
1060 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1061 
1062 	while (newpages-- > 0) {
1063 		simple_unlock(&pp->pr_slock);
1064 		cp = pool_allocator_alloc(pp, PR_NOWAIT);
1065 		if (__predict_true(cp != NULL))
1066 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
1067 		simple_lock(&pp->pr_slock);
1068 
1069 		if (__predict_false(cp == NULL || ph == NULL)) {
1070 			if (cp != NULL)
1071 				pool_allocator_free(pp, cp);
1072 			break;
1073 		}
1074 
1075 		pool_prime_page(pp, cp, ph);
1076 		pp->pr_npagealloc++;
1077 		pp->pr_minpages++;
1078 	}
1079 
1080 	if (pp->pr_minpages >= pp->pr_maxpages)
1081 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
1082 
1083 	simple_unlock(&pp->pr_slock);
1084 	return (0);
1085 }
1086 
1087 /*
1088  * Add a page worth of items to the pool.
1089  *
1090  * Note, we must be called with the pool descriptor LOCKED.
1091  */
1092 void
1093 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
1094 {
1095 	struct pool_item *pi;
1096 	caddr_t cp = storage;
1097 	unsigned int align = pp->pr_align;
1098 	unsigned int ioff = pp->pr_itemoffset;
1099 	int n;
1100 
1101 #ifdef DIAGNOSTIC
1102 	if (((u_long)cp & (pp->pr_alloc->pa_pagesz - 1)) != 0)
1103 		panic("pool_prime_page: %s: unaligned page", pp->pr_wchan);
1104 #endif
1105 
1106 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
1107 		LIST_INSERT_HEAD(&pp->pr_hashtab[PR_HASH_INDEX(pp, cp)],
1108 		    ph, ph_hashlist);
1109 
1110 	/*
1111 	 * Insert page header.
1112 	 */
1113 	TAILQ_INSERT_HEAD(&pp->pr_pagelist, ph, ph_pagelist);
1114 	TAILQ_INIT(&ph->ph_itemlist);
1115 	ph->ph_page = storage;
1116 	ph->ph_nmissing = 0;
1117 	memset(&ph->ph_time, 0, sizeof(ph->ph_time));
1118 
1119 	pp->pr_nidle++;
1120 
1121 	/*
1122 	 * Color this page.
1123 	 */
1124 	cp = (caddr_t)(cp + pp->pr_curcolor);
1125 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
1126 		pp->pr_curcolor = 0;
1127 
1128 	/*
1129 	 * Adjust storage to apply aligment to `pr_itemoffset' in each item.
1130 	 */
1131 	if (ioff != 0)
1132 		cp = (caddr_t)(cp + (align - ioff));
1133 
1134 	/*
1135 	 * Insert remaining chunks on the bucket list.
1136 	 */
1137 	n = pp->pr_itemsperpage;
1138 	pp->pr_nitems += n;
1139 
1140 	while (n--) {
1141 		pi = (struct pool_item *)cp;
1142 
1143 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
1144 
1145 		/* Insert on page list */
1146 		TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
1147 #ifdef DIAGNOSTIC
1148 		pi->pi_magic = PI_MAGIC;
1149 #endif
1150 		cp = (caddr_t)(cp + pp->pr_size);
1151 	}
1152 
1153 	/*
1154 	 * If the pool was depleted, point at the new page.
1155 	 */
1156 	if (pp->pr_curpage == NULL)
1157 		pp->pr_curpage = ph;
1158 
1159 	if (++pp->pr_npages > pp->pr_hiwat)
1160 		pp->pr_hiwat = pp->pr_npages;
1161 }
1162 
1163 /*
1164  * Used by pool_get() when nitems drops below the low water mark.  This
1165  * is used to catch up nitmes with the low water mark.
1166  *
1167  * Note 1, we never wait for memory here, we let the caller decide what to do.
1168  *
1169  * Note 2, we must be called with the pool already locked, and we return
1170  * with it locked.
1171  */
1172 int
1173 pool_catchup(struct pool *pp)
1174 {
1175 	struct pool_item_header *ph;
1176 	caddr_t cp;
1177 	int error = 0;
1178 
1179 	while (POOL_NEEDS_CATCHUP(pp)) {
1180 		/*
1181 		 * Call the page back-end allocator for more memory.
1182 		 *
1183 		 * XXX: We never wait, so should we bother unlocking
1184 		 * the pool descriptor?
1185 		 */
1186 		simple_unlock(&pp->pr_slock);
1187 		cp = pool_allocator_alloc(pp, PR_NOWAIT);
1188 		if (__predict_true(cp != NULL))
1189 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
1190 		simple_lock(&pp->pr_slock);
1191 		if (__predict_false(cp == NULL || ph == NULL)) {
1192 			if (cp != NULL)
1193 				pool_allocator_free(pp, cp);
1194 			error = ENOMEM;
1195 			break;
1196 		}
1197 		pool_prime_page(pp, cp, ph);
1198 		pp->pr_npagealloc++;
1199 	}
1200 
1201 	return (error);
1202 }
1203 
1204 void
1205 pool_setlowat(struct pool *pp, int n)
1206 {
1207 
1208 	simple_lock(&pp->pr_slock);
1209 
1210 	pp->pr_minitems = n;
1211 	pp->pr_minpages = (n == 0)
1212 		? 0
1213 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1214 
1215 	/* Make sure we're caught up with the newly-set low water mark. */
1216 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
1217 		/*
1218 		 * XXX: Should we log a warning?  Should we set up a timeout
1219 		 * to try again in a second or so?  The latter could break
1220 		 * a caller's assumptions about interrupt protection, etc.
1221 		 */
1222 	}
1223 
1224 	simple_unlock(&pp->pr_slock);
1225 }
1226 
1227 void
1228 pool_sethiwat(struct pool *pp, int n)
1229 {
1230 
1231 	simple_lock(&pp->pr_slock);
1232 
1233 	pp->pr_maxpages = (n == 0)
1234 		? 0
1235 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1236 
1237 	simple_unlock(&pp->pr_slock);
1238 }
1239 
1240 int
1241 pool_sethardlimit(struct pool *pp, unsigned n, const char *warnmess, int ratecap)
1242 {
1243 	int error = 0;
1244 
1245 	simple_lock(&pp->pr_slock);
1246 
1247 	if (n < pp->pr_nout) {
1248 		error = EINVAL;
1249 		goto done;
1250 	}
1251 
1252 	pp->pr_hardlimit = n;
1253 	pp->pr_hardlimit_warning = warnmess;
1254 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1255 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1256 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1257 
1258 	/*
1259 	 * In-line version of pool_sethiwat(), because we don't want to
1260 	 * release the lock.
1261 	 */
1262 	pp->pr_maxpages = (n == 0 || n == UINT_MAX)
1263 		? n
1264 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1265 
1266  done:
1267 	simple_unlock(&pp->pr_slock);
1268 
1269 	return (error);
1270 }
1271 
1272 /*
1273  * Release all complete pages that have not been used recently.
1274  *
1275  * Returns non-zero if any pages have been reclaimed.
1276  */
1277 int
1278 #ifdef POOL_DIAGNOSTIC
1279 _pool_reclaim(struct pool *pp, const char *file, long line)
1280 #else
1281 pool_reclaim(struct pool *pp)
1282 #endif
1283 {
1284 	struct pool_item_header *ph, *phnext;
1285 	struct pool_cache *pc;
1286 	struct timeval curtime;
1287 	struct pool_pagelist pq;
1288 	int s;
1289 
1290 	if (pp->pr_drain_hook != NULL) {
1291 		/*
1292 		 * The drain hook must be called with the pool unlocked.
1293 		 */
1294 		(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
1295 	}
1296 
1297 	if (simple_lock_try(&pp->pr_slock) == 0)
1298 		return (0);
1299 	pr_enter(pp, file, line);
1300 
1301 	TAILQ_INIT(&pq);
1302 
1303 	/*
1304 	 * Reclaim items from the pool's caches.
1305 	 */
1306 	TAILQ_FOREACH(pc, &pp->pr_cachelist, pc_poollist)
1307 		pool_cache_reclaim(pc);
1308 
1309 	s = splclock();
1310 	curtime = mono_time;
1311 	splx(s);
1312 
1313 	for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL; ph = phnext) {
1314 		phnext = TAILQ_NEXT(ph, ph_pagelist);
1315 
1316 		/* Check our minimum page claim */
1317 		if (pp->pr_npages <= pp->pr_minpages)
1318 			break;
1319 
1320 		if (ph->ph_nmissing == 0) {
1321 			struct timeval diff;
1322 			timersub(&curtime, &ph->ph_time, &diff);
1323 			if (diff.tv_sec < pool_inactive_time)
1324 				continue;
1325 
1326 			/*
1327 			 * If freeing this page would put us below
1328 			 * the low water mark, stop now.
1329 			 */
1330 			if ((pp->pr_nitems - pp->pr_itemsperpage) <
1331 			    pp->pr_minitems)
1332 				break;
1333 
1334 			pr_rmpage(pp, ph, &pq);
1335 		}
1336 	}
1337 
1338 	pr_leave(pp);
1339 	simple_unlock(&pp->pr_slock);
1340 	if (TAILQ_EMPTY(&pq))
1341 		return (0);
1342 	while ((ph = TAILQ_FIRST(&pq)) != NULL) {
1343 		TAILQ_REMOVE(&pq, ph, ph_pagelist);
1344 		pool_allocator_free(pp, ph->ph_page);
1345 		if (pp->pr_roflags & PR_PHINPAGE) {
1346 			continue;
1347 		}
1348 		LIST_REMOVE(ph, ph_hashlist);
1349 		s = splhigh();
1350 		pool_put(&phpool, ph);
1351 		splx(s);
1352 	}
1353 
1354 	return (1);
1355 }
1356 
1357 
1358 /*
1359  * Drain pools, one at a time.
1360  *
1361  * Note, we must never be called from an interrupt context.
1362  */
1363 void
1364 pool_drain(void *arg)
1365 {
1366 	struct pool *pp;
1367 	int s;
1368 
1369 	pp = NULL;
1370 	s = splvm();
1371 	simple_lock(&pool_head_slock);
1372 	if (drainpp == NULL) {
1373 		drainpp = TAILQ_FIRST(&pool_head);
1374 	}
1375 	if (drainpp) {
1376 		pp = drainpp;
1377 		drainpp = TAILQ_NEXT(pp, pr_poollist);
1378 	}
1379 	simple_unlock(&pool_head_slock);
1380 	pool_reclaim(pp);
1381 	splx(s);
1382 }
1383 
1384 /*
1385  * Diagnostic helpers.
1386  */
1387 void
1388 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1389 {
1390 	int s;
1391 
1392 	s = splvm();
1393 	if (simple_lock_try(&pp->pr_slock) == 0) {
1394 		pr("pool %s is locked; try again later\n",
1395 		    pp->pr_wchan);
1396 		splx(s);
1397 		return;
1398 	}
1399 	pool_print1(pp, modif, pr);
1400 	simple_unlock(&pp->pr_slock);
1401 	splx(s);
1402 }
1403 
1404 void
1405 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1406 {
1407 	struct pool_item_header *ph;
1408 	struct pool_cache *pc;
1409 	struct pool_cache_group *pcg;
1410 #ifdef DIAGNOSTIC
1411 	struct pool_item *pi;
1412 #endif
1413 	int i, print_log = 0, print_pagelist = 0, print_cache = 0;
1414 	char c;
1415 
1416 	while ((c = *modif++) != '\0') {
1417 		if (c == 'l')
1418 			print_log = 1;
1419 		if (c == 'p')
1420 			print_pagelist = 1;
1421 		if (c == 'c')
1422 			print_cache = 1;
1423 		modif++;
1424 	}
1425 
1426 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1427 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1428 	    pp->pr_roflags);
1429 	(*pr)("\talloc %p\n", pp->pr_alloc);
1430 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1431 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1432 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1433 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1434 
1435 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1436 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1437 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1438 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1439 
1440 	if (print_pagelist == 0)
1441 		goto skip_pagelist;
1442 
1443 	if ((ph = TAILQ_FIRST(&pp->pr_pagelist)) != NULL)
1444 		(*pr)("\n\tpage list:\n");
1445 	for (; ph != NULL; ph = TAILQ_NEXT(ph, ph_pagelist)) {
1446 		(*pr)("\t\tpage %p, nmissing %d, time %lu,%lu\n",
1447 		    ph->ph_page, ph->ph_nmissing,
1448 		    (u_long)ph->ph_time.tv_sec,
1449 		    (u_long)ph->ph_time.tv_usec);
1450 #ifdef DIAGNOSTIC
1451 		TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1452 			if (pi->pi_magic != PI_MAGIC) {
1453 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1454 				    pi, pi->pi_magic);
1455 			}
1456 		}
1457 #endif
1458 	}
1459 	if (pp->pr_curpage == NULL)
1460 		(*pr)("\tno current page\n");
1461 	else
1462 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1463 
1464  skip_pagelist:
1465 
1466 	if (print_log == 0)
1467 		goto skip_log;
1468 
1469 	(*pr)("\n");
1470 	if ((pp->pr_roflags & PR_LOGGING) == 0)
1471 		(*pr)("\tno log\n");
1472 	else
1473 		pr_printlog(pp, NULL, pr);
1474 
1475  skip_log:
1476 
1477 	if (print_cache == 0)
1478 		goto skip_cache;
1479 
1480 	TAILQ_FOREACH(pc, &pp->pr_cachelist, pc_poollist) {
1481 		(*pr)("\tcache %p: allocfrom %p freeto %p\n", pc,
1482 		    pc->pc_allocfrom, pc->pc_freeto);
1483 		(*pr)("\t    hits %lu misses %lu ngroups %lu nitems %lu\n",
1484 		    pc->pc_hits, pc->pc_misses, pc->pc_ngroups, pc->pc_nitems);
1485 		TAILQ_FOREACH(pcg, &pc->pc_grouplist, pcg_list) {
1486 			(*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail);
1487 			for (i = 0; i < PCG_NOBJECTS; i++)
1488 				(*pr)("\t\t\t%p\n", pcg->pcg_objects[i]);
1489 		}
1490 	}
1491 
1492  skip_cache:
1493 
1494 	pr_enter_check(pp, pr);
1495 }
1496 
1497 int
1498 pool_chk(struct pool *pp, const char *label)
1499 {
1500 	struct pool_item_header *ph;
1501 	int r = 0;
1502 
1503 	simple_lock(&pp->pr_slock);
1504 
1505 	TAILQ_FOREACH(ph, &pp->pr_pagelist, ph_pagelist) {
1506 		struct pool_item *pi;
1507 		int n;
1508 		caddr_t page;
1509 
1510 		page = (caddr_t)((vaddr_t)ph & pp->pr_alloc->pa_pagemask);
1511 		if (page != ph->ph_page &&
1512 		    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1513 			if (label != NULL)
1514 				printf("%s: ", label);
1515 			printf("pool(%p:%s): page inconsistency: page %p;"
1516 			       " at page head addr %p (p %p)\n", pp,
1517 				pp->pr_wchan, ph->ph_page,
1518 				ph, page);
1519 			r++;
1520 			goto out;
1521 		}
1522 
1523 		for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0;
1524 		     pi != NULL;
1525 		     pi = TAILQ_NEXT(pi,pi_list), n++) {
1526 
1527 #ifdef DIAGNOSTIC
1528 			if (pi->pi_magic != PI_MAGIC) {
1529 				if (label != NULL)
1530 					printf("%s: ", label);
1531 				printf("pool(%s): free list modified: magic=%x;"
1532 				       " page %p; item ordinal %d;"
1533 				       " addr %p (p %p)\n",
1534 					pp->pr_wchan, pi->pi_magic, ph->ph_page,
1535 					n, pi, page);
1536 				panic("pool");
1537 			}
1538 #endif
1539 			page = (caddr_t)((vaddr_t)pi & pp->pr_alloc->pa_pagemask);
1540 			if (page == ph->ph_page)
1541 				continue;
1542 
1543 			if (label != NULL)
1544 				printf("%s: ", label);
1545 			printf("pool(%p:%s): page inconsistency: page %p;"
1546 			       " item ordinal %d; addr %p (p %p)\n", pp,
1547 				pp->pr_wchan, ph->ph_page,
1548 				n, pi, page);
1549 			r++;
1550 			goto out;
1551 		}
1552 	}
1553 out:
1554 	simple_unlock(&pp->pr_slock);
1555 	return (r);
1556 }
1557 
1558 /*
1559  * pool_cache_init:
1560  *
1561  *	Initialize a pool cache.
1562  *
1563  *	NOTE: If the pool must be protected from interrupts, we expect
1564  *	to be called at the appropriate interrupt priority level.
1565  */
1566 void
1567 pool_cache_init(struct pool_cache *pc, struct pool *pp,
1568     int (*ctor)(void *, void *, int),
1569     void (*dtor)(void *, void *),
1570     void *arg)
1571 {
1572 
1573 	TAILQ_INIT(&pc->pc_grouplist);
1574 	simple_lock_init(&pc->pc_slock);
1575 
1576 	pc->pc_allocfrom = NULL;
1577 	pc->pc_freeto = NULL;
1578 	pc->pc_pool = pp;
1579 
1580 	pc->pc_ctor = ctor;
1581 	pc->pc_dtor = dtor;
1582 	pc->pc_arg  = arg;
1583 
1584 	pc->pc_hits   = 0;
1585 	pc->pc_misses = 0;
1586 
1587 	pc->pc_ngroups = 0;
1588 
1589 	pc->pc_nitems = 0;
1590 
1591 	simple_lock(&pp->pr_slock);
1592 	TAILQ_INSERT_TAIL(&pp->pr_cachelist, pc, pc_poollist);
1593 	simple_unlock(&pp->pr_slock);
1594 }
1595 
1596 /*
1597  * pool_cache_destroy:
1598  *
1599  *	Destroy a pool cache.
1600  */
1601 void
1602 pool_cache_destroy(struct pool_cache *pc)
1603 {
1604 	struct pool *pp = pc->pc_pool;
1605 
1606 	/* First, invalidate the entire cache. */
1607 	pool_cache_invalidate(pc);
1608 
1609 	/* ...and remove it from the pool's cache list. */
1610 	simple_lock(&pp->pr_slock);
1611 	TAILQ_REMOVE(&pp->pr_cachelist, pc, pc_poollist);
1612 	simple_unlock(&pp->pr_slock);
1613 }
1614 
1615 static __inline void *
1616 pcg_get(struct pool_cache_group *pcg)
1617 {
1618 	void *object;
1619 	u_int idx;
1620 
1621 	KASSERT(pcg->pcg_avail <= PCG_NOBJECTS);
1622 	KASSERT(pcg->pcg_avail != 0);
1623 	idx = --pcg->pcg_avail;
1624 
1625 	KASSERT(pcg->pcg_objects[idx] != NULL);
1626 	object = pcg->pcg_objects[idx];
1627 	pcg->pcg_objects[idx] = NULL;
1628 
1629 	return (object);
1630 }
1631 
1632 static __inline void
1633 pcg_put(struct pool_cache_group *pcg, void *object)
1634 {
1635 	u_int idx;
1636 
1637 	KASSERT(pcg->pcg_avail < PCG_NOBJECTS);
1638 	idx = pcg->pcg_avail++;
1639 
1640 	KASSERT(pcg->pcg_objects[idx] == NULL);
1641 	pcg->pcg_objects[idx] = object;
1642 }
1643 
1644 /*
1645  * pool_cache_get:
1646  *
1647  *	Get an object from a pool cache.
1648  */
1649 void *
1650 pool_cache_get(struct pool_cache *pc, int flags)
1651 {
1652 	struct pool_cache_group *pcg;
1653 	void *object;
1654 
1655 #ifdef LOCKDEBUG
1656 	if (flags & PR_WAITOK)
1657 		simple_lock_only_held(NULL, "pool_cache_get(PR_WAITOK)");
1658 #endif
1659 
1660 	simple_lock(&pc->pc_slock);
1661 
1662 	if ((pcg = pc->pc_allocfrom) == NULL) {
1663 		TAILQ_FOREACH(pcg, &pc->pc_grouplist, pcg_list) {
1664 			if (pcg->pcg_avail != 0) {
1665 				pc->pc_allocfrom = pcg;
1666 				goto have_group;
1667 			}
1668 		}
1669 
1670 		/*
1671 		 * No groups with any available objects.  Allocate
1672 		 * a new object, construct it, and return it to
1673 		 * the caller.  We will allocate a group, if necessary,
1674 		 * when the object is freed back to the cache.
1675 		 */
1676 		pc->pc_misses++;
1677 		simple_unlock(&pc->pc_slock);
1678 		object = pool_get(pc->pc_pool, flags);
1679 		if (object != NULL && pc->pc_ctor != NULL) {
1680 			if ((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0) {
1681 				pool_put(pc->pc_pool, object);
1682 				return (NULL);
1683 			}
1684 		}
1685 		return (object);
1686 	}
1687 
1688  have_group:
1689 	pc->pc_hits++;
1690 	pc->pc_nitems--;
1691 	object = pcg_get(pcg);
1692 
1693 	if (pcg->pcg_avail == 0)
1694 		pc->pc_allocfrom = NULL;
1695 
1696 	simple_unlock(&pc->pc_slock);
1697 
1698 	return (object);
1699 }
1700 
1701 /*
1702  * pool_cache_put:
1703  *
1704  *	Put an object back to the pool cache.
1705  */
1706 void
1707 pool_cache_put(struct pool_cache *pc, void *object)
1708 {
1709 	struct pool_cache_group *pcg;
1710 	int s;
1711 
1712 	simple_lock(&pc->pc_slock);
1713 
1714 	if ((pcg = pc->pc_freeto) == NULL) {
1715 		TAILQ_FOREACH(pcg, &pc->pc_grouplist, pcg_list) {
1716 			if (pcg->pcg_avail != PCG_NOBJECTS) {
1717 				pc->pc_freeto = pcg;
1718 				goto have_group;
1719 			}
1720 		}
1721 
1722 		/*
1723 		 * No empty groups to free the object to.  Attempt to
1724 		 * allocate one.
1725 		 */
1726 		simple_unlock(&pc->pc_slock);
1727 		s = splvm();
1728 		pcg = pool_get(&pcgpool, PR_NOWAIT);
1729 		splx(s);
1730 		if (pcg != NULL) {
1731 			memset(pcg, 0, sizeof(*pcg));
1732 			simple_lock(&pc->pc_slock);
1733 			pc->pc_ngroups++;
1734 			TAILQ_INSERT_TAIL(&pc->pc_grouplist, pcg, pcg_list);
1735 			if (pc->pc_freeto == NULL)
1736 				pc->pc_freeto = pcg;
1737 			goto have_group;
1738 		}
1739 
1740 		/*
1741 		 * Unable to allocate a cache group; destruct the object
1742 		 * and free it back to the pool.
1743 		 */
1744 		pool_cache_destruct_object(pc, object);
1745 		return;
1746 	}
1747 
1748  have_group:
1749 	pc->pc_nitems++;
1750 	pcg_put(pcg, object);
1751 
1752 	if (pcg->pcg_avail == PCG_NOBJECTS)
1753 		pc->pc_freeto = NULL;
1754 
1755 	simple_unlock(&pc->pc_slock);
1756 }
1757 
1758 /*
1759  * pool_cache_destruct_object:
1760  *
1761  *	Force destruction of an object and its release back into
1762  *	the pool.
1763  */
1764 void
1765 pool_cache_destruct_object(struct pool_cache *pc, void *object)
1766 {
1767 
1768 	if (pc->pc_dtor != NULL)
1769 		(*pc->pc_dtor)(pc->pc_arg, object);
1770 	pool_put(pc->pc_pool, object);
1771 }
1772 
1773 /*
1774  * pool_cache_do_invalidate:
1775  *
1776  *	This internal function implements pool_cache_invalidate() and
1777  *	pool_cache_reclaim().
1778  */
1779 void
1780 pool_cache_do_invalidate(struct pool_cache *pc, int free_groups,
1781     void (*putit)(struct pool *, void *))
1782 {
1783 	struct pool_cache_group *pcg, *npcg;
1784 	void *object;
1785 	int s;
1786 
1787 	for (pcg = TAILQ_FIRST(&pc->pc_grouplist); pcg != NULL;
1788 	     pcg = npcg) {
1789 		npcg = TAILQ_NEXT(pcg, pcg_list);
1790 		while (pcg->pcg_avail != 0) {
1791 			pc->pc_nitems--;
1792 			object = pcg_get(pcg);
1793 			if (pcg->pcg_avail == 0 && pc->pc_allocfrom == pcg)
1794 				pc->pc_allocfrom = NULL;
1795 			if (pc->pc_dtor != NULL)
1796 				(*pc->pc_dtor)(pc->pc_arg, object);
1797 			(*putit)(pc->pc_pool, object);
1798 		}
1799 		if (free_groups) {
1800 			pc->pc_ngroups--;
1801 			TAILQ_REMOVE(&pc->pc_grouplist, pcg, pcg_list);
1802 			if (pc->pc_freeto == pcg)
1803 				pc->pc_freeto = NULL;
1804 			s = splvm();
1805 			pool_put(&pcgpool, pcg);
1806 			splx(s);
1807 		}
1808 	}
1809 }
1810 
1811 /*
1812  * pool_cache_invalidate:
1813  *
1814  *	Invalidate a pool cache (destruct and release all of the
1815  *	cached objects).
1816  */
1817 void
1818 pool_cache_invalidate(struct pool_cache *pc)
1819 {
1820 
1821 	simple_lock(&pc->pc_slock);
1822 	pool_cache_do_invalidate(pc, 0, pool_put);
1823 	simple_unlock(&pc->pc_slock);
1824 }
1825 
1826 /*
1827  * pool_cache_reclaim:
1828  *
1829  *	Reclaim a pool cache for pool_reclaim().
1830  */
1831 void
1832 pool_cache_reclaim(struct pool_cache *pc)
1833 {
1834 
1835 	simple_lock(&pc->pc_slock);
1836 	pool_cache_do_invalidate(pc, 1, pool_do_put);
1837 	simple_unlock(&pc->pc_slock);
1838 }
1839 
1840 /*
1841  * We have three different sysctls.
1842  * kern.pool.npools - the number of pools.
1843  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1844  * kern.pool.name.<pool#> - the name for pool#.[6~
1845  */
1846 int
1847 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1848 {
1849 	struct pool *pp, *foundpool = NULL;
1850 	size_t buflen = where != NULL ? *sizep : 0;
1851 	int npools = 0, s;
1852 	unsigned int lookfor;
1853 	size_t len;
1854 
1855 	switch (*name) {
1856 	case KERN_POOL_NPOOLS:
1857 		if (namelen != 1 || buflen != sizeof(int))
1858 			return (EINVAL);
1859 		lookfor = 0;
1860 		break;
1861 	case KERN_POOL_NAME:
1862 		if (namelen != 2 || buflen < 1)
1863 			return (EINVAL);
1864 		lookfor = name[1];
1865 		break;
1866 	case KERN_POOL_POOL:
1867 		if (namelen != 2 || buflen != sizeof(struct pool))
1868 			return (EINVAL);
1869 		lookfor = name[1];
1870 		break;
1871 	default:
1872 		return (EINVAL);
1873 	}
1874 
1875 	s = splvm();
1876 	simple_lock(&pool_head_slock);
1877 
1878 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1879 		npools++;
1880 		if (lookfor == pp->pr_serial) {
1881 			foundpool = pp;
1882 			break;
1883 		}
1884 	}
1885 
1886 	simple_unlock(&pool_head_slock);
1887 	splx(s);
1888 
1889 	if (lookfor != 0 && foundpool == NULL)
1890 		return (ENOENT);
1891 
1892 	switch (*name) {
1893 	case KERN_POOL_NPOOLS:
1894 		return copyout(&npools, where, buflen);
1895 	case KERN_POOL_NAME:
1896 		len = strlen(foundpool->pr_wchan) + 1;
1897 		if (*sizep < len)
1898 			return (ENOMEM);
1899 		*sizep = len;
1900 		return copyout(foundpool->pr_wchan, where, len);
1901 	case KERN_POOL_POOL:
1902 		return copyout(foundpool, where, buflen);
1903 	}
1904 	/* NOTREACHED */
1905 	return (0); /* XXX - Stupid gcc */
1906 }
1907 
1908 /*
1909  * Pool backend allocators.
1910  *
1911  * Each pool has a backend allocator that handles allocation, deallocation
1912  * and any additional draining that might be needed.
1913  *
1914  * We provide two standard allocators.
1915  *  pool_alloc_kmem - the default used when no allocator is specified.
1916  *  pool_alloc_nointr - used for pools that will not be accessed in
1917  *   interrupt context.
1918  */
1919 void	*pool_page_alloc(struct pool *, int);
1920 void	pool_page_free(struct pool *, void *);
1921 void	*pool_page_alloc_nointr(struct pool *, int);
1922 void	pool_page_free_nointr(struct pool *, void *);
1923 
1924 struct pool_allocator pool_allocator_kmem = {
1925 	pool_page_alloc, pool_page_free, 0,
1926 };
1927 struct pool_allocator pool_allocator_nointr = {
1928 	pool_page_alloc_nointr, pool_page_free_nointr, 0,
1929 };
1930 
1931 /*
1932  * XXX - we have at least three different resources for the same allocation
1933  *  and each resource can be depleted. First we have the ready elements in
1934  *  the pool. Then we have the resource (typically a vm_map) for this
1935  *  allocator, then we have physical memory. Waiting for any of these can
1936  *  be unnecessary when any other is freed, but the kernel doesn't support
1937  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1938  *  the pool (so that we can be awakened when an item is returned to the pool),
1939  *  but we set PA_WANT on the allocator. When a page is returned to
1940  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1941  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1942  *  We also wake up the allocator in case someone without a pool (malloc)
1943  *  is sleeping waiting for this allocator.
1944  */
1945 
1946 void *
1947 pool_allocator_alloc(struct pool *org, int flags)
1948 {
1949 	struct pool_allocator *pa = org->pr_alloc;
1950 	int freed;
1951 	void *res;
1952 	int s;
1953 
1954 	do {
1955 		if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
1956 			return (res);
1957 		if ((flags & PR_WAITOK) == 0) {
1958 			/*
1959 			 * We only run the drain hook here if PR_NOWAIT.
1960 			 * In other cases the hook will be run in
1961 			 * pool_reclaim.
1962 			 */
1963 			if (org->pr_drain_hook != NULL) {
1964 				(*org->pr_drain_hook)(org->pr_drain_hook_arg,
1965 				    flags);
1966 				if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
1967 					return (res);
1968 			}
1969 			break;
1970 		}
1971 		s = splvm();
1972 		simple_lock(&pa->pa_slock);
1973 		freed = pool_allocator_drain(pa, org, 1);
1974 		simple_unlock(&pa->pa_slock);
1975 		splx(s);
1976 	} while (freed);
1977 	return (NULL);
1978 }
1979 
1980 void
1981 pool_allocator_free(struct pool *pp, void *v)
1982 {
1983 	struct pool_allocator *pa = pp->pr_alloc;
1984 	int s;
1985 
1986 	(*pa->pa_free)(pp, v);
1987 
1988 	s = splvm();
1989 	simple_lock(&pa->pa_slock);
1990 	if ((pa->pa_flags & PA_WANT) == 0) {
1991 		simple_unlock(&pa->pa_slock);
1992 		splx(s);
1993 		return;
1994 	}
1995 
1996 	TAILQ_FOREACH(pp, &pa->pa_list, pr_alloc_list) {
1997 		simple_lock(&pp->pr_slock);
1998 		if ((pp->pr_flags & PR_WANTED) != 0) {
1999 			pp->pr_flags &= ~PR_WANTED;
2000 			wakeup(pp);
2001 		}
2002 		simple_unlock(&pp->pr_slock);
2003 	}
2004 	pa->pa_flags &= ~PA_WANT;
2005 	simple_unlock(&pa->pa_slock);
2006 	splx(s);
2007 }
2008 
2009 /*
2010  * Drain all pools, except 'org', that use this allocator.
2011  *
2012  * Must be called at appropriate spl level and with the allocator locked.
2013  *
2014  * We do this to reclaim va space. pa_alloc is responsible
2015  * for waiting for physical memory.
2016  * XXX - we risk looping forever if start if someone calls
2017  *  pool_destroy on 'start'. But there is no other way to
2018  *  have potentially sleeping pool_reclaim, non-sleeping
2019  *  locks on pool_allocator and some stirring of drained
2020  *  pools in the allocator.
2021  * XXX - maybe we should use pool_head_slock for locking
2022  *  the allocators?
2023  */
2024 int
2025 pool_allocator_drain(struct pool_allocator *pa, struct pool *org, int need)
2026 {
2027 	struct pool *pp, *start;
2028 	int freed;
2029 
2030 	freed = 0;
2031 
2032 	pp = start = TAILQ_FIRST(&pa->pa_list);
2033 	do {
2034 		TAILQ_REMOVE(&pa->pa_list, pp, pr_alloc_list);
2035 		TAILQ_INSERT_TAIL(&pa->pa_list, pp, pr_alloc_list);
2036 		if (pp == org)
2037 			continue;
2038 		simple_unlock(&pa->pa_list);
2039 		freed = pool_reclaim(pp)
2040 		simple_lock(&pa->pa_list);
2041 	} while ((pp = TAILQ_FIRST(&pa->pa_list)) != start && (freed < need));
2042 
2043 	if (!freed) {
2044 		/*
2045 		 * We set PA_WANT here, the caller will most likely
2046 		 * sleep waiting for pages (if not, this won't hurt
2047 		 * that much) and there is no way to set this in the
2048 		 * caller without violating locking order.
2049 		 */
2050 		pa->pa_flags |= PA_WANT;
2051 	}
2052 
2053 	return (freed);
2054 }
2055 
2056 void *
2057 pool_page_alloc(struct pool *pp, int flags)
2058 {
2059 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
2060 
2061 	return ((void *)uvm_km_alloc_poolpage1(kmem_map, uvmexp.kmem_object,
2062 	    waitok));
2063 }
2064 
2065 void
2066 pool_page_free(struct pool *pp, void *v)
2067 {
2068 
2069 	uvm_km_free_poolpage1(kmem_map, (vaddr_t)v);
2070 }
2071 
2072 void *
2073 pool_page_alloc_nointr(struct pool *pp, int flags)
2074 {
2075 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
2076 
2077 	splassert(IPL_NONE);
2078 
2079 	return ((void *)uvm_km_alloc_poolpage1(kernel_map, uvm.kernel_object,
2080 	    waitok));
2081 }
2082 
2083 void
2084 pool_page_free_nointr(struct pool *pp, void *v)
2085 {
2086 	splassert(IPL_NONE);
2087 
2088 	uvm_km_free_poolpage1(kernel_map, (vaddr_t)v);
2089 }
2090