xref: /openbsd-src/sys/kern/subr_pool.c (revision 3a3fbb3f2e2521ab7c4a56b7ff7462ebd9095ec5)
1 /*	$OpenBSD: subr_pool.c,v 1.14 2001/11/06 19:53:20 miod Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.59 2001/06/05 18:51:04 thorpej Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the NetBSD
23  *	Foundation, Inc. and its contributors.
24  * 4. Neither the name of The NetBSD Foundation nor the names of its
25  *    contributors may be used to endorse or promote products derived
26  *    from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38  * POSSIBILITY OF SUCH DAMAGE.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/errno.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47 #include <sys/lock.h>
48 #include <sys/pool.h>
49 #include <sys/syslog.h>
50 #include <sys/sysctl.h>
51 
52 #include <uvm/uvm.h>
53 
54 /*
55  * XXX - for now.
56  */
57 #define SIMPLELOCK_INITIALIZER { SLOCK_UNLOCKED }
58 #ifdef LOCKDEBUG
59 #define simple_lock_freecheck(a, s) do { /* nothing */ } while (0)
60 #define simple_lock_only_held(lkp, str) do { /* nothing */ } while (0)
61 #endif
62 #define LOCK_ASSERT(x) /* nothing */
63 
64 /*
65  * Pool resource management utility.
66  *
67  * Memory is allocated in pages which are split into pieces according
68  * to the pool item size. Each page is kept on a list headed by `pr_pagelist'
69  * in the pool structure and the individual pool items are on a linked list
70  * headed by `ph_itemlist' in each page header. The memory for building
71  * the page list is either taken from the allocated pages themselves (for
72  * small pool items) or taken from an internal pool of page headers (`phpool').
73  */
74 
75 /* List of all pools */
76 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
77 
78 /* Private pool for page header structures */
79 static struct pool phpool;
80 
81 /* # of seconds to retain page after last use */
82 int pool_inactive_time = 10;
83 
84 /* Next candidate for drainage (see pool_drain()) */
85 static struct pool	*drainpp;
86 
87 /* This spin lock protects both pool_head and drainpp. */
88 struct simplelock pool_head_slock = SIMPLELOCK_INITIALIZER;
89 
90 struct pool_item_header {
91 	/* Page headers */
92 	TAILQ_ENTRY(pool_item_header)
93 				ph_pagelist;	/* pool page list */
94 	TAILQ_HEAD(,pool_item)	ph_itemlist;	/* chunk list for this page */
95 	LIST_ENTRY(pool_item_header)
96 				ph_hashlist;	/* Off-page page headers */
97 	int			ph_nmissing;	/* # of chunks in use */
98 	caddr_t			ph_page;	/* this page's address */
99 	struct timeval		ph_time;	/* last referenced */
100 };
101 
102 struct pool_item {
103 #ifdef DIAGNOSTIC
104 	int pi_magic;
105 #endif
106 #define	PI_MAGIC 0xdeadbeef
107 	/* Other entries use only this list entry */
108 	TAILQ_ENTRY(pool_item)	pi_list;
109 };
110 
111 
112 #define	PR_HASH_INDEX(pp,addr) \
113 	(((u_long)(addr) >> (pp)->pr_pageshift) & (PR_HASHTABSIZE - 1))
114 
115 #define	POOL_NEEDS_CATCHUP(pp)						\
116 	((pp)->pr_nitems < (pp)->pr_minitems)
117 
118 /*
119  * Every pool get a unique serial number assigned to it. If this counter
120  * wraps, we're screwed, but we shouldn't create so many pools anyway.
121  */
122 unsigned int pool_serial;
123 
124 /*
125  * Pool cache management.
126  *
127  * Pool caches provide a way for constructed objects to be cached by the
128  * pool subsystem.  This can lead to performance improvements by avoiding
129  * needless object construction/destruction; it is deferred until absolutely
130  * necessary.
131  *
132  * Caches are grouped into cache groups.  Each cache group references
133  * up to 16 constructed objects.  When a cache allocates an object
134  * from the pool, it calls the object's constructor and places it into
135  * a cache group.  When a cache group frees an object back to the pool,
136  * it first calls the object's destructor.  This allows the object to
137  * persist in constructed form while freed to the cache.
138  *
139  * Multiple caches may exist for each pool.  This allows a single
140  * object type to have multiple constructed forms.  The pool references
141  * each cache, so that when a pool is drained by the pagedaemon, it can
142  * drain each individual cache as well.  Each time a cache is drained,
143  * the most idle cache group is freed to the pool in its entirety.
144  *
145  * Pool caches are layed on top of pools.  By layering them, we can avoid
146  * the complexity of cache management for pools which would not benefit
147  * from it.
148  */
149 
150 /* The cache group pool. */
151 static struct pool pcgpool;
152 
153 /* The pool cache group. */
154 #define	PCG_NOBJECTS		16
155 struct pool_cache_group {
156 	TAILQ_ENTRY(pool_cache_group)
157 		pcg_list;	/* link in the pool cache's group list */
158 	u_int	pcg_avail;	/* # available objects */
159 				/* pointers to the objects */
160 	void	*pcg_objects[PCG_NOBJECTS];
161 };
162 
163 static void	pool_cache_reclaim(struct pool_cache *);
164 
165 static int	pool_catchup(struct pool *);
166 static void	pool_prime_page(struct pool *, caddr_t,
167 		    struct pool_item_header *);
168 static void	*pool_page_alloc(unsigned long, int, int);
169 static void	pool_page_free(void *, unsigned long, int);
170 
171 static void pool_print1(struct pool *, const char *,
172 	int (*)(const char *, ...));
173 
174 /*
175  * Pool log entry. An array of these is allocated in pool_init().
176  */
177 struct pool_log {
178 	const char	*pl_file;
179 	long		pl_line;
180 	int		pl_action;
181 #define	PRLOG_GET	1
182 #define	PRLOG_PUT	2
183 	void		*pl_addr;
184 };
185 
186 /* Number of entries in pool log buffers */
187 #ifndef POOL_LOGSIZE
188 #define	POOL_LOGSIZE	10
189 #endif
190 
191 int pool_logsize = POOL_LOGSIZE;
192 
193 #ifdef POOL_DIAGNOSTIC
194 static __inline void
195 pr_log(struct pool *pp, void *a, int action, const char *file, long line)
196 {
197 	int n = pp->pr_curlogentry;
198 	struct pool_log *pl;
199 
200 	if ((pp->pr_roflags & PR_LOGGING) == 0)
201 		return;
202 
203 	/*
204 	 * Fill in the current entry. Wrap around and overwrite
205 	 * the oldest entry if necessary.
206 	 */
207 	pl = &pp->pr_log[n];
208 	pl->pl_file = file;
209 	pl->pl_line = line;
210 	pl->pl_action = action;
211 	pl->pl_addr = v;
212 	if (++n >= pp->pr_logsize)
213 		n = 0;
214 	pp->pr_curlogentry = n;
215 }
216 
217 static void
218 pr_printlog(struct pool *pp, struct pool_item *pi,
219     int (*pr)(const char *, ...))
220 {
221 	int i = pp->pr_logsize;
222 	int n = pp->pr_curlogentry;
223 
224 	if ((pp->pr_roflags & PR_LOGGING) == 0)
225 		return;
226 
227 	/*
228 	 * Print all entries in this pool's log.
229 	 */
230 	while (i-- > 0) {
231 		struct pool_log *pl = &pp->pr_log[n];
232 		if (pl->pl_action != 0) {
233 			if (pi == NULL || pi == pl->pl_addr) {
234 				(*pr)("\tlog entry %d:\n", i);
235 				(*pr)("\t\taction = %s, addr = %p\n",
236 				    pl->pl_action == PRLOG_GET ? "get" : "put",
237 				    pl->pl_addr);
238 				(*pr)("\t\tfile: %s at line %lu\n",
239 				    pl->pl_file, pl->pl_line);
240 			}
241 		}
242 		if (++n >= pp->pr_logsize)
243 			n = 0;
244 	}
245 }
246 
247 static __inline void
248 pr_enter(struct pool *pp, const char *file, long line)
249 {
250 
251 	if (__predict_false(pp->pr_entered_file != NULL)) {
252 		printf("pool %s: reentrancy at file %s line %ld\n",
253 		    pp->pr_wchan, file, line);
254 		printf("         previous entry at file %s line %ld\n",
255 		    pp->pr_entered_file, pp->pr_entered_line);
256 		panic("pr_enter");
257 	}
258 
259 	pp->pr_entered_file = file;
260 	pp->pr_entered_line = line;
261 }
262 
263 static __inline void
264 pr_leave(struct pool *pp)
265 {
266 
267 	if (__predict_false(pp->pr_entered_file == NULL)) {
268 		printf("pool %s not entered?\n", pp->pr_wchan);
269 		panic("pr_leave");
270 	}
271 
272 	pp->pr_entered_file = NULL;
273 	pp->pr_entered_line = 0;
274 }
275 
276 static __inline__ void
277 pr_enter_check(struct pool *pp, int (*pr)(const char *, ...))
278 {
279 
280 	if (pp->pr_entered_file != NULL)
281 		(*pr)("\n\tcurrently entered from file %s line %ld\n",
282 		    pp->pr_entered_file, pp->pr_entered_line);
283 }
284 #else
285 #define	pr_log(pp, v, action, file, line)
286 #define	pr_printlog(pp, pi, pr)
287 #define	pr_enter(pp, file, line)
288 #define	pr_leave(pp)
289 #define	pr_enter_check(pp, pr)
290 #endif /* POOL_DIAGNOSTIC */
291 
292 /*
293  * Return the pool page header based on page address.
294  */
295 static __inline struct pool_item_header *
296 pr_find_pagehead(struct pool *pp, caddr_t page)
297 {
298 	struct pool_item_header *ph;
299 
300 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
301 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
302 
303 	for (ph = LIST_FIRST(&pp->pr_hashtab[PR_HASH_INDEX(pp, page)]);
304 	     ph != NULL;
305 	     ph = LIST_NEXT(ph, ph_hashlist)) {
306 		if (ph->ph_page == page)
307 			return (ph);
308 	}
309 	return (NULL);
310 }
311 
312 /*
313  * Remove a page from the pool.
314  */
315 static __inline void
316 pr_rmpage(struct pool *pp, struct pool_item_header *ph)
317 {
318 
319 	/*
320 	 * If the page was idle, decrement the idle page count.
321 	 */
322 	if (ph->ph_nmissing == 0) {
323 #ifdef DIAGNOSTIC
324 		if (pp->pr_nidle == 0)
325 			panic("pr_rmpage: nidle inconsistent");
326 		if (pp->pr_nitems < pp->pr_itemsperpage)
327 			panic("pr_rmpage: nitems inconsistent");
328 #endif
329 		pp->pr_nidle--;
330 	}
331 
332 	pp->pr_nitems -= pp->pr_itemsperpage;
333 
334 	/*
335 	 * Unlink a page from the pool and release it.
336 	 */
337 	TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
338 	(*pp->pr_free)(ph->ph_page, pp->pr_pagesz, pp->pr_mtype);
339 	pp->pr_npages--;
340 	pp->pr_npagefree++;
341 
342 	if ((pp->pr_roflags & PR_PHINPAGE) == 0) {
343 		int s;
344 		LIST_REMOVE(ph, ph_hashlist);
345 		s = splhigh();
346 		pool_put(&phpool, ph);
347 		splx(s);
348 	}
349 
350 	if (pp->pr_curpage == ph) {
351 		/*
352 		 * Find a new non-empty page header, if any.
353 		 * Start search from the page head, to increase the
354 		 * chance for "high water" pages to be freed.
355 		 */
356 		for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL;
357 		     ph = TAILQ_NEXT(ph, ph_pagelist))
358 			if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
359 				break;
360 
361 		pp->pr_curpage = ph;
362 	}
363 }
364 
365 /*
366  * Initialize the given pool resource structure.
367  *
368  * We export this routine to allow other kernel parts to declare
369  * static pools that must be initialized before malloc() is available.
370  */
371 void
372 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
373     const char *wchan, size_t pagesz,
374     void *(*alloc)(unsigned long, int, int),
375     void (*release)(void *, unsigned long, int),
376     int mtype)
377 {
378 	int off, slack, i;
379 
380 #ifdef POOL_DIAGNOSTIC
381 	/*
382 	 * Always log if POOL_DIAGNOSTIC is defined.
383 	 */
384 	if (pool_logsize != 0)
385 		flags |= PR_LOGGING;
386 #endif
387 
388 	/*
389 	 * Check arguments and construct default values.
390 	 */
391 	if (!powerof2(pagesz))
392 		panic("pool_init: page size invalid (%lx)\n", (u_long)pagesz);
393 
394 	if (alloc == NULL && release == NULL) {
395 		alloc = pool_page_alloc;
396 		release = pool_page_free;
397 		pagesz = PAGE_SIZE;	/* Rounds to PAGE_SIZE anyhow. */
398 	} else if ((alloc != NULL && release != NULL) == 0) {
399 		/* If you specifiy one, must specify both. */
400 		panic("pool_init: must specify alloc and release together");
401 	}
402 
403 	if (pagesz == 0)
404 		pagesz = PAGE_SIZE;
405 
406 	if (align == 0)
407 		align = ALIGN(1);
408 
409 	if (size < sizeof(struct pool_item))
410 		size = sizeof(struct pool_item);
411 
412 	size = ALIGN(size);
413 	if (size > pagesz)
414 		panic("pool_init: pool item size (%lu) too large",
415 		      (u_long)size);
416 
417 	/*
418 	 * Initialize the pool structure.
419 	 */
420 	TAILQ_INIT(&pp->pr_pagelist);
421 	TAILQ_INIT(&pp->pr_cachelist);
422 	pp->pr_curpage = NULL;
423 	pp->pr_npages = 0;
424 	pp->pr_minitems = 0;
425 	pp->pr_minpages = 0;
426 	pp->pr_maxpages = UINT_MAX;
427 	pp->pr_roflags = flags;
428 	pp->pr_flags = 0;
429 	pp->pr_size = size;
430 	pp->pr_align = align;
431 	pp->pr_wchan = wchan;
432 	pp->pr_mtype = mtype;
433 	pp->pr_alloc = alloc;
434 	pp->pr_free = release;
435 	pp->pr_pagesz = pagesz;
436 	pp->pr_pagemask = ~(pagesz - 1);
437 	pp->pr_pageshift = ffs(pagesz) - 1;
438 	pp->pr_nitems = 0;
439 	pp->pr_nout = 0;
440 	pp->pr_hardlimit = UINT_MAX;
441 	pp->pr_hardlimit_warning = NULL;
442 	pp->pr_hardlimit_ratecap.tv_sec = 0;
443 	pp->pr_hardlimit_ratecap.tv_usec = 0;
444 	pp->pr_hardlimit_warning_last.tv_sec = 0;
445 	pp->pr_hardlimit_warning_last.tv_usec = 0;
446 	pp->pr_serial = ++pool_serial;
447 	if (pool_serial == 0)
448 		panic("pool_init: too much uptime");
449 
450 	/*
451 	 * Decide whether to put the page header off page to avoid
452 	 * wasting too large a part of the page. Off-page page headers
453 	 * go on a hash table, so we can match a returned item
454 	 * with its header based on the page address.
455 	 * We use 1/16 of the page size as the threshold (XXX: tune)
456 	 */
457 	if (pp->pr_size < pagesz/16) {
458 		/* Use the end of the page for the page header */
459 		pp->pr_roflags |= PR_PHINPAGE;
460 		pp->pr_phoffset = off =
461 			pagesz - ALIGN(sizeof(struct pool_item_header));
462 	} else {
463 		/* The page header will be taken from our page header pool */
464 		pp->pr_phoffset = 0;
465 		off = pagesz;
466 		for (i = 0; i < PR_HASHTABSIZE; i++) {
467 			LIST_INIT(&pp->pr_hashtab[i]);
468 		}
469 	}
470 
471 	/*
472 	 * Alignment is to take place at `ioff' within the item. This means
473 	 * we must reserve up to `align - 1' bytes on the page to allow
474 	 * appropriate positioning of each item.
475 	 *
476 	 * Silently enforce `0 <= ioff < align'.
477 	 */
478 	pp->pr_itemoffset = ioff = ioff % align;
479 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
480 	KASSERT(pp->pr_itemsperpage != 0);
481 
482 	/*
483 	 * Use the slack between the chunks and the page header
484 	 * for "cache coloring".
485 	 */
486 	slack = off - pp->pr_itemsperpage * pp->pr_size;
487 	pp->pr_maxcolor = (slack / align) * align;
488 	pp->pr_curcolor = 0;
489 
490 	pp->pr_nget = 0;
491 	pp->pr_nfail = 0;
492 	pp->pr_nput = 0;
493 	pp->pr_npagealloc = 0;
494 	pp->pr_npagefree = 0;
495 	pp->pr_hiwat = 0;
496 	pp->pr_nidle = 0;
497 
498 #ifdef POOL_DIAGNOSTIC
499 	if (flags & PR_LOGGING) {
500 		if (kmem_map == NULL ||
501 		    (pp->pr_log = malloc(pool_logsize * sizeof(struct pool_log),
502 		     M_TEMP, M_NOWAIT)) == NULL)
503 			pp->pr_roflags &= ~PR_LOGGING;
504 		pp->pr_curlogentry = 0;
505 		pp->pr_logsize = pool_logsize;
506 	}
507 #endif
508 
509 	pp->pr_entered_file = NULL;
510 	pp->pr_entered_line = 0;
511 
512 	simple_lock_init(&pp->pr_slock);
513 
514 	/*
515 	 * Initialize private page header pool and cache magazine pool if we
516 	 * haven't done so yet.
517 	 * XXX LOCKING.
518 	 */
519 	if (phpool.pr_size == 0) {
520 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
521 		    0, "phpool", 0, 0, 0, 0);
522 		pool_init(&pcgpool, sizeof(struct pool_cache_group), 0, 0,
523 		    0, "pcgpool", 0, 0, 0, 0);
524 	}
525 
526 	/* Insert into the list of all pools. */
527 	simple_lock(&pool_head_slock);
528 	TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
529 	simple_unlock(&pool_head_slock);
530 }
531 
532 /*
533  * De-commision a pool resource.
534  */
535 void
536 pool_destroy(struct pool *pp)
537 {
538 	struct pool_item_header *ph;
539 	struct pool_cache *pc;
540 
541 	/* Destroy all caches for this pool. */
542 	while ((pc = TAILQ_FIRST(&pp->pr_cachelist)) != NULL)
543 		pool_cache_destroy(pc);
544 
545 #ifdef DIAGNOSTIC
546 	if (pp->pr_nout != 0) {
547 		pr_printlog(pp, NULL, printf);
548 		panic("pool_destroy: pool busy: still out: %u\n",
549 		    pp->pr_nout);
550 	}
551 #endif
552 
553 	/* Remove all pages */
554 	if ((pp->pr_roflags & PR_STATIC) == 0)
555 		while ((ph = pp->pr_pagelist.tqh_first) != NULL)
556 			pr_rmpage(pp, ph);
557 
558 	/* Remove from global pool list */
559 	simple_lock(&pool_head_slock);
560 	TAILQ_REMOVE(&pool_head, pp, pr_poollist);
561 	/* XXX Only clear this if we were drainpp? */
562 	drainpp = NULL;
563 	simple_unlock(&pool_head_slock);
564 
565 #ifdef POOL_DIAGNOSTIC
566 	if ((pp->pr_roflags & PR_LOGGING) != 0)
567 		free(pp->pr_log, M_TEMP);
568 #endif
569 
570 	if (pp->pr_roflags & PR_FREEHEADER)
571 		free(pp, M_POOL);
572 }
573 
574 static __inline struct pool_item_header *
575 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
576 {
577 	struct pool_item_header *ph;
578 	int s;
579 
580 	LOCK_ASSERT(simple_lock_held(&pp->pr_slock) == 0);
581 
582 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
583 		ph = (struct pool_item_header *) (storage + pp->pr_phoffset);
584 	else {
585 		s = splhigh();
586 		ph = pool_get(&phpool, flags);
587 		splx(s);
588 	}
589 
590 	return (ph);
591 }
592 
593 /*
594  * Grab an item from the pool; must be called at appropriate spl level
595  */
596 void *
597 #ifdef POOL_DIAGNOSTIC
598 _pool_get(struct pool *pp, int flags, const char *file, long line)
599 #else
600 pool_get(struct pool *pp, int flags)
601 #endif
602 {
603 	struct pool_item *pi;
604 	struct pool_item_header *ph;
605 	void *v;
606 
607 #ifdef DIAGNOSTIC
608 	if (__predict_false((pp->pr_roflags & PR_STATIC) &&
609 			    (flags & PR_MALLOCOK))) {
610 		pr_printlog(pp, NULL, printf);
611 		panic("pool_get: static");
612 	}
613 
614 	if (__predict_false(curproc == NULL && /* doing_shutdown == 0 && XXX*/
615 			    (flags & PR_WAITOK) != 0))
616 		panic("pool_get: must have NOWAIT");
617 
618 #endif
619 	simple_lock(&pp->pr_slock);
620 	pr_enter(pp, file, line);
621 
622  startover:
623 	/*
624 	 * Check to see if we've reached the hard limit.  If we have,
625 	 * and we can wait, then wait until an item has been returned to
626 	 * the pool.
627 	 */
628 #ifdef DIAGNOSTIC
629 	if (__predict_false(pp->pr_nout > pp->pr_hardlimit)) {
630 		pr_leave(pp);
631 		simple_unlock(&pp->pr_slock);
632 		panic("pool_get: %s: crossed hard limit", pp->pr_wchan);
633 	}
634 #endif
635 	if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
636 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
637 			/*
638 			 * XXX: A warning isn't logged in this case.  Should
639 			 * it be?
640 			 */
641 			pp->pr_flags |= PR_WANTED;
642 			pr_leave(pp);
643 			simple_unlock(&pp->pr_slock);
644 			tsleep((caddr_t)pp, PSWP, (char *)pp->pr_wchan, 0);
645 			simple_lock(&pp->pr_slock);
646 			pr_enter(pp, file, line);
647 			goto startover;
648 		}
649 
650 		/*
651 		 * Log a message that the hard limit has been hit.
652 		 */
653 		if (pp->pr_hardlimit_warning != NULL &&
654 		    ratecheck(&pp->pr_hardlimit_warning_last,
655 			      &pp->pr_hardlimit_ratecap))
656 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
657 
658 		if (flags & PR_URGENT)
659 			panic("pool_get: urgent");
660 
661 		pp->pr_nfail++;
662 
663 		pr_leave(pp);
664 		simple_unlock(&pp->pr_slock);
665 		return (NULL);
666 	}
667 
668 	/*
669 	 * The convention we use is that if `curpage' is not NULL, then
670 	 * it points at a non-empty bucket. In particular, `curpage'
671 	 * never points at a page header which has PR_PHINPAGE set and
672 	 * has no items in its bucket.
673 	 */
674 	if ((ph = pp->pr_curpage) == NULL) {
675 #ifdef DIAGNOSTIC
676 		if (pp->pr_nitems != 0) {
677 			simple_unlock(&pp->pr_slock);
678 			printf("pool_get: %s: curpage NULL, nitems %u\n",
679 			    pp->pr_wchan, pp->pr_nitems);
680 			panic("pool_get: nitems inconsistent\n");
681 		}
682 #endif
683 
684 		/*
685 		 * Call the back-end page allocator for more memory.
686 		 * Release the pool lock, as the back-end page allocator
687 		 * may block.
688 		 */
689 		pr_leave(pp);
690 		simple_unlock(&pp->pr_slock);
691 		v = (*pp->pr_alloc)(pp->pr_pagesz, flags, pp->pr_mtype);
692 		if (__predict_true(v != NULL))
693 			ph = pool_alloc_item_header(pp, v, flags);
694 		simple_lock(&pp->pr_slock);
695 		pr_enter(pp, file, line);
696 
697 		if (__predict_false(v == NULL || ph == NULL)) {
698 			if (v != NULL)
699 				(*pp->pr_free)(v, pp->pr_pagesz, pp->pr_mtype);
700 
701 			/*
702 			 * We were unable to allocate a page or item
703 			 * header, but we released the lock during
704 			 * allocation, so perhaps items were freed
705 			 * back to the pool.  Check for this case.
706 			 */
707 			if (pp->pr_curpage != NULL)
708 				goto startover;
709 
710 			if (flags & PR_URGENT)
711 				panic("pool_get: urgent");
712 
713 			if ((flags & PR_WAITOK) == 0) {
714 				pp->pr_nfail++;
715 				pr_leave(pp);
716 				simple_unlock(&pp->pr_slock);
717 				return (NULL);
718 			}
719 
720 			/*
721 			 * Wait for items to be returned to this pool.
722 			 *
723 			 * XXX: we actually want to wait just until
724 			 * the page allocator has memory again. Depending
725 			 * on this pool's usage, we might get stuck here
726 			 * for a long time.
727 			 *
728 			 * XXX: maybe we should wake up once a second and
729 			 * try again?
730 			 */
731 			pp->pr_flags |= PR_WANTED;
732 			pr_leave(pp);
733 			simple_unlock(&pp->pr_slock);
734 			tsleep((caddr_t)pp, PSWP, (char *)pp->pr_wchan, 0);
735 			simple_lock(&pp->pr_slock);
736 			pr_enter(pp, file, line);
737 			goto startover;
738 		}
739 
740 		/* We have more memory; add it to the pool */
741 		pp->pr_npagealloc++;
742 		pool_prime_page(pp, v, ph);
743 
744 		/* Start the allocation process over. */
745 		goto startover;
746 	}
747 
748 	if (__predict_false((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL)) {
749 		pr_leave(pp);
750 		simple_unlock(&pp->pr_slock);
751 		panic("pool_get: %s: page empty", pp->pr_wchan);
752 	}
753 #ifdef DIAGNOSTIC
754 	if (__predict_false(pp->pr_nitems == 0)) {
755 		pr_leave(pp);
756 		simple_unlock(&pp->pr_slock);
757 		printf("pool_get: %s: items on itemlist, nitems %u\n",
758 		    pp->pr_wchan, pp->pr_nitems);
759 		panic("pool_get: nitems inconsistent\n");
760 	}
761 
762 	pr_log(pp, v, PRLOG_GET, file, line);
763 
764 	if (__predict_false(pi->pi_magic != PI_MAGIC)) {
765 		pr_printlog(pp, pi, printf);
766 		panic("pool_get(%s): free list modified: magic=%x; page %p;"
767 		       " item addr %p\n",
768 			pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
769 	}
770 #endif
771 
772 	/*
773 	 * Remove from item list.
774 	 */
775 	TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list);
776 	pp->pr_nitems--;
777 	pp->pr_nout++;
778 	if (ph->ph_nmissing == 0) {
779 #ifdef DIAGNOSTIC
780 		if (__predict_false(pp->pr_nidle == 0))
781 			panic("pool_get: nidle inconsistent");
782 #endif
783 		pp->pr_nidle--;
784 	}
785 	ph->ph_nmissing++;
786 	if (TAILQ_FIRST(&ph->ph_itemlist) == NULL) {
787 #ifdef DIAGNOSTIC
788 		if (__predict_false(ph->ph_nmissing != pp->pr_itemsperpage)) {
789 			pr_leave(pp);
790 			simple_unlock(&pp->pr_slock);
791 			panic("pool_get: %s: nmissing inconsistent",
792 			    pp->pr_wchan);
793 		}
794 #endif
795 		/*
796 		 * Find a new non-empty page header, if any.
797 		 * Start search from the page head, to increase
798 		 * the chance for "high water" pages to be freed.
799 		 *
800 		 * Migrate empty pages to the end of the list.  This
801 		 * will speed the update of curpage as pages become
802 		 * idle.  Empty pages intermingled with idle pages
803 		 * is no big deal.  As soon as a page becomes un-empty,
804 		 * it will move back to the head of the list.
805 		 */
806 		TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
807 		TAILQ_INSERT_TAIL(&pp->pr_pagelist, ph, ph_pagelist);
808 		for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL;
809 		     ph = TAILQ_NEXT(ph, ph_pagelist))
810 			if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
811 				break;
812 
813 		pp->pr_curpage = ph;
814 	}
815 
816 	pp->pr_nget++;
817 
818 	/*
819 	 * If we have a low water mark and we are now below that low
820 	 * water mark, add more items to the pool.
821 	 */
822 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
823 		/*
824 		 * XXX: Should we log a warning?  Should we set up a timeout
825 		 * to try again in a second or so?  The latter could break
826 		 * a caller's assumptions about interrupt protection, etc.
827 		 */
828 	}
829 
830 	pr_leave(pp);
831 	simple_unlock(&pp->pr_slock);
832 	return (v);
833 }
834 
835 /*
836  * Internal version of pool_put().  Pool is already locked/entered.
837  */
838 static void
839 pool_do_put(struct pool *pp, void *v)
840 {
841 	struct pool_item *pi = v;
842 	struct pool_item_header *ph;
843 	caddr_t page;
844 	int s;
845 
846 	page = (caddr_t)((u_long)v & pp->pr_pagemask);
847 
848 #ifdef DIAGNOSTIC
849 	if (__predict_false(pp->pr_nout == 0)) {
850 		printf("pool %s: putting with none out\n",
851 		    pp->pr_wchan);
852 		panic("pool_put");
853 	}
854 #endif
855 
856 	if (__predict_false((ph = pr_find_pagehead(pp, page)) == NULL)) {
857 		pr_printlog(pp, NULL, printf);
858 		panic("pool_put: %s: page header missing", pp->pr_wchan);
859 	}
860 
861 #ifdef LOCKDEBUG
862 	/*
863 	 * Check if we're freeing a locked simple lock.
864 	 */
865 	simple_lock_freecheck((caddr_t)pi, ((caddr_t)pi) + pp->pr_size);
866 #endif
867 
868 	/*
869 	 * Return to item list.
870 	 */
871 #ifdef DIAGNOSTIC
872 	pi->pi_magic = PI_MAGIC;
873 #endif
874 #ifdef DEBUG
875 	{
876 		int i, *ip = v;
877 
878 		for (i = 0; i < pp->pr_size / sizeof(int); i++) {
879 			*ip++ = PI_MAGIC;
880 		}
881 	}
882 #endif
883 
884 	TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
885 	ph->ph_nmissing--;
886 	pp->pr_nput++;
887 	pp->pr_nitems++;
888 	pp->pr_nout--;
889 
890 	/* Cancel "pool empty" condition if it exists */
891 	if (pp->pr_curpage == NULL)
892 		pp->pr_curpage = ph;
893 
894 	if (pp->pr_flags & PR_WANTED) {
895 		pp->pr_flags &= ~PR_WANTED;
896 		if (ph->ph_nmissing == 0)
897 			pp->pr_nidle++;
898 		wakeup((caddr_t)pp);
899 		return;
900 	}
901 
902 	/*
903 	 * If this page is now complete, do one of two things:
904 	 *
905 	 *	(1) If we have more pages than the page high water
906 	 *	    mark, free the page back to the system.
907 	 *
908 	 *	(2) Move it to the end of the page list, so that
909 	 *	    we minimize our chances of fragmenting the
910 	 *	    pool.  Idle pages migrate to the end (along with
911 	 *	    completely empty pages, so that we find un-empty
912 	 *	    pages more quickly when we update curpage) of the
913 	 *	    list so they can be more easily swept up by
914 	 *	    the pagedaemon when pages are scarce.
915 	 */
916 	if (ph->ph_nmissing == 0) {
917 		pp->pr_nidle++;
918 		if (pp->pr_npages > pp->pr_maxpages) {
919 			pr_rmpage(pp, ph);
920 		} else {
921 			TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
922 			TAILQ_INSERT_TAIL(&pp->pr_pagelist, ph, ph_pagelist);
923 
924 			/*
925 			 * Update the timestamp on the page.  A page must
926 			 * be idle for some period of time before it can
927 			 * be reclaimed by the pagedaemon.  This minimizes
928 			 * ping-pong'ing for memory.
929 			 */
930 			s = splclock();
931 			ph->ph_time = mono_time;
932 			splx(s);
933 
934 			/*
935 			 * Update the current page pointer.  Just look for
936 			 * the first page with any free items.
937 			 *
938 			 * XXX: Maybe we want an option to look for the
939 			 * page with the fewest available items, to minimize
940 			 * fragmentation?
941 			 */
942 			for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL;
943 			     ph = TAILQ_NEXT(ph, ph_pagelist))
944 				if (TAILQ_FIRST(&ph->ph_itemlist) != NULL)
945 					break;
946 
947 			pp->pr_curpage = ph;
948 		}
949 	}
950 	/*
951 	 * If the page has just become un-empty, move it to the head of
952 	 * the list, and make it the current page.  The next allocation
953 	 * will get the item from this page, instead of further fragmenting
954 	 * the pool.
955 	 */
956 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
957 		TAILQ_REMOVE(&pp->pr_pagelist, ph, ph_pagelist);
958 		TAILQ_INSERT_HEAD(&pp->pr_pagelist, ph, ph_pagelist);
959 		pp->pr_curpage = ph;
960 	}
961 }
962 
963 /*
964  * Return resource to the pool; must be called at appropriate spl level
965  */
966 #ifdef POOL_DIAGNOSTIC
967 void
968 _pool_put(struct pool *pp, void *v, const char *file, long line)
969 {
970 
971 	simple_lock(&pp->pr_slock);
972 	pr_enter(pp, file, line);
973 
974 	pr_log(pp, v, PRLOG_PUT, file, line);
975 
976 	pool_do_put(pp, v);
977 
978 	pr_leave(pp);
979 	simple_unlock(&pp->pr_slock);
980 }
981 #undef pool_put
982 #endif /* POOL_DIAGNOSTIC */
983 
984 void
985 pool_put(struct pool *pp, void *v)
986 {
987 
988 	simple_lock(&pp->pr_slock);
989 
990 	pool_do_put(pp, v);
991 
992 	simple_unlock(&pp->pr_slock);
993 }
994 
995 #ifdef POOL_DIAGNOSTIC
996 #define		pool_put(h, v)	_pool_put((h), (v), __FILE__, __LINE__)
997 #endif
998 
999 /*
1000  * Add N items to the pool.
1001  */
1002 int
1003 pool_prime(struct pool *pp, int n)
1004 {
1005 	struct pool_item_header *ph;
1006 	caddr_t cp;
1007 	int newpages, error = 0;
1008 
1009 	simple_lock(&pp->pr_slock);
1010 
1011 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1012 
1013 	while (newpages-- > 0) {
1014 		simple_unlock(&pp->pr_slock);
1015 		cp = (*pp->pr_alloc)(pp->pr_pagesz, PR_NOWAIT, pp->pr_mtype);
1016 		if (__predict_true(cp != NULL))
1017 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
1018 		simple_lock(&pp->pr_slock);
1019 
1020 		if (__predict_false(cp == NULL || ph == NULL)) {
1021 			error = ENOMEM;
1022 			if (cp != NULL)
1023 				(*pp->pr_free)(cp, pp->pr_pagesz, pp->pr_mtype);
1024 			break;
1025 		}
1026 
1027 		pool_prime_page(pp, cp, ph);
1028 		pp->pr_npagealloc++;
1029 		pp->pr_minpages++;
1030 	}
1031 
1032 	if (pp->pr_minpages >= pp->pr_maxpages)
1033 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
1034 
1035 	simple_unlock(&pp->pr_slock);
1036 	return (0);
1037 }
1038 
1039 /*
1040  * Add a page worth of items to the pool.
1041  *
1042  * Note, we must be called with the pool descriptor LOCKED.
1043  */
1044 static void
1045 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
1046 {
1047 	struct pool_item *pi;
1048 	caddr_t cp = storage;
1049 	unsigned int align = pp->pr_align;
1050 	unsigned int ioff = pp->pr_itemoffset;
1051 	int n;
1052 
1053 	if (((u_long)cp & (pp->pr_pagesz - 1)) != 0)
1054 		panic("pool_prime_page: %s: unaligned page", pp->pr_wchan);
1055 
1056 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
1057 		LIST_INSERT_HEAD(&pp->pr_hashtab[PR_HASH_INDEX(pp, cp)],
1058 		    ph, ph_hashlist);
1059 
1060 	/*
1061 	 * Insert page header.
1062 	 */
1063 	TAILQ_INSERT_HEAD(&pp->pr_pagelist, ph, ph_pagelist);
1064 	TAILQ_INIT(&ph->ph_itemlist);
1065 	ph->ph_page = storage;
1066 	ph->ph_nmissing = 0;
1067 	memset(&ph->ph_time, 0, sizeof(ph->ph_time));
1068 
1069 	pp->pr_nidle++;
1070 
1071 	/*
1072 	 * Color this page.
1073 	 */
1074 	cp = (caddr_t)(cp + pp->pr_curcolor);
1075 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
1076 		pp->pr_curcolor = 0;
1077 
1078 	/*
1079 	 * Adjust storage to apply aligment to `pr_itemoffset' in each item.
1080 	 */
1081 	if (ioff != 0)
1082 		cp = (caddr_t)(cp + (align - ioff));
1083 
1084 	/*
1085 	 * Insert remaining chunks on the bucket list.
1086 	 */
1087 	n = pp->pr_itemsperpage;
1088 	pp->pr_nitems += n;
1089 
1090 	while (n--) {
1091 		pi = (struct pool_item *)cp;
1092 
1093 		/* Insert on page list */
1094 		TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
1095 #ifdef DIAGNOSTIC
1096 		pi->pi_magic = PI_MAGIC;
1097 #endif
1098 		cp = (caddr_t)(cp + pp->pr_size);
1099 	}
1100 
1101 	/*
1102 	 * If the pool was depleted, point at the new page.
1103 	 */
1104 	if (pp->pr_curpage == NULL)
1105 		pp->pr_curpage = ph;
1106 
1107 	if (++pp->pr_npages > pp->pr_hiwat)
1108 		pp->pr_hiwat = pp->pr_npages;
1109 }
1110 
1111 /*
1112  * Used by pool_get() when nitems drops below the low water mark.  This
1113  * is used to catch up nitmes with the low water mark.
1114  *
1115  * Note 1, we never wait for memory here, we let the caller decide what to do.
1116  *
1117  * Note 2, this doesn't work with static pools.
1118  *
1119  * Note 3, we must be called with the pool already locked, and we return
1120  * with it locked.
1121  */
1122 static int
1123 pool_catchup(struct pool *pp)
1124 {
1125 	struct pool_item_header *ph;
1126 	caddr_t cp;
1127 	int error = 0;
1128 
1129 	if (pp->pr_roflags & PR_STATIC) {
1130 		/*
1131 		 * We dropped below the low water mark, and this is not a
1132 		 * good thing.  Log a warning.
1133 		 *
1134 		 * XXX: rate-limit this?
1135 		 */
1136 		printf("WARNING: static pool `%s' dropped below low water "
1137 		    "mark\n", pp->pr_wchan);
1138 		return (0);
1139 	}
1140 
1141 	while (POOL_NEEDS_CATCHUP(pp)) {
1142 		/*
1143 		 * Call the page back-end allocator for more memory.
1144 		 *
1145 		 * XXX: We never wait, so should we bother unlocking
1146 		 * the pool descriptor?
1147 		 */
1148 		simple_unlock(&pp->pr_slock);
1149 		cp = (*pp->pr_alloc)(pp->pr_pagesz, PR_NOWAIT, pp->pr_mtype);
1150 		if (__predict_true(cp != NULL))
1151 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
1152 		simple_lock(&pp->pr_slock);
1153 		if (__predict_false(cp == NULL || ph == NULL)) {
1154 			if (cp != NULL)
1155 				(*pp->pr_free)(cp, pp->pr_pagesz, pp->pr_mtype);
1156 			error = ENOMEM;
1157 			break;
1158 		}
1159 		pool_prime_page(pp, cp, ph);
1160 		pp->pr_npagealloc++;
1161 	}
1162 
1163 	return (error);
1164 }
1165 
1166 void
1167 pool_setlowat(struct pool *pp, int n)
1168 {
1169 	int error;
1170 
1171 	simple_lock(&pp->pr_slock);
1172 
1173 	pp->pr_minitems = n;
1174 	pp->pr_minpages = (n == 0)
1175 		? 0
1176 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1177 
1178 	/* Make sure we're caught up with the newly-set low water mark. */
1179 	if (POOL_NEEDS_CATCHUP(pp) && (error = pool_catchup(pp) != 0)) {
1180 		/*
1181 		 * XXX: Should we log a warning?  Should we set up a timeout
1182 		 * to try again in a second or so?  The latter could break
1183 		 * a caller's assumptions about interrupt protection, etc.
1184 		 */
1185 	}
1186 
1187 	simple_unlock(&pp->pr_slock);
1188 }
1189 
1190 void
1191 pool_sethiwat(struct pool *pp, int n)
1192 {
1193 
1194 	simple_lock(&pp->pr_slock);
1195 
1196 	pp->pr_maxpages = (n == 0)
1197 		? 0
1198 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1199 
1200 	simple_unlock(&pp->pr_slock);
1201 }
1202 
1203 void
1204 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
1205 {
1206 
1207 	simple_lock(&pp->pr_slock);
1208 
1209 	pp->pr_hardlimit = n;
1210 	pp->pr_hardlimit_warning = warnmess;
1211 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1212 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1213 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1214 
1215 	/*
1216 	 * In-line version of pool_sethiwat(), because we don't want to
1217 	 * release the lock.
1218 	 */
1219 	pp->pr_maxpages = (n == 0)
1220 		? 0
1221 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1222 
1223 	simple_unlock(&pp->pr_slock);
1224 }
1225 
1226 /*
1227  * Default page allocator.
1228  */
1229 static void *
1230 pool_page_alloc(unsigned long sz, int flags, int mtype)
1231 {
1232 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1233 
1234 	return ((void *)uvm_km_alloc_poolpage(waitok));
1235 }
1236 
1237 static void
1238 pool_page_free(void *v, unsigned long sz, int mtype)
1239 {
1240 	uvm_km_free_poolpage((vaddr_t)v);
1241 }
1242 
1243 /*
1244  * Alternate pool page allocator for pools that know they will
1245  * never be accessed in interrupt context.
1246  */
1247 void *
1248 pool_page_alloc_nointr(unsigned long sz, int flags, int mtype)
1249 {
1250 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1251 
1252 	return ((void *)uvm_km_alloc_poolpage1(kernel_map, uvm.kernel_object,
1253 	    waitok));
1254 }
1255 
1256 void
1257 pool_page_free_nointr(void *v, unsigned long sz, int mtype)
1258 {
1259 
1260 	uvm_km_free_poolpage1(kernel_map, (vaddr_t)v);
1261 }
1262 
1263 
1264 /*
1265  * Release all complete pages that have not been used recently.
1266  */
1267 void
1268 #ifdef POOL_DIAGNOSTIC
1269 _pool_reclaim(struct pool *pp, const char *file, long line)
1270 #else
1271 pool_reclaim(struct pool *pp)
1272 #endif
1273 {
1274 	struct pool_item_header *ph, *phnext;
1275 	struct pool_cache *pc;
1276 	struct timeval curtime;
1277 	int s;
1278 
1279 	if (pp->pr_roflags & PR_STATIC)
1280 		return;
1281 
1282 	if (simple_lock_try(&pp->pr_slock) == 0)
1283 		return;
1284 	pr_enter(pp, file, line);
1285 
1286 	/*
1287 	 * Reclaim items from the pool's caches.
1288 	 */
1289 	for (pc = TAILQ_FIRST(&pp->pr_cachelist); pc != NULL;
1290 	     pc = TAILQ_NEXT(pc, pc_poollist))
1291 		pool_cache_reclaim(pc);
1292 
1293 	s = splclock();
1294 	curtime = mono_time;
1295 	splx(s);
1296 
1297 	for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL; ph = phnext) {
1298 		phnext = TAILQ_NEXT(ph, ph_pagelist);
1299 
1300 		/* Check our minimum page claim */
1301 		if (pp->pr_npages <= pp->pr_minpages)
1302 			break;
1303 
1304 		if (ph->ph_nmissing == 0) {
1305 			struct timeval diff;
1306 			timersub(&curtime, &ph->ph_time, &diff);
1307 			if (diff.tv_sec < pool_inactive_time)
1308 				continue;
1309 
1310 			/*
1311 			 * If freeing this page would put us below
1312 			 * the low water mark, stop now.
1313 			 */
1314 			if ((pp->pr_nitems - pp->pr_itemsperpage) <
1315 			    pp->pr_minitems)
1316 				break;
1317 
1318 			pr_rmpage(pp, ph);
1319 		}
1320 	}
1321 
1322 	pr_leave(pp);
1323 	simple_unlock(&pp->pr_slock);
1324 }
1325 
1326 
1327 /*
1328  * Drain pools, one at a time.
1329  *
1330  * Note, we must never be called from an interrupt context.
1331  */
1332 void
1333 pool_drain(void *arg)
1334 {
1335 	struct pool *pp;
1336 	int s;
1337 
1338 	s = splvm();
1339 	simple_lock(&pool_head_slock);
1340 
1341 	if (drainpp == NULL && (drainpp = TAILQ_FIRST(&pool_head)) == NULL)
1342 		goto out;
1343 
1344 	pp = drainpp;
1345 	drainpp = TAILQ_NEXT(pp, pr_poollist);
1346 
1347 	pool_reclaim(pp);
1348 
1349  out:
1350 	simple_unlock(&pool_head_slock);
1351 	splx(s);
1352 }
1353 
1354 
1355 /*
1356  * Diagnostic helpers.
1357  */
1358 void
1359 pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1360 {
1361 	int s;
1362 
1363 	s = splvm();
1364 	if (simple_lock_try(&pp->pr_slock) == 0) {
1365 		printf("pool %s is locked; try again later\n",
1366 		    pp->pr_wchan);
1367 		splx(s);
1368 		return;
1369 	}
1370 	pool_print1(pp, modif, printf);
1371 	simple_unlock(&pp->pr_slock);
1372 	splx(s);
1373 }
1374 
1375 static void
1376 pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...))
1377 {
1378 	struct pool_item_header *ph;
1379 	struct pool_cache *pc;
1380 	struct pool_cache_group *pcg;
1381 #ifdef DIAGNOSTIC
1382 	struct pool_item *pi;
1383 #endif
1384 	int i, print_log = 0, print_pagelist = 0, print_cache = 0;
1385 	char c;
1386 
1387 	while ((c = *modif++) != '\0') {
1388 		if (c == 'l')
1389 			print_log = 1;
1390 		if (c == 'p')
1391 			print_pagelist = 1;
1392 		if (c == 'c')
1393 			print_cache = 1;
1394 		modif++;
1395 	}
1396 
1397 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1398 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1399 	    pp->pr_roflags);
1400 	(*pr)("\tpagesz %u, mtype %d\n", pp->pr_pagesz, pp->pr_mtype);
1401 	(*pr)("\talloc %p, release %p\n", pp->pr_alloc, pp->pr_free);
1402 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1403 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1404 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1405 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1406 
1407 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1408 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1409 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1410 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1411 
1412 	if (print_pagelist == 0)
1413 		goto skip_pagelist;
1414 
1415 	if ((ph = TAILQ_FIRST(&pp->pr_pagelist)) != NULL)
1416 		(*pr)("\n\tpage list:\n");
1417 	for (; ph != NULL; ph = TAILQ_NEXT(ph, ph_pagelist)) {
1418 		(*pr)("\t\tpage %p, nmissing %d, time %lu,%lu\n",
1419 		    ph->ph_page, ph->ph_nmissing,
1420 		    (u_long)ph->ph_time.tv_sec,
1421 		    (u_long)ph->ph_time.tv_usec);
1422 #ifdef DIAGNOSTIC
1423 		for (pi = TAILQ_FIRST(&ph->ph_itemlist); pi != NULL;
1424 		     pi = TAILQ_NEXT(pi, pi_list)) {
1425 			if (pi->pi_magic != PI_MAGIC) {
1426 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1427 				    pi, pi->pi_magic);
1428 			}
1429 		}
1430 #endif
1431 	}
1432 	if (pp->pr_curpage == NULL)
1433 		(*pr)("\tno current page\n");
1434 	else
1435 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1436 
1437  skip_pagelist:
1438 
1439 	if (print_log == 0)
1440 		goto skip_log;
1441 
1442 	(*pr)("\n");
1443 	if ((pp->pr_roflags & PR_LOGGING) == 0)
1444 		(*pr)("\tno log\n");
1445 	else
1446 		pr_printlog(pp, NULL, pr);
1447 
1448  skip_log:
1449 
1450 	if (print_cache == 0)
1451 		goto skip_cache;
1452 
1453 	for (pc = TAILQ_FIRST(&pp->pr_cachelist); pc != NULL;
1454 	     pc = TAILQ_NEXT(pc, pc_poollist)) {
1455 		(*pr)("\tcache %p: allocfrom %p freeto %p\n", pc,
1456 		    pc->pc_allocfrom, pc->pc_freeto);
1457 		(*pr)("\t    hits %lu misses %lu ngroups %lu nitems %lu\n",
1458 		    pc->pc_hits, pc->pc_misses, pc->pc_ngroups, pc->pc_nitems);
1459 		for (pcg = TAILQ_FIRST(&pc->pc_grouplist); pcg != NULL;
1460 		     pcg = TAILQ_NEXT(pcg, pcg_list)) {
1461 			(*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail);
1462 			for (i = 0; i < PCG_NOBJECTS; i++)
1463 				(*pr)("\t\t\t%p\n", pcg->pcg_objects[i]);
1464 		}
1465 	}
1466 
1467  skip_cache:
1468 
1469 	pr_enter_check(pp, pr);
1470 }
1471 
1472 int
1473 pool_chk(struct pool *pp, const char *label)
1474 {
1475 	struct pool_item_header *ph;
1476 	int r = 0;
1477 
1478 	simple_lock(&pp->pr_slock);
1479 
1480 	for (ph = TAILQ_FIRST(&pp->pr_pagelist); ph != NULL;
1481 	     ph = TAILQ_NEXT(ph, ph_pagelist)) {
1482 
1483 		struct pool_item *pi;
1484 		int n;
1485 		caddr_t page;
1486 
1487 		page = (caddr_t)((u_long)ph & pp->pr_pagemask);
1488 		if (page != ph->ph_page &&
1489 		    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1490 			if (label != NULL)
1491 				printf("%s: ", label);
1492 			printf("pool(%p:%s): page inconsistency: page %p;"
1493 			       " at page head addr %p (p %p)\n", pp,
1494 				pp->pr_wchan, ph->ph_page,
1495 				ph, page);
1496 			r++;
1497 			goto out;
1498 		}
1499 
1500 		for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0;
1501 		     pi != NULL;
1502 		     pi = TAILQ_NEXT(pi,pi_list), n++) {
1503 
1504 #ifdef DIAGNOSTIC
1505 			if (pi->pi_magic != PI_MAGIC) {
1506 				if (label != NULL)
1507 					printf("%s: ", label);
1508 				printf("pool(%s): free list modified: magic=%x;"
1509 				       " page %p; item ordinal %d;"
1510 				       " addr %p (p %p)\n",
1511 					pp->pr_wchan, pi->pi_magic, ph->ph_page,
1512 					n, pi, page);
1513 				panic("pool");
1514 			}
1515 #endif
1516 			page = (caddr_t)((u_long)pi & pp->pr_pagemask);
1517 			if (page == ph->ph_page)
1518 				continue;
1519 
1520 			if (label != NULL)
1521 				printf("%s: ", label);
1522 			printf("pool(%p:%s): page inconsistency: page %p;"
1523 			       " item ordinal %d; addr %p (p %p)\n", pp,
1524 				pp->pr_wchan, ph->ph_page,
1525 				n, pi, page);
1526 			r++;
1527 			goto out;
1528 		}
1529 	}
1530 out:
1531 	simple_unlock(&pp->pr_slock);
1532 	return (r);
1533 }
1534 
1535 /*
1536  * pool_cache_init:
1537  *
1538  *	Initialize a pool cache.
1539  *
1540  *	NOTE: If the pool must be protected from interrupts, we expect
1541  *	to be called at the appropriate interrupt priority level.
1542  */
1543 void
1544 pool_cache_init(struct pool_cache *pc, struct pool *pp,
1545     int (*ctor)(void *, void *, int),
1546     void (*dtor)(void *, void *),
1547     void *arg)
1548 {
1549 
1550 	TAILQ_INIT(&pc->pc_grouplist);
1551 	simple_lock_init(&pc->pc_slock);
1552 
1553 	pc->pc_allocfrom = NULL;
1554 	pc->pc_freeto = NULL;
1555 	pc->pc_pool = pp;
1556 
1557 	pc->pc_ctor = ctor;
1558 	pc->pc_dtor = dtor;
1559 	pc->pc_arg  = arg;
1560 
1561 	pc->pc_hits   = 0;
1562 	pc->pc_misses = 0;
1563 
1564 	pc->pc_ngroups = 0;
1565 
1566 	pc->pc_nitems = 0;
1567 
1568 	simple_lock(&pp->pr_slock);
1569 	TAILQ_INSERT_TAIL(&pp->pr_cachelist, pc, pc_poollist);
1570 	simple_unlock(&pp->pr_slock);
1571 }
1572 
1573 /*
1574  * pool_cache_destroy:
1575  *
1576  *	Destroy a pool cache.
1577  */
1578 void
1579 pool_cache_destroy(struct pool_cache *pc)
1580 {
1581 	struct pool *pp = pc->pc_pool;
1582 
1583 	/* First, invalidate the entire cache. */
1584 	pool_cache_invalidate(pc);
1585 
1586 	/* ...and remove it from the pool's cache list. */
1587 	simple_lock(&pp->pr_slock);
1588 	TAILQ_REMOVE(&pp->pr_cachelist, pc, pc_poollist);
1589 	simple_unlock(&pp->pr_slock);
1590 }
1591 
1592 static __inline void *
1593 pcg_get(struct pool_cache_group *pcg)
1594 {
1595 	void *object;
1596 	u_int idx;
1597 
1598 	KASSERT(pcg->pcg_avail <= PCG_NOBJECTS);
1599 	KASSERT(pcg->pcg_avail != 0);
1600 	idx = --pcg->pcg_avail;
1601 
1602 	KASSERT(pcg->pcg_objects[idx] != NULL);
1603 	object = pcg->pcg_objects[idx];
1604 	pcg->pcg_objects[idx] = NULL;
1605 
1606 	return (object);
1607 }
1608 
1609 static __inline void
1610 pcg_put(struct pool_cache_group *pcg, void *object)
1611 {
1612 	u_int idx;
1613 
1614 	KASSERT(pcg->pcg_avail < PCG_NOBJECTS);
1615 	idx = pcg->pcg_avail++;
1616 
1617 	KASSERT(pcg->pcg_objects[idx] == NULL);
1618 	pcg->pcg_objects[idx] = object;
1619 }
1620 
1621 /*
1622  * pool_cache_get:
1623  *
1624  *	Get an object from a pool cache.
1625  */
1626 void *
1627 pool_cache_get(struct pool_cache *pc, int flags)
1628 {
1629 	struct pool_cache_group *pcg;
1630 	void *object;
1631 
1632 #ifdef LOCKDEBUG
1633 	if (flags & PR_WAITOK)
1634 		simple_lock_only_held(NULL, "pool_cache_get(PR_WAITOK)");
1635 #endif
1636 
1637 	simple_lock(&pc->pc_slock);
1638 
1639 	if ((pcg = pc->pc_allocfrom) == NULL) {
1640 		for (pcg = TAILQ_FIRST(&pc->pc_grouplist); pcg != NULL;
1641 		     pcg = TAILQ_NEXT(pcg, pcg_list)) {
1642 			if (pcg->pcg_avail != 0) {
1643 				pc->pc_allocfrom = pcg;
1644 				goto have_group;
1645 			}
1646 		}
1647 
1648 		/*
1649 		 * No groups with any available objects.  Allocate
1650 		 * a new object, construct it, and return it to
1651 		 * the caller.  We will allocate a group, if necessary,
1652 		 * when the object is freed back to the cache.
1653 		 */
1654 		pc->pc_misses++;
1655 		simple_unlock(&pc->pc_slock);
1656 		object = pool_get(pc->pc_pool, flags);
1657 		if (object != NULL && pc->pc_ctor != NULL) {
1658 			if ((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0) {
1659 				pool_put(pc->pc_pool, object);
1660 				return (NULL);
1661 			}
1662 		}
1663 		return (object);
1664 	}
1665 
1666  have_group:
1667 	pc->pc_hits++;
1668 	pc->pc_nitems--;
1669 	object = pcg_get(pcg);
1670 
1671 	if (pcg->pcg_avail == 0)
1672 		pc->pc_allocfrom = NULL;
1673 
1674 	simple_unlock(&pc->pc_slock);
1675 
1676 	return (object);
1677 }
1678 
1679 /*
1680  * pool_cache_put:
1681  *
1682  *	Put an object back to the pool cache.
1683  */
1684 void
1685 pool_cache_put(struct pool_cache *pc, void *object)
1686 {
1687 	struct pool_cache_group *pcg;
1688 
1689 	simple_lock(&pc->pc_slock);
1690 
1691 	if ((pcg = pc->pc_freeto) == NULL) {
1692 		for (pcg = TAILQ_FIRST(&pc->pc_grouplist); pcg != NULL;
1693 		     pcg = TAILQ_NEXT(pcg, pcg_list)) {
1694 			if (pcg->pcg_avail != PCG_NOBJECTS) {
1695 				pc->pc_freeto = pcg;
1696 				goto have_group;
1697 			}
1698 		}
1699 
1700 		/*
1701 		 * No empty groups to free the object to.  Attempt to
1702 		 * allocate one.
1703 		 */
1704 		simple_unlock(&pc->pc_slock);
1705 		pcg = pool_get(&pcgpool, PR_NOWAIT);
1706 		if (pcg != NULL) {
1707 			memset(pcg, 0, sizeof(*pcg));
1708 			simple_lock(&pc->pc_slock);
1709 			pc->pc_ngroups++;
1710 			TAILQ_INSERT_TAIL(&pc->pc_grouplist, pcg, pcg_list);
1711 			if (pc->pc_freeto == NULL)
1712 				pc->pc_freeto = pcg;
1713 			goto have_group;
1714 		}
1715 
1716 		/*
1717 		 * Unable to allocate a cache group; destruct the object
1718 		 * and free it back to the pool.
1719 		 */
1720 		pool_cache_destruct_object(pc, object);
1721 		return;
1722 	}
1723 
1724  have_group:
1725 	pc->pc_nitems++;
1726 	pcg_put(pcg, object);
1727 
1728 	if (pcg->pcg_avail == PCG_NOBJECTS)
1729 		pc->pc_freeto = NULL;
1730 
1731 	simple_unlock(&pc->pc_slock);
1732 }
1733 
1734 /*
1735  * pool_cache_destruct_object:
1736  *
1737  *	Force destruction of an object and its release back into
1738  *	the pool.
1739  */
1740 void
1741 pool_cache_destruct_object(struct pool_cache *pc, void *object)
1742 {
1743 
1744 	if (pc->pc_dtor != NULL)
1745 		(*pc->pc_dtor)(pc->pc_arg, object);
1746 	pool_put(pc->pc_pool, object);
1747 }
1748 
1749 /*
1750  * pool_cache_do_invalidate:
1751  *
1752  *	This internal function implements pool_cache_invalidate() and
1753  *	pool_cache_reclaim().
1754  */
1755 static void
1756 pool_cache_do_invalidate(struct pool_cache *pc, int free_groups,
1757     void (*putit)(struct pool *, void *))
1758 {
1759 	struct pool_cache_group *pcg, *npcg;
1760 	void *object;
1761 
1762 	for (pcg = TAILQ_FIRST(&pc->pc_grouplist); pcg != NULL;
1763 	     pcg = npcg) {
1764 		npcg = TAILQ_NEXT(pcg, pcg_list);
1765 		while (pcg->pcg_avail != 0) {
1766 			pc->pc_nitems--;
1767 			object = pcg_get(pcg);
1768 			if (pcg->pcg_avail == 0 && pc->pc_allocfrom == pcg)
1769 				pc->pc_allocfrom = NULL;
1770 			if (pc->pc_dtor != NULL)
1771 				(*pc->pc_dtor)(pc->pc_arg, object);
1772 			(*putit)(pc->pc_pool, object);
1773 		}
1774 		if (free_groups) {
1775 			pc->pc_ngroups--;
1776 			TAILQ_REMOVE(&pc->pc_grouplist, pcg, pcg_list);
1777 			if (pc->pc_freeto == pcg)
1778 				pc->pc_freeto = NULL;
1779 			pool_put(&pcgpool, pcg);
1780 		}
1781 	}
1782 }
1783 
1784 /*
1785  * pool_cache_invalidate:
1786  *
1787  *	Invalidate a pool cache (destruct and release all of the
1788  *	cached objects).
1789  */
1790 void
1791 pool_cache_invalidate(struct pool_cache *pc)
1792 {
1793 
1794 	simple_lock(&pc->pc_slock);
1795 	pool_cache_do_invalidate(pc, 0, pool_put);
1796 	simple_unlock(&pc->pc_slock);
1797 }
1798 
1799 /*
1800  * pool_cache_reclaim:
1801  *
1802  *	Reclaim a pool cache for pool_reclaim().
1803  */
1804 static void
1805 pool_cache_reclaim(struct pool_cache *pc)
1806 {
1807 
1808 	simple_lock(&pc->pc_slock);
1809 	pool_cache_do_invalidate(pc, 1, pool_do_put);
1810 	simple_unlock(&pc->pc_slock);
1811 }
1812 
1813 /*
1814  * We have three different sysctls.
1815  * kern.pool.npools - the number of pools.
1816  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1817  * kern.pool.name.<pool#> - the name for pool#.[6~
1818  */
1819 int
1820 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1821 {
1822 	struct pool *pp, *foundpool = NULL;
1823 	size_t buflen = where != NULL ? *sizep : 0;
1824 	int npools = 0, s;
1825 	unsigned int lookfor;
1826 	size_t len;
1827 
1828 	switch (*name) {
1829 	case KERN_POOL_NPOOLS:
1830 		if (namelen != 1 || buflen != sizeof(int))
1831 			return (EINVAL);
1832 		lookfor = 0;
1833 		break;
1834 	case KERN_POOL_NAME:
1835 		if (namelen != 2 || buflen < 1)
1836 			return (EINVAL);
1837 		lookfor = name[1];
1838 		break;
1839 	case KERN_POOL_POOL:
1840 		if (namelen != 2 || buflen != sizeof(struct pool))
1841 			return (EINVAL);
1842 		lookfor = name[1];
1843 		break;
1844 	default:
1845 		return (EINVAL);
1846 	}
1847 
1848 	s = splvm();
1849 	simple_lock(&pool_head_slock);
1850 
1851 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1852 		npools++;
1853 		if (lookfor == pp->pr_serial) {
1854 			foundpool = pp;
1855 			break;
1856 		}
1857 	}
1858 
1859 	simple_unlock(&pool_head_slock);
1860 	splx(s);
1861 
1862 	if (lookfor != 0 && foundpool == NULL)
1863 		return (ENOENT);
1864 
1865 	switch (*name) {
1866 	case KERN_POOL_NPOOLS:
1867 		return copyout(&npools, where, buflen);
1868 	case KERN_POOL_NAME:
1869 		len = strlen(foundpool->pr_wchan) + 1;
1870 		if (*sizep < len)
1871 			return (ENOMEM);
1872 		*sizep = len;
1873 		return copyout(foundpool->pr_wchan, where, len);
1874 	case KERN_POOL_POOL:
1875 		return copyout(foundpool, where, buflen);
1876 	}
1877 	/* NOTREACHED */
1878 	return (0); /* XXX - Stupid gcc */
1879 }
1880