xref: /dflybsd-src/sys/kern/kern_kmalloc.c (revision e9dbfea17a45e13134f19ed8fa22fbe8d11ac99c)
1 /*
2  * KERN_KMALLOC.C	- Kernel memory allocator
3  *
4  * Copyright (c) 2021 The DragonFly Project, All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * This module implements the kmalloc_obj allocator.  This is a type-stable
39  * allocator that uses the same base structures (e.g. malloc_type) plus
40  * some extensions to efficiently implement single-type zones.
41  *
42  * All memory management is zone based.  When a zone is destroyed, all of
43  * its memory is returned to the system with no fragmentation.
44  *
45  * A mini-slab allocator hangs directly off the zone structure (malloc_type).
46  * Since the object zones are single-size-only, the slab allocator is very
47  * simple and currently utilizes just two per-zone/per-cpu slabs (active and
48  * alternate) before kicking up to the per-zone cache.  Beyond that we just
49  * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary
50  * kernel_map mappings and unmappings.
51  *
52  * The advantage of this that zones don't stomp over each other and cause
53  * excessive fragmentation in the slabs.  For example, when you umount a
54  * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory)
55  * is returned to the system.
56  */
57 
58 #include "opt_vm.h"
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/kernel.h>
63 #include <sys/slaballoc.h>
64 #include <sys/mbuf.h>
65 #include <sys/vmmeter.h>
66 #include <sys/spinlock.h>
67 #include <sys/lock.h>
68 #include <sys/thread.h>
69 #include <sys/globaldata.h>
70 #include <sys/sysctl.h>
71 #include <sys/ktr.h>
72 #include <sys/malloc.h>
73 
74 #include <vm/vm.h>
75 #include <vm/vm_param.h>
76 #include <vm/vm_kern.h>
77 #include <vm/vm_extern.h>
78 #include <vm/vm_object.h>
79 #include <vm/pmap.h>
80 #include <vm/vm_map.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_pageout.h>
83 
84 #include <machine/cpu.h>
85 
86 #include <sys/spinlock2.h>
87 #include <sys/thread2.h>
88 #include <vm/vm_page2.h>
89 
90 #define MEMORY_STRING	"ptr=%p type=%p size=%lu flags=%04x"
91 #define MEMORY_ARGS	void *ptr, void *type, unsigned long size, int flags
92 
93 #if !defined(KTR_MEMORY)
94 #define KTR_MEMORY	KTR_ALL
95 #endif
96 KTR_INFO_MASTER(mem_obj);
97 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin");
98 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS);
99 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS);
100 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS);
101 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS);
102 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS);
103 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS);
104 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS);
105 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS);
106 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin");
107 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end");
108 
109 #define logmemory(name, ptr, type, size, flags)				\
110 	KTR_LOG(mem_obj ## name, ptr, type, size, flags)
111 #define logmemory_quick(name)						\
112 	KTR_LOG(mem_obj ## name)
113 
114 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS;
115 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, "");
116 __read_frequently static int kzone_debug;
117 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, "");
118 
119 __read_frequently struct kmalloc_slab kslab_dummy;
120 
121 static void malloc_slab_destroy(struct malloc_type *type,
122 			struct kmalloc_slab **slabp);
123 
124 /*
125  * Cache a chain of slabs onto their respective cpu slab caches.  Any slabs
126  * which we cannot cache will be returned.
127  *
128  * free_slabs	     - Current structure may only be accessed by current cpu
129  * remote_free_slabs - Only atomic swap operations are allowed.
130  * free_count	     - Only atomic operations are allowed.
131  *
132  * If the count is sufficient to cache the entire list, NULL is returned.
133  * Otherwise the portion that was not cached is returned.
134  */
135 static __noinline
136 struct kmalloc_slab *
137 gslab_cache(struct kmalloc_slab *slab)
138 {
139 	struct kmalloc_slab *save;
140 	struct kmalloc_slab *next;
141 	struct kmalloc_slab *res;
142 	struct kmalloc_slab **resp;
143 	struct kmalloc_slab **slabp;
144 	globaldata_t rgd;
145 	size_t count;
146 	int cpuid;
147 
148 	res = NULL;
149 	resp = &res;
150 	KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
151 
152 	/*
153 	 * Given the slab list, get the cpuid and clip off as many matching
154 	 * elements as fits in the cache.
155 	 */
156 	while (slab) {
157 		cpuid = slab->orig_cpuid;
158 		rgd = globaldata_find(cpuid);
159 
160 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
161 		/*
162 		 * Doesn't fit in cache, put on return list.
163 		 */
164 		if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) {
165 			*resp = slab;
166 			resp = &slab->next;
167 			slab = slab->next;
168 			continue;
169 		}
170 
171 		/*
172 		 * Collect.  We aren't required to match-up the original cpu
173 		 * with the disposal cpu, but its a good idea to retain
174 		 * memory locality.
175 		 *
176 		 * The slabs we collect are going into the global cache,
177 		 * remove the type association.
178 		 */
179 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
180 		slabp = &slab->next;
181 		count = 1;
182 		slab->type = NULL;
183 
184 		while ((next = *slabp) != NULL &&
185 		       next->orig_cpuid == cpuid &&
186 		       rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs)
187 	        {
188 			KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0);
189 			next->type = NULL;
190 			++count;
191 			slabp = &next->next;
192 		}
193 
194 		/*
195 		 * Safety, unhook before next, next is not included in the
196 		 * list starting with slab that is being pre-pended
197 		 * to remote_free_slabs.
198 		 */
199 		*slabp = NULL;
200 
201 		/*
202 		 * Now atomically pre-pend slab...*slabp to remote_free_slabs.
203 		 * Pump the count first (its ok if the actual chain length
204 		 * races the count update).
205 		 *
206 		 * NOTE: In the loop, (save) is updated by fcmpset.
207 		 */
208 		atomic_add_long(&rgd->gd_kmslab.free_count, count);
209 		save = rgd->gd_kmslab.remote_free_slabs;
210 		for (;;) {
211 			KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0);
212 			*slabp = save;	/* end of slab list chain to... */
213 			cpu_ccfence();
214 			if (atomic_fcmpset_ptr(
215 				&rgd->gd_kmslab.remote_free_slabs,
216 				&save, slab))
217 			{
218 				break;
219 			}
220 		}
221 
222 		/*
223 		 * Setup for next loop
224 		 */
225 		slab = next;
226 	}
227 
228 	/*
229 	 * Terminate the result list and return it
230 	 */
231 	*resp = NULL;
232 
233 	return res;
234 }
235 
236 /*
237  * May only be called on current cpu.  Pull a free slab from the
238  * pcpu cache.  If we run out, move any slabs that have built-up
239  * from remote cpus.
240  *
241  * We are only allowed to swap the remote_free_slabs head, we cannot
242  * manipulate any next pointers while structures are sitting on that list.
243  */
244 static __inline
245 struct kmalloc_slab *
246 gslab_alloc(globaldata_t gd)
247 {
248 	struct kmalloc_slab *slab;
249 
250 	slab = gd->gd_kmslab.free_slabs;
251 	if (slab == NULL) {
252 		slab = atomic_swap_ptr(
253 			(volatile void **)&gd->gd_kmslab.remote_free_slabs,
254 			NULL);
255 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
256 	}
257 	if (slab) {
258 		gd->gd_kmslab.free_slabs = slab->next;
259 		slab->next = NULL;
260 		atomic_add_long(&gd->gd_kmslab.free_count, -1);
261 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
262 	}
263 	return slab;
264 }
265 
266 void
267 malloc_mgt_init(struct malloc_type *type __unused,
268 		struct kmalloc_mgt *mgt, size_t size)
269 {
270 	size_t offset;
271 	size_t count;
272 
273 	bzero(mgt, sizeof(*mgt));
274 	spin_init(&mgt->spin, "kmmgt");
275 
276 	/*
277 	 * Allows us to avoid a conditional.  The dummy slabs are empty
278 	 * and have no objects.
279 	 */
280 	mgt->active = &kslab_dummy;
281 	mgt->alternate = &kslab_dummy;
282 	mgt->empty_tailp = &mgt->empty;
283 
284 	/*
285 	 * Figure out the count by taking into account the size of the fobjs[]
286 	 * array by adding it to the object size.
287 	 */
288 	offset = offsetof(struct kmalloc_slab, fobjs[0]);
289 	offset = __VM_CACHELINE_ALIGN(offset);
290 	count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *));
291 
292 	/*
293 	 * However, the fobj[] array itself must be aligned, so we might
294 	 * have to reduce the count by 1.  (We can do this becaues 'size'
295 	 * is already aligned as well).
296 	 */
297 	offset = offsetof(struct kmalloc_slab, fobjs[count]);
298 	offset = __VM_CACHELINE_ALIGN(offset);
299 
300 	if (offset + size * count > KMALLOC_SLAB_SIZE) {
301 		--count;
302 		offset = offsetof(struct kmalloc_slab, fobjs[count]);
303 		offset = __VM_CACHELINE_ALIGN(offset);
304 		KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE);
305 	}
306 
307 	mgt->slab_offset = offset;
308 	mgt->slab_count	 = count;
309 }
310 
311 void
312 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst)
313 {
314 	struct kmalloc_slab **slabp;
315 
316 	spin_init(&dst->spin, "kmmgt");
317 	slabp = &dst->empty;
318 
319 	while (*slabp) {
320 		slabp = &(*slabp)->next;
321 	}
322 	dst->empty_tailp = slabp;
323 }
324 
325 void
326 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt)
327 {
328 	if (mgt->active != &kslab_dummy)
329 		malloc_slab_destroy(type, &mgt->active);
330 	mgt->active = NULL;
331 
332 	if (mgt->alternate != &kslab_dummy)
333 		malloc_slab_destroy(type, &mgt->alternate);
334 	mgt->alternate = NULL;
335 
336 	malloc_slab_destroy(type, &mgt->partial);
337 	malloc_slab_destroy(type, &mgt->full);
338 	malloc_slab_destroy(type, &mgt->empty);
339 	mgt->npartial = 0;
340 	mgt->nfull = 0;
341 	mgt->nempty = 0;
342 	mgt->empty_tailp = &mgt->empty;
343 
344 	spin_uninit(&mgt->spin);
345 }
346 
347 /*
348  * Destroy a list of slabs.  Attempt to cache the slabs on the specified
349  * (possibly remote) cpu.  This allows slabs that were operating on a
350  * particular cpu to be disposed of back to that same cpu.
351  */
352 static void
353 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp)
354 {
355 	struct kmalloc_slab *slab;
356 	struct kmalloc_slab *base;
357 	struct kmalloc_slab **basep;
358 	size_t delta;
359 
360 	if (*slabp == NULL)
361 		return;
362 
363 	/*
364 	 * Collect all slabs that can actually be destroyed, complain
365 	 * about the rest.
366 	 */
367 	base = NULL;
368 	basep = &base;
369 	while ((slab = *slabp) != NULL) {
370 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
371 
372 		delta = slab->findex - slab->aindex;
373 		if (delta == slab->ncount) {
374 			*slabp = slab->next;	/* unlink */
375 			*basep = slab;		/* link into base list */
376 			basep = &slab->next;
377 		} else {
378 			kprintf("%s: slab %p %zd objects "
379 				"were still allocated\n",
380 				type->ks_shortdesc, slab,
381 				slab->ncount - delta);
382 			/* leave link intact and iterate */
383 			slabp = &slab->next;
384 		}
385 	}
386 
387 	/*
388 	 * Terminate the base list of slabs that can be destroyed,
389 	 * then cache as many of them as possible.
390 	 */
391 	*basep = NULL;
392 	if (base == NULL)
393 		return;
394 	base = gslab_cache(base);
395 
396 	/*
397 	 * Destroy the remainder
398 	 */
399 	while ((slab = base) != NULL) {
400 		base = slab->next;
401 		slab->next = (void *)(uintptr_t)-1;
402 		kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
403 	}
404 }
405 
406 /*
407  * Poll a limited number of slabs on the empty list and move them
408  * to the appropriate full or partial list.  Slabs left on the empty
409  * list or rotated to the tail.
410  *
411  * If gcache is non-zero this function will try to place full slabs into
412  * the globaldata cache, if it isn't already too full.
413  *
414  * The mgt is spin-locked
415  *
416  * Returns non-zero if the ggm updates possibly made slabs available for
417  * allocation.
418  */
419 static int
420 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count, int gcache)
421 {
422 	struct kmalloc_slab *marker;
423 	struct kmalloc_slab *slab;
424 	size_t delta;
425 	int got_something;
426 
427 	if (ggm->empty == NULL)
428 		return 0;
429 
430 	got_something = 0;
431 	marker = ggm->empty;
432 
433 	while (count-- && (slab = ggm->empty) != NULL) {
434 		/*
435 		 * Unlink from empty
436 		 */
437 		ggm->empty = slab->next;
438 		slab->next = NULL;
439 		--ggm->nempty;
440 		if (ggm->empty_tailp == &slab->next)
441 			ggm->empty_tailp = &ggm->empty;
442 
443 		/*
444 		 * Check partial, full, and empty.  We rotate
445 		 * empty entries to the end of the empty list.
446 		 *
447 		 * NOTE: For a fully-freeable slab we also have
448 		 *	 to check xindex.
449 		 */
450 		delta = slab->findex - slab->aindex;
451 		if (delta == slab->ncount) {
452 			/*
453 			 * Full and possibly freeable.
454 			 */
455 			if (gcache &&
456 			    ggm->nfull >= KMALLOC_MAXFREEMAGS &&
457 			    slab->xindex == slab->findex)
458 			{
459 				KKASSERT(slab->next == NULL);
460 				slab = gslab_cache(slab);
461 			}
462 			if (slab) {
463 				KKASSERT(slab->next == NULL);
464 				slab->next = ggm->full;
465 				ggm->full = slab;
466 				got_something = 1;
467 				++ggm->nfull;
468 			}
469 		} else if (delta) {
470 			/*
471 			 * Partially full
472 			 */
473 			KKASSERT(slab->next == NULL);
474 			slab->next = ggm->partial;
475 			ggm->partial = slab;
476 			got_something = 1;
477 			++ggm->npartial;
478 		} else {
479 			/*
480 			 * Empty
481 			 */
482 			KKASSERT(slab->next == NULL);
483 			*ggm->empty_tailp = slab;
484 			ggm->empty_tailp = &slab->next;
485 			++ggm->nempty;
486 			if (ggm->empty == marker)
487 				break;
488 		}
489 	}
490 	return got_something;
491 }
492 
493 /*
494  * Called once a second with the zone interlocked against destruction.
495  *
496  * Returns non-zero to tell the caller to iterate to the next type,
497  * else the caller should stay on the current type.
498  */
499 int
500 malloc_mgt_poll(struct malloc_type *type)
501 {
502 	struct kmalloc_mgt *ggm;
503 	struct kmalloc_slab *slab;
504 	struct kmalloc_slab **slabp;
505 	struct kmalloc_slab *base;
506 	struct kmalloc_slab **basep;
507 	size_t delta;
508 	int donext;
509 	int count;
510 	int retired;
511 
512 	if ((type->ks_flags & KSF_OBJSIZE) == 0)
513 		return 1;
514 
515 	/*
516 	 * Check the partial, full, and empty lists for full freeable slabs
517 	 * in excess of desired caching count.
518 	 */
519 	ggm = &type->ks_mgt;
520 	spin_lock(&ggm->spin);
521 
522 	/*
523 	 * Move empty slabs to partial or full as appropriate.  We
524 	 * don't bother checking partial slabs to see if they are full
525 	 * for now.
526 	 */
527 	malloc_mgt_poll_empty_locked(ggm, 16, 0);
528 
529 	/*
530 	 * Ok, cleanout some of the full mags from the full list
531 	 */
532 	base = NULL;
533 	basep = &base;
534 	count = ggm->nfull;
535 	retired = 0;
536 	cpu_ccfence();
537 
538 	if (count > KMALLOC_MAXFREEMAGS) {
539 		slabp = &ggm->full;
540 		count -= KMALLOC_MAXFREEMAGS;
541 		if (count > 16)
542 			count = 16;
543 
544 		while (count && (slab = *slabp) != NULL) {
545 			delta = slab->findex - slab->aindex;
546 			if (delta == slab->ncount &&
547 			    slab->xindex == slab->findex)
548 			{
549 				*slabp = slab->next;
550 				*basep = slab;
551 				basep = &slab->next;
552 				--ggm->nfull;
553 				if (++retired == 4)
554 					break;
555 			} else {
556 				slabp = &slab->next;
557 			}
558 			--count;
559 		}
560 		*basep = NULL;	/* terminate the retirement list */
561 		donext = (*slabp == NULL);
562 	} else {
563 		donext = 1;
564 	}
565 	spin_unlock(&ggm->spin);
566 
567 	/*
568 	 * Clean out any slabs that we couldn't stow in the globaldata cache.
569 	 */
570 	if (retired) {
571 		if (kzone_debug) {
572 			kprintf("kmalloc_poll: %s retire %d\n",
573 				type->ks_shortdesc, retired);
574 		}
575 		base = gslab_cache(base);
576 		while ((slab = base) != NULL) {
577 			base = base->next;
578 			slab->next = NULL;
579 			kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
580 		}
581 	}
582 
583 	return donext;
584 }
585 
586 /*
587  * Optional bitmap double-free check.  This is typically turned on by
588  * default for safety (sys/_malloc.h)
589  */
590 #ifdef KMALLOC_CHECK_DOUBLE_FREE
591 
592 static __inline void
593 bmap_set(struct kmalloc_slab *slab, void *obj)
594 {
595 	uint64_t *ptr;
596 	uint64_t mask;
597 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
598 		   slab->objsize;
599 
600 	ptr = &slab->bmap[i >> 6];
601 	mask = (uint64_t)1U << (i & 63);
602 	KKASSERT(i < slab->ncount && (*ptr & mask) == 0);
603 	atomic_set_64(ptr, mask);
604 }
605 
606 static __inline void
607 bmap_clr(struct kmalloc_slab *slab, void *obj)
608 {
609 	uint64_t *ptr;
610 	uint64_t mask;
611 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
612 		   slab->objsize;
613 
614 	ptr = &slab->bmap[i >> 6];
615 	mask = (uint64_t)1U << (i & 63);
616 	KKASSERT(i < slab->ncount && (*ptr & mask) != 0);
617 	atomic_clear_64(ptr, mask);
618 }
619 
620 #endif
621 
622 /*
623  * Cleanup a mgt structure.
624  *
625  * Always called from the current cpu, so we can manipulate the various
626  * lists freely.
627  *
628  * WARNING: findex can race, fobjs[n] is updated after findex is incremented,
629  *	    and 'full'
630  */
631 #if 0
632 static void
633 mgt_cleanup(struct kmalloc_mgt *mgt)
634 {
635 #if 0
636 	struct kmalloc_slab **slabp;
637 	struct kmalloc_slab *slab;
638 	size_t delta;
639 	size_t total;
640 #endif
641 }
642 #endif
643 
644 #ifdef SLAB_DEBUG
645 void *
646 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags,
647 	      const char *file, int line)
648 #else
649 void *
650 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags)
651 #endif
652 {
653 	struct kmalloc_slab *slab;
654 	struct kmalloc_use *use;
655 	struct kmalloc_mgt *mgt;
656 	struct kmalloc_mgt *ggm;
657 	globaldata_t gd;
658 	void *obj;
659 	size_t delta;
660 
661 	/*
662 	 * Check limits
663 	 */
664 	while (__predict_false(type->ks_loosememuse >= type->ks_limit)) {
665 		long ttl;
666 		int n;
667 
668 		for (n = ttl = 0; n < ncpus; ++n)
669 			ttl += type->ks_use[n].memuse;
670 		type->ks_loosememuse = ttl;	/* not MP synchronized */
671 		if ((ssize_t)ttl < 0)		/* deal with occassional race */
672 			ttl = 0;
673 		if (ttl >= type->ks_limit) {
674 			if (flags & M_NULLOK)
675 				return(NULL);
676 			panic("%s: malloc limit exceeded", type->ks_shortdesc);
677 		}
678 	}
679 
680 	/*
681 	 * Setup
682 	 */
683 	crit_enter();
684 	logmemory_quick(malloc_beg);
685 	KKASSERT(size == type->ks_objsize);
686 	gd = mycpu;
687 	use = &type->ks_use[gd->gd_cpuid];
688 
689 retry:
690 	/*
691 	 * Check active
692 	 *
693 	 * NOTE: obj can be NULL if racing a _kfree_obj().
694 	 */
695 	mgt = &use->mgt;
696 	slab = mgt->active;			/* Might be dummy */
697 	delta = slab->findex - slab->aindex;
698 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
699 		size_t i;
700 
701 		i = slab->aindex % slab->ncount;
702 		obj = slab->fobjs[i];
703 		if (__predict_true(obj != NULL)) {
704 			slab->fobjs[i] = NULL;
705 			++slab->aindex;
706 #ifdef KMALLOC_CHECK_DOUBLE_FREE
707 			bmap_set(slab, obj);
708 #endif
709 			goto found;
710 		}
711 	}
712 
713 	/*
714 	 * Check alternate.  If we find something, swap it with
715 	 * the active.
716 	 *
717 	 * NOTE: It is possible for exhausted slabs to recover entries
718 	 *	 via _kfree_obj(), so we just keep swapping until both
719 	 *	 are empty.
720 	 *
721 	 * NOTE: obj can be NULL if racing a _kfree_obj().
722 	 */
723 	slab = mgt->alternate;			/* Might be dummy */
724 	delta = slab->findex - slab->aindex;
725 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
726 		size_t i;
727 
728 		mgt->alternate = mgt->active;
729 		mgt->active = slab;
730 		i = slab->aindex % slab->ncount;
731 		obj = slab->fobjs[i];
732 		if (__predict_true(obj != NULL)) {
733 			slab->fobjs[i] = NULL;
734 			++slab->aindex;
735 #ifdef KMALLOC_CHECK_DOUBLE_FREE
736 			bmap_set(slab, obj);
737 #endif
738 			goto found;
739 		}
740 	}
741 
742 	/*
743 	 * Rotate a slab from the global mgt into the pcpu mgt.
744 	 *
745 	 *	G(partial, full) -> active -> alternate -> G(empty)
746 	 *
747 	 * We try to exhaust partials first to reduce fragmentation, then
748 	 * dig into the fulls.
749 	 */
750 	ggm = &type->ks_mgt;
751 	spin_lock(&ggm->spin);
752 
753 rerotate:
754 	if (ggm->partial) {
755 		slab = mgt->alternate;		/* Might be dummy */
756 		mgt->alternate = mgt->active;	/* Might be dummy */
757 		mgt->active = ggm->partial;
758 		ggm->partial = ggm->partial->next;
759 		mgt->active->next = NULL;
760 		--ggm->npartial;
761 		if (slab != &kslab_dummy) {
762 			KKASSERT(slab->next == NULL);
763 			*ggm->empty_tailp = slab;
764 			ggm->empty_tailp = &slab->next;
765 			++ggm->nempty;
766 		}
767 		spin_unlock(&ggm->spin);
768 		goto retry;
769 	}
770 
771 	if (ggm->full) {
772 		slab = mgt->alternate;		/* Might be dummy */
773 		mgt->alternate = mgt->active;	/* Might be dummy */
774 		mgt->active = ggm->full;
775 		ggm->full = ggm->full->next;
776 		mgt->active->next = NULL;
777 		--ggm->nfull;
778 		if (slab != &kslab_dummy) {
779 			KKASSERT(slab->next == NULL);
780 			*ggm->empty_tailp = slab;
781 			ggm->empty_tailp = &slab->next;
782 			++ggm->nempty;
783 		}
784 		spin_unlock(&ggm->spin);
785 		goto retry;
786 	}
787 
788 	/*
789 	 * We couldn't find anything, scan a limited number of empty entries
790 	 * looking for something with objects.
791 	 */
792 	if (malloc_mgt_poll_empty_locked(ggm, 16, 1))
793 		goto rerotate;
794 
795 	/*
796 	 * Absolutely nothing is available, allocate a new slab and
797 	 * rotate it in.
798 	 *
799 	 * Try to get a slab from the global pcpu slab cache.
800 	 */
801 	spin_unlock(&ggm->spin);
802 
803 	if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) {
804 		slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE,
805 				       M_WAITOK);
806 	}
807 
808 	bzero(slab, sizeof(*slab));
809 	KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <=
810 		 use->mgt.slab_offset);
811 
812 	obj = (char *)slab + use->mgt.slab_offset;
813 	slab->type = type;
814 	slab->orig_cpuid = gd->gd_cpuid;
815 	slab->ncount = use->mgt.slab_count;
816 	slab->offset = use->mgt.slab_offset;
817 	slab->objsize = type->ks_objsize;
818 	slab->aindex = 0;
819 	slab->findex = slab->ncount;
820 	slab->xindex = slab->ncount;
821 	for (delta = 0; delta < slab->ncount; ++delta) {
822 		slab->fobjs[delta] = obj;
823 		obj = (char *)obj + type->ks_objsize;
824 	}
825 
826 	/*
827 	 * Sanity check, assert that the last byte of last object is still
828 	 * in the slab.
829 	 */
830 #if 0
831 	KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
832 		  ~KMALLOC_SLAB_MASK) == 0);
833 #endif
834 	KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
835 		  ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj));
836 	slab->magic = KMALLOC_SLAB_MAGIC;
837 	spin_init(&slab->spin, "kmslb");
838 
839 	/*
840 	 * Rotate it in, then retry.
841 	 *
842 	 *	(NEW)slab -> active -> alternate -> G(empty)
843 	 */
844 	spin_lock(&ggm->spin);
845 	if (mgt->alternate != &kslab_dummy) {
846 		struct kmalloc_slab *slab_tmp;
847 
848 		slab_tmp = mgt->alternate;
849 		slab_tmp->next = NULL;
850 		*ggm->empty_tailp = slab_tmp;
851 		ggm->empty_tailp = &slab_tmp->next;
852 		++ggm->nempty;
853 	}
854 	mgt->alternate = mgt->active;		/* Might be dummy */
855 	mgt->active = slab;
856 	spin_unlock(&ggm->spin);
857 
858 	goto retry;
859 
860 	/*
861 	 * Found object, adjust statistics and return
862 	 */
863 found:
864 	++use->inuse;
865 	++use->calls;
866 	use->memuse += size;
867 	use->loosememuse += size;
868 	if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) {
869 	    /* not MP synchronized */
870 	    type->ks_loosememuse += use->loosememuse;
871 	    use->loosememuse = 0;
872 	}
873 
874 	/*
875 	 * Handle remaining flags.  M_ZERO is typically not set because
876 	 * the inline macro deals with zeroing for constant sizes.
877 	 */
878 	if (__predict_false(flags & M_ZERO))
879 	    bzero(obj, size);
880 
881 	crit_exit();
882 	logmemory(malloc_end, NULL, type, size, flags);
883 
884 	return(obj);
885 }
886 
887 /*
888  * Free a type-stable object.  We have the base structure and can
889  * calculate the slab, but from this direction we don't know which
890  * mgt structure or list the slab might be on.
891  */
892 void
893 _kfree_obj(void *obj, struct malloc_type *type)
894 {
895 	struct kmalloc_slab *slab;
896 	struct kmalloc_use *use;
897 	globaldata_t gd;
898 	size_t	delta;
899 	size_t	i;
900 
901 	logmemory_quick(free_beg);
902 	gd = mycpu;
903 
904 	/*
905 	 * Calculate the slab from the pointer
906 	 */
907 	slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK);
908 	delta = slab->findex - slab->aindex;
909 	KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount);
910 
911 	/*
912 	 * We can only safely adjust the statistics for the current cpu.
913 	 * Don't try to track down the original cpu.  The statistics will
914 	 * be collected and fixed up by vmstat -m  (etc).
915 	 */
916 	use = &slab->type->ks_use[gd->gd_cpuid];
917 	--use->inuse;
918 	use->memuse -= slab->objsize;
919 
920 	/*
921 	 * There MUST be free space in the slab since we are returning
922 	 * the obj to the same slab it was allocated from.
923 	 */
924 	i = atomic_fetchadd_long(&slab->findex, 1);
925 	i = i % slab->ncount;
926 	if (slab->fobjs[i] != NULL) {
927 		kprintf("_kfree_obj failure %zd/%zd/%zd\n",
928 			slab->aindex, slab->findex, slab->ncount);
929 	}
930 #ifdef KMALLOC_CHECK_DOUBLE_FREE
931 	bmap_clr(slab, obj);
932 #endif
933 	KKASSERT(slab->fobjs[i] == NULL);
934 	slab->fobjs[i] = obj;
935 	atomic_add_long(&slab->xindex, 1);	/* synchronizer */
936 
937 	logmemory_quick(free_end);
938 }
939