xref: /dflybsd-src/sys/kern/kern_kmalloc.c (revision c8860c9adc386d4c6e42a14b22ea4165d1f37b72)
1  /*
2   * KERN_KMALLOC.C	- Kernel memory allocator
3   *
4   * Copyright (c) 2021 The DragonFly Project, All rights reserved.
5   *
6   * This code is derived from software contributed to The DragonFly Project
7   * by Matthew Dillon <dillon@backplane.com>
8   *
9   * Redistribution and use in source and binary forms, with or without
10   * modification, are permitted provided that the following conditions
11   * are met:
12   *
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   * 3. Neither the name of The DragonFly Project nor the names of its
20   *    contributors may be used to endorse or promote products derived
21   *    from this software without specific, prior written permission.
22   *
23   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24   * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27   * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28   * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31   * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34   * SUCH DAMAGE.
35   */
36  
37  /*
38   * This module implements the kmalloc_obj allocator.  This is a type-stable
39   * allocator that uses the same base structures (e.g. malloc_type) plus
40   * some extensions to efficiently implement single-type zones.
41   *
42   * All memory management is zone based.  When a zone is destroyed, all of
43   * its memory is returned to the system with no fragmentation.
44   *
45   * A mini-slab allocator hangs directly off the zone structure (malloc_type).
46   * Since the object zones are single-size-only, the slab allocator is very
47   * simple and currently utilizes just two per-zone/per-cpu slabs (active and
48   * alternate) before kicking up to the per-zone cache.  Beyond that we just
49   * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary
50   * kernel_map mappings and unmappings.
51   *
52   * The advantage of this that zones don't stomp over each other and cause
53   * excessive fragmentation in the slabs.  For example, when you umount a
54   * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory)
55   * is returned to the system.
56   */
57  
58  #include "opt_vm.h"
59  
60  #include <sys/param.h>
61  #include <sys/systm.h>
62  #include <sys/kernel.h>
63  #include <sys/slaballoc.h>
64  #include <sys/mbuf.h>
65  #include <sys/vmmeter.h>
66  #include <sys/spinlock.h>
67  #include <sys/lock.h>
68  #include <sys/thread.h>
69  #include <sys/globaldata.h>
70  #include <sys/sysctl.h>
71  #include <sys/ktr.h>
72  #include <sys/malloc.h>
73  
74  #include <vm/vm.h>
75  #include <vm/vm_param.h>
76  #include <vm/vm_kern.h>
77  #include <vm/vm_extern.h>
78  #include <vm/vm_object.h>
79  #include <vm/pmap.h>
80  #include <vm/vm_map.h>
81  #include <vm/vm_page.h>
82  #include <vm/vm_pageout.h>
83  
84  #include <machine/cpu.h>
85  
86  #include <sys/spinlock2.h>
87  #include <sys/thread2.h>
88  #include <sys/exislock2.h>
89  #include <vm/vm_page2.h>
90  
91  #define MEMORY_STRING	"ptr=%p type=%p size=%lu flags=%04x"
92  #define MEMORY_ARGS	void *ptr, void *type, unsigned long size, int flags
93  
94  #if !defined(KTR_MEMORY)
95  #define KTR_MEMORY	KTR_ALL
96  #endif
97  KTR_INFO_MASTER(mem_obj);
98  KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin");
99  KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS);
100  #if 0
101  KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS);
102  KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS);
103  KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS);
104  KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS);
105  KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS);
106  KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS);
107  KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS);
108  #endif
109  KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin");
110  KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end");
111  
112  #define logmemory(name, ptr, type, size, flags)				\
113  	KTR_LOG(mem_obj_ ## name, ptr, type, size, flags)
114  #define logmemory_quick(name)						\
115  	KTR_LOG(mem_obj_ ## name)
116  
117  __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS;
118  SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, "");
119  __read_frequently static int kzone_bretire = 4;
120  SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, "");
121  __read_frequently static int kzone_debug;
122  SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, "");
123  
124  __read_frequently struct kmalloc_slab kslab_dummy;
125  
126  static void malloc_slab_destroy(struct malloc_type *type,
127  			struct kmalloc_slab **slabp);
128  
129  /*
130   * Cache a chain of slabs onto their respective cpu slab caches.  Any slabs
131   * which we cannot cache will be returned.
132   *
133   * free_slabs	     - Current structure may only be accessed by current cpu
134   * remote_free_slabs - Only atomic swap operations are allowed.
135   * free_count	     - Only atomic operations are allowed.
136   *
137   * If the count is sufficient to cache the entire list, NULL is returned.
138   * Otherwise the portion that was not cached is returned.
139   */
140  static __noinline
141  struct kmalloc_slab *
142  gslab_cache(struct kmalloc_slab *slab)
143  {
144  	struct kmalloc_slab *save;
145  	struct kmalloc_slab *next;
146  	struct kmalloc_slab *res;
147  	struct kmalloc_slab **resp;
148  	struct kmalloc_slab **slabp;
149  	globaldata_t rgd;
150  	size_t count;
151  	int cpuid;
152  
153  	res = NULL;
154  	resp = &res;
155  	KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
156  
157  	/*
158  	 * Given the slab list, get the cpuid and clip off as many matching
159  	 * elements as fits in the cache.
160  	 */
161  	while (slab) {
162  		cpuid = slab->orig_cpuid;
163  		rgd = globaldata_find(cpuid);
164  
165  		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
166  		/*
167  		 * Doesn't fit in cache, put on return list.
168  		 */
169  		if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) {
170  			*resp = slab;
171  			resp = &slab->next;
172  			slab = slab->next;
173  			continue;
174  		}
175  
176  		/*
177  		 * Collect.  We aren't required to match-up the original cpu
178  		 * with the disposal cpu, but its a good idea to retain
179  		 * memory locality.
180  		 *
181  		 * The slabs we collect are going into the global cache,
182  		 * remove the type association.
183  		 */
184  		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
185  		slabp = &slab->next;
186  		count = 1;
187  		slab->type = NULL;
188  
189  		while ((next = *slabp) != NULL &&
190  		       next->orig_cpuid == cpuid &&
191  		       rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs)
192  	        {
193  			KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0);
194  			next->type = NULL;
195  			++count;
196  			slabp = &next->next;
197  		}
198  
199  		/*
200  		 * Safety, unhook before next, next is not included in the
201  		 * list starting with slab that is being pre-pended
202  		 * to remote_free_slabs.
203  		 */
204  		*slabp = NULL;
205  
206  		/*
207  		 * Now atomically pre-pend slab...*slabp to remote_free_slabs.
208  		 * Pump the count first (its ok if the actual chain length
209  		 * races the count update).
210  		 *
211  		 * NOTE: In the loop, (save) is updated by fcmpset.
212  		 */
213  		atomic_add_long(&rgd->gd_kmslab.free_count, count);
214  		save = rgd->gd_kmslab.remote_free_slabs;
215  		for (;;) {
216  			KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0);
217  			*slabp = save;	/* end of slab list chain to... */
218  			cpu_ccfence();
219  			if (atomic_fcmpset_ptr(
220  				&rgd->gd_kmslab.remote_free_slabs,
221  				&save, slab))
222  			{
223  				break;
224  			}
225  		}
226  
227  		/*
228  		 * Setup for next loop
229  		 */
230  		slab = next;
231  	}
232  
233  	/*
234  	 * Terminate the result list and return it
235  	 */
236  	*resp = NULL;
237  
238  	return res;
239  }
240  
241  /*
242   * May only be called on current cpu.  Pull a free slab from the
243   * pcpu cache.  If we run out, move any slabs that have built-up
244   * from remote cpus.
245   *
246   * We are only allowed to swap the remote_free_slabs head, we cannot
247   * manipulate any next pointers while structures are sitting on that list.
248   */
249  static __inline
250  struct kmalloc_slab *
251  gslab_alloc(globaldata_t gd)
252  {
253  	struct kmalloc_slab *slab;
254  
255  	slab = gd->gd_kmslab.free_slabs;
256  	if (slab == NULL) {
257  		slab = atomic_swap_ptr(
258  			(volatile void **)&gd->gd_kmslab.remote_free_slabs,
259  			NULL);
260  		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
261  	}
262  	if (slab) {
263  		gd->gd_kmslab.free_slabs = slab->next;
264  		slab->next = NULL;
265  		atomic_add_long(&gd->gd_kmslab.free_count, -1);
266  		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
267  	}
268  	return slab;
269  }
270  
271  void
272  malloc_mgt_init(struct malloc_type *type __unused,
273  		struct kmalloc_mgt *mgt, size_t size)
274  {
275  	size_t offset;
276  	size_t count;
277  
278  	bzero(mgt, sizeof(*mgt));
279  	spin_init(&mgt->spin, "kmmgt");
280  
281  	/*
282  	 * Allows us to avoid a conditional.  The dummy slabs are empty
283  	 * and have no objects.
284  	 */
285  	mgt->active = &kslab_dummy;
286  	mgt->alternate = &kslab_dummy;
287  	mgt->empty_tailp = &mgt->empty;
288  
289  	/*
290  	 * Figure out the count by taking into account the size of the fobjs[]
291  	 * array by adding it to the object size.  This initial calculation
292  	 * ignores alignment edge-cases that might require the count to be
293  	 * reduced.
294  	 */
295  	offset = offsetof(struct kmalloc_slab, fobjs[0]);
296  	count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *));
297  
298  	/*
299  	 * Recalculate the offset of the first object, this time including
300  	 * the required alignment.  (size) should already be aligned.  This
301  	 * may push the last object beyond the slab so check and loop with
302  	 * a reduced count as necessary.
303  	 *
304  	 * Ok, theoretically the count should not actually change since the
305  	 * division above rounds-down (that is, any mis-alignment is already
306  	 * not included in the count calculation).  But I'm not going to take
307  	 * any chances and check anyway as a safety in case some programmer
308  	 * changes the code above later.  This is not a time-critical code
309  	 * path.
310  	 */
311  	offset = offsetof(struct kmalloc_slab, fobjs[count]);
312  	offset = __VM_CACHELINE_ALIGN(offset);
313  
314  	while (offset + size * count > KMALLOC_SLAB_SIZE) {
315  		--count;
316  		offset = offsetof(struct kmalloc_slab, fobjs[count]);
317  		offset = __VM_CACHELINE_ALIGN(offset);
318  		KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE);
319  	}
320  
321  	mgt->slab_offset = offset;
322  	mgt->slab_count	 = count;
323  }
324  
325  void
326  malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst)
327  {
328  	struct kmalloc_slab **slabp;
329  
330  	spin_init(&dst->spin, "kmmgt");
331  	slabp = &dst->empty;
332  
333  	while (*slabp) {
334  		slabp = &(*slabp)->next;
335  	}
336  	dst->empty_tailp = slabp;
337  }
338  
339  void
340  malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt)
341  {
342  	if (mgt->active != &kslab_dummy)
343  		malloc_slab_destroy(type, &mgt->active);
344  	mgt->active = NULL;
345  
346  	if (mgt->alternate != &kslab_dummy)
347  		malloc_slab_destroy(type, &mgt->alternate);
348  	mgt->alternate = NULL;
349  
350  	malloc_slab_destroy(type, &mgt->partial);
351  	malloc_slab_destroy(type, &mgt->full);
352  	malloc_slab_destroy(type, &mgt->empty);
353  	mgt->npartial = 0;
354  	mgt->nfull = 0;
355  	mgt->nempty = 0;
356  	mgt->empty_tailp = &mgt->empty;
357  
358  	spin_uninit(&mgt->spin);
359  }
360  
361  /*
362   * Destroy a list of slabs.  Attempt to cache the slabs on the specified
363   * (possibly remote) cpu.  This allows slabs that were operating on a
364   * particular cpu to be disposed of back to that same cpu.
365   */
366  static void
367  malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp)
368  {
369  	struct kmalloc_slab *slab;
370  	struct kmalloc_slab *base;
371  	struct kmalloc_slab **basep;
372  	size_t delta;
373  
374  	if (*slabp == NULL)
375  		return;
376  
377  	/*
378  	 * Collect all slabs that can actually be destroyed, complain
379  	 * about the rest.
380  	 */
381  	base = NULL;
382  	basep = &base;
383  	while ((slab = *slabp) != NULL) {
384  		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
385  
386  		delta = slab->findex - slab->aindex;
387  		if (delta == slab->ncount) {
388  			*slabp = slab->next;	/* unlink */
389  			*basep = slab;		/* link into base list */
390  			basep = &slab->next;
391  		} else {
392  			kprintf("%s: slab %p %zd objects "
393  				"were still allocated\n",
394  				type->ks_shortdesc, slab,
395  				slab->ncount - delta);
396  			/* leave link intact and iterate */
397  			slabp = &slab->next;
398  		}
399  	}
400  
401  	/*
402  	 * Terminate the base list of slabs that can be destroyed,
403  	 * then cache as many of them as possible.
404  	 */
405  	*basep = NULL;
406  	if (base == NULL)
407  		return;
408  	base = gslab_cache(base);
409  
410  	/*
411  	 * Destroy the remainder
412  	 */
413  	while ((slab = base) != NULL) {
414  		base = slab->next;
415  		slab->next = (void *)(uintptr_t)-1;
416  		kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
417  	}
418  }
419  
420  /*
421   * Objects can be freed to an empty slab at any time, causing it to no
422   * longer be empty.  To improve performance, we do not try to pro-actively
423   * move such slabs to the appropriate partial or full list upon kfree_obj().
424   * Instead, a poller comes along and tests the slabs on the empty list
425   * periodically, and moves slabs that are no longer empty to the appropriate
426   * list.
427   *
428   * --
429   *
430   * Poll a limited number of slabs on the empty list and move them
431   * to the appropriate full or partial list.  Slabs left on the empty
432   * list are rotated to the tail.
433   *
434   * If gcache is non-zero this function will try to place full slabs into
435   * the globaldata cache, if it isn't already too full.
436   *
437   * The mgt is spin-locked
438   *
439   * Returns non-zero if the ggm updates possibly made slabs available for
440   * allocation.
441   */
442  static int
443  malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count)
444  {
445  	struct kmalloc_slab *marker;
446  	struct kmalloc_slab *slab;
447  	size_t delta;
448  	int got_something;
449  
450  	if (ggm->empty == NULL)
451  		return 0;
452  
453  	got_something = 0;
454  	marker = ggm->empty;
455  
456  	while (count-- && (slab = ggm->empty) != NULL) {
457  		/*
458  		 * Unlink from empty
459  		 */
460  		ggm->empty = slab->next;
461  		slab->next = NULL;
462  		--ggm->nempty;
463  		if (ggm->empty_tailp == &slab->next)
464  			ggm->empty_tailp = &ggm->empty;
465  
466  		/*
467  		 * Check partial, full, and empty.  We rotate
468  		 * empty entries to the end of the empty list.
469  		 *
470  		 * NOTE: For a fully-freeable slab we also have
471  		 *	 to check xindex.
472  		 */
473  		delta = slab->findex - slab->aindex;
474  		if (delta == slab->ncount) {
475  			/*
476  			 * Stuff into the full list.  This requires setting
477  			 * the exis sequence number via exis_terminate().
478  			 */
479  			KKASSERT(slab->next == NULL);
480  			exis_terminate(&slab->exis);
481  			slab->next = ggm->full;
482  			ggm->full = slab;
483  			got_something = 1;
484  			++ggm->nfull;
485  		} else if (delta) {
486  			/*
487  			 * Partially full
488  			 */
489  			KKASSERT(slab->next == NULL);
490  			slab->next = ggm->partial;
491  			ggm->partial = slab;
492  			got_something = 1;
493  			++ggm->npartial;
494  		} else {
495  			/*
496  			 * Empty
497  			 */
498  			KKASSERT(slab->next == NULL);
499  			*ggm->empty_tailp = slab;
500  			ggm->empty_tailp = &slab->next;
501  			++ggm->nempty;
502  			if (ggm->empty == marker)
503  				break;
504  		}
505  	}
506  	return got_something;
507  }
508  
509  /*
510   * Called once a second with the zone interlocked against destruction.
511   *
512   * Returns non-zero to tell the caller to iterate to the next type,
513   * else the caller should stay on the current type.
514   */
515  int
516  malloc_mgt_poll(struct malloc_type *type)
517  {
518  	struct kmalloc_mgt *ggm;
519  	struct kmalloc_slab *slab;
520  	struct kmalloc_slab **slabp;
521  	struct kmalloc_slab *base;
522  	struct kmalloc_slab **basep;
523  	size_t delta;
524  	int donext;
525  	int count;
526  	int retired;
527  
528  	if ((type->ks_flags & KSF_OBJSIZE) == 0)
529  		return 1;
530  
531  	/*
532  	 * Check the partial, full, and empty lists for full freeable slabs
533  	 * in excess of desired caching count.
534  	 */
535  	ggm = &type->ks_mgt;
536  	spin_lock(&ggm->spin);
537  
538  	/*
539  	 * Move empty slabs to partial or full as appropriate.  We
540  	 * don't bother checking partial slabs to see if they are full
541  	 * for now.
542  	 */
543  	malloc_mgt_poll_empty_locked(ggm, 16);
544  
545  	/*
546  	 * Ok, cleanout some of the full mags from the full list
547  	 */
548  	base = NULL;
549  	basep = &base;
550  	count = ggm->nfull;
551  	retired = 0;
552  	cpu_ccfence();
553  
554  	if (count > KMALLOC_MAXFREEMAGS) {
555  		slabp = &ggm->full;
556  		count -= KMALLOC_MAXFREEMAGS;
557  		if (count > 16)
558  			count = 16;
559  
560  		while (count && (slab = *slabp) != NULL) {
561  			delta = slab->findex - slab->aindex;
562  			if (delta == slab->ncount &&
563  			    slab->xindex == slab->findex &&
564  			    exis_freeable(&slab->exis))
565  			{
566  				/*
567  				 * (1) No allocated entries in the structure,
568  				 *     this should always be the case from the
569  				 *     full list.
570  				 *
571  				 * (2) kfree_obj() has fully completed.  Just
572  				 *     checking findex is not sufficient since
573  				 *     it is incremented to reserve the slot
574  				 *     before the element is loaded into it.
575  				 *
576  				 * (3) The slab has been on the full list for
577  				 *     a sufficient number of EXIS
578  				 *     pseudo_ticks, for type-safety.
579  				 */
580  				*slabp = slab->next;
581  				*basep = slab;
582  				basep = &slab->next;
583  				--ggm->nfull;
584  				++ggm->gcache_count;
585  				if (++retired == kzone_bretire)
586  					break;
587  			} else {
588  				slabp = &slab->next;
589  			}
590  			--count;
591  		}
592  		*basep = NULL;	/* terminate the retirement list */
593  		donext = (*slabp == NULL);
594  	} else {
595  		donext = 1;
596  	}
597  	spin_unlock(&ggm->spin);
598  
599  	/*
600  	 * Clean out any slabs that we couldn't stow in the globaldata cache.
601  	 */
602  	if (retired) {
603  		if (kzone_debug) {
604  			kprintf("kmalloc_poll: %s retire %d\n",
605  				type->ks_shortdesc, retired);
606  		}
607  		base = gslab_cache(base);
608  		while ((slab = base) != NULL) {
609  			base = base->next;
610  			slab->next = NULL;
611  			kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
612  		}
613  	}
614  
615  	return donext;
616  }
617  
618  /*
619   * Optional bitmap double-free check.  This is typically turned on by
620   * default for safety (sys/_malloc.h)
621   */
622  #ifdef KMALLOC_CHECK_DOUBLE_FREE
623  
624  static __inline void
625  bmap_set(struct kmalloc_slab *slab, void *obj)
626  {
627  	uint64_t *ptr;
628  	uint64_t mask;
629  	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
630  		   slab->objsize;
631  
632  	ptr = &slab->bmap[i >> 6];
633  	mask = (uint64_t)1U << (i & 63);
634  	KKASSERT(i < slab->ncount && (*ptr & mask) == 0);
635  	atomic_set_64(ptr, mask);
636  }
637  
638  static __inline void
639  bmap_clr(struct kmalloc_slab *slab, void *obj)
640  {
641  	uint64_t *ptr;
642  	uint64_t mask;
643  	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
644  		   slab->objsize;
645  
646  	ptr = &slab->bmap[i >> 6];
647  	mask = (uint64_t)1U << (i & 63);
648  	KKASSERT(i < slab->ncount && (*ptr & mask) != 0);
649  	atomic_clear_64(ptr, mask);
650  }
651  
652  #endif
653  
654  /*
655   * Cleanup a mgt structure.
656   *
657   * Always called from the current cpu, so we can manipulate the various
658   * lists freely.
659   *
660   * WARNING: findex can race, fobjs[n] is updated after findex is incremented,
661   *	    and 'full'
662   */
663  #if 0
664  static void
665  mgt_cleanup(struct kmalloc_mgt *mgt)
666  {
667  #if 0
668  	struct kmalloc_slab **slabp;
669  	struct kmalloc_slab *slab;
670  	size_t delta;
671  	size_t total;
672  #endif
673  }
674  #endif
675  
676  #ifdef SLAB_DEBUG
677  void *
678  _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags,
679  	      const char *file, int line)
680  #else
681  void *
682  _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags)
683  #endif
684  {
685  	struct kmalloc_slab *slab;
686  	struct kmalloc_use *use;
687  	struct kmalloc_mgt *mgt;
688  	struct kmalloc_mgt *ggm;
689  	globaldata_t gd;
690  	void *obj;
691  	size_t delta;
692  
693  	/*
694  	 * Check limits
695  	 */
696  	while (__predict_false(type->ks_loosememuse >= type->ks_limit)) {
697  		long ttl;
698  		int n;
699  
700  		for (n = ttl = 0; n < ncpus; ++n)
701  			ttl += type->ks_use[n].memuse;
702  		type->ks_loosememuse = ttl;	/* not MP synchronized */
703  		if ((ssize_t)ttl < 0)		/* deal with occassional race */
704  			ttl = 0;
705  		if (ttl >= type->ks_limit) {
706  			if (flags & M_NULLOK)
707  				return(NULL);
708  			panic("%s: malloc limit exceeded", type->ks_shortdesc);
709  		}
710  	}
711  
712  	/*
713  	 * Setup
714  	 */
715  	crit_enter();
716  	logmemory_quick(malloc_beg);
717  	KKASSERT(size == type->ks_objsize);
718  	gd = mycpu;
719  	use = &type->ks_use[gd->gd_cpuid];
720  
721  retry:
722  	/*
723  	 * Check active
724  	 *
725  	 * NOTE: obj can be NULL if racing a _kfree_obj().
726  	 */
727  	mgt = &use->mgt;
728  	slab = mgt->active;			/* Might be dummy */
729  	delta = slab->findex - slab->aindex;
730  	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
731  		size_t i;
732  
733  		i = slab->aindex % slab->ncount;
734  		obj = slab->fobjs[i];
735  		if (__predict_true(obj != NULL)) {
736  			slab->fobjs[i] = NULL;
737  			++slab->aindex;
738  #ifdef KMALLOC_CHECK_DOUBLE_FREE
739  			bmap_set(slab, obj);
740  #endif
741  			goto found;
742  		}
743  	}
744  
745  	/*
746  	 * Check alternate.  If we find something, swap it with
747  	 * the active.
748  	 *
749  	 * NOTE: It is possible for exhausted slabs to recover entries
750  	 *	 via _kfree_obj(), so we just keep swapping until both
751  	 *	 are empty.
752  	 *
753  	 * NOTE: obj can be NULL if racing a _kfree_obj().
754  	 */
755  	slab = mgt->alternate;			/* Might be dummy */
756  	delta = slab->findex - slab->aindex;
757  	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
758  		size_t i;
759  
760  		mgt->alternate = mgt->active;
761  		mgt->active = slab;
762  		i = slab->aindex % slab->ncount;
763  		obj = slab->fobjs[i];
764  		if (__predict_true(obj != NULL)) {
765  			slab->fobjs[i] = NULL;
766  			++slab->aindex;
767  #ifdef KMALLOC_CHECK_DOUBLE_FREE
768  			bmap_set(slab, obj);
769  #endif
770  			goto found;
771  		}
772  	}
773  
774  	/*
775  	 * Rotate a slab from the global mgt into the pcpu mgt.
776  	 *
777  	 *	G(partial, full) -> active -> alternate -> G(empty)
778  	 *
779  	 * We try to exhaust partials first to reduce fragmentation, then
780  	 * dig into the fulls.
781  	 */
782  	ggm = &type->ks_mgt;
783  	spin_lock(&ggm->spin);
784  
785  rerotate:
786  	if (ggm->partial) {
787  		slab = mgt->alternate;		/* Might be dummy */
788  		mgt->alternate = mgt->active;	/* Might be dummy */
789  		mgt->active = ggm->partial;
790  		ggm->partial = ggm->partial->next;
791  		mgt->active->next = NULL;
792  		--ggm->npartial;
793  		if (slab != &kslab_dummy) {
794  			KKASSERT(slab->next == NULL);
795  			*ggm->empty_tailp = slab;
796  			ggm->empty_tailp = &slab->next;
797  			++ggm->nempty;
798  		}
799  		spin_unlock(&ggm->spin);
800  		goto retry;
801  	}
802  
803  	if (ggm->full) {
804  		slab = mgt->alternate;		/* Might be dummy */
805  		mgt->alternate = mgt->active;	/* Might be dummy */
806  		mgt->active = ggm->full;
807  		ggm->full = ggm->full->next;
808  		mgt->active->next = NULL;
809  		--ggm->nfull;
810  		exis_setlive(&mgt->active->exis);
811  		if (slab != &kslab_dummy) {
812  			KKASSERT(slab->next == NULL);
813  			*ggm->empty_tailp = slab;
814  			ggm->empty_tailp = &slab->next;
815  			++ggm->nempty;
816  		}
817  		spin_unlock(&ggm->spin);
818  		goto retry;
819  	}
820  
821  	/*
822  	 * We couldn't find anything, scan a limited number of empty entries
823  	 * looking for something with objects.  This will also free excess
824  	 * full lists that meet requirements.
825  	 */
826  	if (malloc_mgt_poll_empty_locked(ggm, 16))
827  		goto rerotate;
828  
829  	/*
830  	 * Absolutely nothing is available, allocate a new slab and
831  	 * rotate it in.
832  	 *
833  	 * Try to get a slab from the global pcpu slab cache (very cheap).
834  	 * If that fails, allocate a new slab (very expensive).
835  	 */
836  	spin_unlock(&ggm->spin);
837  
838  	if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) {
839  		slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE,
840  				       M_WAITOK);
841  	}
842  
843  	bzero(slab, sizeof(*slab));
844  	KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <=
845  		 use->mgt.slab_offset);
846  
847  	obj = (char *)slab + use->mgt.slab_offset;
848  	slab->type = type;
849  	slab->orig_cpuid = gd->gd_cpuid;
850  	slab->ncount = use->mgt.slab_count;
851  	slab->offset = use->mgt.slab_offset;
852  	slab->objsize = type->ks_objsize;
853  	slab->aindex = 0;
854  	slab->findex = slab->ncount;
855  	slab->xindex = slab->ncount;
856  	for (delta = 0; delta < slab->ncount; ++delta) {
857  		slab->fobjs[delta] = obj;
858  		obj = (char *)obj + type->ks_objsize;
859  	}
860  
861  	/*
862  	 * Sanity check, assert that the last byte of last object is still
863  	 * in the slab.
864  	 */
865  #if 0
866  	KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
867  		  ~KMALLOC_SLAB_MASK) == 0);
868  #endif
869  	KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
870  		  ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj));
871  	slab->magic = KMALLOC_SLAB_MAGIC;
872  	spin_init(&slab->spin, "kmslb");
873  
874  	/*
875  	 * Rotate it in, then retry.
876  	 *
877  	 *	(NEW)slab -> active -> alternate -> G(empty)
878  	 */
879  	spin_lock(&ggm->spin);
880  	if (mgt->alternate != &kslab_dummy) {
881  		struct kmalloc_slab *slab_tmp;
882  
883  		slab_tmp = mgt->alternate;
884  		slab_tmp->next = NULL;
885  		*ggm->empty_tailp = slab_tmp;
886  		ggm->empty_tailp = &slab_tmp->next;
887  		++ggm->nempty;
888  	}
889  	mgt->alternate = mgt->active;		/* Might be dummy */
890  	mgt->active = slab;
891  	spin_unlock(&ggm->spin);
892  
893  	goto retry;
894  
895  	/*
896  	 * Found object, adjust statistics and return
897  	 */
898  found:
899  	++use->inuse;
900  	++use->calls;
901  	use->memuse += size;
902  	use->loosememuse += size;
903  	if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) {
904  	    /* not MP synchronized */
905  	    type->ks_loosememuse += use->loosememuse;
906  	    use->loosememuse = 0;
907  	}
908  
909  	/*
910  	 * Handle remaining flags.  M_ZERO is typically not set because
911  	 * the inline macro deals with zeroing for constant sizes.
912  	 */
913  	if (__predict_false(flags & M_ZERO))
914  	    bzero(obj, size);
915  
916  	crit_exit();
917  	logmemory(malloc_end, NULL, type, size, flags);
918  
919  	return(obj);
920  }
921  
922  /*
923   * Free a type-stable object.  We have the base structure and can
924   * calculate the slab, but from this direction we don't know which
925   * mgt structure or list the slab might be on.
926   */
927  void
928  _kfree_obj(void *obj, struct malloc_type *type)
929  {
930  	struct kmalloc_slab *slab;
931  	struct kmalloc_use *use;
932  	globaldata_t gd;
933  	size_t	delta;
934  	size_t	i;
935  
936  	logmemory_quick(free_beg);
937  	gd = mycpu;
938  
939  	/*
940  	 * Calculate the slab from the pointer
941  	 */
942  	slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK);
943  	delta = slab->findex - slab->aindex;
944  	KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount);
945  
946  	/*
947  	 * We can only safely adjust the statistics for the current cpu.
948  	 * Don't try to track down the original cpu.  The statistics will
949  	 * be collected and fixed up by vmstat -m  (etc).
950  	 */
951  	use = &slab->type->ks_use[gd->gd_cpuid];
952  	--use->inuse;
953  	use->memuse -= slab->objsize;
954  
955  	/*
956  	 * There MUST be free space in the slab since we are returning
957  	 * the obj to the same slab it was allocated from.
958  	 */
959  	i = atomic_fetchadd_long(&slab->findex, 1);
960  	i = i % slab->ncount;
961  	if (slab->fobjs[i] != NULL) {
962  		kprintf("_kfree_obj failure %zd/%zd/%zd\n",
963  			slab->aindex, slab->findex, slab->ncount);
964  	}
965  #ifdef KMALLOC_CHECK_DOUBLE_FREE
966  	bmap_clr(slab, obj);
967  #endif
968  	KKASSERT(slab->fobjs[i] == NULL);
969  	slab->fobjs[i] = obj;
970  	atomic_add_long(&slab->xindex, 1);	/* synchronizer */
971  
972  	logmemory_quick(free_end);
973  }
974