xref: /dflybsd-src/lib/libc/stdlib/nmalloc.c (revision 2462e051dc96823aff8a3047860722add2807852)
1 /*
2  * NMALLOC.C	- New Malloc (ported from kernel slab allocator)
3  *
4  * Copyright (c) 2003,2004,2009,2010 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com> and by
8  * Venkatesh Srinivas <me@endeavour.zapto.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * $Id: nmalloc.c,v 1.37 2010/07/23 08:20:35 vsrinivas Exp $
38  */
39 /*
40  * This module implements a slab allocator drop-in replacement for the
41  * libc malloc().
42  *
43  * A slab allocator reserves a ZONE for each chunk size, then lays the
44  * chunks out in an array within the zone.  Allocation and deallocation
45  * is nearly instantaneous, and overhead losses are limited to a fixed
46  * worst-case amount.
47  *
48  * The slab allocator does not have to pre-initialize the list of
49  * free chunks for each zone, and the underlying VM will not be
50  * touched at all beyond the zone header until an actual allocation
51  * needs it.
52  *
53  * Slab management and locking is done on a per-zone basis.
54  *
55  *	Alloc Size	Chunking        Number of zones
56  *	0-127		8		16
57  *	128-255		16		8
58  *	256-511		32		8
59  *	512-1023	64		8
60  *	1024-2047	128		8
61  *	2048-4095	256		8
62  *	4096-8191	512		8
63  *	8192-16383	1024		8
64  *	16384-32767	2048		8
65  *
66  *	Allocations >= ZoneLimit (16K) go directly to mmap and a hash table
67  *	is used to locate for free.  One and Two-page allocations use the
68  *	zone mechanic to avoid excessive mmap()/munmap() calls.
69  *
70  *			   API FEATURES AND SIDE EFFECTS
71  *
72  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
73  *	Above that power-of-2 sized allocations are page-aligned.  Non
74  *	power-of-2 sized allocations are aligned the same as the chunk
75  *	size for their zone.
76  *    + malloc(0) returns a special non-NULL value
77  *    + ability to allocate arbitrarily large chunks of memory
78  *    + realloc will reuse the passed pointer if possible, within the
79  *	limitations of the zone chunking.
80  *
81  * Multithreaded enhancements for small allocations introduced August 2010.
82  * These are in the spirit of 'libumem'. See:
83  *	Bonwick, J.; Adams, J. (2001). "Magazines and Vmem: Extending the
84  *	slab allocator to many CPUs and arbitrary resources". In Proc. 2001
85  *	USENIX Technical Conference. USENIX Association.
86  *
87  * Oversized allocations employ the BIGCACHE mechanic whereby large
88  * allocations may be handed significantly larger buffers, allowing them
89  * to avoid mmap/munmap operations even through significant realloc()s.
90  * The excess space is only trimmed if too many large allocations have been
91  * given this treatment.
92  *
93  * TUNING
94  *
95  * The value of the environment variable MALLOC_OPTIONS is a character string
96  * containing various flags to tune nmalloc.
97  *
98  * 'U'   / ['u']	Generate / do not generate utrace entries for ktrace(1)
99  *			This will generate utrace events for all malloc,
100  *			realloc, and free calls. There are tools (mtrplay) to
101  *			replay and allocation pattern or to graph heap structure
102  *			(mtrgraph) which can interpret these logs.
103  * 'Z'   / ['z']	Zero out / do not zero all allocations.
104  *			Each new byte of memory allocated by malloc, realloc, or
105  *			reallocf will be initialized to 0. This is intended for
106  *			debugging and will affect performance negatively.
107  * 'H'	/  ['h']	Pass a hint to the kernel about pages unused by the
108  *			allocation functions.
109  */
110 
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o nmalloc.so nmalloc.c */
112 
113 #include "namespace.h"
114 #include <sys/param.h>
115 #include <sys/types.h>
116 #include <sys/mman.h>
117 #include <sys/queue.h>
118 #include <sys/ktrace.h>
119 #include <stdio.h>
120 #include <stdint.h>
121 #include <stdlib.h>
122 #include <stdarg.h>
123 #include <stddef.h>
124 #include <unistd.h>
125 #include <string.h>
126 #include <fcntl.h>
127 #include <errno.h>
128 #include <pthread.h>
129 #include <machine/atomic.h>
130 #include "un-namespace.h"
131 
132 #include "libc_private.h"
133 #include "spinlock.h"
134 
135 void __free(void *);
136 void *__malloc(size_t);
137 void *__calloc(size_t, size_t);
138 void *__realloc(void *, size_t);
139 void *__aligned_alloc(size_t, size_t);
140 int __posix_memalign(void **, size_t, size_t);
141 
142 /*
143  * Linked list of large allocations
144  */
145 typedef struct bigalloc {
146 	struct bigalloc *next;	/* hash link */
147 	void	*base;		/* base pointer */
148 	u_long	active;		/* bytes active */
149 	u_long	bytes;		/* bytes allocated */
150 } *bigalloc_t;
151 
152 /*
153  * Note that any allocations which are exact multiples of PAGE_SIZE, or
154  * which are >= ZALLOC_ZONE_LIMIT, will fall through to the kmem subsystem.
155  */
156 #define ZALLOC_ZONE_LIMIT	(16 * 1024)	/* max slab-managed alloc */
157 #define ZALLOC_MIN_ZONE_SIZE	(32 * 1024)	/* minimum zone size */
158 #define ZALLOC_MAX_ZONE_SIZE	(128 * 1024)	/* maximum zone size */
159 #define ZALLOC_ZONE_SIZE	(64 * 1024)
160 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
161 #define ZALLOC_SLAB_SLIDE	20		/* L1-cache skip */
162 
163 #if ZALLOC_ZONE_LIMIT == 16384
164 #define NZONES			72
165 #elif ZALLOC_ZONE_LIMIT == 32768
166 #define NZONES			80
167 #else
168 #error "I couldn't figure out NZONES"
169 #endif
170 
171 /*
172  * Chunk structure for free elements
173  */
174 typedef struct slchunk {
175 	struct slchunk *c_Next;
176 } *slchunk_t;
177 
178 /*
179  * The IN-BAND zone header is placed at the beginning of each zone.
180  */
181 struct slglobaldata;
182 
183 typedef struct slzone {
184 	int32_t		z_Magic;	/* magic number for sanity check */
185 	int		z_NFree;	/* total free chunks / ualloc space */
186 	struct slzone *z_Next;		/* ZoneAry[] link if z_NFree non-zero */
187 	int		z_NMax;		/* maximum free chunks */
188 	char		*z_BasePtr;	/* pointer to start of chunk array */
189 	int		z_UIndex;	/* current initial allocation index */
190 	int		z_UEndIndex;	/* last (first) allocation index */
191 	int		z_ChunkSize;	/* chunk size for validation */
192 	int		z_FirstFreePg;	/* chunk list on a page-by-page basis */
193 	int		z_ZoneIndex;
194 	int		z_Flags;
195 	struct slchunk *z_PageAry[ZALLOC_ZONE_SIZE / PAGE_SIZE];
196 } *slzone_t;
197 
198 typedef struct slglobaldata {
199 	spinlock_t	Spinlock;
200 	slzone_t	ZoneAry[NZONES];/* linked list of zones NFree > 0 */
201 	int		JunkIndex;
202 } *slglobaldata_t;
203 
204 #define SLZF_UNOTZEROD		0x0001
205 
206 #define FASTSLABREALLOC		0x02
207 
208 /*
209  * Misc constants.  Note that allocations that are exact multiples of
210  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
211  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
212  */
213 #define MIN_CHUNK_SIZE		8		/* in bytes */
214 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
215 #define IN_SAME_PAGE_MASK	(~(intptr_t)PAGE_MASK | MIN_CHUNK_MASK)
216 
217 /*
218  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
219  *	    not be larger then 64.
220  */
221 #define BIGHSHIFT	10			/* bigalloc hash table */
222 #define BIGHSIZE	(1 << BIGHSHIFT)
223 #define BIGHMASK	(BIGHSIZE - 1)
224 #define BIGXSIZE	(BIGHSIZE / 16)		/* bigalloc lock table */
225 #define BIGXMASK	(BIGXSIZE - 1)
226 
227 /*
228  * BIGCACHE caches oversized allocations.  Note that a linear search is
229  * performed, so do not make the cache too large.
230  *
231  * BIGCACHE will garbage-collect excess space when the excess exceeds the
232  * specified value.  A relatively large number should be used here because
233  * garbage collection is expensive.
234  */
235 #define BIGCACHE	16
236 #define BIGCACHE_MASK	(BIGCACHE - 1)
237 #define BIGCACHE_LIMIT	(1024 * 1024)		/* size limit */
238 #define BIGCACHE_EXCESS	(16 * 1024 * 1024)	/* garbage collect */
239 
240 #define SAFLAG_ZERO	0x0001
241 #define SAFLAG_PASSIVE	0x0002
242 
243 /*
244  * Thread control
245  */
246 
247 #define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))
248 
249 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
250 				_mpanic("assertion: %s in %s",	\
251 				#exp, __func__);		\
252 			    } while (0)
253 
254 /*
255  * Magazines
256  */
257 
258 #define M_MAX_ROUNDS	64
259 #define M_ZONE_ROUNDS	64
260 #define M_LOW_ROUNDS	32
261 #define M_INIT_ROUNDS	8
262 #define M_BURST_FACTOR  8
263 #define M_BURST_NSCALE	2
264 
265 #define M_BURST		0x0001
266 #define M_BURST_EARLY	0x0002
267 
268 struct magazine {
269 	SLIST_ENTRY(magazine) nextmagazine;
270 
271 	int		flags;
272 	int		capacity;	/* Max rounds in this magazine */
273 	int		rounds;		/* Current number of free rounds */
274 	int		burst_factor;	/* Number of blocks to prefill with */
275 	int		low_factor;	/* Free till low_factor from full mag */
276 	void		*objects[M_MAX_ROUNDS];
277 };
278 
279 SLIST_HEAD(magazinelist, magazine);
280 
281 static spinlock_t zone_mag_lock;
282 static spinlock_t depot_spinlock;
283 static struct magazine zone_magazine = {
284 	.flags = M_BURST | M_BURST_EARLY,
285 	.capacity = M_ZONE_ROUNDS,
286 	.rounds = 0,
287 	.burst_factor = M_BURST_FACTOR,
288 	.low_factor = M_LOW_ROUNDS
289 };
290 
291 #define MAGAZINE_FULL(mp)	(mp->rounds == mp->capacity)
292 #define MAGAZINE_NOTFULL(mp)	(mp->rounds < mp->capacity)
293 #define MAGAZINE_EMPTY(mp)	(mp->rounds == 0)
294 #define MAGAZINE_NOTEMPTY(mp)	(mp->rounds != 0)
295 
296 /*
297  * Each thread will have a pair of magazines per size-class (NZONES)
298  * The loaded magazine will support immediate allocations, the previous
299  * magazine will either be full or empty and can be swapped at need
300  */
301 typedef struct magazine_pair {
302 	struct magazine	*loaded;
303 	struct magazine	*prev;
304 } magazine_pair;
305 
306 /* A depot is a collection of magazines for a single zone. */
307 typedef struct magazine_depot {
308 	struct magazinelist full;
309 	struct magazinelist empty;
310 	spinlock_t	lock;
311 } magazine_depot;
312 
313 typedef struct thr_mags {
314 	magazine_pair	mags[NZONES];
315 	struct magazine	*newmag;
316 	int		init;
317 } thr_mags;
318 
319 /*
320  * With this attribute set, do not require a function call for accessing
321  * this variable when the code is compiled -fPIC.
322  *
323  * Must be empty for libc_rtld (similar to __thread).
324  */
325 #ifdef __LIBC_RTLD
326 #define TLS_ATTRIBUTE
327 #else
328 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")))
329 #endif
330 
331 static __thread thr_mags thread_mags TLS_ATTRIBUTE;
332 static pthread_key_t thread_mags_key;
333 static pthread_once_t thread_mags_once = PTHREAD_ONCE_INIT;
334 static magazine_depot depots[NZONES];
335 
336 /*
337  * Fixed globals (not per-cpu)
338  */
339 static const int ZoneSize = ZALLOC_ZONE_SIZE;
340 static const int ZoneLimit = ZALLOC_ZONE_LIMIT;
341 static const int ZonePageCount = ZALLOC_ZONE_SIZE / PAGE_SIZE;
342 static const int ZoneMask = ZALLOC_ZONE_SIZE - 1;
343 
344 static int opt_madvise = 0;
345 static int opt_utrace = 0;
346 static int g_malloc_flags = 0;
347 static struct slglobaldata	SLGlobalData;
348 static bigalloc_t bigalloc_array[BIGHSIZE];
349 static spinlock_t bigspin_array[BIGXSIZE];
350 static volatile void *bigcache_array[BIGCACHE];		/* atomic swap */
351 static volatile size_t bigcache_size_array[BIGCACHE];	/* SMP races ok */
352 static volatile int bigcache_index;			/* SMP races ok */
353 static int malloc_panic;
354 static size_t excess_alloc;				/* excess big allocs */
355 
356 static void *_slaballoc(size_t size, int flags);
357 static void *_slabrealloc(void *ptr, size_t size);
358 static void _slabfree(void *ptr, int, bigalloc_t *);
359 static int _slabmemalign(void **memptr, size_t alignment, size_t size);
360 static void *_vmem_alloc(size_t bytes, size_t align, int flags);
361 static void _vmem_free(void *ptr, size_t bytes);
362 static void *magazine_alloc(struct magazine *, int *);
363 static int magazine_free(struct magazine *, void *);
364 static void *mtmagazine_alloc(int zi);
365 static int mtmagazine_free(int zi, void *);
366 static void mtmagazine_init(void);
367 static void mtmagazine_destructor(void *);
368 static slzone_t zone_alloc(int flags);
369 static void zone_free(void *z);
370 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
371 static void malloc_init(void) __constructor(101);
372 
373 struct nmalloc_utrace {
374 	void *p;
375 	size_t s;
376 	void *r;
377 };
378 
379 #define UTRACE(a, b, c)						\
380 	if (opt_utrace) {					\
381 		struct nmalloc_utrace ut = {			\
382 			.p = (a),				\
383 			.s = (b),				\
384 			.r = (c)				\
385 		};						\
386 		utrace(&ut, sizeof(ut));			\
387 	}
388 
389 static void
390 malloc_init(void)
391 {
392 	const char *p = NULL;
393 
394 	if (issetugid() == 0)
395 		p = getenv("MALLOC_OPTIONS");
396 
397 	for (; p != NULL && *p != '\0'; p++) {
398 		switch(*p) {
399 		case 'u':	opt_utrace = 0; break;
400 		case 'U':	opt_utrace = 1; break;
401 		case 'h':	opt_madvise = 0; break;
402 		case 'H':	opt_madvise = 1; break;
403 		case 'z':	g_malloc_flags = 0; break;
404 		case 'Z':	g_malloc_flags = SAFLAG_ZERO; break;
405 		default:
406 			break;
407 		}
408 	}
409 
410 	UTRACE((void *) -1, 0, NULL);
411 }
412 
413 /*
414  * We have to install a handler for nmalloc thread teardowns when
415  * the thread is created.  We cannot delay this because destructors in
416  * sophisticated userland programs can call malloc() for the first time
417  * during their thread exit.
418  *
419  * This routine is called directly from pthreads.
420  */
421 void
422 _nmalloc_thr_init(void)
423 {
424 	static int init_once;
425 	thr_mags *tp;
426 
427 	/*
428 	 * Disallow mtmagazine operations until the mtmagazine is
429 	 * initialized.
430 	 */
431 	tp = &thread_mags;
432 	tp->init = -1;
433 
434 	if (init_once == 0) {
435 		init_once = 1;
436 		_pthread_once(&thread_mags_once, mtmagazine_init);
437 	}
438 	_pthread_setspecific(thread_mags_key, tp);
439 	tp->init = 1;
440 }
441 
442 void
443 _nmalloc_thr_prepfork(void)
444 {
445 	if (__isthreaded) {
446 		_SPINLOCK(&zone_mag_lock);
447 		_SPINLOCK(&depot_spinlock);
448 	}
449 }
450 
451 void
452 _nmalloc_thr_parentfork(void)
453 {
454 	if (__isthreaded) {
455 		_SPINUNLOCK(&depot_spinlock);
456 		_SPINUNLOCK(&zone_mag_lock);
457 	}
458 }
459 
460 void
461 _nmalloc_thr_childfork(void)
462 {
463 	if (__isthreaded) {
464 		_SPINUNLOCK(&depot_spinlock);
465 		_SPINUNLOCK(&zone_mag_lock);
466 	}
467 }
468 
469 /*
470  * Thread locks.
471  */
472 static __inline void
473 slgd_lock(slglobaldata_t slgd)
474 {
475 	if (__isthreaded)
476 		_SPINLOCK(&slgd->Spinlock);
477 }
478 
479 static __inline void
480 slgd_unlock(slglobaldata_t slgd)
481 {
482 	if (__isthreaded)
483 		_SPINUNLOCK(&slgd->Spinlock);
484 }
485 
486 static __inline void
487 depot_lock(magazine_depot *dp __unused)
488 {
489 	if (__isthreaded)
490 		_SPINLOCK(&depot_spinlock);
491 #if 0
492 	if (__isthreaded)
493 		_SPINLOCK(&dp->lock);
494 #endif
495 }
496 
497 static __inline void
498 depot_unlock(magazine_depot *dp __unused)
499 {
500 	if (__isthreaded)
501 		_SPINUNLOCK(&depot_spinlock);
502 #if 0
503 	if (__isthreaded)
504 		_SPINUNLOCK(&dp->lock);
505 #endif
506 }
507 
508 static __inline void
509 zone_magazine_lock(void)
510 {
511 	if (__isthreaded)
512 		_SPINLOCK(&zone_mag_lock);
513 }
514 
515 static __inline void
516 zone_magazine_unlock(void)
517 {
518 	if (__isthreaded)
519 		_SPINUNLOCK(&zone_mag_lock);
520 }
521 
522 static __inline void
523 swap_mags(magazine_pair *mp)
524 {
525 	struct magazine *tmp;
526 	tmp = mp->loaded;
527 	mp->loaded = mp->prev;
528 	mp->prev = tmp;
529 }
530 
531 /*
532  * bigalloc hashing and locking support.
533  *
534  * Return an unmasked hash code for the passed pointer.
535  */
536 static __inline int
537 _bigalloc_hash(void *ptr)
538 {
539 	int hv;
540 
541 	hv = ((int)(intptr_t)ptr >> PAGE_SHIFT) ^
542 	      ((int)(intptr_t)ptr >> (PAGE_SHIFT + BIGHSHIFT));
543 
544 	return(hv);
545 }
546 
547 /*
548  * Lock the hash chain and return a pointer to its base for the specified
549  * address.
550  */
551 static __inline bigalloc_t *
552 bigalloc_lock(void *ptr)
553 {
554 	int hv = _bigalloc_hash(ptr);
555 	bigalloc_t *bigp;
556 
557 	bigp = &bigalloc_array[hv & BIGHMASK];
558 	if (__isthreaded)
559 		_SPINLOCK(&bigspin_array[hv & BIGXMASK]);
560 	return(bigp);
561 }
562 
563 /*
564  * Lock the hash chain and return a pointer to its base for the specified
565  * address.
566  *
567  * BUT, if the hash chain is empty, just return NULL and do not bother
568  * to lock anything.
569  */
570 static __inline bigalloc_t *
571 bigalloc_check_and_lock(void *ptr)
572 {
573 	int hv = _bigalloc_hash(ptr);
574 	bigalloc_t *bigp;
575 
576 	bigp = &bigalloc_array[hv & BIGHMASK];
577 	if (*bigp == NULL)
578 		return(NULL);
579 	if (__isthreaded) {
580 		_SPINLOCK(&bigspin_array[hv & BIGXMASK]);
581 	}
582 	return(bigp);
583 }
584 
585 static __inline void
586 bigalloc_unlock(void *ptr)
587 {
588 	int hv;
589 
590 	if (__isthreaded) {
591 		hv = _bigalloc_hash(ptr);
592 		_SPINUNLOCK(&bigspin_array[hv & BIGXMASK]);
593 	}
594 }
595 
596 /*
597  * Find a bigcache entry that might work for the allocation.  SMP races are
598  * ok here except for the swap (that is, it is ok if bigcache_size_array[i]
599  * is wrong or if a NULL or too-small big is returned).
600  *
601  * Generally speaking it is ok to find a large entry even if the bytes
602  * requested are relatively small (but still oversized), because we really
603  * don't know *what* the application is going to do with the buffer.
604  */
605 static __inline
606 bigalloc_t
607 bigcache_find_alloc(size_t bytes)
608 {
609 	bigalloc_t big = NULL;
610 	size_t test;
611 	int i;
612 
613 	for (i = 0; i < BIGCACHE; ++i) {
614 		test = bigcache_size_array[i];
615 		if (bytes <= test) {
616 			bigcache_size_array[i] = 0;
617 			big = atomic_swap_ptr(&bigcache_array[i], NULL);
618 			break;
619 		}
620 	}
621 	return big;
622 }
623 
624 /*
625  * Free a bigcache entry, possibly returning one that the caller really must
626  * free.  This is used to cache recent oversized memory blocks.  Only
627  * big blocks smaller than BIGCACHE_LIMIT will be cached this way, so try
628  * to collect the biggest ones we can that are under the limit.
629  */
630 static __inline
631 bigalloc_t
632 bigcache_find_free(bigalloc_t big)
633 {
634 	int i;
635 	int j;
636 	int b;
637 
638 	b = ++bigcache_index;
639 	for (i = 0; i < BIGCACHE; ++i) {
640 		j = (b + i) & BIGCACHE_MASK;
641 		if (bigcache_size_array[j] < big->bytes) {
642 			bigcache_size_array[j] = big->bytes;
643 			big = atomic_swap_ptr(&bigcache_array[j], big);
644 			break;
645 		}
646 	}
647 	return big;
648 }
649 
650 static __inline
651 void
652 handle_excess_big(void)
653 {
654 	int i;
655 	bigalloc_t big;
656 	bigalloc_t *bigp;
657 
658 	if (excess_alloc <= BIGCACHE_EXCESS)
659 		return;
660 
661 	for (i = 0; i < BIGHSIZE; ++i) {
662 		bigp = &bigalloc_array[i];
663 		if (*bigp == NULL)
664 			continue;
665 		if (__isthreaded)
666 			_SPINLOCK(&bigspin_array[i & BIGXMASK]);
667 		for (big = *bigp; big; big = big->next) {
668 			if (big->active < big->bytes) {
669 				MASSERT((big->active & PAGE_MASK) == 0);
670 				MASSERT((big->bytes & PAGE_MASK) == 0);
671 				munmap((char *)big->base + big->active,
672 				       big->bytes - big->active);
673 				atomic_add_long(&excess_alloc,
674 						big->active - big->bytes);
675 				big->bytes = big->active;
676 			}
677 		}
678 		if (__isthreaded)
679 			_SPINUNLOCK(&bigspin_array[i & BIGXMASK]);
680 	}
681 }
682 
683 /*
684  * Calculate the zone index for the allocation request size and set the
685  * allocation request size to that particular zone's chunk size.
686  */
687 static __inline int
688 zoneindex(size_t *bytes, size_t *chunking)
689 {
690 	size_t n = (unsigned int)*bytes;	/* unsigned for shift opt */
691 
692 	/*
693 	 * This used to be 8-byte chunks and 16 zones for n < 128.
694 	 * However some instructions may require 16-byte alignment
695 	 * (aka SIMD) and programs might not request an aligned size
696 	 * (aka GCC-7), so change this as follows:
697 	 *
698 	 * 0-15 bytes	8-byte alignment in two zones	(0-1)
699 	 * 16-127 bytes	16-byte alignment in four zones	(3-10)
700 	 * zone index 2 and 11-15 are currently unused.
701 	 */
702 	if (n < 16) {
703 		*bytes = n = (n + 7) & ~7;
704 		*chunking = 8;
705 		return(n / 8 - 1);		/* 8 byte chunks, 2 zones */
706 		/* zones 0,1, zone 2 is unused */
707 	}
708 	if (n < 128) {
709 		*bytes = n = (n + 15) & ~15;
710 		*chunking = 16;
711 		return(n / 16 + 2);		/* 16 byte chunks, 8 zones */
712 		/* zones 3-10, zones 11-15 unused */
713 	}
714 	if (n < 256) {
715 		*bytes = n = (n + 15) & ~15;
716 		*chunking = 16;
717 		return(n / 16 + 7);
718 	}
719 	if (n < 8192) {
720 		if (n < 512) {
721 			*bytes = n = (n + 31) & ~31;
722 			*chunking = 32;
723 			return(n / 32 + 15);
724 		}
725 		if (n < 1024) {
726 			*bytes = n = (n + 63) & ~63;
727 			*chunking = 64;
728 			return(n / 64 + 23);
729 		}
730 		if (n < 2048) {
731 			*bytes = n = (n + 127) & ~127;
732 			*chunking = 128;
733 			return(n / 128 + 31);
734 		}
735 		if (n < 4096) {
736 			*bytes = n = (n + 255) & ~255;
737 			*chunking = 256;
738 			return(n / 256 + 39);
739 		}
740 		*bytes = n = (n + 511) & ~511;
741 		*chunking = 512;
742 		return(n / 512 + 47);
743 	}
744 #if ZALLOC_ZONE_LIMIT > 8192
745 	if (n < 16384) {
746 		*bytes = n = (n + 1023) & ~1023;
747 		*chunking = 1024;
748 		return(n / 1024 + 55);
749 	}
750 #endif
751 #if ZALLOC_ZONE_LIMIT > 16384
752 	if (n < 32768) {
753 		*bytes = n = (n + 2047) & ~2047;
754 		*chunking = 2048;
755 		return(n / 2048 + 63);
756 	}
757 #endif
758 	_mpanic("Unexpected byte count %zu", n);
759 	return(0);
760 }
761 
762 /*
763  * malloc() - call internal slab allocator
764  */
765 void *
766 __malloc(size_t size)
767 {
768 	void *ptr;
769 
770 	ptr = _slaballoc(size, 0);
771 	if (ptr == NULL)
772 		errno = ENOMEM;
773 	else
774 		UTRACE(0, size, ptr);
775 	return(ptr);
776 }
777 
778 #define MUL_NO_OVERFLOW	(1UL << (sizeof(size_t) * 4))
779 
780 /*
781  * calloc() - call internal slab allocator
782  */
783 void *
784 __calloc(size_t number, size_t size)
785 {
786 	void *ptr;
787 
788 	if ((number >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
789 	     number > 0 && SIZE_MAX / number < size) {
790 		errno = ENOMEM;
791 		return(NULL);
792 	}
793 
794 	ptr = _slaballoc(number * size, SAFLAG_ZERO);
795 	if (ptr == NULL)
796 		errno = ENOMEM;
797 	else
798 		UTRACE(0, number * size, ptr);
799 	return(ptr);
800 }
801 
802 /*
803  * realloc() (SLAB ALLOCATOR)
804  *
805  * We do not attempt to optimize this routine beyond reusing the same
806  * pointer if the new size fits within the chunking of the old pointer's
807  * zone.
808  */
809 void *
810 __realloc(void *ptr, size_t size)
811 {
812 	void *ret;
813 	ret = _slabrealloc(ptr, size);
814 	if (ret == NULL)
815 		errno = ENOMEM;
816 	else
817 		UTRACE(ptr, size, ret);
818 	return(ret);
819 }
820 
821 /*
822  * aligned_alloc()
823  *
824  * Allocate (size) bytes with a alignment of (alignment).
825  */
826 void *
827 __aligned_alloc(size_t alignment, size_t size)
828 {
829 	void *ptr;
830 	int rc;
831 
832 	ptr = NULL;
833 	rc = _slabmemalign(&ptr, alignment, size);
834 	if (rc)
835 		errno = rc;
836 
837 	return (ptr);
838 }
839 
840 /*
841  * posix_memalign()
842  *
843  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
844  * is a power of 2 >= sizeof(void *).
845  */
846 int
847 __posix_memalign(void **memptr, size_t alignment, size_t size)
848 {
849 	int rc;
850 
851 	/*
852 	 * OpenGroup spec issue 6 check
853 	 */
854 	if (alignment < sizeof(void *)) {
855 		*memptr = NULL;
856 		return(EINVAL);
857 	}
858 
859 	rc = _slabmemalign(memptr, alignment, size);
860 
861 	return (rc);
862 }
863 
864 /*
865  * The slab allocator will allocate on power-of-2 boundaries up to
866  * at least PAGE_SIZE.  We use the zoneindex mechanic to find a
867  * zone matching the requirements, and _vmem_alloc() otherwise.
868  */
869 static int
870 _slabmemalign(void **memptr, size_t alignment, size_t size)
871 {
872 	bigalloc_t *bigp;
873 	bigalloc_t big;
874 	size_t chunking;
875 	int zi __unused;
876 
877 	if (alignment < 1) {
878 		*memptr = NULL;
879 		return(EINVAL);
880 	}
881 
882 	/*
883 	 * OpenGroup spec issue 6 checks
884 	 */
885 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
886 		*memptr = NULL;
887 		return(EINVAL);
888 	}
889 
890 	/*
891 	 * Our zone mechanism guarantees same-sized alignment for any
892 	 * power-of-2 allocation.  If size is a power-of-2 and reasonable
893 	 * we can just call _slaballoc() and be done.  We round size up
894 	 * to the nearest alignment boundary to improve our odds of
895 	 * it becoming a power-of-2 if it wasn't before.
896 	 */
897 	if (size <= alignment)
898 		size = alignment;
899 	else
900 		size = (size + alignment - 1) & ~(size_t)(alignment - 1);
901 
902 	/*
903 	 * If we have overflowed above when rounding to the nearest alignment
904 	 * boundary, just return ENOMEM, size should be == N * sizeof(void *).
905 	 *
906 	 * Power-of-2 allocations up to 8KB will be aligned to the allocation
907 	 * size and _slaballoc() can simply be used.  Please see line 1082
908 	 * for this special case: 'Align the storage in the zone based on
909 	 * the chunking' has a special case for powers of 2.
910 	 */
911 	if (size == 0)
912 		return(ENOMEM);
913 
914 	if (size <= PAGE_SIZE*2 && (size | (size - 1)) + 1 == (size << 1)) {
915 		*memptr = _slaballoc(size, 0);
916 		return(*memptr ? 0 : ENOMEM);
917 	}
918 
919 	/*
920 	 * Otherwise locate a zone with a chunking that matches
921 	 * the requested alignment, within reason.   Consider two cases:
922 	 *
923 	 * (1) A 1K allocation on a 32-byte alignment.  The first zoneindex
924 	 *     we find will be the best fit because the chunking will be
925 	 *     greater or equal to the alignment.
926 	 *
927 	 * (2) A 513 allocation on a 256-byte alignment.  In this case
928 	 *     the first zoneindex we find will be for 576 byte allocations
929 	 *     with a chunking of 64, which is not sufficient.  To fix this
930 	 *     we simply find the nearest power-of-2 >= size and use the
931 	 *     same side-effect of _slaballoc() which guarantees
932 	 *     same-alignment on a power-of-2 allocation.
933 	 */
934 	if (size < PAGE_SIZE) {
935 		zi = zoneindex(&size, &chunking);
936 		if (chunking >= alignment) {
937 			*memptr = _slaballoc(size, 0);
938 			return(*memptr ? 0 : ENOMEM);
939 		}
940 		if (size >= 1024)
941 			alignment = 1024;
942 		if (size >= 16384)
943 			alignment = 16384;
944 		while (alignment < size)
945 			alignment <<= 1;
946 		*memptr = _slaballoc(alignment, 0);
947 		return(*memptr ? 0 : ENOMEM);
948 	}
949 
950 	/*
951 	 * If the slab allocator cannot handle it use vmem_alloc().
952 	 *
953 	 * Alignment must be adjusted up to at least PAGE_SIZE in this case.
954 	 */
955 	if (alignment < PAGE_SIZE)
956 		alignment = PAGE_SIZE;
957 	if (size < alignment)
958 		size = alignment;
959 	size = (size + PAGE_MASK) & ~(size_t)PAGE_MASK;
960 	if (alignment == PAGE_SIZE && size <= BIGCACHE_LIMIT) {
961 		big = bigcache_find_alloc(size);
962 		if (big && big->bytes < size) {
963 			_slabfree(big->base, FASTSLABREALLOC, &big);
964 			big = NULL;
965 		}
966 		if (big) {
967 			*memptr = big->base;
968 			big->active = size;
969 			if (big->active < big->bytes) {
970 				atomic_add_long(&excess_alloc,
971 						big->bytes - big->active);
972 			}
973 			bigp = bigalloc_lock(*memptr);
974 			big->next = *bigp;
975 			*bigp = big;
976 			bigalloc_unlock(*memptr);
977 			handle_excess_big();
978 			return(0);
979 		}
980 	}
981 	*memptr = _vmem_alloc(size, alignment, 0);
982 	if (*memptr == NULL)
983 		return(ENOMEM);
984 
985 	big = _slaballoc(sizeof(struct bigalloc), 0);
986 	if (big == NULL) {
987 		_vmem_free(*memptr, size);
988 		*memptr = NULL;
989 		return(ENOMEM);
990 	}
991 	bigp = bigalloc_lock(*memptr);
992 	big->base = *memptr;
993 	big->active = size;
994 	big->bytes = size;		/* no excess */
995 	big->next = *bigp;
996 	*bigp = big;
997 	bigalloc_unlock(*memptr);
998 
999 	return(0);
1000 }
1001 
1002 /*
1003  * free() (SLAB ALLOCATOR) - do the obvious
1004  */
1005 void
1006 __free(void *ptr)
1007 {
1008 	UTRACE(ptr, 0, 0);
1009 	_slabfree(ptr, 0, NULL);
1010 }
1011 
1012 /*
1013  * _slaballoc()	(SLAB ALLOCATOR)
1014  *
1015  *	Allocate memory via the slab allocator.  If the request is too large,
1016  *	or if it page-aligned beyond a certain size, we fall back to the
1017  *	KMEM subsystem
1018  */
1019 static void *
1020 _slaballoc(size_t size, int flags)
1021 {
1022 	slzone_t z;
1023 	slchunk_t chunk;
1024 	slglobaldata_t slgd;
1025 	size_t chunking;
1026 	int zi;
1027 	int off;
1028 	void *obj;
1029 
1030 	/*
1031 	 * Handle the degenerate size == 0 case.  Yes, this does happen.
1032 	 * Return a special pointer.  This is to maintain compatibility with
1033 	 * the original malloc implementation.  Certain devices, such as the
1034 	 * adaptec driver, not only allocate 0 bytes, they check for NULL and
1035 	 * also realloc() later on.  Joy.
1036 	 */
1037 	if (size == 0)
1038 		size = 1;
1039 
1040 	/* Capture global flags */
1041 	flags |= g_malloc_flags;
1042 
1043 	/*
1044 	 * Handle large allocations directly.  There should not be very many
1045 	 * of these so performance is not a big issue.
1046 	 *
1047 	 * The backend allocator is pretty nasty on a SMP system.   Use the
1048 	 * slab allocator for one and two page-sized chunks even though we
1049 	 * lose some efficiency.
1050 	 *
1051 	 * NOTE: Please see posix_memalign around line 864, which assumes
1052 	 *	 that power-of-2 allocations of PAGE_SIZE and PAGE_SIZE*2
1053 	 *	 can use _slaballoc() and be aligned to the same.  The
1054 	 *	 zone cache can be used for this case, bigalloc does not
1055 	 *	 have to be used.
1056 	 */
1057 	if (size >= ZoneLimit ||
1058 	    ((size & PAGE_MASK) == 0 && size > PAGE_SIZE*2)) {
1059 		bigalloc_t big;
1060 		bigalloc_t *bigp;
1061 
1062 		/*
1063 		 * Page-align and cache-color in case of virtually indexed
1064 		 * physically tagged L1 caches (aka SandyBridge).  No sweat
1065 		 * otherwise, so just do it.
1066 		 *
1067 		 * (don't count as excess).
1068 		 */
1069 		size = (size + PAGE_MASK) & ~(size_t)PAGE_MASK;
1070 
1071 		/*
1072 		 * If we have overflowed above when rounding to the page
1073 		 * boundary, something has passed us (size_t)[-PAGE_MASK..-1]
1074 		 * so just return NULL, size at this point should be >= 0.
1075 		*/
1076 		if (size == 0)
1077 			return (NULL);
1078 
1079 		if ((size & (PAGE_SIZE * 2 - 1)) == 0)
1080 			size += PAGE_SIZE;
1081 
1082 		/*
1083 		 * Try to reuse a cached big block to avoid mmap'ing.  If it
1084 		 * turns out not to fit our requirements we throw it away
1085 		 * and allocate normally.
1086 		 */
1087 		big = NULL;
1088 		if (size <= BIGCACHE_LIMIT) {
1089 			big = bigcache_find_alloc(size);
1090 			if (big && big->bytes < size) {
1091 				_slabfree(big->base, FASTSLABREALLOC, &big);
1092 				big = NULL;
1093 			}
1094 		}
1095 		if (big) {
1096 			chunk = big->base;
1097 			if (flags & SAFLAG_ZERO)
1098 				bzero(chunk, size);
1099 		} else {
1100 			chunk = _vmem_alloc(size, PAGE_SIZE, flags);
1101 			if (chunk == NULL)
1102 				return(NULL);
1103 
1104 			big = _slaballoc(sizeof(struct bigalloc), 0);
1105 			if (big == NULL) {
1106 				_vmem_free(chunk, size);
1107 				return(NULL);
1108 			}
1109 			big->base = chunk;
1110 			big->bytes = size;
1111 		}
1112 		big->active = size;
1113 
1114 		bigp = bigalloc_lock(chunk);
1115 		if (big->active < big->bytes) {
1116 			atomic_add_long(&excess_alloc,
1117 					big->bytes - big->active);
1118 		}
1119 		big->next = *bigp;
1120 		*bigp = big;
1121 		bigalloc_unlock(chunk);
1122 		handle_excess_big();
1123 
1124 		return(chunk);
1125 	}
1126 
1127 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
1128 	zi = zoneindex(&size, &chunking);
1129 	MASSERT(zi < NZONES);
1130 
1131 	obj = mtmagazine_alloc(zi);
1132 	if (obj != NULL) {
1133 		if (flags & SAFLAG_ZERO)
1134 			bzero(obj, size);
1135 		return (obj);
1136 	}
1137 
1138 	slgd = &SLGlobalData;
1139 	slgd_lock(slgd);
1140 
1141 	/*
1142 	 * Attempt to allocate out of an existing zone.  If all zones are
1143 	 * exhausted pull one off the free list or allocate a new one.
1144 	 */
1145 	if ((z = slgd->ZoneAry[zi]) == NULL) {
1146 		z = zone_alloc(flags);
1147 		if (z == NULL)
1148 			goto fail;
1149 
1150 		/*
1151 		 * How big is the base structure?
1152 		 */
1153 		off = sizeof(struct slzone);
1154 
1155 		/*
1156 		 * Align the storage in the zone based on the chunking.
1157 		 *
1158 		 * Guarantee power-of-2 alignment for power-of-2-sized
1159 		 * chunks.  Otherwise align based on the chunking size
1160 		 * (typically 8 or 16 bytes for small allocations).
1161 		 *
1162 		 * NOTE: Allocations >= ZoneLimit are governed by the
1163 		 * bigalloc code and typically only guarantee page-alignment.
1164 		 *
1165 		 * Set initial conditions for UIndex near the zone header
1166 		 * to reduce unecessary page faults, vs semi-randomization
1167 		 * to improve L1 cache saturation.
1168 		 *
1169 		 * NOTE: Please see posix_memalign around line 864-ish, which
1170 		 *	 assumes that power-of-2 allocations of PAGE_SIZE
1171 		 *	 and PAGE_SIZE*2 can use _slaballoc() and be aligned
1172 		 *	 to the same.  The zone cache can be used for this
1173 		 *	 case, bigalloc does not have to be used.
1174 		 *
1175 		 *	 ALL power-of-2 requests that fall through to this
1176 		 *	 code use this rule (conditionals above limit this
1177 		 *	 to <= PAGE_SIZE*2.
1178 		 */
1179 		if ((size | (size - 1)) + 1 == (size << 1))
1180 			off = roundup2(off, size);
1181 		else
1182 			off = roundup2(off, chunking);
1183 		z->z_Magic = ZALLOC_SLAB_MAGIC;
1184 		z->z_ZoneIndex = zi;
1185 		z->z_NMax = (ZoneSize - off) / size;
1186 		z->z_NFree = z->z_NMax;
1187 		z->z_BasePtr = (char *)z + off;
1188 		z->z_UIndex = z->z_UEndIndex = 0;
1189 		z->z_ChunkSize = size;
1190 		z->z_FirstFreePg = ZonePageCount;
1191 		z->z_Next = slgd->ZoneAry[zi];
1192 		slgd->ZoneAry[zi] = z;
1193 		if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
1194 			flags &= ~SAFLAG_ZERO;	/* already zero'd */
1195 			flags |= SAFLAG_PASSIVE;
1196 		}
1197 
1198 		/*
1199 		 * Slide the base index for initial allocations out of the
1200 		 * next zone we create so we do not over-weight the lower
1201 		 * part of the cpu memory caches.
1202 		 */
1203 		slgd->JunkIndex = (slgd->JunkIndex + ZALLOC_SLAB_SLIDE)
1204 					& (ZALLOC_MAX_ZONE_SIZE - 1);
1205 	}
1206 
1207 	/*
1208 	 * Ok, we have a zone from which at least one chunk is available.
1209 	 *
1210 	 * Remove us from the ZoneAry[] when we become empty
1211 	 */
1212 	MASSERT(z->z_NFree > 0);
1213 
1214 	if (--z->z_NFree == 0) {
1215 		slgd->ZoneAry[zi] = z->z_Next;
1216 		z->z_Next = NULL;
1217 	}
1218 
1219 	/*
1220 	 * Locate a chunk in a free page.  This attempts to localize
1221 	 * reallocations into earlier pages without us having to sort
1222 	 * the chunk list.  A chunk may still overlap a page boundary.
1223 	 */
1224 	while (z->z_FirstFreePg < ZonePageCount) {
1225 		if ((chunk = z->z_PageAry[z->z_FirstFreePg]) != NULL) {
1226 			MASSERT((uintptr_t)chunk & ZoneMask);
1227 			z->z_PageAry[z->z_FirstFreePg] = chunk->c_Next;
1228 			goto done;
1229 		}
1230 		++z->z_FirstFreePg;
1231 	}
1232 
1233 	/*
1234 	 * No chunks are available but NFree said we had some memory,
1235 	 * so it must be available in the never-before-used-memory
1236 	 * area governed by UIndex.  The consequences are very
1237 	 * serious if our zone got corrupted so we use an explicit
1238 	 * panic rather then a KASSERT.
1239 	 */
1240 	chunk = (slchunk_t)(z->z_BasePtr + z->z_UIndex * size);
1241 
1242 	if (++z->z_UIndex == z->z_NMax)
1243 		z->z_UIndex = 0;
1244 	if (z->z_UIndex == z->z_UEndIndex) {
1245 		if (z->z_NFree != 0)
1246 			_mpanic("slaballoc: corrupted zone");
1247 	}
1248 
1249 	if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
1250 		flags &= ~SAFLAG_ZERO;
1251 		flags |= SAFLAG_PASSIVE;
1252 	}
1253 
1254 done:
1255 	slgd_unlock(slgd);
1256 	if (flags & SAFLAG_ZERO)
1257 		bzero(chunk, size);
1258 	return(chunk);
1259 fail:
1260 	slgd_unlock(slgd);
1261 	return(NULL);
1262 }
1263 
1264 /*
1265  * Reallocate memory within the chunk
1266  */
1267 static void *
1268 _slabrealloc(void *ptr, size_t size)
1269 {
1270 	bigalloc_t *bigp;
1271 	void *nptr;
1272 	slzone_t z;
1273 	size_t chunking;
1274 
1275 	if (ptr == NULL) {
1276 		return(_slaballoc(size, 0));
1277 	}
1278 
1279 	if (size == 0)
1280 		size = 1;
1281 
1282 	/*
1283 	 * Handle oversized allocations.
1284 	 */
1285 	if ((bigp = bigalloc_check_and_lock(ptr)) != NULL) {
1286 		bigalloc_t big;
1287 		size_t bigbytes;
1288 
1289 		while ((big = *bigp) != NULL) {
1290 			if (big->base == ptr) {
1291 				size = (size + PAGE_MASK) & ~(size_t)PAGE_MASK;
1292 				bigbytes = big->bytes;
1293 
1294 				/*
1295 				 * If it already fits determine if it makes
1296 				 * sense to shrink/reallocate.  Try to optimize
1297 				 * programs which stupidly make incremental
1298 				 * reallocations larger or smaller by scaling
1299 				 * the allocation.  Also deal with potential
1300 				 * coloring.
1301 				 */
1302 				if (size >= (bigbytes >> 1) &&
1303 				    size <= bigbytes) {
1304 					if (big->active != size) {
1305 						atomic_add_long(&excess_alloc,
1306 								big->active -
1307 								size);
1308 					}
1309 					big->active = size;
1310 					bigalloc_unlock(ptr);
1311 					return(ptr);
1312 				}
1313 
1314 				/*
1315 				 * For large reallocations, allocate more space
1316 				 * than we need to try to avoid excessive
1317 				 * reallocations later on.
1318 				 */
1319 				chunking = size + (size >> 3);
1320 				chunking = (chunking + PAGE_MASK) &
1321 					   ~(size_t)PAGE_MASK;
1322 
1323 				/*
1324 				 * Try to allocate adjacently in case the
1325 				 * program is idiotically realloc()ing a
1326 				 * huge memory block just slightly bigger.
1327 				 * (llvm's llc tends to do this a lot).
1328 				 *
1329 				 * (MAP_TRYFIXED forces mmap to fail if there
1330 				 *  is already something at the address).
1331 				 */
1332 				if (chunking > bigbytes) {
1333 					char *addr;
1334 					int errno_save = errno;
1335 
1336 					addr = mmap((char *)ptr + bigbytes,
1337 						    chunking - bigbytes,
1338 						    PROT_READ|PROT_WRITE,
1339 						    MAP_PRIVATE|MAP_ANON|
1340 						    MAP_TRYFIXED,
1341 						    -1, 0);
1342 					errno = errno_save;
1343 					if (addr == (char *)ptr + bigbytes) {
1344 						atomic_add_long(&excess_alloc,
1345 								big->active -
1346 								big->bytes +
1347 								chunking -
1348 								size);
1349 						big->bytes = chunking;
1350 						big->active = size;
1351 						bigalloc_unlock(ptr);
1352 
1353 						return(ptr);
1354 					}
1355 					MASSERT((void *)addr == MAP_FAILED);
1356 				}
1357 
1358 				/*
1359 				 * Failed, unlink big and allocate fresh.
1360 				 * (note that we have to leave (big) intact
1361 				 * in case the slaballoc fails).
1362 				 */
1363 				*bigp = big->next;
1364 				bigalloc_unlock(ptr);
1365 				if ((nptr = _slaballoc(size, 0)) == NULL) {
1366 					/* Relink block */
1367 					bigp = bigalloc_lock(ptr);
1368 					big->next = *bigp;
1369 					*bigp = big;
1370 					bigalloc_unlock(ptr);
1371 					return(NULL);
1372 				}
1373 				if (size > bigbytes)
1374 					size = bigbytes;
1375 				bcopy(ptr, nptr, size);
1376 				atomic_add_long(&excess_alloc, big->active -
1377 							       big->bytes);
1378 				_slabfree(ptr, FASTSLABREALLOC, &big);
1379 
1380 				return(nptr);
1381 			}
1382 			bigp = &big->next;
1383 		}
1384 		bigalloc_unlock(ptr);
1385 		handle_excess_big();
1386 	}
1387 
1388 	/*
1389 	 * Get the original allocation's zone.  If the new request winds
1390 	 * up using the same chunk size we do not have to do anything.
1391 	 *
1392 	 * NOTE: We don't have to lock the globaldata here, the fields we
1393 	 * access here will not change at least as long as we have control
1394 	 * over the allocation.
1395 	 */
1396 	z = (slzone_t)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
1397 	MASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
1398 
1399 	/*
1400 	 * Use zoneindex() to chunk-align the new size, as long as the
1401 	 * new size is not too large.
1402 	 */
1403 	if (size < ZoneLimit) {
1404 		zoneindex(&size, &chunking);
1405 		if (z->z_ChunkSize == size) {
1406 			return(ptr);
1407 		}
1408 	}
1409 
1410 	/*
1411 	 * Allocate memory for the new request size and copy as appropriate.
1412 	 */
1413 	if ((nptr = _slaballoc(size, 0)) != NULL) {
1414 		if (size > z->z_ChunkSize)
1415 			size = z->z_ChunkSize;
1416 		bcopy(ptr, nptr, size);
1417 		_slabfree(ptr, 0, NULL);
1418 	}
1419 
1420 	return(nptr);
1421 }
1422 
1423 /*
1424  * free (SLAB ALLOCATOR)
1425  *
1426  * Free a memory block previously allocated by malloc.  Note that we do not
1427  * attempt to uplodate ks_loosememuse as MP races could prevent us from
1428  * checking memory limits in malloc.
1429  *
1430  * flags:
1431  *	FASTSLABREALLOC		Fast call from realloc, *rbigp already
1432  *				unlinked.
1433  *
1434  * MPSAFE
1435  */
1436 static void
1437 _slabfree(void *ptr, int flags, bigalloc_t *rbigp)
1438 {
1439 	slzone_t z;
1440 	slchunk_t chunk;
1441 	bigalloc_t big;
1442 	bigalloc_t *bigp;
1443 	slglobaldata_t slgd;
1444 	size_t size;
1445 	int zi;
1446 	int pgno;
1447 
1448 	/* Fast realloc path for big allocations */
1449 	if (flags & FASTSLABREALLOC) {
1450 		big = *rbigp;
1451 		goto fastslabrealloc;
1452 	}
1453 
1454 	/*
1455 	 * Handle NULL frees and special 0-byte allocations
1456 	 */
1457 	if (ptr == NULL)
1458 		return;
1459 
1460 	/*
1461 	 * Handle oversized allocations.
1462 	 */
1463 	if ((bigp = bigalloc_check_and_lock(ptr)) != NULL) {
1464 		while ((big = *bigp) != NULL) {
1465 			if (big->base == ptr) {
1466 				*bigp = big->next;
1467 				atomic_add_long(&excess_alloc, big->active -
1468 							       big->bytes);
1469 				bigalloc_unlock(ptr);
1470 
1471 				/*
1472 				 * Try to stash the block we are freeing,
1473 				 * potentially receiving another block in
1474 				 * return which must be freed.
1475 				 */
1476 fastslabrealloc:
1477 				if (big->bytes <= BIGCACHE_LIMIT) {
1478 					big = bigcache_find_free(big);
1479 					if (big == NULL)
1480 						return;
1481 				}
1482 				ptr = big->base;	/* reload */
1483 				size = big->bytes;
1484 				_slabfree(big, 0, NULL);
1485 				_vmem_free(ptr, size);
1486 				return;
1487 			}
1488 			bigp = &big->next;
1489 		}
1490 		bigalloc_unlock(ptr);
1491 		handle_excess_big();
1492 	}
1493 
1494 	/*
1495 	 * Zone case.  Figure out the zone based on the fact that it is
1496 	 * ZoneSize aligned.
1497 	 */
1498 	z = (slzone_t)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
1499 	MASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
1500 
1501 	size = z->z_ChunkSize;
1502 	zi = z->z_ZoneIndex;
1503 
1504 	if (g_malloc_flags & SAFLAG_ZERO)
1505 		bzero(ptr, size);
1506 
1507 	if (mtmagazine_free(zi, ptr) == 0)
1508 		return;
1509 
1510 	pgno = ((char *)ptr - (char *)z) >> PAGE_SHIFT;
1511 	chunk = ptr;
1512 	slgd = &SLGlobalData;
1513 	slgd_lock(slgd);
1514 
1515 	/*
1516 	 * Add this free non-zero'd chunk to a linked list for reuse, adjust
1517 	 * z_FirstFreePg.
1518 	 */
1519 	chunk->c_Next = z->z_PageAry[pgno];
1520 	z->z_PageAry[pgno] = chunk;
1521 	if (z->z_FirstFreePg > pgno)
1522 		z->z_FirstFreePg = pgno;
1523 
1524 	/*
1525 	 * Bump the number of free chunks.  If it becomes non-zero the zone
1526 	 * must be added back onto the appropriate list.
1527 	 */
1528 	if (z->z_NFree++ == 0) {
1529 		z->z_Next = slgd->ZoneAry[z->z_ZoneIndex];
1530 		slgd->ZoneAry[z->z_ZoneIndex] = z;
1531 	}
1532 
1533 	/*
1534 	 * If the zone becomes totally free then release it.
1535 	 */
1536 	if (z->z_NFree == z->z_NMax) {
1537 		slzone_t *pz;
1538 
1539 		pz = &slgd->ZoneAry[z->z_ZoneIndex];
1540 		while (z != *pz)
1541 			pz = &(*pz)->z_Next;
1542 		*pz = z->z_Next;
1543 		z->z_Magic = -1;
1544 		z->z_Next = NULL;
1545 		zone_free(z);
1546 		/* slgd lock released */
1547 		return;
1548 	}
1549 	slgd_unlock(slgd);
1550 }
1551 
1552 /*
1553  * Allocate and return a magazine.  NULL is returned and *burst is adjusted
1554  * if the magazine is empty.
1555  */
1556 static __inline void *
1557 magazine_alloc(struct magazine *mp, int *burst)
1558 {
1559 	void *obj;
1560 
1561 	if (mp == NULL)
1562 		return(NULL);
1563 	if (MAGAZINE_NOTEMPTY(mp)) {
1564 		obj = mp->objects[--mp->rounds];
1565 		return(obj);
1566 	}
1567 
1568 	/*
1569 	 * Return burst factor to caller along with NULL
1570 	 */
1571 	if ((mp->flags & M_BURST) && (burst != NULL)) {
1572 		*burst = mp->burst_factor;
1573 	}
1574 	/* Reduce burst factor by NSCALE; if it hits 1, disable BURST */
1575 	if ((mp->flags & M_BURST) && (mp->flags & M_BURST_EARLY) &&
1576 	    (burst != NULL)) {
1577 		mp->burst_factor -= M_BURST_NSCALE;
1578 		if (mp->burst_factor <= 1) {
1579 			mp->burst_factor = 1;
1580 			mp->flags &= ~(M_BURST);
1581 			mp->flags &= ~(M_BURST_EARLY);
1582 		}
1583 	}
1584 	return (NULL);
1585 }
1586 
1587 static __inline int
1588 magazine_free(struct magazine *mp, void *p)
1589 {
1590 	if (mp != NULL && MAGAZINE_NOTFULL(mp)) {
1591 		mp->objects[mp->rounds++] = p;
1592 		return 0;
1593 	}
1594 
1595 	return -1;
1596 }
1597 
1598 static void *
1599 mtmagazine_alloc(int zi)
1600 {
1601 	thr_mags *tp;
1602 	struct magazine *mp, *emptymag;
1603 	magazine_depot *d;
1604 	void *obj;
1605 
1606 	/*
1607 	 * Do not try to access per-thread magazines while the mtmagazine
1608 	 * is being initialized or destroyed.
1609 	 */
1610 	tp = &thread_mags;
1611 	if (tp->init < 0)
1612 		return(NULL);
1613 
1614 	/*
1615 	 * Primary per-thread allocation loop
1616 	 */
1617 	for (;;) {
1618 		/*
1619 		 * If the loaded magazine has rounds, allocate and return
1620 		 */
1621 		mp = tp->mags[zi].loaded;
1622 		obj = magazine_alloc(mp, NULL);
1623 		if (obj)
1624 			break;
1625 
1626 		/*
1627 		 * If the prev magazine is full, swap with the loaded
1628 		 * magazine and retry.
1629 		 */
1630 		mp = tp->mags[zi].prev;
1631 		if (mp && MAGAZINE_FULL(mp)) {
1632 			MASSERT(mp->rounds != 0);
1633 			swap_mags(&tp->mags[zi]);	/* prev now empty */
1634 			continue;
1635 		}
1636 
1637 		/*
1638 		 * Try to get a full magazine from the depot.  Cycle
1639 		 * through depot(full)->loaded->prev->depot(empty).
1640 		 * Retry if a full magazine was available from the depot.
1641 		 *
1642 		 * Return NULL (caller will fall through) if no magazines
1643 		 * can be found anywhere.
1644 		 */
1645 		d = &depots[zi];
1646 		depot_lock(d);
1647 		emptymag = tp->mags[zi].prev;
1648 		if (emptymag)
1649 			SLIST_INSERT_HEAD(&d->empty, emptymag, nextmagazine);
1650 		tp->mags[zi].prev = tp->mags[zi].loaded;
1651 		mp = SLIST_FIRST(&d->full);	/* loaded magazine */
1652 		tp->mags[zi].loaded = mp;
1653 		if (mp) {
1654 			SLIST_REMOVE_HEAD(&d->full, nextmagazine);
1655 			MASSERT(MAGAZINE_NOTEMPTY(mp));
1656 			depot_unlock(d);
1657 			continue;
1658 		}
1659 		depot_unlock(d);
1660 		break;
1661 	}
1662 
1663 	return (obj);
1664 }
1665 
1666 static int
1667 mtmagazine_free(int zi, void *ptr)
1668 {
1669 	thr_mags *tp;
1670 	struct magazine *mp, *loadedmag;
1671 	magazine_depot *d;
1672 	int rc = -1;
1673 
1674 	/*
1675 	 * Do not try to access per-thread magazines while the mtmagazine
1676 	 * is being initialized or destroyed.
1677 	 */
1678 	tp = &thread_mags;
1679 	if (tp->init < 0)
1680 		return(-1);
1681 
1682 	/*
1683 	 * Primary per-thread freeing loop
1684 	 */
1685 	for (;;) {
1686 		/*
1687 		 * Make sure a new magazine is available in case we have
1688 		 * to use it.  Staging the newmag allows us to avoid
1689 		 * some locking/reentrancy complexity.
1690 		 *
1691 		 * Temporarily disable the per-thread caches for this
1692 		 * allocation to avoid reentrancy and/or to avoid a
1693 		 * stack overflow if the [zi] happens to be the same that
1694 		 * would be used to allocate the new magazine.
1695 		 */
1696 		if (tp->newmag == NULL) {
1697 			tp->init = -1;
1698 			tp->newmag = _slaballoc(sizeof(struct magazine),
1699 						SAFLAG_ZERO);
1700 			tp->init = 1;
1701 			if (tp->newmag == NULL) {
1702 				rc = -1;
1703 				break;
1704 			}
1705 		}
1706 
1707 		/*
1708 		 * If the loaded magazine has space, free directly to it
1709 		 */
1710 		rc = magazine_free(tp->mags[zi].loaded, ptr);
1711 		if (rc == 0)
1712 			break;
1713 
1714 		/*
1715 		 * If the prev magazine is empty, swap with the loaded
1716 		 * magazine and retry.
1717 		 */
1718 		mp = tp->mags[zi].prev;
1719 		if (mp && MAGAZINE_EMPTY(mp)) {
1720 			MASSERT(mp->rounds == 0);
1721 			swap_mags(&tp->mags[zi]);	/* prev now full */
1722 			continue;
1723 		}
1724 
1725 		/*
1726 		 * Try to get an empty magazine from the depot.  Cycle
1727 		 * through depot(empty)->loaded->prev->depot(full).
1728 		 * Retry if an empty magazine was available from the depot.
1729 		 */
1730 		d = &depots[zi];
1731 		depot_lock(d);
1732 
1733 		if ((loadedmag = tp->mags[zi].prev) != NULL)
1734 			SLIST_INSERT_HEAD(&d->full, loadedmag, nextmagazine);
1735 		tp->mags[zi].prev = tp->mags[zi].loaded;
1736 		mp = SLIST_FIRST(&d->empty);
1737 		if (mp) {
1738 			tp->mags[zi].loaded = mp;
1739 			SLIST_REMOVE_HEAD(&d->empty, nextmagazine);
1740 			MASSERT(MAGAZINE_NOTFULL(mp));
1741 		} else {
1742 			mp = tp->newmag;
1743 			tp->newmag = NULL;
1744 			mp->capacity = M_MAX_ROUNDS;
1745 			mp->rounds = 0;
1746 			mp->flags = 0;
1747 			tp->mags[zi].loaded = mp;
1748 		}
1749 		depot_unlock(d);
1750 	}
1751 
1752 	return rc;
1753 }
1754 
1755 static void
1756 mtmagazine_init(void)
1757 {
1758 	int error;
1759 
1760 	error = _pthread_key_create(&thread_mags_key, mtmagazine_destructor);
1761 	if (error)
1762 		abort();
1763 }
1764 
1765 /*
1766  * This function is only used by the thread exit destructor
1767  */
1768 static void
1769 mtmagazine_drain(struct magazine *mp)
1770 {
1771 	void *obj;
1772 
1773 	while (MAGAZINE_NOTEMPTY(mp)) {
1774 		obj = magazine_alloc(mp, NULL);
1775 		_slabfree(obj, 0, NULL);
1776 	}
1777 }
1778 
1779 /*
1780  * mtmagazine_destructor()
1781  *
1782  * When a thread exits, we reclaim all its resources; all its magazines are
1783  * drained and the structures are freed.
1784  *
1785  * WARNING!  The destructor can be called multiple times if the larger user
1786  *	     program has its own destructors which run after ours which
1787  *	     allocate or free memory.
1788  */
1789 static void
1790 mtmagazine_destructor(void *thrp)
1791 {
1792 	thr_mags *tp = thrp;
1793 	struct magazine *mp;
1794 	int i;
1795 
1796 	/*
1797 	 * Prevent further use of mtmagazines while we are destructing
1798 	 * them, as well as for any destructors which are run after us
1799 	 * prior to the thread actually being destroyed.
1800 	 */
1801 	tp->init = -1;
1802 
1803 	for (i = 0; i < NZONES; i++) {
1804 		mp = tp->mags[i].loaded;
1805 		tp->mags[i].loaded = NULL;
1806 		if (mp) {
1807 			if (MAGAZINE_NOTEMPTY(mp))
1808 				mtmagazine_drain(mp);
1809 			_slabfree(mp, 0, NULL);
1810 		}
1811 
1812 		mp = tp->mags[i].prev;
1813 		tp->mags[i].prev = NULL;
1814 		if (mp) {
1815 			if (MAGAZINE_NOTEMPTY(mp))
1816 				mtmagazine_drain(mp);
1817 			_slabfree(mp, 0, NULL);
1818 		}
1819 	}
1820 
1821 	if (tp->newmag) {
1822 		mp = tp->newmag;
1823 		tp->newmag = NULL;
1824 		_slabfree(mp, 0, NULL);
1825 	}
1826 }
1827 
1828 /*
1829  * zone_alloc()
1830  *
1831  * Attempt to allocate a zone from the zone magazine; the zone magazine has
1832  * M_BURST_EARLY enabled, so honor the burst request from the magazine.
1833  */
1834 static slzone_t
1835 zone_alloc(int flags)
1836 {
1837 	slglobaldata_t slgd = &SLGlobalData;
1838 	int burst = 1;
1839 	int i, j;
1840 	slzone_t z;
1841 
1842 	zone_magazine_lock();
1843 	slgd_unlock(slgd);
1844 
1845 	z = magazine_alloc(&zone_magazine, &burst);
1846 	if (z == NULL && burst == 1) {
1847 		zone_magazine_unlock();
1848 		z = _vmem_alloc(ZoneSize * burst, ZoneSize, flags);
1849 	} else if (z == NULL) {
1850 		z = _vmem_alloc(ZoneSize * burst, ZoneSize, flags);
1851 		if (z) {
1852 			for (i = 1; i < burst; i++) {
1853 				j = magazine_free(&zone_magazine,
1854 						  (char *) z + (ZoneSize * i));
1855 				MASSERT(j == 0);
1856 			}
1857 		}
1858 		zone_magazine_unlock();
1859 	} else {
1860 		z->z_Flags |= SLZF_UNOTZEROD;
1861 		zone_magazine_unlock();
1862 	}
1863 	slgd_lock(slgd);
1864 	return z;
1865 }
1866 
1867 /*
1868  * zone_free()
1869  *
1870  * Release a zone and unlock the slgd lock.
1871  */
1872 static void
1873 zone_free(void *z)
1874 {
1875 	slglobaldata_t slgd = &SLGlobalData;
1876 	void *excess[M_ZONE_ROUNDS - M_LOW_ROUNDS] = {};
1877 	int i, j;
1878 
1879 	zone_magazine_lock();
1880 	slgd_unlock(slgd);
1881 
1882 	bzero(z, sizeof(struct slzone));
1883 
1884 	if (opt_madvise)
1885 		madvise(z, ZoneSize, MADV_FREE);
1886 
1887 	i = magazine_free(&zone_magazine, z);
1888 
1889 	/*
1890 	 * If we failed to free, collect excess magazines; release the zone
1891 	 * magazine lock, and then free to the system via _vmem_free. Re-enable
1892 	 * BURST mode for the magazine.
1893 	 */
1894 	if (i == -1) {
1895 		j = zone_magazine.rounds - zone_magazine.low_factor;
1896 		for (i = 0; i < j; i++) {
1897 			excess[i] = magazine_alloc(&zone_magazine, NULL);
1898 			MASSERT(excess[i] !=  NULL);
1899 		}
1900 
1901 		zone_magazine_unlock();
1902 
1903 		for (i = 0; i < j; i++)
1904 			_vmem_free(excess[i], ZoneSize);
1905 
1906 		_vmem_free(z, ZoneSize);
1907 	} else {
1908 		zone_magazine_unlock();
1909 	}
1910 }
1911 
1912 /*
1913  * _vmem_alloc()
1914  *
1915  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1916  *	alignment.
1917  *
1918  *	Alignment must be a multiple of PAGE_SIZE.
1919  *
1920  *	Size must be >= alignment.
1921  */
1922 static void *
1923 _vmem_alloc(size_t size, size_t align, int flags)
1924 {
1925 	char *addr;
1926 	char *save;
1927 	size_t excess;
1928 
1929 	/*
1930 	 * Map anonymous private memory.
1931 	 */
1932 	addr = mmap(NULL, size, PROT_READ|PROT_WRITE,
1933 		    MAP_PRIVATE|MAP_ANON, -1, 0);
1934 	if (addr == MAP_FAILED)
1935 		return(NULL);
1936 
1937 	/*
1938 	 * Check alignment.  The misaligned offset is also the excess
1939 	 * amount.  If misaligned unmap the excess so we have a chance of
1940 	 * mapping at the next alignment point and recursively try again.
1941 	 *
1942 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1943 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1944 	 *   xxxxxxxxx				final excess calculation
1945 	 *   ^ returned address
1946 	 */
1947 	excess = (uintptr_t)addr & (align - 1);
1948 
1949 	if (excess) {
1950 		excess = align - excess;
1951 		save = addr;
1952 
1953 		munmap(save + excess, size - excess);
1954 		addr = _vmem_alloc(size, align, flags);
1955 		munmap(save, excess);
1956 	}
1957 	return((void *)addr);
1958 }
1959 
1960 /*
1961  * _vmem_free()
1962  *
1963  *	Free a chunk of memory allocated with _vmem_alloc()
1964  */
1965 static void
1966 _vmem_free(void *ptr, size_t size)
1967 {
1968 	munmap(ptr, size);
1969 }
1970 
1971 /*
1972  * Panic on fatal conditions
1973  */
1974 static void
1975 _mpanic(const char *ctl, ...)
1976 {
1977 	va_list va;
1978 
1979 	if (malloc_panic == 0) {
1980 		malloc_panic = 1;
1981 		va_start(va, ctl);
1982 		vfprintf(stderr, ctl, va);
1983 		fprintf(stderr, "\n");
1984 		fflush(stderr);
1985 		va_end(va);
1986 	}
1987 	abort();
1988 }
1989 
1990 __weak_reference(__aligned_alloc, aligned_alloc);
1991 __weak_reference(__malloc, malloc);
1992 __weak_reference(__calloc, calloc);
1993 __weak_reference(__posix_memalign, posix_memalign);
1994 __weak_reference(__realloc, realloc);
1995 __weak_reference(__free, free);
1996