xref: /dflybsd-src/lib/libc/stdlib/dmalloc.c (revision 171835807871f68c36f75ff84e1d6f7df4683df0)
1  /*
2   * DMALLOC.C	- Dillon's malloc
3   *
4   * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5   *
6   * This code is derived from software contributed to The DragonFly Project
7   * by Matthew Dillon <dillon@backplane.com>.
8   *
9   * Redistribution and use in source and binary forms, with or without
10   * modification, are permitted provided that the following conditions
11   * are met:
12   *
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   * 3. Neither the name of The DragonFly Project nor the names of its
20   *    contributors may be used to endorse or promote products derived
21   *    from this software without specific, prior written permission.
22   *
23   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24   * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27   * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28   * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31   * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34   * SUCH DAMAGE.
35   */
36  /*
37   * This module implements a modified slab allocator as a drop-in replacement
38   * for the libc malloc().  The slab algorithm has been adjusted to support
39   * dynamic sizing of slabs which effectively allows slabs to be used for
40   * allocations of any size.  Because of this we neither have a small-block
41   * allocator or a big-block allocator and the code paths are simplified.
42   *
43   * To support dynamic slab sizing available user virtual memory is broken
44   * down into ~1024 regions.  Each region has fixed slab size whos value is
45   * set when the region is opened up for use.  The free() path simply applies
46   * a mask based on the region to the pointer to acquire the base of the
47   * governing slab structure.
48   *
49   * Regions[NREGIONS]	(1024)
50   *
51   * Slab management and locking is done on a per-zone basis.
52   *
53   *	Alloc Size	Chunking        Number of zones
54   *	0-127		8		16
55   *	128-255		16		8
56   *	256-511		32		8
57   *	512-1023	64		8
58   *	1024-2047	128		8
59   *	2048-4095	256		8
60   *	4096-8191	512		8
61   *	8192-16383	1024		8
62   *	16384-32767	2048		8
63   *	32768-65535	4096		8
64   *	... continues forever ...	4 zones
65   *
66   *	For a 2^63 memory space each doubling >= 64K is broken down into
67   *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68   *
69   *			   API FEATURES AND SIDE EFFECTS
70   *
71   *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72   *	Above that power-of-2 sized allocations are page-aligned.  Non
73   *	power-of-2 sized allocations are aligned the same as the chunk
74   *	size for their zone.
75   *    + ability to allocate arbitrarily large chunks of memory
76   *    + realloc will reuse the passed pointer if possible, within the
77   *	limitations of the zone chunking.
78   *
79   * On top of the slab allocator we also implement a 16-entry-per-thread
80   * magazine cache for allocations <= NOMSLABSIZE.
81   *
82   *				FUTURE FEATURES
83   *
84   *    + [better] garbage collection
85   *    + better initial sizing.
86   *
87   * TUNING
88   *
89   * The value of the environment variable MALLOC_OPTIONS is a character string
90   * containing various flags to tune nmalloc.  Upper case letters enabled
91   * or increase the feature, lower case disables or decreases the feature.
92   *
93   * U		Enable UTRACE for all operations, observable with ktrace.
94   *		Diasbled by default.
95   *
96   * Z		Zero out allocations, otherwise allocations (except for
97   *		calloc) will contain garbage.
98   *		Disabled by default.
99   *
100   * H		Pass a hint with madvise() about unused pages.
101   *		Disabled by default.
102   *		Not currently implemented.
103   *
104   * F		Disable local per-thread caching.
105   *		Disabled by default.
106   *
107   * C		Increase (decrease) how much excess cache to retain.
108   *		Set to 4 by default.
109   */
110  
111  /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
112  
113  #ifndef STANDALONE_DEBUG
114  #include "libc_private.h"
115  #endif
116  
117  #include <sys/param.h>
118  #include <sys/types.h>
119  #include <sys/mman.h>
120  #include <sys/queue.h>
121  #include <sys/ktrace.h>
122  #include <stdio.h>
123  #include <stdint.h>
124  #include <stdlib.h>
125  #include <stdarg.h>
126  #include <stddef.h>
127  #include <unistd.h>
128  #include <string.h>
129  #include <fcntl.h>
130  #include <errno.h>
131  #include <pthread.h>
132  #include <limits.h>
133  
134  #include <machine/atomic.h>
135  #include <machine/cpufunc.h>
136  
137  #ifdef STANDALONE_DEBUG
138  void _nmalloc_thr_init(void);
139  #else
140  #include "spinlock.h"
141  #include "un-namespace.h"
142  #endif
143  
144  #ifndef MAP_SIZEALIGN
145  #define MAP_SIZEALIGN	0
146  #endif
147  
148  #if SSIZE_MAX == 0x7FFFFFFF
149  #define ADDRBITS	32
150  #define UVM_BITS	32	/* worst case */
151  #else
152  #define ADDRBITS	64
153  #define UVM_BITS	48	/* worst case XXX */
154  #endif
155  
156  #if LONG_MAX == 0x7FFFFFFF
157  #define LONG_BITS	32
158  #define LONG_BITS_SHIFT	5
159  #else
160  #define LONG_BITS	64
161  #define LONG_BITS_SHIFT	6
162  #endif
163  
164  #define LOCKEDPTR	((void *)(intptr_t)-1)
165  
166  /*
167   * Regions[]
168   */
169  #define NREGIONS_BITS	10
170  #define NREGIONS	(1 << NREGIONS_BITS)
171  #define NREGIONS_MASK	(NREGIONS - 1)
172  #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
173  #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
174  
175  typedef struct region *region_t;
176  typedef struct slglobaldata *slglobaldata_t;
177  typedef struct slab *slab_t;
178  
179  struct region {
180  	uintptr_t	mask;
181  	slab_t		slab;	/* conditional out of band slab */
182  };
183  
184  static struct region Regions[NREGIONS];
185  
186  /*
187   * Number of chunking zones available
188   */
189  #define CHUNKFACTOR	8
190  #if ADDRBITS == 32
191  #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
192  #else
193  #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
194  #endif
195  
196  static int MaxChunks[NZONES];
197  
198  #define NDEPOTS		8		/* must be power of 2 */
199  
200  /*
201   * Maximum number of chunks per slab, governed by the allocation bitmap in
202   * each slab.  The maximum is reduced for large chunk sizes.
203   */
204  #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
205  #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
206  #define LITSLABSIZE	(32 * 1024)
207  #define NOMSLABSIZE	(2 * 1024 * 1024)
208  #define BIGSLABSIZE	(128 * 1024 * 1024)
209  
210  #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
211  
212  TAILQ_HEAD(slab_list, slab);
213  
214  /*
215   * A slab structure
216   */
217  struct slab {
218  	struct slab	*next;		/* slabs with available space */
219  	TAILQ_ENTRY(slab) entry;
220  	int32_t		magic;		/* magic number for sanity check */
221  	u_int		navail;		/* number of free elements available */
222  	u_int		nmax;
223  	u_int		free_bit;	/* free hint bitno */
224  	u_int		free_index;	/* free hint index */
225  	u_long		bitmap[LONG_BITS]; /* free chunks */
226  	size_t		slab_size;	/* size of entire slab */
227  	size_t		chunk_size;	/* chunk size for validation */
228  	int		zone_index;
229  	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
230  	int		flags;
231  	region_t	region;		/* related region */
232  	char		*chunks;	/* chunk base */
233  	slglobaldata_t	slgd;		/* localized to thread else NULL */
234  };
235  
236  /*
237   * per-thread data + global depot
238   *
239   * NOTE: The magazine shortcut is only used for per-thread data.
240   */
241  #define NMAGSHORTCUT	16
242  
243  struct slglobaldata {
244  	spinlock_t	lock;		/* only used by slglobaldepot */
245  	struct zoneinfo {
246  		slab_t	avail_base;
247  		slab_t	empty_base;
248  		int	best_region;
249  		int	mag_index;
250  		int	avail_count;
251  		int	empty_count;
252  		void	*mag_shortcut[NMAGSHORTCUT];
253  	} zone[NZONES];
254  	struct slab_list full_zones;	/* via entry */
255  	int		masked;
256  	int		biggest_index;
257  	size_t		nslabs;
258  };
259  
260  #define SLAB_ZEROD		0x0001
261  
262  /*
263   * Misc constants.  Note that allocations that are exact multiples of
264   * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
265   * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
266   */
267  #define MIN_CHUNK_SIZE		8		/* in bytes */
268  #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
269  
270  #define SAFLAG_ZERO	0x00000001
271  
272  /*
273   * The WEIRD_ADDR is used as known text to copy into free objects to
274   * try to create deterministic failure cases if the data is accessed after
275   * free.
276   *
277   * WARNING: A limited number of spinlocks are available, BIGXSIZE should
278   *	    not be larger then 64.
279   */
280  #ifdef INVARIANTS
281  #define WEIRD_ADDR      0xdeadc0de
282  #endif
283  
284  /*
285   * Thread control
286   */
287  
288  #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
289  				_mpanic("assertion: %s in %s",	\
290  				#exp, __func__);		\
291  			    } while (0)
292  
293  /*
294   * With this attribute set, do not require a function call for accessing
295   * this variable when the code is compiled -fPIC.
296   *
297   * Must be empty for libc_rtld (similar to __thread)
298   */
299  #if defined(__LIBC_RTLD)
300  #define TLS_ATTRIBUTE
301  #else
302  #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
303  #endif
304  
305  static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
306  static pthread_key_t thread_malloc_key;
307  static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
308  static struct slglobaldata slglobaldepot;
309  
310  static int opt_madvise = 0;
311  static int opt_free = 0;
312  static int opt_cache = 4;
313  static int opt_utrace = 0;
314  static int g_malloc_flags = 0;
315  static int malloc_panic;
316  
317  #ifdef INVARIANTS
318  static const int32_t weirdary[16] = {
319  	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
320  	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321  	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322  	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
323  };
324  #endif
325  
326  static void *memalloc(size_t size, int flags);
327  static void *memrealloc(void *ptr, size_t size);
328  static void memfree(void *ptr, int);
329  static int memalign(void **memptr, size_t alignment, size_t size);
330  static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
331  static void slabfree(slab_t slab);
332  static void slabterm(slglobaldata_t slgd, slab_t slab);
333  static void *_vmem_alloc(int ri, size_t slab_size);
334  static void _vmem_free(void *ptr, size_t slab_size);
335  static void _mpanic(const char *ctl, ...) __printflike(1, 2);
336  #ifndef STANDALONE_DEBUG
337  static void malloc_init(void) __constructor(101);
338  #else
339  static void malloc_init(void) __constructor(101);
340  #endif
341  
342  
343  struct nmalloc_utrace {
344  	void *p;
345  	size_t s;
346  	void *r;
347  };
348  
349  #define UTRACE(a, b, c)						\
350  	if (opt_utrace) {					\
351  		struct nmalloc_utrace ut = {			\
352  			.p = (a),				\
353  			.s = (b),				\
354  			.r = (c)				\
355  		};						\
356  		utrace(&ut, sizeof(ut));			\
357  	}
358  
359  #ifdef INVARIANTS
360  /*
361   * If enabled any memory allocated without M_ZERO is initialized to -1.
362   */
363  static int  use_malloc_pattern;
364  #endif
365  
366  static void
malloc_init(void)367  malloc_init(void)
368  {
369  	const char *p = NULL;
370  
371  	TAILQ_INIT(&slglobal.full_zones);
372  
373  	Regions[0].mask = -1; /* disallow activity in lowest region */
374  
375  	if (issetugid() == 0)
376  		p = getenv("MALLOC_OPTIONS");
377  
378  	for (; p != NULL && *p != '\0'; p++) {
379  		switch(*p) {
380  		case 'u':
381  			opt_utrace = 0;
382  			break;
383  		case 'U':
384  			opt_utrace = 1;
385  			break;
386  		case 'h':
387  			opt_madvise = 0;
388  			break;
389  		case 'H':
390  			opt_madvise = 1;
391  			break;
392  		case 'c':
393  			if (opt_cache > 0)
394  				--opt_cache;
395  			break;
396  		case 'C':
397  			++opt_cache;
398  			break;
399  		case 'f':
400  			opt_free = 0;
401  			break;
402  		case 'F':
403  			opt_free = 1;
404  			break;
405  		case 'z':
406  			g_malloc_flags = 0;
407  			break;
408  		case 'Z':
409  			g_malloc_flags = SAFLAG_ZERO;
410  			break;
411  		default:
412  			break;
413  		}
414  	}
415  
416  	UTRACE((void *) -1, 0, NULL);
417  }
418  
419  /*
420   * We have to install a handler for nmalloc thread teardowns when
421   * the thread is created.  We cannot delay this because destructors in
422   * sophisticated userland programs can call malloc() for the first time
423   * during their thread exit.
424   *
425   * This routine is called directly from pthreads.
426   */
427  static void _nmalloc_thr_init_once(void);
428  static void _nmalloc_thr_destructor(void *thrp);
429  
430  void
_nmalloc_thr_init(void)431  _nmalloc_thr_init(void)
432  {
433  	static int did_init;
434  
435  	TAILQ_INIT(&slglobal.full_zones);
436  
437  	if (slglobal.masked)
438  		return;
439  
440  	slglobal.masked = 1;
441  	if (did_init == 0) {
442  		did_init = 1;
443  		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
444  	}
445  	pthread_setspecific(thread_malloc_key, &slglobal);
446  	slglobal.masked = 0;
447  }
448  
449  void
_nmalloc_thr_prepfork(void)450  _nmalloc_thr_prepfork(void)
451  {
452  	if (__isthreaded)
453  		_SPINLOCK(&slglobaldepot.lock);
454  }
455  
456  void
_nmalloc_thr_parentfork(void)457  _nmalloc_thr_parentfork(void)
458  {
459  	if (__isthreaded)
460  		_SPINUNLOCK(&slglobaldepot.lock);
461  }
462  
463  void
_nmalloc_thr_childfork(void)464  _nmalloc_thr_childfork(void)
465  {
466  	if (__isthreaded)
467  		_SPINUNLOCK(&slglobaldepot.lock);
468  }
469  
470  /*
471   * Called just once
472   */
473  static void
_nmalloc_thr_init_once(void)474  _nmalloc_thr_init_once(void)
475  {
476  	/* ignore error from stub if not threaded */
477  	pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
478  }
479  
480  /*
481   * Called for each thread undergoing exit
482   *
483   * Move all of the thread's slabs into a depot.
484   */
485  static void
_nmalloc_thr_destructor(void * thrp)486  _nmalloc_thr_destructor(void *thrp)
487  {
488  	slglobaldata_t slgd = thrp;
489  	struct zoneinfo *zinfo;
490  	slab_t slab;
491  	void *ptr;
492  	int i;
493  	int j;
494  
495  	slgd->masked = 1;
496  
497  	for (i = 0; i <= slgd->biggest_index; i++) {
498  		zinfo = &slgd->zone[i];
499  
500  		while ((j = zinfo->mag_index) > 0) {
501  			--j;
502  			ptr = zinfo->mag_shortcut[j];
503  			zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
504  			zinfo->mag_index = j;
505  			memfree(ptr, 0);
506  		}
507  
508  		while ((slab = zinfo->empty_base) != NULL) {
509  			zinfo->empty_base = slab->next;
510  			--zinfo->empty_count;
511  			slabterm(slgd, slab);
512  		}
513  
514  		while ((slab = zinfo->avail_base) != NULL) {
515  			zinfo->avail_base = slab->next;
516  			--zinfo->avail_count;
517  			slabterm(slgd, slab);
518  		}
519  
520  		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
521  			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
522  			slabterm(slgd, slab);
523  		}
524  	}
525  }
526  
527  /*
528   * Calculate the zone index for the allocation request size and set the
529   * allocation request size to that particular zone's chunk size.
530   *
531   * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
532   * with malloc requirements for intel/amd.
533   */
534  static __inline int
zoneindex(size_t * bytes,size_t * chunking)535  zoneindex(size_t *bytes, size_t *chunking)
536  {
537  	size_t n = (size_t)*bytes;
538  	size_t x;
539  	size_t c;
540  	int i;
541  
542  	if (n < 128) {
543  		if (n < 16) {
544  			*bytes = n = (n + 7) & ~7;
545  			*chunking = 8;
546  			return(n / 8 - 1);	/* 8 byte chunks, 2 zones */
547  		} else {
548  			*bytes = n = (n + 15) & ~15;
549  			*chunking = 16;
550  			return(n / 16 + 2);	/* 16 byte chunks, 8 zones */
551  		}
552  	}
553  	if (n < 4096) {
554  		x = 256;
555  		c = x / (CHUNKFACTOR * 2);
556  		i = 16;
557  	} else {
558  		x = 8192;
559  		c = x / (CHUNKFACTOR * 2);
560  		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
561  	}
562  	while (n >= x) {
563  		x <<= 1;
564  		c <<= 1;
565  		i += CHUNKFACTOR;
566  		if (x == 0)
567  			_mpanic("slaballoc: byte value too high");
568  	}
569  	*bytes = n = roundup2(n, c);
570  	*chunking = c;
571  	return (i + n / c - CHUNKFACTOR);
572  #if 0
573  	*bytes = n = (n + c - 1) & ~(c - 1);
574  	*chunking = c;
575  	return (n / c + i);
576  
577  	if (n < 256) {
578  		*bytes = n = (n + 15) & ~15;
579  		*chunking = 16;
580  		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
581  	}
582  	if (n < 8192) {
583  		if (n < 512) {
584  			*bytes = n = (n + 31) & ~31;
585  			*chunking = 32;
586  			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
587  		}
588  		if (n < 1024) {
589  			*bytes = n = (n + 63) & ~63;
590  			*chunking = 64;
591  			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
592  		}
593  		if (n < 2048) {
594  			*bytes = n = (n + 127) & ~127;
595  			*chunking = 128;
596  			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
597  		}
598  		if (n < 4096) {
599  			*bytes = n = (n + 255) & ~255;
600  			*chunking = 256;
601  			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
602  		}
603  		*bytes = n = (n + 511) & ~511;
604  		*chunking = 512;
605  		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
606  	}
607  	if (n < 16384) {
608  		*bytes = n = (n + 1023) & ~1023;
609  		*chunking = 1024;
610  		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
611  	}
612  	if (n < 32768) {				/* 16384-32767 */
613  		*bytes = n = (n + 2047) & ~2047;
614  		*chunking = 2048;
615  		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
616  	}
617  	if (n < 65536) {
618  		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
619  		*chunking = 4096;
620  		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
621  	}
622  
623  	x = 131072;
624  	c = 8192;
625  	i = CHUNKINGLO*10 - 1;
626  
627  	while (n >= x) {
628  		x <<= 1;
629  		c <<= 1;
630  		i += CHUNKINGHI;
631  		if (x == 0)
632  			_mpanic("slaballoc: byte value too high");
633  	}
634  	*bytes = n = (n + c - 1) & ~(c - 1);
635  	*chunking = c;
636  	return (n / c + i);
637  #endif
638  }
639  
640  /*
641   * malloc() - call internal slab allocator
642   */
643  void *
__malloc(size_t size)644  __malloc(size_t size)
645  {
646  	void *ptr;
647  
648  	ptr = memalloc(size, 0);
649  	if (ptr == NULL)
650  		errno = ENOMEM;
651  	else
652  		UTRACE(0, size, ptr);
653  	return(ptr);
654  }
655  
656  /*
657   * calloc() - call internal slab allocator
658   */
659  void *
__calloc(size_t number,size_t size)660  __calloc(size_t number, size_t size)
661  {
662  	void *ptr;
663  
664  	ptr = memalloc(number * size, SAFLAG_ZERO);
665  	if (ptr == NULL)
666  		errno = ENOMEM;
667  	else
668  		UTRACE(0, number * size, ptr);
669  	return(ptr);
670  }
671  
672  /*
673   * realloc() (SLAB ALLOCATOR)
674   *
675   * We do not attempt to optimize this routine beyond reusing the same
676   * pointer if the new size fits within the chunking of the old pointer's
677   * zone.
678   */
679  void *
__realloc(void * ptr,size_t size)680  __realloc(void *ptr, size_t size)
681  {
682  	void *ret;
683  
684  	if (ptr == NULL)
685  		ret = memalloc(size, 0);
686  	else
687  		ret = memrealloc(ptr, size);
688  	if (ret == NULL)
689  		errno = ENOMEM;
690  	else
691  		UTRACE(ptr, size, ret);
692  	return(ret);
693  }
694  
695  /*
696   * aligned_alloc()
697   *
698   * Allocate (size) bytes with a alignment of (alignment).
699   */
700  void *
__aligned_alloc(size_t alignment,size_t size)701  __aligned_alloc(size_t alignment, size_t size)
702  {
703  	void *ptr;
704  	int rc;
705  
706  	ptr = NULL;
707  	rc = memalign(&ptr, alignment, size);
708  	if (rc)
709  		errno = rc;
710  
711  	return (ptr);
712  }
713  
714  /*
715   * posix_memalign()
716   *
717   * Allocate (size) bytes with a alignment of (alignment), where (alignment)
718   * is a power of 2 >= sizeof(void *).
719   */
720  int
__posix_memalign(void ** memptr,size_t alignment,size_t size)721  __posix_memalign(void **memptr, size_t alignment, size_t size)
722  {
723  	int rc;
724  
725  	/*
726  	 * OpenGroup spec issue 6 check
727  	 */
728  	if (alignment < sizeof(void *)) {
729  		*memptr = NULL;
730  		return(EINVAL);
731  	}
732  
733  	rc = memalign(memptr, alignment, size);
734  
735  	return (rc);
736  }
737  
738  /*
739   * The slab allocator will allocate on power-of-2 boundaries up to at least
740   * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
741   * matching the requirements.
742   */
743  static int
memalign(void ** memptr,size_t alignment,size_t size)744  memalign(void **memptr, size_t alignment, size_t size)
745  {
746  
747  	if (alignment < 1) {
748  		*memptr = NULL;
749  		return(EINVAL);
750  	}
751  
752  	/*
753  	 * OpenGroup spec issue 6 check
754  	 */
755  	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
756  		*memptr = NULL;
757  		return(EINVAL);
758  	}
759  
760  	/*
761  	 * XXX for now just find the nearest power of 2 >= size and also
762  	 * >= alignment and allocate that.
763  	 */
764  	while (alignment < size) {
765  		alignment <<= 1;
766  		if (alignment == 0)
767  			_mpanic("posix_memalign: byte value too high");
768  	}
769  	*memptr = memalloc(alignment, 0);
770  	return(*memptr ? 0 : ENOMEM);
771  }
772  
773  /*
774   * free() (SLAB ALLOCATOR) - do the obvious
775   */
776  void
__free(void * ptr)777  __free(void *ptr)
778  {
779  	if (ptr) {
780  		UTRACE(ptr, 0, 0);
781  		memfree(ptr, 0);
782  	}
783  }
784  
785  /*
786   * memalloc()	(SLAB ALLOCATOR)
787   *
788   *	Allocate memory via the slab allocator.
789   */
790  static void *
memalloc(size_t size,int flags)791  memalloc(size_t size, int flags)
792  {
793  	slglobaldata_t slgd;
794  	struct zoneinfo *zinfo;
795  	slab_t slab;
796  	size_t chunking;
797  	int bmi;
798  	int bno;
799  	u_long *bmp;
800  	int zi;
801  #ifdef INVARIANTS
802  	int i;
803  #endif
804  	int j;
805  	char *obj;
806  
807  	/*
808  	 * If 0 bytes is requested we have to return a unique pointer, allocate
809  	 * at least one byte.
810  	 */
811  	if (size == 0)
812  		size = 1;
813  
814  	/* Capture global flags */
815  	flags |= g_malloc_flags;
816  
817  	/* Compute allocation zone; zoneindex will panic on excessive sizes */
818  	zi = zoneindex(&size, &chunking);
819  	MASSERT(zi < NZONES);
820  	if (size == 0)
821  		return(NULL);
822  
823  	/*
824  	 * Try magazine shortcut first
825  	 */
826  	slgd = &slglobal;
827  	zinfo = &slgd->zone[zi];
828  
829  	if ((j = zinfo->mag_index) != 0) {
830  		zinfo->mag_index = --j;
831  		obj = zinfo->mag_shortcut[j];
832  		zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
833  		if (flags & SAFLAG_ZERO)
834  			bzero(obj, size);
835  		return obj;
836  	}
837  
838  	/*
839  	 * Locate a slab with available space.  If no slabs are available
840  	 * back-off to the empty list and if we still come up dry allocate
841  	 * a new slab (which will try the depot first).
842  	 */
843  retry:
844  	if ((slab = zinfo->avail_base) == NULL) {
845  		if ((slab = zinfo->empty_base) == NULL) {
846  			/*
847  			 * Still dry
848  			 */
849  			slab = slaballoc(zi, chunking, size);
850  			if (slab == NULL)
851  				return(NULL);
852  			slab->next = zinfo->avail_base;
853  			zinfo->avail_base = slab;
854  			++zinfo->avail_count;
855  			slab->state = AVAIL;
856  			if (slgd->biggest_index < zi)
857  				slgd->biggest_index = zi;
858  			++slgd->nslabs;
859  		} else {
860  			/*
861  			 * Pulled from empty list
862  			 */
863  			zinfo->empty_base = slab->next;
864  			slab->next = zinfo->avail_base;
865  			zinfo->avail_base = slab;
866  			++zinfo->avail_count;
867  			slab->state = AVAIL;
868  			--zinfo->empty_count;
869  		}
870  	}
871  
872  	/*
873  	 * Allocate a chunk out of the slab.  HOT PATH
874  	 *
875  	 * Only the thread owning the slab can allocate out of it.
876  	 *
877  	 * NOTE: The last bit in the bitmap is always marked allocated so
878  	 *	 we cannot overflow here.
879  	 */
880  	bno = slab->free_bit;
881  	bmi = slab->free_index;
882  	bmp = &slab->bitmap[bmi];
883  	if (*bmp & (1LU << bno)) {
884  		atomic_clear_long(bmp, 1LU << bno);
885  		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
886  		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
887  		atomic_add_int(&slab->navail, -1);
888  		if (flags & SAFLAG_ZERO)
889  			bzero(obj, size);
890  		return (obj);
891  	}
892  
893  	/*
894  	 * Allocate a chunk out of a slab.  COLD PATH
895  	 */
896  	if (slab->navail == 0) {
897  		zinfo->avail_base = slab->next;
898  		--zinfo->avail_count;
899  		slab->state = FULL;
900  		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
901  		goto retry;
902  	}
903  
904  	while (bmi < LONG_BITS) {
905  		bmp = &slab->bitmap[bmi];
906  		if (*bmp) {
907  			bno = bsflong(*bmp);
908  			atomic_clear_long(bmp, 1LU << bno);
909  			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
910  					     size;
911  			slab->free_index = bmi;
912  			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
913  			atomic_add_int(&slab->navail, -1);
914  			if (flags & SAFLAG_ZERO)
915  				bzero(obj, size);
916  			return (obj);
917  		}
918  		++bmi;
919  	}
920  	bmi = 0;
921  	while (bmi < LONG_BITS) {
922  		bmp = &slab->bitmap[bmi];
923  		if (*bmp) {
924  			bno = bsflong(*bmp);
925  			atomic_clear_long(bmp, 1LU << bno);
926  			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
927  					     size;
928  			slab->free_index = bmi;
929  			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
930  			atomic_add_int(&slab->navail, -1);
931  			if (flags & SAFLAG_ZERO)
932  				bzero(obj, size);
933  			return (obj);
934  		}
935  		++bmi;
936  	}
937  	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
938  	/* not reached */
939  	return NULL;
940  }
941  
942  /*
943   * Reallocate memory within the chunk
944   */
945  static void *
memrealloc(void * ptr,size_t nsize)946  memrealloc(void *ptr, size_t nsize)
947  {
948  	region_t region;
949  	slab_t slab;
950  	size_t osize;
951  	char *obj;
952  	int flags = 0;
953  
954  	/*
955  	 * If 0 bytes is requested we have to return a unique pointer, allocate
956  	 * at least one byte.
957  	 */
958  	if (nsize == 0)
959  		nsize = 1;
960  
961  	/* Capture global flags */
962  	flags |= g_malloc_flags;
963  
964  	/*
965  	 * Locate the zone by looking up the dynamic slab size mask based
966  	 * on the memory region the allocation resides in.
967  	 */
968  	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
969  	if ((slab = region->slab) == NULL)
970  		slab = (void *)((uintptr_t)ptr & region->mask);
971  	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
972  	osize = slab->chunk_size;
973  	if (nsize <= osize) {
974  		if (osize < 32 || nsize >= osize / 2) {
975  			obj = ptr;
976  			if ((flags & SAFLAG_ZERO) && nsize < osize)
977  				bzero(obj + nsize, osize - nsize);
978  			return(obj);
979  		}
980  	}
981  
982  	/*
983  	 * Otherwise resize the object
984  	 */
985  	obj = memalloc(nsize, 0);
986  	if (obj) {
987  		if (nsize > osize)
988  			nsize = osize;
989  		bcopy(ptr, obj, nsize);
990  		memfree(ptr, 0);
991  	}
992  	return (obj);
993  }
994  
995  /*
996   * free (SLAB ALLOCATOR)
997   *
998   * Free a memory block previously allocated by malloc.
999   *
1000   * MPSAFE
1001   */
1002  static void
memfree(void * ptr,int flags)1003  memfree(void *ptr, int flags)
1004  {
1005  	region_t region;
1006  	slglobaldata_t slgd;
1007  	slab_t slab;
1008  	slab_t stmp;
1009  	slab_t *slabp;
1010  	int bmi;
1011  	int bno;
1012  	int j;
1013  	u_long *bmp;
1014  
1015  	/*
1016  	 * Locate the zone by looking up the dynamic slab size mask based
1017  	 * on the memory region the allocation resides in.
1018  	 *
1019  	 * WARNING!  The slab may be owned by another thread!
1020  	 */
1021  	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1022  	if ((slab = region->slab) == NULL)
1023  		slab = (void *)((uintptr_t)ptr & region->mask);
1024  	MASSERT(slab != NULL);
1025  	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1026  
1027  #ifdef INVARIANTS
1028  	/*
1029  	 * Put weird data into the memory to detect modifications after
1030  	 * freeing, illegal pointer use after freeing (we should fault on
1031  	 * the odd address), and so forth.
1032  	 */
1033  	if (slab->chunk_size < sizeof(weirdary))
1034  		bcopy(weirdary, ptr, slab->chunk_size);
1035  	else
1036  		bcopy(weirdary, ptr, sizeof(weirdary));
1037  #endif
1038  	slgd = &slglobal;
1039  
1040  	/*
1041  	 * Use mag_shortcut[] when possible
1042  	 */
1043  	if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1044  		struct zoneinfo *zinfo;
1045  
1046  		zinfo = &slgd->zone[slab->zone_index];
1047  		j = zinfo->mag_index;
1048  		if (j < NMAGSHORTCUT) {
1049  			zinfo->mag_shortcut[j] = ptr;
1050  			zinfo->mag_index = j + 1;
1051  			return;
1052  		}
1053  	}
1054  
1055  	/*
1056  	 * Free to slab and increment navail.  We can delay incrementing
1057  	 * navail to prevent the slab from being destroyed out from under
1058  	 * us while we do other optimizations.
1059  	 */
1060  	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1061  	bmi = bno >> LONG_BITS_SHIFT;
1062  	bno &= (LONG_BITS - 1);
1063  	bmp = &slab->bitmap[bmi];
1064  
1065  	MASSERT(bmi >= 0 && bmi < slab->nmax);
1066  	MASSERT((*bmp & (1LU << bno)) == 0);
1067  	atomic_set_long(bmp, 1LU << bno);
1068  
1069  	if (slab->slgd == slgd) {
1070  		/*
1071  		 * We can only do the following if we own the slab.  Note
1072  		 * that navail can be incremented by any thread even if
1073  		 * we own the slab.
1074  		 */
1075  		struct zoneinfo *zinfo;
1076  
1077  		atomic_add_int(&slab->navail, 1);
1078  		if (slab->free_index > bmi) {
1079  			slab->free_index = bmi;
1080  			slab->free_bit = bno;
1081  		} else if (slab->free_index == bmi &&
1082  			   slab->free_bit > bno) {
1083  			slab->free_bit = bno;
1084  		}
1085  		zinfo = &slgd->zone[slab->zone_index];
1086  
1087  		/*
1088  		 * Freeing an object from a full slab makes it less than
1089  		 * full.  The slab must be moved to the available list.
1090  		 *
1091  		 * If the available list has too many slabs, release some
1092  		 * to the depot.
1093  		 */
1094  		if (slab->state == FULL) {
1095  			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1096  			slab->state = AVAIL;
1097  			stmp = zinfo->avail_base;
1098  			slab->next = stmp;
1099  			zinfo->avail_base = slab;
1100  			++zinfo->avail_count;
1101  			while (zinfo->avail_count > opt_cache) {
1102  				slab = zinfo->avail_base;
1103  				zinfo->avail_base = slab->next;
1104  				--zinfo->avail_count;
1105  				slabterm(slgd, slab);
1106  			}
1107  			goto done;
1108  		}
1109  
1110  		/*
1111  		 * If the slab becomes completely empty dispose of it in
1112  		 * some manner.  By default each thread caches up to 4
1113  		 * empty slabs.  Only small slabs are cached.
1114  		 */
1115  		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1116  			/*
1117  			 * Remove slab from available queue
1118  			 */
1119  			slabp = &zinfo->avail_base;
1120  			while ((stmp = *slabp) != slab)
1121  				slabp = &stmp->next;
1122  			*slabp = slab->next;
1123  			--zinfo->avail_count;
1124  
1125  			if (opt_free || opt_cache == 0) {
1126  				/*
1127  				 * If local caching is disabled cache the
1128  				 * slab in the depot (or free it).
1129  				 */
1130  				slabterm(slgd, slab);
1131  			} else if (slab->slab_size > BIGSLABSIZE) {
1132  				/*
1133  				 * We do not try to retain large slabs
1134  				 * in per-thread caches.
1135  				 */
1136  				slabterm(slgd, slab);
1137  			} else if (zinfo->empty_count > opt_cache) {
1138  				/*
1139  				 * We have too many slabs cached, but
1140  				 * instead of freeing this one free
1141  				 * an empty slab that's been idle longer.
1142  				 *
1143  				 * (empty_count does not change)
1144  				 */
1145  				stmp = zinfo->empty_base;
1146  				slab->state = EMPTY;
1147  				slab->next = stmp->next;
1148  				zinfo->empty_base = slab;
1149  				slabterm(slgd, stmp);
1150  			} else {
1151  				/*
1152  				 * Cache the empty slab in our thread local
1153  				 * empty list.
1154  				 */
1155  				++zinfo->empty_count;
1156  				slab->state = EMPTY;
1157  				slab->next = zinfo->empty_base;
1158  				zinfo->empty_base = slab;
1159  			}
1160  		}
1161  	} else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1162  		slglobaldata_t sldepot;
1163  
1164  		/*
1165  		 * If freeing to a slab owned by the global depot, and
1166  		 * the slab becomes completely EMPTY, try to move it to
1167  		 * the correct list.
1168  		 */
1169  		sldepot = &slglobaldepot;
1170  		if (__isthreaded)
1171  			_SPINLOCK(&sldepot->lock);
1172  		if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1173  			struct zoneinfo *zinfo;
1174  
1175  			/*
1176  			 * Move the slab to the empty list
1177  			 */
1178  			MASSERT(slab->state == AVAIL);
1179  			atomic_add_int(&slab->navail, 1);
1180  			zinfo = &sldepot->zone[slab->zone_index];
1181  			slabp = &zinfo->avail_base;
1182  			while (slab != *slabp)
1183  				slabp = &(*slabp)->next;
1184  			*slabp = slab->next;
1185  			--zinfo->avail_count;
1186  
1187  			/*
1188  			 * Clean out excessive empty entries from the
1189  			 * depot.
1190  			 */
1191  			slab->state = EMPTY;
1192  			slab->next = zinfo->empty_base;
1193  			zinfo->empty_base = slab;
1194  			++zinfo->empty_count;
1195  			while (zinfo->empty_count > opt_cache) {
1196  				slab = zinfo->empty_base;
1197  				zinfo->empty_base = slab->next;
1198  				--zinfo->empty_count;
1199  				slab->state = UNKNOWN;
1200  				if (__isthreaded)
1201  					_SPINUNLOCK(&sldepot->lock);
1202  				slabfree(slab);
1203  				if (__isthreaded)
1204  					_SPINLOCK(&sldepot->lock);
1205  			}
1206  		} else {
1207  			atomic_add_int(&slab->navail, 1);
1208  		}
1209  		if (__isthreaded)
1210  			_SPINUNLOCK(&sldepot->lock);
1211  	} else {
1212  		/*
1213  		 * We can't act on the slab other than by adjusting navail
1214  		 * (and the bitmap which we did in the common code at the
1215  		 * top).
1216  		 */
1217  		atomic_add_int(&slab->navail, 1);
1218  	}
1219  done:
1220  	;
1221  }
1222  
1223  /*
1224   * Allocate a new slab holding objects of size chunk_size.
1225   */
1226  static slab_t
slaballoc(int zi,size_t chunking,size_t chunk_size)1227  slaballoc(int zi, size_t chunking, size_t chunk_size)
1228  {
1229  	slglobaldata_t slgd;
1230  	slglobaldata_t sldepot;
1231  	struct zoneinfo *zinfo;
1232  	region_t region;
1233  	void *save;
1234  	slab_t slab;
1235  	size_t slab_desire;
1236  	size_t slab_size;
1237  	size_t region_mask;
1238  	uintptr_t chunk_offset;
1239  	ssize_t maxchunks;
1240  	ssize_t tmpchunks;
1241  	int ispower2;
1242  	int power;
1243  	int ri;
1244  	int rx;
1245  	int nswath;
1246  	int j;
1247  
1248  	/*
1249  	 * First look in the depot.  Any given zone in the depot may be
1250  	 * locked by being set to -1.  We have to do this instead of simply
1251  	 * removing the entire chain because removing the entire chain can
1252  	 * cause racing threads to allocate local slabs for large objects,
1253  	 * resulting in a large VSZ.
1254  	 */
1255  	slgd = &slglobal;
1256  	sldepot = &slglobaldepot;
1257  	zinfo = &sldepot->zone[zi];
1258  
1259  	if (zinfo->avail_base) {
1260  		if (__isthreaded)
1261  			_SPINLOCK(&sldepot->lock);
1262  		slab = zinfo->avail_base;
1263  		if (slab) {
1264  			MASSERT(slab->slgd == NULL);
1265  			slab->slgd = slgd;
1266  			zinfo->avail_base = slab->next;
1267  			--zinfo->avail_count;
1268  			if (__isthreaded)
1269  				_SPINUNLOCK(&sldepot->lock);
1270  			return slab;
1271  		}
1272  		if (__isthreaded)
1273  			_SPINUNLOCK(&sldepot->lock);
1274  	}
1275  
1276  	/*
1277  	 * Nothing in the depot, allocate a new slab by locating or assigning
1278  	 * a region and then using the system virtual memory allocator.
1279  	 */
1280  	slab = NULL;
1281  
1282  	/*
1283  	 * Calculate the start of the data chunks relative to the start
1284  	 * of the slab.  If chunk_size is a power of 2 we guarantee
1285  	 * power of 2 alignment.  If it is not we guarantee alignment
1286  	 * to the chunk size.
1287  	 */
1288  	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1289  		ispower2 = 1;
1290  		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1291  	} else {
1292  		ispower2 = 0;
1293  		chunk_offset = sizeof(*slab) + chunking - 1;
1294  		chunk_offset -= chunk_offset % chunking;
1295  	}
1296  
1297  	/*
1298  	 * Calculate a reasonable number of chunks for the slab.
1299  	 *
1300  	 * Once initialized the MaxChunks[] array can only ever be
1301  	 * reinitialized to the same value.
1302  	 */
1303  	maxchunks = MaxChunks[zi];
1304  	if (maxchunks == 0) {
1305  		/*
1306  		 * First calculate how many chunks would fit in 1/1024
1307  		 * available memory.  This is around 2MB on a 32 bit
1308  		 * system and 128G on a 64-bit (48-bits available) system.
1309  		 */
1310  		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1311  			    (ssize_t)chunk_size;
1312  		if (maxchunks <= 0)
1313  			maxchunks = 1;
1314  
1315  		/*
1316  		 * A slab cannot handle more than MAXCHUNKS chunks, but
1317  		 * limit us to approximately MAXCHUNKS / 2 here because
1318  		 * we may have to expand maxchunks when we calculate the
1319  		 * actual power-of-2 slab.
1320  		 */
1321  		if (maxchunks > MAXCHUNKS / 2)
1322  			maxchunks = MAXCHUNKS / 2;
1323  
1324  		/*
1325  		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1326  		 * slabs will be created if the allocation does not fit.
1327  		 */
1328  		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1329  			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1330  				    (ssize_t)chunk_size;
1331  			if (tmpchunks <= 0)
1332  				tmpchunks = 1;
1333  			if (maxchunks > tmpchunks)
1334  				maxchunks = tmpchunks;
1335  		}
1336  
1337  		/*
1338  		 * If the slab calculates to greater than 2MB see if we
1339  		 * can cut it down to ~2MB.  This controls VSZ but has
1340  		 * no effect on run-time size or performance.
1341  		 *
1342  		 * This is very important in case you core dump and also
1343  		 * important to reduce unnecessary region allocations.
1344  		 */
1345  		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1346  			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1347  				    (ssize_t)chunk_size;
1348  			if (tmpchunks < 1)
1349  				tmpchunks = 1;
1350  			if (maxchunks > tmpchunks)
1351  				maxchunks = tmpchunks;
1352  		}
1353  
1354  		/*
1355  		 * If the slab calculates to greater than 128K see if we
1356  		 * can cut it down to ~128K while still maintaining a
1357  		 * reasonably large number of chunks in each slab.  This
1358  		 * controls VSZ but has no effect on run-time size or
1359  		 * performance.
1360  		 *
1361  		 * This is very important in case you core dump and also
1362  		 * important to reduce unnecessary region allocations.
1363  		 */
1364  		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1365  			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1366  				    (ssize_t)chunk_size;
1367  			if (tmpchunks < 32)
1368  				tmpchunks = 32;
1369  			if (maxchunks > tmpchunks)
1370  				maxchunks = tmpchunks;
1371  		}
1372  
1373  		MaxChunks[zi] = maxchunks;
1374  	}
1375  	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1376  
1377  	/*
1378  	 * Calculate the actual slab size.  maxchunks will be recalculated
1379  	 * a little later.
1380  	 */
1381  	slab_desire = chunk_offset + chunk_size * maxchunks;
1382  	slab_size = 8 * MAXCHUNKS;
1383  	power = 3 + MAXCHUNKS_BITS;
1384  	while (slab_size < slab_desire) {
1385  		slab_size <<= 1;
1386  		++power;
1387  	}
1388  
1389  	/*
1390  	 * Do a quick recalculation based on the actual slab size but not
1391  	 * yet dealing with whether the slab header is in-band or out-of-band.
1392  	 * The purpose here is to see if we can reasonably reduce slab_size
1393  	 * to a power of 4 to allow more chunk sizes to use the same slab
1394  	 * size.
1395  	 */
1396  	if ((power & 1) && slab_size > 32768) {
1397  		maxchunks = (slab_size - chunk_offset) / chunk_size;
1398  		if (maxchunks >= MAXCHUNKS / 8) {
1399  			slab_size >>= 1;
1400  			--power;
1401  		}
1402  	}
1403  	if ((power & 2) && slab_size > 32768 * 4) {
1404  		maxchunks = (slab_size - chunk_offset) / chunk_size;
1405  		if (maxchunks >= MAXCHUNKS / 4) {
1406  			slab_size >>= 2;
1407  			power -= 2;
1408  		}
1409  	}
1410  	/*
1411  	 * This case occurs when the slab_size is larger than 1/1024 available
1412  	 * UVM.
1413  	 */
1414  	nswath = slab_size / NREGIONS_SIZE;
1415  	if (nswath > NREGIONS)
1416  		return (NULL);
1417  
1418  
1419  	/*
1420  	 * Try to allocate from our current best region for this zi
1421  	 */
1422  	region_mask = ~(slab_size - 1);
1423  	ri = slgd->zone[zi].best_region;
1424  	if (Regions[ri].mask == region_mask) {
1425  		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1426  			goto found;
1427  	}
1428  
1429  	/*
1430  	 * Try to find an existing region to allocate from.  The normal
1431  	 * case will be for allocations that are less than 1/1024 available
1432  	 * UVM, which fit into a single Regions[] entry.
1433  	 */
1434  	while (slab_size <= NREGIONS_SIZE) {
1435  		rx = -1;
1436  		for (ri = 0; ri < NREGIONS; ++ri) {
1437  			if (rx < 0 && Regions[ri].mask == 0)
1438  				rx = ri;
1439  			if (Regions[ri].mask == region_mask) {
1440  				slab = _vmem_alloc(ri, slab_size);
1441  				if (slab) {
1442  					slgd->zone[zi].best_region = ri;
1443  					goto found;
1444  				}
1445  			}
1446  		}
1447  
1448  		if (rx < 0)
1449  			return(NULL);
1450  
1451  		/*
1452  		 * This can fail, retry either way
1453  		 */
1454  		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1455  				  NULL,
1456  				  (void *)region_mask);
1457  	}
1458  
1459  	for (;;) {
1460  		rx = -1;
1461  		for (ri = 0; ri < NREGIONS; ri += nswath) {
1462  			if (Regions[ri].mask == region_mask) {
1463  				slab = _vmem_alloc(ri, slab_size);
1464  				if (slab) {
1465  					slgd->zone[zi].best_region = ri;
1466  					goto found;
1467  				}
1468  			}
1469  			if (rx < 0) {
1470  				for (j = nswath - 1; j >= 0; --j) {
1471  					if (Regions[ri+j].mask != 0)
1472  						break;
1473  				}
1474  				if (j < 0)
1475  					rx = ri;
1476  			}
1477  		}
1478  
1479  		/*
1480  		 * We found a candidate, try to allocate it backwards so
1481  		 * another thread racing a slaballoc() does not see the
1482  		 * mask in the base index position until we are done.
1483  		 *
1484  		 * We can safely zero-out any partial allocations because
1485  		 * the mask is only accessed from the base index.  Any other
1486  		 * threads racing us will fail prior to us clearing the mask.
1487  		 */
1488  		if (rx < 0)
1489  			return(NULL);
1490  		for (j = nswath - 1; j >= 0; --j) {
1491  			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1492  					       NULL, (void *)region_mask)) {
1493  				while (++j < nswath)
1494  					Regions[rx+j].mask = 0;
1495  				break;
1496  			}
1497  		}
1498  		/* retry */
1499  	}
1500  
1501  	/*
1502  	 * Fill in the new slab in region ri.  If the slab_size completely
1503  	 * fills one or more region slots we move the slab structure out of
1504  	 * band which should optimize the chunking (particularly for a power
1505  	 * of 2).
1506  	 */
1507  found:
1508  	region = &Regions[ri];
1509  	MASSERT(region->slab == NULL);
1510  	if (slab_size >= NREGIONS_SIZE) {
1511  		save = slab;
1512  		slab = memalloc(sizeof(*slab), 0);
1513  		bzero(slab, sizeof(*slab));
1514  		slab->chunks = save;
1515  		for (j = 0; j < nswath; ++j)
1516  			region[j].slab = slab;
1517  		chunk_offset = 0;
1518  	} else {
1519  		bzero(slab, sizeof(*slab));
1520  		slab->chunks = (char *)slab + chunk_offset;
1521  	}
1522  
1523  	/*
1524  	 * Calculate the start of the chunks memory and recalculate the
1525  	 * actual number of chunks the slab can hold.
1526  	 */
1527  	maxchunks = (slab_size - chunk_offset) / chunk_size;
1528  	if (maxchunks > MAXCHUNKS)
1529  		maxchunks = MAXCHUNKS;
1530  
1531  	/*
1532  	 * And fill in the rest
1533  	 */
1534  	slab->magic = ZALLOC_SLAB_MAGIC;
1535  	slab->navail = maxchunks;
1536  	slab->nmax = maxchunks;
1537  	slab->slab_size = slab_size;
1538  	slab->chunk_size = chunk_size;
1539  	slab->zone_index = zi;
1540  	slab->slgd = slgd;
1541  	slab->state = UNKNOWN;
1542  	slab->region = region;
1543  
1544  	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1545  		if (ri + LONG_BITS <= maxchunks)
1546  			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1547  		else
1548  			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1549  						(1LU << (maxchunks - ri)) - 1;
1550  	}
1551  	return (slab);
1552  }
1553  
1554  /*
1555   * Free a slab.
1556   */
1557  static void
slabfree(slab_t slab)1558  slabfree(slab_t slab)
1559  {
1560  	int nswath;
1561  	int j;
1562  
1563  	if (slab->region->slab == slab) {
1564  		/*
1565  		 * Out-of-band slab.
1566  		 */
1567  		nswath = slab->slab_size / NREGIONS_SIZE;
1568  		for (j = 0; j < nswath; ++j)
1569  			slab->region[j].slab = NULL;
1570  		slab->magic = 0;
1571  		_vmem_free(slab->chunks, slab->slab_size);
1572  		memfree(slab, 0);
1573  	} else {
1574  		/*
1575  		 * In-band slab.
1576  		 */
1577  		slab->magic = 0;
1578  		_vmem_free(slab, slab->slab_size);
1579  	}
1580  }
1581  
1582  /*
1583   * Terminate a slab's use in the current thread.  The slab may still have
1584   * outstanding allocations and thus not be deallocatable.
1585   */
1586  static void
slabterm(slglobaldata_t slgd,slab_t slab)1587  slabterm(slglobaldata_t slgd, slab_t slab)
1588  {
1589  	slglobaldata_t sldepot;
1590  	struct zoneinfo *zinfo;
1591  	int zi = slab->zone_index;
1592  
1593  	slab->slgd = NULL;
1594  	--slgd->nslabs;
1595  	sldepot = &slglobaldepot;
1596  	zinfo = &sldepot->zone[zi];
1597  
1598  	/*
1599  	 * Move the slab to the avail list or the empty list.
1600  	 */
1601  	if (__isthreaded)
1602  		_SPINLOCK(&sldepot->lock);
1603  	if (slab->navail == slab->nmax) {
1604  		slab->state = EMPTY;
1605  		slab->next = zinfo->empty_base;
1606  		zinfo->empty_base = slab;
1607  		++zinfo->empty_count;
1608  	} else {
1609  		slab->state = AVAIL;
1610  		slab->next = zinfo->avail_base;
1611  		zinfo->avail_base = slab;
1612  		++zinfo->avail_count;
1613  	}
1614  
1615  	/*
1616  	 * Clean extra slabs out of the empty list
1617  	 */
1618  	while (zinfo->empty_count > opt_cache) {
1619  		slab = zinfo->empty_base;
1620  		zinfo->empty_base = slab->next;
1621  		--zinfo->empty_count;
1622  		slab->state = UNKNOWN;
1623  		if (__isthreaded)
1624  			_SPINUNLOCK(&sldepot->lock);
1625  		slabfree(slab);
1626  		if (__isthreaded)
1627  			_SPINLOCK(&sldepot->lock);
1628  	}
1629  	if (__isthreaded)
1630  		_SPINUNLOCK(&sldepot->lock);
1631  }
1632  
1633  /*
1634   * _vmem_alloc()
1635   *
1636   *	Directly map memory in PAGE_SIZE'd chunks with the specified
1637   *	alignment.
1638   *
1639   *	Alignment must be a multiple of PAGE_SIZE.
1640   *
1641   *	Size must be >= alignment.
1642   */
1643  static void *
_vmem_alloc(int ri,size_t slab_size)1644  _vmem_alloc(int ri, size_t slab_size)
1645  {
1646  	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1647  	char *eaddr;
1648  	char *addr;
1649  	char *save;
1650  	uintptr_t excess;
1651  
1652  	if (slab_size < NREGIONS_SIZE)
1653  		eaddr = baddr + NREGIONS_SIZE;
1654  	else
1655  		eaddr = baddr + slab_size;
1656  
1657  	/*
1658  	 * This usually just works but might not.
1659  	 */
1660  	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1661  		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1662  	if (addr == MAP_FAILED) {
1663  		return (NULL);
1664  	}
1665  	if (addr < baddr || addr + slab_size > eaddr) {
1666  		munmap(addr, slab_size);
1667  		return (NULL);
1668  	}
1669  
1670  	/*
1671  	 * Check alignment.  The misaligned offset is also the excess
1672  	 * amount.  If misaligned unmap the excess so we have a chance of
1673  	 * mapping at the next alignment point and recursively try again.
1674  	 *
1675  	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1676  	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1677  	 *   xxxxxxxxx				final excess calculation
1678  	 *   ^ returned address
1679  	 */
1680  	excess = (uintptr_t)addr & (slab_size - 1);
1681  	while (excess) {
1682  		excess = slab_size - excess;
1683  		save = addr;
1684  
1685  		munmap(save + excess, slab_size - excess);
1686  		addr = _vmem_alloc(ri, slab_size);
1687  		munmap(save, excess);
1688  		if (addr == NULL)
1689  			return (NULL);
1690  		if (addr < baddr || addr + slab_size > eaddr) {
1691  			munmap(addr, slab_size);
1692  			return (NULL);
1693  		}
1694  		excess = (uintptr_t)addr & (slab_size - 1);
1695  	}
1696  	return (addr);
1697  }
1698  
1699  /*
1700   * _vmem_free()
1701   *
1702   *	Free a chunk of memory allocated with _vmem_alloc()
1703   */
1704  static void
_vmem_free(void * ptr,size_t size)1705  _vmem_free(void *ptr, size_t size)
1706  {
1707  	munmap(ptr, size);
1708  }
1709  
1710  /*
1711   * Panic on fatal conditions
1712   */
1713  static void
_mpanic(const char * ctl,...)1714  _mpanic(const char *ctl, ...)
1715  {
1716  	va_list va;
1717  
1718  	if (malloc_panic == 0) {
1719  		malloc_panic = 1;
1720  		va_start(va, ctl);
1721  		vfprintf(stderr, ctl, va);
1722  		fprintf(stderr, "\n");
1723  		fflush(stderr);
1724  		va_end(va);
1725  	}
1726  	abort();
1727  }
1728  
1729  __weak_reference(__aligned_alloc, aligned_alloc);
1730  __weak_reference(__malloc, malloc);
1731  __weak_reference(__calloc, calloc);
1732  __weak_reference(__posix_memalign, posix_memalign);
1733  __weak_reference(__realloc, realloc);
1734  __weak_reference(__free, free);
1735