1 /* 2 * DMALLOC.C - Dillon's malloc 3 * 4 * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com>. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 /* 37 * This module implements a modified slab allocator as a drop-in replacement 38 * for the libc malloc(). The slab algorithm has been adjusted to support 39 * dynamic sizing of slabs which effectively allows slabs to be used for 40 * allocations of any size. Because of this we neither have a small-block 41 * allocator or a big-block allocator and the code paths are simplified. 42 * 43 * To support dynamic slab sizing available user virtual memory is broken 44 * down into ~1024 regions. Each region has fixed slab size whos value is 45 * set when the region is opened up for use. The free() path simply applies 46 * a mask based on the region to the pointer to acquire the base of the 47 * governing slab structure. 48 * 49 * Regions[NREGIONS] (1024) 50 * 51 * Slab management and locking is done on a per-zone basis. 52 * 53 * Alloc Size Chunking Number of zones 54 * 0-127 8 16 55 * 128-255 16 8 56 * 256-511 32 8 57 * 512-1023 64 8 58 * 1024-2047 128 8 59 * 2048-4095 256 8 60 * 4096-8191 512 8 61 * 8192-16383 1024 8 62 * 16384-32767 2048 8 63 * 32768-65535 4096 8 64 * ... continues forever ... 4 zones 65 * 66 * For a 2^63 memory space each doubling >= 64K is broken down into 67 * 4 chunking zones, so we support 88 + (48 * 4) = 280 zones. 68 * 69 * API FEATURES AND SIDE EFFECTS 70 * 71 * + power-of-2 sized allocations up to a page will be power-of-2 aligned. 72 * Above that power-of-2 sized allocations are page-aligned. Non 73 * power-of-2 sized allocations are aligned the same as the chunk 74 * size for their zone. 75 * + ability to allocate arbitrarily large chunks of memory 76 * + realloc will reuse the passed pointer if possible, within the 77 * limitations of the zone chunking. 78 * 79 * On top of the slab allocator we also implement a 16-entry-per-thread 80 * magazine cache for allocations <= NOMSLABSIZE. 81 * 82 * FUTURE FEATURES 83 * 84 * + [better] garbage collection 85 * + better initial sizing. 86 * 87 * TUNING 88 * 89 * The value of the environment variable MALLOC_OPTIONS is a character string 90 * containing various flags to tune nmalloc. Upper case letters enabled 91 * or increase the feature, lower case disables or decreases the feature. 92 * 93 * U Enable UTRACE for all operations, observable with ktrace. 94 * Diasbled by default. 95 * 96 * Z Zero out allocations, otherwise allocations (except for 97 * calloc) will contain garbage. 98 * Disabled by default. 99 * 100 * H Pass a hint with madvise() about unused pages. 101 * Disabled by default. 102 * Not currently implemented. 103 * 104 * F Disable local per-thread caching. 105 * Disabled by default. 106 * 107 * C Increase (decrease) how much excess cache to retain. 108 * Set to 4 by default. 109 */ 110 111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */ 112 113 #ifndef STANDALONE_DEBUG 114 #include "libc_private.h" 115 #endif 116 117 #include <sys/param.h> 118 #include <sys/types.h> 119 #include <sys/mman.h> 120 #include <sys/queue.h> 121 #include <sys/ktrace.h> 122 #include <stdio.h> 123 #include <stdint.h> 124 #include <stdlib.h> 125 #include <stdarg.h> 126 #include <stddef.h> 127 #include <unistd.h> 128 #include <string.h> 129 #include <fcntl.h> 130 #include <errno.h> 131 #include <pthread.h> 132 #include <limits.h> 133 134 #include <machine/atomic.h> 135 #include <machine/cpufunc.h> 136 137 #ifdef STANDALONE_DEBUG 138 void _nmalloc_thr_init(void); 139 #else 140 #include "spinlock.h" 141 #include "un-namespace.h" 142 #endif 143 144 #ifndef MAP_SIZEALIGN 145 #define MAP_SIZEALIGN 0 146 #endif 147 148 #if SSIZE_MAX == 0x7FFFFFFF 149 #define ADDRBITS 32 150 #define UVM_BITS 32 /* worst case */ 151 #else 152 #define ADDRBITS 64 153 #define UVM_BITS 48 /* worst case XXX */ 154 #endif 155 156 #if LONG_MAX == 0x7FFFFFFF 157 #define LONG_BITS 32 158 #define LONG_BITS_SHIFT 5 159 #else 160 #define LONG_BITS 64 161 #define LONG_BITS_SHIFT 6 162 #endif 163 164 #define LOCKEDPTR ((void *)(intptr_t)-1) 165 166 /* 167 * Regions[] 168 */ 169 #define NREGIONS_BITS 10 170 #define NREGIONS (1 << NREGIONS_BITS) 171 #define NREGIONS_MASK (NREGIONS - 1) 172 #define NREGIONS_SHIFT (UVM_BITS - NREGIONS_BITS) 173 #define NREGIONS_SIZE (1LU << NREGIONS_SHIFT) 174 175 typedef struct region *region_t; 176 typedef struct slglobaldata *slglobaldata_t; 177 typedef struct slab *slab_t; 178 179 struct region { 180 uintptr_t mask; 181 slab_t slab; /* conditional out of band slab */ 182 }; 183 184 static struct region Regions[NREGIONS]; 185 186 /* 187 * Number of chunking zones available 188 */ 189 #define CHUNKFACTOR 8 190 #if ADDRBITS == 32 191 #define NZONES (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR) 192 #else 193 #define NZONES (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR) 194 #endif 195 196 static int MaxChunks[NZONES]; 197 198 #define NDEPOTS 8 /* must be power of 2 */ 199 200 /* 201 * Maximum number of chunks per slab, governed by the allocation bitmap in 202 * each slab. The maximum is reduced for large chunk sizes. 203 */ 204 #define MAXCHUNKS (LONG_BITS * LONG_BITS) 205 #define MAXCHUNKS_BITS (LONG_BITS_SHIFT * LONG_BITS_SHIFT) 206 #define LITSLABSIZE (32 * 1024) 207 #define NOMSLABSIZE (2 * 1024 * 1024) 208 #define BIGSLABSIZE (128 * 1024 * 1024) 209 210 #define ZALLOC_SLAB_MAGIC 0x736c6162 /* magic sanity */ 211 212 TAILQ_HEAD(slab_list, slab); 213 214 /* 215 * A slab structure 216 */ 217 struct slab { 218 struct slab *next; /* slabs with available space */ 219 TAILQ_ENTRY(slab) entry; 220 int32_t magic; /* magic number for sanity check */ 221 u_int navail; /* number of free elements available */ 222 u_int nmax; 223 u_int free_bit; /* free hint bitno */ 224 u_int free_index; /* free hint index */ 225 u_long bitmap[LONG_BITS]; /* free chunks */ 226 size_t slab_size; /* size of entire slab */ 227 size_t chunk_size; /* chunk size for validation */ 228 int zone_index; 229 enum { UNKNOWN, AVAIL, EMPTY, FULL } state; 230 int flags; 231 region_t region; /* related region */ 232 char *chunks; /* chunk base */ 233 slglobaldata_t slgd; /* localized to thread else NULL */ 234 }; 235 236 /* 237 * per-thread data + global depot 238 * 239 * NOTE: The magazine shortcut is only used for per-thread data. 240 */ 241 #define NMAGSHORTCUT 16 242 243 struct slglobaldata { 244 spinlock_t lock; /* only used by slglobaldepot */ 245 struct zoneinfo { 246 slab_t avail_base; 247 slab_t empty_base; 248 int best_region; 249 int mag_index; 250 int avail_count; 251 int empty_count; 252 void *mag_shortcut[NMAGSHORTCUT]; 253 } zone[NZONES]; 254 struct slab_list full_zones; /* via entry */ 255 int masked; 256 int biggest_index; 257 size_t nslabs; 258 }; 259 260 #define SLAB_ZEROD 0x0001 261 262 /* 263 * Misc constants. Note that allocations that are exact multiples of 264 * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module. 265 * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists. 266 */ 267 #define MIN_CHUNK_SIZE 8 /* in bytes */ 268 #define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1) 269 270 #define SAFLAG_ZERO 0x00000001 271 272 /* 273 * The WEIRD_ADDR is used as known text to copy into free objects to 274 * try to create deterministic failure cases if the data is accessed after 275 * free. 276 * 277 * WARNING: A limited number of spinlocks are available, BIGXSIZE should 278 * not be larger then 64. 279 */ 280 #ifdef INVARIANTS 281 #define WEIRD_ADDR 0xdeadc0de 282 #endif 283 284 /* 285 * Thread control 286 */ 287 288 #define MASSERT(exp) do { if (__predict_false(!(exp))) \ 289 _mpanic("assertion: %s in %s", \ 290 #exp, __func__); \ 291 } while (0) 292 293 /* 294 * With this attribute set, do not require a function call for accessing 295 * this variable when the code is compiled -fPIC. 296 * 297 * Must be empty for libc_rtld (similar to __thread) 298 */ 299 #if defined(__LIBC_RTLD) 300 #define TLS_ATTRIBUTE 301 #else 302 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec"))); 303 #endif 304 305 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE; 306 static pthread_key_t thread_malloc_key; 307 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT; 308 static struct slglobaldata slglobaldepot; 309 310 static int opt_madvise = 0; 311 static int opt_free = 0; 312 static int opt_cache = 4; 313 static int opt_utrace = 0; 314 static int g_malloc_flags = 0; 315 static int malloc_panic; 316 317 #ifdef INVARIANTS 318 static const int32_t weirdary[16] = { 319 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, 320 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, 321 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, 322 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR 323 }; 324 #endif 325 326 static void *memalloc(size_t size, int flags); 327 static void *memrealloc(void *ptr, size_t size); 328 static void memfree(void *ptr, int); 329 static int memalign(void **memptr, size_t alignment, size_t size); 330 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size); 331 static void slabfree(slab_t slab); 332 static void slabterm(slglobaldata_t slgd, slab_t slab); 333 static void *_vmem_alloc(int ri, size_t slab_size); 334 static void _vmem_free(void *ptr, size_t slab_size); 335 static void _mpanic(const char *ctl, ...) __printflike(1, 2); 336 #ifndef STANDALONE_DEBUG 337 static void malloc_init(void) __constructor(101); 338 #else 339 static void malloc_init(void) __constructor(101); 340 #endif 341 342 343 struct nmalloc_utrace { 344 void *p; 345 size_t s; 346 void *r; 347 }; 348 349 #define UTRACE(a, b, c) \ 350 if (opt_utrace) { \ 351 struct nmalloc_utrace ut = { \ 352 .p = (a), \ 353 .s = (b), \ 354 .r = (c) \ 355 }; \ 356 utrace(&ut, sizeof(ut)); \ 357 } 358 359 #ifdef INVARIANTS 360 /* 361 * If enabled any memory allocated without M_ZERO is initialized to -1. 362 */ 363 static int use_malloc_pattern; 364 #endif 365 366 static void malloc_init(void)367 malloc_init(void) 368 { 369 const char *p = NULL; 370 371 TAILQ_INIT(&slglobal.full_zones); 372 373 Regions[0].mask = -1; /* disallow activity in lowest region */ 374 375 if (issetugid() == 0) 376 p = getenv("MALLOC_OPTIONS"); 377 378 for (; p != NULL && *p != '\0'; p++) { 379 switch(*p) { 380 case 'u': 381 opt_utrace = 0; 382 break; 383 case 'U': 384 opt_utrace = 1; 385 break; 386 case 'h': 387 opt_madvise = 0; 388 break; 389 case 'H': 390 opt_madvise = 1; 391 break; 392 case 'c': 393 if (opt_cache > 0) 394 --opt_cache; 395 break; 396 case 'C': 397 ++opt_cache; 398 break; 399 case 'f': 400 opt_free = 0; 401 break; 402 case 'F': 403 opt_free = 1; 404 break; 405 case 'z': 406 g_malloc_flags = 0; 407 break; 408 case 'Z': 409 g_malloc_flags = SAFLAG_ZERO; 410 break; 411 default: 412 break; 413 } 414 } 415 416 UTRACE((void *) -1, 0, NULL); 417 } 418 419 /* 420 * We have to install a handler for nmalloc thread teardowns when 421 * the thread is created. We cannot delay this because destructors in 422 * sophisticated userland programs can call malloc() for the first time 423 * during their thread exit. 424 * 425 * This routine is called directly from pthreads. 426 */ 427 static void _nmalloc_thr_init_once(void); 428 static void _nmalloc_thr_destructor(void *thrp); 429 430 void _nmalloc_thr_init(void)431 _nmalloc_thr_init(void) 432 { 433 static int did_init; 434 435 TAILQ_INIT(&slglobal.full_zones); 436 437 if (slglobal.masked) 438 return; 439 440 slglobal.masked = 1; 441 if (did_init == 0) { 442 did_init = 1; 443 pthread_once(&thread_malloc_once, _nmalloc_thr_init_once); 444 } 445 pthread_setspecific(thread_malloc_key, &slglobal); 446 slglobal.masked = 0; 447 } 448 449 void _nmalloc_thr_prepfork(void)450 _nmalloc_thr_prepfork(void) 451 { 452 if (__isthreaded) 453 _SPINLOCK(&slglobaldepot.lock); 454 } 455 456 void _nmalloc_thr_parentfork(void)457 _nmalloc_thr_parentfork(void) 458 { 459 if (__isthreaded) 460 _SPINUNLOCK(&slglobaldepot.lock); 461 } 462 463 void _nmalloc_thr_childfork(void)464 _nmalloc_thr_childfork(void) 465 { 466 if (__isthreaded) 467 _SPINUNLOCK(&slglobaldepot.lock); 468 } 469 470 /* 471 * Called just once 472 */ 473 static void _nmalloc_thr_init_once(void)474 _nmalloc_thr_init_once(void) 475 { 476 /* ignore error from stub if not threaded */ 477 pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor); 478 } 479 480 /* 481 * Called for each thread undergoing exit 482 * 483 * Move all of the thread's slabs into a depot. 484 */ 485 static void _nmalloc_thr_destructor(void * thrp)486 _nmalloc_thr_destructor(void *thrp) 487 { 488 slglobaldata_t slgd = thrp; 489 struct zoneinfo *zinfo; 490 slab_t slab; 491 void *ptr; 492 int i; 493 int j; 494 495 slgd->masked = 1; 496 497 for (i = 0; i <= slgd->biggest_index; i++) { 498 zinfo = &slgd->zone[i]; 499 500 while ((j = zinfo->mag_index) > 0) { 501 --j; 502 ptr = zinfo->mag_shortcut[j]; 503 zinfo->mag_shortcut[j] = NULL; /* SAFETY */ 504 zinfo->mag_index = j; 505 memfree(ptr, 0); 506 } 507 508 while ((slab = zinfo->empty_base) != NULL) { 509 zinfo->empty_base = slab->next; 510 --zinfo->empty_count; 511 slabterm(slgd, slab); 512 } 513 514 while ((slab = zinfo->avail_base) != NULL) { 515 zinfo->avail_base = slab->next; 516 --zinfo->avail_count; 517 slabterm(slgd, slab); 518 } 519 520 while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) { 521 TAILQ_REMOVE(&slgd->full_zones, slab, entry); 522 slabterm(slgd, slab); 523 } 524 } 525 } 526 527 /* 528 * Calculate the zone index for the allocation request size and set the 529 * allocation request size to that particular zone's chunk size. 530 * 531 * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform 532 * with malloc requirements for intel/amd. 533 */ 534 static __inline int zoneindex(size_t * bytes,size_t * chunking)535 zoneindex(size_t *bytes, size_t *chunking) 536 { 537 size_t n = (size_t)*bytes; 538 size_t x; 539 size_t c; 540 int i; 541 542 if (n < 128) { 543 if (n < 16) { 544 *bytes = n = (n + 7) & ~7; 545 *chunking = 8; 546 return(n / 8 - 1); /* 8 byte chunks, 2 zones */ 547 } else { 548 *bytes = n = (n + 15) & ~15; 549 *chunking = 16; 550 return(n / 16 + 2); /* 16 byte chunks, 8 zones */ 551 } 552 } 553 if (n < 4096) { 554 x = 256; 555 c = x / (CHUNKFACTOR * 2); 556 i = 16; 557 } else { 558 x = 8192; 559 c = x / (CHUNKFACTOR * 2); 560 i = 16 + CHUNKFACTOR * 5; /* 256->512,1024,2048,4096,8192 */ 561 } 562 while (n >= x) { 563 x <<= 1; 564 c <<= 1; 565 i += CHUNKFACTOR; 566 if (x == 0) 567 _mpanic("slaballoc: byte value too high"); 568 } 569 *bytes = n = roundup2(n, c); 570 *chunking = c; 571 return (i + n / c - CHUNKFACTOR); 572 #if 0 573 *bytes = n = (n + c - 1) & ~(c - 1); 574 *chunking = c; 575 return (n / c + i); 576 577 if (n < 256) { 578 *bytes = n = (n + 15) & ~15; 579 *chunking = 16; 580 return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1); 581 } 582 if (n < 8192) { 583 if (n < 512) { 584 *bytes = n = (n + 31) & ~31; 585 *chunking = 32; 586 return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1); 587 } 588 if (n < 1024) { 589 *bytes = n = (n + 63) & ~63; 590 *chunking = 64; 591 return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1); 592 } 593 if (n < 2048) { 594 *bytes = n = (n + 127) & ~127; 595 *chunking = 128; 596 return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1); 597 } 598 if (n < 4096) { 599 *bytes = n = (n + 255) & ~255; 600 *chunking = 256; 601 return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1); 602 } 603 *bytes = n = (n + 511) & ~511; 604 *chunking = 512; 605 return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1); 606 } 607 if (n < 16384) { 608 *bytes = n = (n + 1023) & ~1023; 609 *chunking = 1024; 610 return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1); 611 } 612 if (n < 32768) { /* 16384-32767 */ 613 *bytes = n = (n + 2047) & ~2047; 614 *chunking = 2048; 615 return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1); 616 } 617 if (n < 65536) { 618 *bytes = n = (n + 4095) & ~4095; /* 32768-65535 */ 619 *chunking = 4096; 620 return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1); 621 } 622 623 x = 131072; 624 c = 8192; 625 i = CHUNKINGLO*10 - 1; 626 627 while (n >= x) { 628 x <<= 1; 629 c <<= 1; 630 i += CHUNKINGHI; 631 if (x == 0) 632 _mpanic("slaballoc: byte value too high"); 633 } 634 *bytes = n = (n + c - 1) & ~(c - 1); 635 *chunking = c; 636 return (n / c + i); 637 #endif 638 } 639 640 /* 641 * malloc() - call internal slab allocator 642 */ 643 void * __malloc(size_t size)644 __malloc(size_t size) 645 { 646 void *ptr; 647 648 ptr = memalloc(size, 0); 649 if (ptr == NULL) 650 errno = ENOMEM; 651 else 652 UTRACE(0, size, ptr); 653 return(ptr); 654 } 655 656 /* 657 * calloc() - call internal slab allocator 658 */ 659 void * __calloc(size_t number,size_t size)660 __calloc(size_t number, size_t size) 661 { 662 void *ptr; 663 664 ptr = memalloc(number * size, SAFLAG_ZERO); 665 if (ptr == NULL) 666 errno = ENOMEM; 667 else 668 UTRACE(0, number * size, ptr); 669 return(ptr); 670 } 671 672 /* 673 * realloc() (SLAB ALLOCATOR) 674 * 675 * We do not attempt to optimize this routine beyond reusing the same 676 * pointer if the new size fits within the chunking of the old pointer's 677 * zone. 678 */ 679 void * __realloc(void * ptr,size_t size)680 __realloc(void *ptr, size_t size) 681 { 682 void *ret; 683 684 if (ptr == NULL) 685 ret = memalloc(size, 0); 686 else 687 ret = memrealloc(ptr, size); 688 if (ret == NULL) 689 errno = ENOMEM; 690 else 691 UTRACE(ptr, size, ret); 692 return(ret); 693 } 694 695 /* 696 * aligned_alloc() 697 * 698 * Allocate (size) bytes with a alignment of (alignment). 699 */ 700 void * __aligned_alloc(size_t alignment,size_t size)701 __aligned_alloc(size_t alignment, size_t size) 702 { 703 void *ptr; 704 int rc; 705 706 ptr = NULL; 707 rc = memalign(&ptr, alignment, size); 708 if (rc) 709 errno = rc; 710 711 return (ptr); 712 } 713 714 /* 715 * posix_memalign() 716 * 717 * Allocate (size) bytes with a alignment of (alignment), where (alignment) 718 * is a power of 2 >= sizeof(void *). 719 */ 720 int __posix_memalign(void ** memptr,size_t alignment,size_t size)721 __posix_memalign(void **memptr, size_t alignment, size_t size) 722 { 723 int rc; 724 725 /* 726 * OpenGroup spec issue 6 check 727 */ 728 if (alignment < sizeof(void *)) { 729 *memptr = NULL; 730 return(EINVAL); 731 } 732 733 rc = memalign(memptr, alignment, size); 734 735 return (rc); 736 } 737 738 /* 739 * The slab allocator will allocate on power-of-2 boundaries up to at least 740 * PAGE_SIZE. Otherwise we use the zoneindex mechanic to find a zone 741 * matching the requirements. 742 */ 743 static int memalign(void ** memptr,size_t alignment,size_t size)744 memalign(void **memptr, size_t alignment, size_t size) 745 { 746 747 if (alignment < 1) { 748 *memptr = NULL; 749 return(EINVAL); 750 } 751 752 /* 753 * OpenGroup spec issue 6 check 754 */ 755 if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) { 756 *memptr = NULL; 757 return(EINVAL); 758 } 759 760 /* 761 * XXX for now just find the nearest power of 2 >= size and also 762 * >= alignment and allocate that. 763 */ 764 while (alignment < size) { 765 alignment <<= 1; 766 if (alignment == 0) 767 _mpanic("posix_memalign: byte value too high"); 768 } 769 *memptr = memalloc(alignment, 0); 770 return(*memptr ? 0 : ENOMEM); 771 } 772 773 /* 774 * free() (SLAB ALLOCATOR) - do the obvious 775 */ 776 void __free(void * ptr)777 __free(void *ptr) 778 { 779 if (ptr) { 780 UTRACE(ptr, 0, 0); 781 memfree(ptr, 0); 782 } 783 } 784 785 /* 786 * memalloc() (SLAB ALLOCATOR) 787 * 788 * Allocate memory via the slab allocator. 789 */ 790 static void * memalloc(size_t size,int flags)791 memalloc(size_t size, int flags) 792 { 793 slglobaldata_t slgd; 794 struct zoneinfo *zinfo; 795 slab_t slab; 796 size_t chunking; 797 int bmi; 798 int bno; 799 u_long *bmp; 800 int zi; 801 #ifdef INVARIANTS 802 int i; 803 #endif 804 int j; 805 char *obj; 806 807 /* 808 * If 0 bytes is requested we have to return a unique pointer, allocate 809 * at least one byte. 810 */ 811 if (size == 0) 812 size = 1; 813 814 /* Capture global flags */ 815 flags |= g_malloc_flags; 816 817 /* Compute allocation zone; zoneindex will panic on excessive sizes */ 818 zi = zoneindex(&size, &chunking); 819 MASSERT(zi < NZONES); 820 if (size == 0) 821 return(NULL); 822 823 /* 824 * Try magazine shortcut first 825 */ 826 slgd = &slglobal; 827 zinfo = &slgd->zone[zi]; 828 829 if ((j = zinfo->mag_index) != 0) { 830 zinfo->mag_index = --j; 831 obj = zinfo->mag_shortcut[j]; 832 zinfo->mag_shortcut[j] = NULL; /* SAFETY */ 833 if (flags & SAFLAG_ZERO) 834 bzero(obj, size); 835 return obj; 836 } 837 838 /* 839 * Locate a slab with available space. If no slabs are available 840 * back-off to the empty list and if we still come up dry allocate 841 * a new slab (which will try the depot first). 842 */ 843 retry: 844 if ((slab = zinfo->avail_base) == NULL) { 845 if ((slab = zinfo->empty_base) == NULL) { 846 /* 847 * Still dry 848 */ 849 slab = slaballoc(zi, chunking, size); 850 if (slab == NULL) 851 return(NULL); 852 slab->next = zinfo->avail_base; 853 zinfo->avail_base = slab; 854 ++zinfo->avail_count; 855 slab->state = AVAIL; 856 if (slgd->biggest_index < zi) 857 slgd->biggest_index = zi; 858 ++slgd->nslabs; 859 } else { 860 /* 861 * Pulled from empty list 862 */ 863 zinfo->empty_base = slab->next; 864 slab->next = zinfo->avail_base; 865 zinfo->avail_base = slab; 866 ++zinfo->avail_count; 867 slab->state = AVAIL; 868 --zinfo->empty_count; 869 } 870 } 871 872 /* 873 * Allocate a chunk out of the slab. HOT PATH 874 * 875 * Only the thread owning the slab can allocate out of it. 876 * 877 * NOTE: The last bit in the bitmap is always marked allocated so 878 * we cannot overflow here. 879 */ 880 bno = slab->free_bit; 881 bmi = slab->free_index; 882 bmp = &slab->bitmap[bmi]; 883 if (*bmp & (1LU << bno)) { 884 atomic_clear_long(bmp, 1LU << bno); 885 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size; 886 slab->free_bit = (bno + 1) & (LONG_BITS - 1); 887 atomic_add_int(&slab->navail, -1); 888 if (flags & SAFLAG_ZERO) 889 bzero(obj, size); 890 return (obj); 891 } 892 893 /* 894 * Allocate a chunk out of a slab. COLD PATH 895 */ 896 if (slab->navail == 0) { 897 zinfo->avail_base = slab->next; 898 --zinfo->avail_count; 899 slab->state = FULL; 900 TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry); 901 goto retry; 902 } 903 904 while (bmi < LONG_BITS) { 905 bmp = &slab->bitmap[bmi]; 906 if (*bmp) { 907 bno = bsflong(*bmp); 908 atomic_clear_long(bmp, 1LU << bno); 909 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * 910 size; 911 slab->free_index = bmi; 912 slab->free_bit = (bno + 1) & (LONG_BITS - 1); 913 atomic_add_int(&slab->navail, -1); 914 if (flags & SAFLAG_ZERO) 915 bzero(obj, size); 916 return (obj); 917 } 918 ++bmi; 919 } 920 bmi = 0; 921 while (bmi < LONG_BITS) { 922 bmp = &slab->bitmap[bmi]; 923 if (*bmp) { 924 bno = bsflong(*bmp); 925 atomic_clear_long(bmp, 1LU << bno); 926 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * 927 size; 928 slab->free_index = bmi; 929 slab->free_bit = (bno + 1) & (LONG_BITS - 1); 930 atomic_add_int(&slab->navail, -1); 931 if (flags & SAFLAG_ZERO) 932 bzero(obj, size); 933 return (obj); 934 } 935 ++bmi; 936 } 937 _mpanic("slaballoc: corrupted zone: navail %d", slab->navail); 938 /* not reached */ 939 return NULL; 940 } 941 942 /* 943 * Reallocate memory within the chunk 944 */ 945 static void * memrealloc(void * ptr,size_t nsize)946 memrealloc(void *ptr, size_t nsize) 947 { 948 region_t region; 949 slab_t slab; 950 size_t osize; 951 char *obj; 952 int flags = 0; 953 954 /* 955 * If 0 bytes is requested we have to return a unique pointer, allocate 956 * at least one byte. 957 */ 958 if (nsize == 0) 959 nsize = 1; 960 961 /* Capture global flags */ 962 flags |= g_malloc_flags; 963 964 /* 965 * Locate the zone by looking up the dynamic slab size mask based 966 * on the memory region the allocation resides in. 967 */ 968 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK]; 969 if ((slab = region->slab) == NULL) 970 slab = (void *)((uintptr_t)ptr & region->mask); 971 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC); 972 osize = slab->chunk_size; 973 if (nsize <= osize) { 974 if (osize < 32 || nsize >= osize / 2) { 975 obj = ptr; 976 if ((flags & SAFLAG_ZERO) && nsize < osize) 977 bzero(obj + nsize, osize - nsize); 978 return(obj); 979 } 980 } 981 982 /* 983 * Otherwise resize the object 984 */ 985 obj = memalloc(nsize, 0); 986 if (obj) { 987 if (nsize > osize) 988 nsize = osize; 989 bcopy(ptr, obj, nsize); 990 memfree(ptr, 0); 991 } 992 return (obj); 993 } 994 995 /* 996 * free (SLAB ALLOCATOR) 997 * 998 * Free a memory block previously allocated by malloc. 999 * 1000 * MPSAFE 1001 */ 1002 static void memfree(void * ptr,int flags)1003 memfree(void *ptr, int flags) 1004 { 1005 region_t region; 1006 slglobaldata_t slgd; 1007 slab_t slab; 1008 slab_t stmp; 1009 slab_t *slabp; 1010 int bmi; 1011 int bno; 1012 int j; 1013 u_long *bmp; 1014 1015 /* 1016 * Locate the zone by looking up the dynamic slab size mask based 1017 * on the memory region the allocation resides in. 1018 * 1019 * WARNING! The slab may be owned by another thread! 1020 */ 1021 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK]; 1022 if ((slab = region->slab) == NULL) 1023 slab = (void *)((uintptr_t)ptr & region->mask); 1024 MASSERT(slab != NULL); 1025 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC); 1026 1027 #ifdef INVARIANTS 1028 /* 1029 * Put weird data into the memory to detect modifications after 1030 * freeing, illegal pointer use after freeing (we should fault on 1031 * the odd address), and so forth. 1032 */ 1033 if (slab->chunk_size < sizeof(weirdary)) 1034 bcopy(weirdary, ptr, slab->chunk_size); 1035 else 1036 bcopy(weirdary, ptr, sizeof(weirdary)); 1037 #endif 1038 slgd = &slglobal; 1039 1040 /* 1041 * Use mag_shortcut[] when possible 1042 */ 1043 if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) { 1044 struct zoneinfo *zinfo; 1045 1046 zinfo = &slgd->zone[slab->zone_index]; 1047 j = zinfo->mag_index; 1048 if (j < NMAGSHORTCUT) { 1049 zinfo->mag_shortcut[j] = ptr; 1050 zinfo->mag_index = j + 1; 1051 return; 1052 } 1053 } 1054 1055 /* 1056 * Free to slab and increment navail. We can delay incrementing 1057 * navail to prevent the slab from being destroyed out from under 1058 * us while we do other optimizations. 1059 */ 1060 bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size; 1061 bmi = bno >> LONG_BITS_SHIFT; 1062 bno &= (LONG_BITS - 1); 1063 bmp = &slab->bitmap[bmi]; 1064 1065 MASSERT(bmi >= 0 && bmi < slab->nmax); 1066 MASSERT((*bmp & (1LU << bno)) == 0); 1067 atomic_set_long(bmp, 1LU << bno); 1068 1069 if (slab->slgd == slgd) { 1070 /* 1071 * We can only do the following if we own the slab. Note 1072 * that navail can be incremented by any thread even if 1073 * we own the slab. 1074 */ 1075 struct zoneinfo *zinfo; 1076 1077 atomic_add_int(&slab->navail, 1); 1078 if (slab->free_index > bmi) { 1079 slab->free_index = bmi; 1080 slab->free_bit = bno; 1081 } else if (slab->free_index == bmi && 1082 slab->free_bit > bno) { 1083 slab->free_bit = bno; 1084 } 1085 zinfo = &slgd->zone[slab->zone_index]; 1086 1087 /* 1088 * Freeing an object from a full slab makes it less than 1089 * full. The slab must be moved to the available list. 1090 * 1091 * If the available list has too many slabs, release some 1092 * to the depot. 1093 */ 1094 if (slab->state == FULL) { 1095 TAILQ_REMOVE(&slgd->full_zones, slab, entry); 1096 slab->state = AVAIL; 1097 stmp = zinfo->avail_base; 1098 slab->next = stmp; 1099 zinfo->avail_base = slab; 1100 ++zinfo->avail_count; 1101 while (zinfo->avail_count > opt_cache) { 1102 slab = zinfo->avail_base; 1103 zinfo->avail_base = slab->next; 1104 --zinfo->avail_count; 1105 slabterm(slgd, slab); 1106 } 1107 goto done; 1108 } 1109 1110 /* 1111 * If the slab becomes completely empty dispose of it in 1112 * some manner. By default each thread caches up to 4 1113 * empty slabs. Only small slabs are cached. 1114 */ 1115 if (slab->navail == slab->nmax && slab->state == AVAIL) { 1116 /* 1117 * Remove slab from available queue 1118 */ 1119 slabp = &zinfo->avail_base; 1120 while ((stmp = *slabp) != slab) 1121 slabp = &stmp->next; 1122 *slabp = slab->next; 1123 --zinfo->avail_count; 1124 1125 if (opt_free || opt_cache == 0) { 1126 /* 1127 * If local caching is disabled cache the 1128 * slab in the depot (or free it). 1129 */ 1130 slabterm(slgd, slab); 1131 } else if (slab->slab_size > BIGSLABSIZE) { 1132 /* 1133 * We do not try to retain large slabs 1134 * in per-thread caches. 1135 */ 1136 slabterm(slgd, slab); 1137 } else if (zinfo->empty_count > opt_cache) { 1138 /* 1139 * We have too many slabs cached, but 1140 * instead of freeing this one free 1141 * an empty slab that's been idle longer. 1142 * 1143 * (empty_count does not change) 1144 */ 1145 stmp = zinfo->empty_base; 1146 slab->state = EMPTY; 1147 slab->next = stmp->next; 1148 zinfo->empty_base = slab; 1149 slabterm(slgd, stmp); 1150 } else { 1151 /* 1152 * Cache the empty slab in our thread local 1153 * empty list. 1154 */ 1155 ++zinfo->empty_count; 1156 slab->state = EMPTY; 1157 slab->next = zinfo->empty_base; 1158 zinfo->empty_base = slab; 1159 } 1160 } 1161 } else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) { 1162 slglobaldata_t sldepot; 1163 1164 /* 1165 * If freeing to a slab owned by the global depot, and 1166 * the slab becomes completely EMPTY, try to move it to 1167 * the correct list. 1168 */ 1169 sldepot = &slglobaldepot; 1170 if (__isthreaded) 1171 _SPINLOCK(&sldepot->lock); 1172 if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) { 1173 struct zoneinfo *zinfo; 1174 1175 /* 1176 * Move the slab to the empty list 1177 */ 1178 MASSERT(slab->state == AVAIL); 1179 atomic_add_int(&slab->navail, 1); 1180 zinfo = &sldepot->zone[slab->zone_index]; 1181 slabp = &zinfo->avail_base; 1182 while (slab != *slabp) 1183 slabp = &(*slabp)->next; 1184 *slabp = slab->next; 1185 --zinfo->avail_count; 1186 1187 /* 1188 * Clean out excessive empty entries from the 1189 * depot. 1190 */ 1191 slab->state = EMPTY; 1192 slab->next = zinfo->empty_base; 1193 zinfo->empty_base = slab; 1194 ++zinfo->empty_count; 1195 while (zinfo->empty_count > opt_cache) { 1196 slab = zinfo->empty_base; 1197 zinfo->empty_base = slab->next; 1198 --zinfo->empty_count; 1199 slab->state = UNKNOWN; 1200 if (__isthreaded) 1201 _SPINUNLOCK(&sldepot->lock); 1202 slabfree(slab); 1203 if (__isthreaded) 1204 _SPINLOCK(&sldepot->lock); 1205 } 1206 } else { 1207 atomic_add_int(&slab->navail, 1); 1208 } 1209 if (__isthreaded) 1210 _SPINUNLOCK(&sldepot->lock); 1211 } else { 1212 /* 1213 * We can't act on the slab other than by adjusting navail 1214 * (and the bitmap which we did in the common code at the 1215 * top). 1216 */ 1217 atomic_add_int(&slab->navail, 1); 1218 } 1219 done: 1220 ; 1221 } 1222 1223 /* 1224 * Allocate a new slab holding objects of size chunk_size. 1225 */ 1226 static slab_t slaballoc(int zi,size_t chunking,size_t chunk_size)1227 slaballoc(int zi, size_t chunking, size_t chunk_size) 1228 { 1229 slglobaldata_t slgd; 1230 slglobaldata_t sldepot; 1231 struct zoneinfo *zinfo; 1232 region_t region; 1233 void *save; 1234 slab_t slab; 1235 size_t slab_desire; 1236 size_t slab_size; 1237 size_t region_mask; 1238 uintptr_t chunk_offset; 1239 ssize_t maxchunks; 1240 ssize_t tmpchunks; 1241 int ispower2; 1242 int power; 1243 int ri; 1244 int rx; 1245 int nswath; 1246 int j; 1247 1248 /* 1249 * First look in the depot. Any given zone in the depot may be 1250 * locked by being set to -1. We have to do this instead of simply 1251 * removing the entire chain because removing the entire chain can 1252 * cause racing threads to allocate local slabs for large objects, 1253 * resulting in a large VSZ. 1254 */ 1255 slgd = &slglobal; 1256 sldepot = &slglobaldepot; 1257 zinfo = &sldepot->zone[zi]; 1258 1259 if (zinfo->avail_base) { 1260 if (__isthreaded) 1261 _SPINLOCK(&sldepot->lock); 1262 slab = zinfo->avail_base; 1263 if (slab) { 1264 MASSERT(slab->slgd == NULL); 1265 slab->slgd = slgd; 1266 zinfo->avail_base = slab->next; 1267 --zinfo->avail_count; 1268 if (__isthreaded) 1269 _SPINUNLOCK(&sldepot->lock); 1270 return slab; 1271 } 1272 if (__isthreaded) 1273 _SPINUNLOCK(&sldepot->lock); 1274 } 1275 1276 /* 1277 * Nothing in the depot, allocate a new slab by locating or assigning 1278 * a region and then using the system virtual memory allocator. 1279 */ 1280 slab = NULL; 1281 1282 /* 1283 * Calculate the start of the data chunks relative to the start 1284 * of the slab. If chunk_size is a power of 2 we guarantee 1285 * power of 2 alignment. If it is not we guarantee alignment 1286 * to the chunk size. 1287 */ 1288 if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) { 1289 ispower2 = 1; 1290 chunk_offset = roundup2(sizeof(*slab), chunk_size); 1291 } else { 1292 ispower2 = 0; 1293 chunk_offset = sizeof(*slab) + chunking - 1; 1294 chunk_offset -= chunk_offset % chunking; 1295 } 1296 1297 /* 1298 * Calculate a reasonable number of chunks for the slab. 1299 * 1300 * Once initialized the MaxChunks[] array can only ever be 1301 * reinitialized to the same value. 1302 */ 1303 maxchunks = MaxChunks[zi]; 1304 if (maxchunks == 0) { 1305 /* 1306 * First calculate how many chunks would fit in 1/1024 1307 * available memory. This is around 2MB on a 32 bit 1308 * system and 128G on a 64-bit (48-bits available) system. 1309 */ 1310 maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) / 1311 (ssize_t)chunk_size; 1312 if (maxchunks <= 0) 1313 maxchunks = 1; 1314 1315 /* 1316 * A slab cannot handle more than MAXCHUNKS chunks, but 1317 * limit us to approximately MAXCHUNKS / 2 here because 1318 * we may have to expand maxchunks when we calculate the 1319 * actual power-of-2 slab. 1320 */ 1321 if (maxchunks > MAXCHUNKS / 2) 1322 maxchunks = MAXCHUNKS / 2; 1323 1324 /* 1325 * Try to limit the slabs to BIGSLABSIZE (~128MB). Larger 1326 * slabs will be created if the allocation does not fit. 1327 */ 1328 if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) { 1329 tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) / 1330 (ssize_t)chunk_size; 1331 if (tmpchunks <= 0) 1332 tmpchunks = 1; 1333 if (maxchunks > tmpchunks) 1334 maxchunks = tmpchunks; 1335 } 1336 1337 /* 1338 * If the slab calculates to greater than 2MB see if we 1339 * can cut it down to ~2MB. This controls VSZ but has 1340 * no effect on run-time size or performance. 1341 * 1342 * This is very important in case you core dump and also 1343 * important to reduce unnecessary region allocations. 1344 */ 1345 if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) { 1346 tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) / 1347 (ssize_t)chunk_size; 1348 if (tmpchunks < 1) 1349 tmpchunks = 1; 1350 if (maxchunks > tmpchunks) 1351 maxchunks = tmpchunks; 1352 } 1353 1354 /* 1355 * If the slab calculates to greater than 128K see if we 1356 * can cut it down to ~128K while still maintaining a 1357 * reasonably large number of chunks in each slab. This 1358 * controls VSZ but has no effect on run-time size or 1359 * performance. 1360 * 1361 * This is very important in case you core dump and also 1362 * important to reduce unnecessary region allocations. 1363 */ 1364 if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) { 1365 tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) / 1366 (ssize_t)chunk_size; 1367 if (tmpchunks < 32) 1368 tmpchunks = 32; 1369 if (maxchunks > tmpchunks) 1370 maxchunks = tmpchunks; 1371 } 1372 1373 MaxChunks[zi] = maxchunks; 1374 } 1375 MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS); 1376 1377 /* 1378 * Calculate the actual slab size. maxchunks will be recalculated 1379 * a little later. 1380 */ 1381 slab_desire = chunk_offset + chunk_size * maxchunks; 1382 slab_size = 8 * MAXCHUNKS; 1383 power = 3 + MAXCHUNKS_BITS; 1384 while (slab_size < slab_desire) { 1385 slab_size <<= 1; 1386 ++power; 1387 } 1388 1389 /* 1390 * Do a quick recalculation based on the actual slab size but not 1391 * yet dealing with whether the slab header is in-band or out-of-band. 1392 * The purpose here is to see if we can reasonably reduce slab_size 1393 * to a power of 4 to allow more chunk sizes to use the same slab 1394 * size. 1395 */ 1396 if ((power & 1) && slab_size > 32768) { 1397 maxchunks = (slab_size - chunk_offset) / chunk_size; 1398 if (maxchunks >= MAXCHUNKS / 8) { 1399 slab_size >>= 1; 1400 --power; 1401 } 1402 } 1403 if ((power & 2) && slab_size > 32768 * 4) { 1404 maxchunks = (slab_size - chunk_offset) / chunk_size; 1405 if (maxchunks >= MAXCHUNKS / 4) { 1406 slab_size >>= 2; 1407 power -= 2; 1408 } 1409 } 1410 /* 1411 * This case occurs when the slab_size is larger than 1/1024 available 1412 * UVM. 1413 */ 1414 nswath = slab_size / NREGIONS_SIZE; 1415 if (nswath > NREGIONS) 1416 return (NULL); 1417 1418 1419 /* 1420 * Try to allocate from our current best region for this zi 1421 */ 1422 region_mask = ~(slab_size - 1); 1423 ri = slgd->zone[zi].best_region; 1424 if (Regions[ri].mask == region_mask) { 1425 if ((slab = _vmem_alloc(ri, slab_size)) != NULL) 1426 goto found; 1427 } 1428 1429 /* 1430 * Try to find an existing region to allocate from. The normal 1431 * case will be for allocations that are less than 1/1024 available 1432 * UVM, which fit into a single Regions[] entry. 1433 */ 1434 while (slab_size <= NREGIONS_SIZE) { 1435 rx = -1; 1436 for (ri = 0; ri < NREGIONS; ++ri) { 1437 if (rx < 0 && Regions[ri].mask == 0) 1438 rx = ri; 1439 if (Regions[ri].mask == region_mask) { 1440 slab = _vmem_alloc(ri, slab_size); 1441 if (slab) { 1442 slgd->zone[zi].best_region = ri; 1443 goto found; 1444 } 1445 } 1446 } 1447 1448 if (rx < 0) 1449 return(NULL); 1450 1451 /* 1452 * This can fail, retry either way 1453 */ 1454 atomic_cmpset_ptr((void **)&Regions[rx].mask, 1455 NULL, 1456 (void *)region_mask); 1457 } 1458 1459 for (;;) { 1460 rx = -1; 1461 for (ri = 0; ri < NREGIONS; ri += nswath) { 1462 if (Regions[ri].mask == region_mask) { 1463 slab = _vmem_alloc(ri, slab_size); 1464 if (slab) { 1465 slgd->zone[zi].best_region = ri; 1466 goto found; 1467 } 1468 } 1469 if (rx < 0) { 1470 for (j = nswath - 1; j >= 0; --j) { 1471 if (Regions[ri+j].mask != 0) 1472 break; 1473 } 1474 if (j < 0) 1475 rx = ri; 1476 } 1477 } 1478 1479 /* 1480 * We found a candidate, try to allocate it backwards so 1481 * another thread racing a slaballoc() does not see the 1482 * mask in the base index position until we are done. 1483 * 1484 * We can safely zero-out any partial allocations because 1485 * the mask is only accessed from the base index. Any other 1486 * threads racing us will fail prior to us clearing the mask. 1487 */ 1488 if (rx < 0) 1489 return(NULL); 1490 for (j = nswath - 1; j >= 0; --j) { 1491 if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask, 1492 NULL, (void *)region_mask)) { 1493 while (++j < nswath) 1494 Regions[rx+j].mask = 0; 1495 break; 1496 } 1497 } 1498 /* retry */ 1499 } 1500 1501 /* 1502 * Fill in the new slab in region ri. If the slab_size completely 1503 * fills one or more region slots we move the slab structure out of 1504 * band which should optimize the chunking (particularly for a power 1505 * of 2). 1506 */ 1507 found: 1508 region = &Regions[ri]; 1509 MASSERT(region->slab == NULL); 1510 if (slab_size >= NREGIONS_SIZE) { 1511 save = slab; 1512 slab = memalloc(sizeof(*slab), 0); 1513 bzero(slab, sizeof(*slab)); 1514 slab->chunks = save; 1515 for (j = 0; j < nswath; ++j) 1516 region[j].slab = slab; 1517 chunk_offset = 0; 1518 } else { 1519 bzero(slab, sizeof(*slab)); 1520 slab->chunks = (char *)slab + chunk_offset; 1521 } 1522 1523 /* 1524 * Calculate the start of the chunks memory and recalculate the 1525 * actual number of chunks the slab can hold. 1526 */ 1527 maxchunks = (slab_size - chunk_offset) / chunk_size; 1528 if (maxchunks > MAXCHUNKS) 1529 maxchunks = MAXCHUNKS; 1530 1531 /* 1532 * And fill in the rest 1533 */ 1534 slab->magic = ZALLOC_SLAB_MAGIC; 1535 slab->navail = maxchunks; 1536 slab->nmax = maxchunks; 1537 slab->slab_size = slab_size; 1538 slab->chunk_size = chunk_size; 1539 slab->zone_index = zi; 1540 slab->slgd = slgd; 1541 slab->state = UNKNOWN; 1542 slab->region = region; 1543 1544 for (ri = 0; ri < maxchunks; ri += LONG_BITS) { 1545 if (ri + LONG_BITS <= maxchunks) 1546 slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX; 1547 else 1548 slab->bitmap[ri >> LONG_BITS_SHIFT] = 1549 (1LU << (maxchunks - ri)) - 1; 1550 } 1551 return (slab); 1552 } 1553 1554 /* 1555 * Free a slab. 1556 */ 1557 static void slabfree(slab_t slab)1558 slabfree(slab_t slab) 1559 { 1560 int nswath; 1561 int j; 1562 1563 if (slab->region->slab == slab) { 1564 /* 1565 * Out-of-band slab. 1566 */ 1567 nswath = slab->slab_size / NREGIONS_SIZE; 1568 for (j = 0; j < nswath; ++j) 1569 slab->region[j].slab = NULL; 1570 slab->magic = 0; 1571 _vmem_free(slab->chunks, slab->slab_size); 1572 memfree(slab, 0); 1573 } else { 1574 /* 1575 * In-band slab. 1576 */ 1577 slab->magic = 0; 1578 _vmem_free(slab, slab->slab_size); 1579 } 1580 } 1581 1582 /* 1583 * Terminate a slab's use in the current thread. The slab may still have 1584 * outstanding allocations and thus not be deallocatable. 1585 */ 1586 static void slabterm(slglobaldata_t slgd,slab_t slab)1587 slabterm(slglobaldata_t slgd, slab_t slab) 1588 { 1589 slglobaldata_t sldepot; 1590 struct zoneinfo *zinfo; 1591 int zi = slab->zone_index; 1592 1593 slab->slgd = NULL; 1594 --slgd->nslabs; 1595 sldepot = &slglobaldepot; 1596 zinfo = &sldepot->zone[zi]; 1597 1598 /* 1599 * Move the slab to the avail list or the empty list. 1600 */ 1601 if (__isthreaded) 1602 _SPINLOCK(&sldepot->lock); 1603 if (slab->navail == slab->nmax) { 1604 slab->state = EMPTY; 1605 slab->next = zinfo->empty_base; 1606 zinfo->empty_base = slab; 1607 ++zinfo->empty_count; 1608 } else { 1609 slab->state = AVAIL; 1610 slab->next = zinfo->avail_base; 1611 zinfo->avail_base = slab; 1612 ++zinfo->avail_count; 1613 } 1614 1615 /* 1616 * Clean extra slabs out of the empty list 1617 */ 1618 while (zinfo->empty_count > opt_cache) { 1619 slab = zinfo->empty_base; 1620 zinfo->empty_base = slab->next; 1621 --zinfo->empty_count; 1622 slab->state = UNKNOWN; 1623 if (__isthreaded) 1624 _SPINUNLOCK(&sldepot->lock); 1625 slabfree(slab); 1626 if (__isthreaded) 1627 _SPINLOCK(&sldepot->lock); 1628 } 1629 if (__isthreaded) 1630 _SPINUNLOCK(&sldepot->lock); 1631 } 1632 1633 /* 1634 * _vmem_alloc() 1635 * 1636 * Directly map memory in PAGE_SIZE'd chunks with the specified 1637 * alignment. 1638 * 1639 * Alignment must be a multiple of PAGE_SIZE. 1640 * 1641 * Size must be >= alignment. 1642 */ 1643 static void * _vmem_alloc(int ri,size_t slab_size)1644 _vmem_alloc(int ri, size_t slab_size) 1645 { 1646 char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT); 1647 char *eaddr; 1648 char *addr; 1649 char *save; 1650 uintptr_t excess; 1651 1652 if (slab_size < NREGIONS_SIZE) 1653 eaddr = baddr + NREGIONS_SIZE; 1654 else 1655 eaddr = baddr + slab_size; 1656 1657 /* 1658 * This usually just works but might not. 1659 */ 1660 addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE, 1661 MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0); 1662 if (addr == MAP_FAILED) { 1663 return (NULL); 1664 } 1665 if (addr < baddr || addr + slab_size > eaddr) { 1666 munmap(addr, slab_size); 1667 return (NULL); 1668 } 1669 1670 /* 1671 * Check alignment. The misaligned offset is also the excess 1672 * amount. If misaligned unmap the excess so we have a chance of 1673 * mapping at the next alignment point and recursively try again. 1674 * 1675 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB block alignment 1676 * aaaaaaaaa aaaaaaaaaaa aa mis-aligned allocation 1677 * xxxxxxxxx final excess calculation 1678 * ^ returned address 1679 */ 1680 excess = (uintptr_t)addr & (slab_size - 1); 1681 while (excess) { 1682 excess = slab_size - excess; 1683 save = addr; 1684 1685 munmap(save + excess, slab_size - excess); 1686 addr = _vmem_alloc(ri, slab_size); 1687 munmap(save, excess); 1688 if (addr == NULL) 1689 return (NULL); 1690 if (addr < baddr || addr + slab_size > eaddr) { 1691 munmap(addr, slab_size); 1692 return (NULL); 1693 } 1694 excess = (uintptr_t)addr & (slab_size - 1); 1695 } 1696 return (addr); 1697 } 1698 1699 /* 1700 * _vmem_free() 1701 * 1702 * Free a chunk of memory allocated with _vmem_alloc() 1703 */ 1704 static void _vmem_free(void * ptr,size_t size)1705 _vmem_free(void *ptr, size_t size) 1706 { 1707 munmap(ptr, size); 1708 } 1709 1710 /* 1711 * Panic on fatal conditions 1712 */ 1713 static void _mpanic(const char * ctl,...)1714 _mpanic(const char *ctl, ...) 1715 { 1716 va_list va; 1717 1718 if (malloc_panic == 0) { 1719 malloc_panic = 1; 1720 va_start(va, ctl); 1721 vfprintf(stderr, ctl, va); 1722 fprintf(stderr, "\n"); 1723 fflush(stderr); 1724 va_end(va); 1725 } 1726 abort(); 1727 } 1728 1729 __weak_reference(__aligned_alloc, aligned_alloc); 1730 __weak_reference(__malloc, malloc); 1731 __weak_reference(__calloc, calloc); 1732 __weak_reference(__posix_memalign, posix_memalign); 1733 __weak_reference(__realloc, realloc); 1734 __weak_reference(__free, free); 1735