1 /* 2 * KERN_KMALLOC.C - Kernel memory allocator 3 * 4 * Copyright (c) 2021 The DragonFly Project, All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * This module implements the kmalloc_obj allocator. This is a type-stable 39 * allocator that uses the same base structures (e.g. malloc_type) plus 40 * some extensions to efficiently implement single-type zones. 41 * 42 * All memory management is zone based. When a zone is destroyed, all of 43 * its memory is returned to the system with no fragmentation. 44 * 45 * A mini-slab allocator hangs directly off the zone structure (malloc_type). 46 * Since the object zones are single-size-only, the slab allocator is very 47 * simple and currently utilizes just two per-zone/per-cpu slabs (active and 48 * alternate) before kicking up to the per-zone cache. Beyond that we just 49 * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary 50 * kernel_map mappings and unmappings. 51 * 52 * The advantage of this that zones don't stomp over each other and cause 53 * excessive fragmentation in the slabs. For example, when you umount a 54 * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory) 55 * is returned to the system. 56 */ 57 58 #include "opt_vm.h" 59 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/kernel.h> 63 #include <sys/slaballoc.h> 64 #include <sys/mbuf.h> 65 #include <sys/vmmeter.h> 66 #include <sys/spinlock.h> 67 #include <sys/lock.h> 68 #include <sys/thread.h> 69 #include <sys/globaldata.h> 70 #include <sys/sysctl.h> 71 #include <sys/ktr.h> 72 #include <sys/malloc.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_param.h> 76 #include <vm/vm_kern.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_object.h> 79 #include <vm/pmap.h> 80 #include <vm/vm_map.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pageout.h> 83 84 #include <machine/cpu.h> 85 86 #include <sys/spinlock2.h> 87 #include <sys/thread2.h> 88 #include <vm/vm_page2.h> 89 90 #define MEMORY_STRING "ptr=%p type=%p size=%lu flags=%04x" 91 #define MEMORY_ARGS void *ptr, void *type, unsigned long size, int flags 92 93 #if !defined(KTR_MEMORY) 94 #define KTR_MEMORY KTR_ALL 95 #endif 96 KTR_INFO_MASTER(mem_obj); 97 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin"); 98 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS); 99 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS); 100 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS); 101 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS); 102 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS); 103 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS); 104 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS); 105 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS); 106 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin"); 107 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end"); 108 109 #define logmemory(name, ptr, type, size, flags) \ 110 KTR_LOG(mem_obj ## name, ptr, type, size, flags) 111 #define logmemory_quick(name) \ 112 KTR_LOG(mem_obj ## name) 113 114 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS; 115 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, ""); 116 __read_frequently static int kzone_debug; 117 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, ""); 118 119 __read_frequently struct kmalloc_slab kslab_dummy; 120 121 static void malloc_slab_destroy(struct malloc_type *type, 122 struct kmalloc_slab **slabp); 123 124 /* 125 * Cache a chain of slabs onto their respective cpu slab caches. Any slabs 126 * which we cannot cache will be returned. 127 * 128 * free_slabs - Current structure may only be accessed by current cpu 129 * remote_free_slabs - Only atomic swap operations are allowed. 130 * free_count - Only atomic operations are allowed. 131 * 132 * If the count is sufficient to cache the entire list, NULL is returned. 133 * Otherwise the portion that was not cached is returned. 134 */ 135 static __noinline 136 struct kmalloc_slab * 137 gslab_cache(struct kmalloc_slab *slab) 138 { 139 struct kmalloc_slab *save; 140 struct kmalloc_slab *next; 141 struct kmalloc_slab *res; 142 struct kmalloc_slab **resp; 143 struct kmalloc_slab **slabp; 144 globaldata_t rgd; 145 size_t count; 146 int cpuid; 147 148 res = NULL; 149 resp = &res; 150 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 151 152 /* 153 * Given the slab list, get the cpuid and clip off as many matching 154 * elements as fits in the cache. 155 */ 156 while (slab) { 157 cpuid = slab->orig_cpuid; 158 rgd = globaldata_find(cpuid); 159 160 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 161 /* 162 * Doesn't fit in cache, put on return list. 163 */ 164 if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) { 165 *resp = slab; 166 resp = &slab->next; 167 slab = slab->next; 168 continue; 169 } 170 171 /* 172 * Collect. We aren't required to match-up the original cpu 173 * with the disposal cpu, but its a good idea to retain 174 * memory locality. 175 * 176 * The slabs we collect are going into the global cache, 177 * remove the type association. 178 */ 179 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 180 slabp = &slab->next; 181 count = 1; 182 slab->type = NULL; 183 184 while ((next = *slabp) != NULL && 185 next->orig_cpuid == cpuid && 186 rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs) 187 { 188 KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0); 189 next->type = NULL; 190 ++count; 191 slabp = &next->next; 192 } 193 194 /* 195 * Safety, unhook before next, next is not included in the 196 * list starting with slab that is being pre-pended 197 * to remote_free_slabs. 198 */ 199 *slabp = NULL; 200 201 /* 202 * Now atomically pre-pend slab...*slabp to remote_free_slabs. 203 * Pump the count first (its ok if the actual chain length 204 * races the count update). 205 * 206 * NOTE: In the loop, (save) is updated by fcmpset. 207 */ 208 atomic_add_long(&rgd->gd_kmslab.free_count, count); 209 save = rgd->gd_kmslab.remote_free_slabs; 210 for (;;) { 211 KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0); 212 *slabp = save; /* end of slab list chain to... */ 213 cpu_ccfence(); 214 if (atomic_fcmpset_ptr( 215 &rgd->gd_kmslab.remote_free_slabs, 216 &save, slab)) 217 { 218 break; 219 } 220 } 221 222 /* 223 * Setup for next loop 224 */ 225 slab = next; 226 } 227 228 /* 229 * Terminate the result list and return it 230 */ 231 *resp = NULL; 232 233 return res; 234 } 235 236 /* 237 * May only be called on current cpu. Pull a free slab from the 238 * pcpu cache. If we run out, move any slabs that have built-up 239 * from remote cpus. 240 * 241 * We are only allowed to swap the remote_free_slabs head, we cannot 242 * manipulate any next pointers while structures are sitting on that list. 243 */ 244 static __inline 245 struct kmalloc_slab * 246 gslab_alloc(globaldata_t gd) 247 { 248 struct kmalloc_slab *slab; 249 250 slab = gd->gd_kmslab.free_slabs; 251 if (slab == NULL) { 252 slab = atomic_swap_ptr( 253 (volatile void **)&gd->gd_kmslab.remote_free_slabs, 254 NULL); 255 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 256 } 257 if (slab) { 258 gd->gd_kmslab.free_slabs = slab->next; 259 slab->next = NULL; 260 atomic_add_long(&gd->gd_kmslab.free_count, -1); 261 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 262 } 263 return slab; 264 } 265 266 void 267 malloc_mgt_init(struct malloc_type *type __unused, 268 struct kmalloc_mgt *mgt, size_t size) 269 { 270 size_t offset; 271 size_t count; 272 273 bzero(mgt, sizeof(*mgt)); 274 spin_init(&mgt->spin, "kmmgt"); 275 276 /* 277 * Allows us to avoid a conditional. The dummy slabs are empty 278 * and have no objects. 279 */ 280 mgt->active = &kslab_dummy; 281 mgt->alternate = &kslab_dummy; 282 mgt->empty_tailp = &mgt->empty; 283 284 /* 285 * Figure out the count by taking into account the size of the fobjs[] 286 * array by adding it to the object size. 287 */ 288 offset = offsetof(struct kmalloc_slab, fobjs[0]); 289 offset = __VM_CACHELINE_ALIGN(offset); 290 count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *)); 291 292 /* 293 * However, the fobj[] array itself must be aligned, so we might 294 * have to reduce the count by 1. (We can do this becaues 'size' 295 * is already aligned as well). 296 */ 297 offset = offsetof(struct kmalloc_slab, fobjs[count]); 298 offset = __VM_CACHELINE_ALIGN(offset); 299 300 if (offset + size * count > KMALLOC_SLAB_SIZE) { 301 --count; 302 offset = offsetof(struct kmalloc_slab, fobjs[count]); 303 offset = __VM_CACHELINE_ALIGN(offset); 304 KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE); 305 } 306 307 mgt->slab_offset = offset; 308 mgt->slab_count = count; 309 } 310 311 void 312 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst) 313 { 314 struct kmalloc_slab **slabp; 315 316 spin_init(&dst->spin, "kmmgt"); 317 slabp = &dst->empty; 318 319 while (*slabp) { 320 slabp = &(*slabp)->next; 321 } 322 dst->empty_tailp = slabp; 323 } 324 325 void 326 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt) 327 { 328 if (mgt->active != &kslab_dummy) 329 malloc_slab_destroy(type, &mgt->active); 330 mgt->active = NULL; 331 332 if (mgt->alternate != &kslab_dummy) 333 malloc_slab_destroy(type, &mgt->alternate); 334 mgt->alternate = NULL; 335 336 malloc_slab_destroy(type, &mgt->partial); 337 malloc_slab_destroy(type, &mgt->full); 338 malloc_slab_destroy(type, &mgt->empty); 339 mgt->npartial = 0; 340 mgt->nfull = 0; 341 mgt->nempty = 0; 342 mgt->empty_tailp = &mgt->empty; 343 344 spin_uninit(&mgt->spin); 345 } 346 347 /* 348 * Destroy a list of slabs. Attempt to cache the slabs on the specified 349 * (possibly remote) cpu. This allows slabs that were operating on a 350 * particular cpu to be disposed of back to that same cpu. 351 */ 352 static void 353 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp) 354 { 355 struct kmalloc_slab *slab; 356 struct kmalloc_slab *base; 357 struct kmalloc_slab **basep; 358 size_t delta; 359 360 if (*slabp == NULL) 361 return; 362 363 /* 364 * Collect all slabs that can actually be destroyed, complain 365 * about the rest. 366 */ 367 base = NULL; 368 basep = &base; 369 while ((slab = *slabp) != NULL) { 370 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 371 372 delta = slab->findex - slab->aindex; 373 if (delta == slab->ncount) { 374 *slabp = slab->next; /* unlink */ 375 *basep = slab; /* link into base list */ 376 basep = &slab->next; 377 } else { 378 kprintf("%s: slab %p %zd objects " 379 "were still allocated\n", 380 type->ks_shortdesc, slab, 381 slab->ncount - delta); 382 /* leave link intact and iterate */ 383 slabp = &slab->next; 384 } 385 } 386 387 /* 388 * Terminate the base list of slabs that can be destroyed, 389 * then cache as many of them as possible. 390 */ 391 *basep = NULL; 392 if (base == NULL) 393 return; 394 base = gslab_cache(base); 395 396 /* 397 * Destroy the remainder 398 */ 399 while ((slab = base) != NULL) { 400 base = slab->next; 401 slab->next = (void *)(uintptr_t)-1; 402 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 403 } 404 } 405 406 /* 407 * Poll a limited number of slabs on the empty list and move them 408 * to the appropriate full or partial list. Slabs left on the empty 409 * list or rotated to the tail. 410 * 411 * If gcache is non-zero this function will try to place full slabs into 412 * the globaldata cache, if it isn't already too full. 413 * 414 * The mgt is spin-locked 415 * 416 * Returns non-zero if the ggm updates possibly made slabs available for 417 * allocation. 418 */ 419 static int 420 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count, int gcache) 421 { 422 struct kmalloc_slab *marker; 423 struct kmalloc_slab *slab; 424 size_t delta; 425 int got_something; 426 427 if (ggm->empty == NULL) 428 return 0; 429 430 got_something = 0; 431 marker = ggm->empty; 432 433 while (count-- && (slab = ggm->empty) != NULL) { 434 /* 435 * Unlink from empty 436 */ 437 ggm->empty = slab->next; 438 slab->next = NULL; 439 --ggm->nempty; 440 if (ggm->empty_tailp == &slab->next) 441 ggm->empty_tailp = &ggm->empty; 442 443 /* 444 * Check partial, full, and empty. We rotate 445 * empty entries to the end of the empty list. 446 * 447 * NOTE: For a fully-freeable slab we also have 448 * to check xindex. 449 */ 450 delta = slab->findex - slab->aindex; 451 if (delta == slab->ncount) { 452 /* 453 * Full and possibly freeable. 454 */ 455 if (gcache && 456 ggm->nfull >= KMALLOC_MAXFREEMAGS && 457 slab->xindex == slab->findex) 458 { 459 KKASSERT(slab->next == NULL); 460 slab = gslab_cache(slab); 461 } 462 if (slab) { 463 KKASSERT(slab->next == NULL); 464 slab->next = ggm->full; 465 ggm->full = slab; 466 got_something = 1; 467 ++ggm->nfull; 468 } 469 } else if (delta) { 470 /* 471 * Partially full 472 */ 473 KKASSERT(slab->next == NULL); 474 slab->next = ggm->partial; 475 ggm->partial = slab; 476 got_something = 1; 477 ++ggm->npartial; 478 } else { 479 /* 480 * Empty 481 */ 482 KKASSERT(slab->next == NULL); 483 *ggm->empty_tailp = slab; 484 ggm->empty_tailp = &slab->next; 485 ++ggm->nempty; 486 if (ggm->empty == marker) 487 break; 488 } 489 } 490 return got_something; 491 } 492 493 /* 494 * Called once a second with the zone interlocked against destruction. 495 * 496 * Returns non-zero to tell the caller to iterate to the next type, 497 * else the caller should stay on the current type. 498 */ 499 int 500 malloc_mgt_poll(struct malloc_type *type) 501 { 502 struct kmalloc_mgt *ggm; 503 struct kmalloc_slab *slab; 504 struct kmalloc_slab **slabp; 505 struct kmalloc_slab *base; 506 struct kmalloc_slab **basep; 507 size_t delta; 508 int donext; 509 int count; 510 int retired; 511 512 if ((type->ks_flags & KSF_OBJSIZE) == 0) 513 return 1; 514 515 /* 516 * Check the partial, full, and empty lists for full freeable slabs 517 * in excess of desired caching count. 518 */ 519 ggm = &type->ks_mgt; 520 spin_lock(&ggm->spin); 521 522 /* 523 * Move empty slabs to partial or full as appropriate. We 524 * don't bother checking partial slabs to see if they are full 525 * for now. 526 */ 527 malloc_mgt_poll_empty_locked(ggm, 16, 0); 528 529 /* 530 * Ok, cleanout some of the full mags from the full list 531 */ 532 base = NULL; 533 basep = &base; 534 count = ggm->nfull; 535 retired = 0; 536 cpu_ccfence(); 537 538 if (count > KMALLOC_MAXFREEMAGS) { 539 slabp = &ggm->full; 540 count -= KMALLOC_MAXFREEMAGS; 541 if (count > 16) 542 count = 16; 543 544 while (count && (slab = *slabp) != NULL) { 545 delta = slab->findex - slab->aindex; 546 if (delta == slab->ncount && 547 slab->xindex == slab->findex) 548 { 549 *slabp = slab->next; 550 *basep = slab; 551 basep = &slab->next; 552 --ggm->nfull; 553 if (++retired == 4) 554 break; 555 } else { 556 slabp = &slab->next; 557 } 558 --count; 559 } 560 *basep = NULL; /* terminate the retirement list */ 561 donext = (*slabp == NULL); 562 } else { 563 donext = 1; 564 } 565 spin_unlock(&ggm->spin); 566 567 /* 568 * Clean out any slabs that we couldn't stow in the globaldata cache. 569 */ 570 if (retired) { 571 if (kzone_debug) { 572 kprintf("kmalloc_poll: %s retire %d\n", 573 type->ks_shortdesc, retired); 574 } 575 base = gslab_cache(base); 576 while ((slab = base) != NULL) { 577 base = base->next; 578 slab->next = NULL; 579 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 580 } 581 } 582 583 return donext; 584 } 585 586 /* 587 * Optional bitmap double-free check. This is typically turned on by 588 * default for safety (sys/_malloc.h) 589 */ 590 #ifdef KMALLOC_CHECK_DOUBLE_FREE 591 592 static __inline void 593 bmap_set(struct kmalloc_slab *slab, void *obj) 594 { 595 uint64_t *ptr; 596 uint64_t mask; 597 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 598 slab->objsize; 599 600 ptr = &slab->bmap[i >> 6]; 601 mask = (uint64_t)1U << (i & 63); 602 KKASSERT(i < slab->ncount && (*ptr & mask) == 0); 603 atomic_set_64(ptr, mask); 604 } 605 606 static __inline void 607 bmap_clr(struct kmalloc_slab *slab, void *obj) 608 { 609 uint64_t *ptr; 610 uint64_t mask; 611 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 612 slab->objsize; 613 614 ptr = &slab->bmap[i >> 6]; 615 mask = (uint64_t)1U << (i & 63); 616 KKASSERT(i < slab->ncount && (*ptr & mask) != 0); 617 atomic_clear_64(ptr, mask); 618 } 619 620 #endif 621 622 /* 623 * Cleanup a mgt structure. 624 * 625 * Always called from the current cpu, so we can manipulate the various 626 * lists freely. 627 * 628 * WARNING: findex can race, fobjs[n] is updated after findex is incremented, 629 * and 'full' 630 */ 631 #if 0 632 static void 633 mgt_cleanup(struct kmalloc_mgt *mgt) 634 { 635 #if 0 636 struct kmalloc_slab **slabp; 637 struct kmalloc_slab *slab; 638 size_t delta; 639 size_t total; 640 #endif 641 } 642 #endif 643 644 #ifdef SLAB_DEBUG 645 void * 646 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags, 647 const char *file, int line) 648 #else 649 void * 650 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags) 651 #endif 652 { 653 struct kmalloc_slab *slab; 654 struct kmalloc_use *use; 655 struct kmalloc_mgt *mgt; 656 struct kmalloc_mgt *ggm; 657 globaldata_t gd; 658 void *obj; 659 size_t delta; 660 661 /* 662 * Check limits 663 */ 664 while (__predict_false(type->ks_loosememuse >= type->ks_limit)) { 665 long ttl; 666 int n; 667 668 for (n = ttl = 0; n < ncpus; ++n) 669 ttl += type->ks_use[n].memuse; 670 type->ks_loosememuse = ttl; /* not MP synchronized */ 671 if ((ssize_t)ttl < 0) /* deal with occassional race */ 672 ttl = 0; 673 if (ttl >= type->ks_limit) { 674 if (flags & M_NULLOK) 675 return(NULL); 676 panic("%s: malloc limit exceeded", type->ks_shortdesc); 677 } 678 } 679 680 /* 681 * Setup 682 */ 683 crit_enter(); 684 logmemory_quick(malloc_beg); 685 KKASSERT(size == type->ks_objsize); 686 gd = mycpu; 687 use = &type->ks_use[gd->gd_cpuid]; 688 689 retry: 690 /* 691 * Check active 692 * 693 * NOTE: obj can be NULL if racing a _kfree_obj(). 694 */ 695 mgt = &use->mgt; 696 slab = mgt->active; /* Might be dummy */ 697 delta = slab->findex - slab->aindex; 698 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 699 size_t i; 700 701 i = slab->aindex % slab->ncount; 702 obj = slab->fobjs[i]; 703 if (__predict_true(obj != NULL)) { 704 slab->fobjs[i] = NULL; 705 ++slab->aindex; 706 #ifdef KMALLOC_CHECK_DOUBLE_FREE 707 bmap_set(slab, obj); 708 #endif 709 goto found; 710 } 711 } 712 713 /* 714 * Check alternate. If we find something, swap it with 715 * the active. 716 * 717 * NOTE: It is possible for exhausted slabs to recover entries 718 * via _kfree_obj(), so we just keep swapping until both 719 * are empty. 720 * 721 * NOTE: obj can be NULL if racing a _kfree_obj(). 722 */ 723 slab = mgt->alternate; /* Might be dummy */ 724 delta = slab->findex - slab->aindex; 725 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 726 size_t i; 727 728 mgt->alternate = mgt->active; 729 mgt->active = slab; 730 i = slab->aindex % slab->ncount; 731 obj = slab->fobjs[i]; 732 if (__predict_true(obj != NULL)) { 733 slab->fobjs[i] = NULL; 734 ++slab->aindex; 735 #ifdef KMALLOC_CHECK_DOUBLE_FREE 736 bmap_set(slab, obj); 737 #endif 738 goto found; 739 } 740 } 741 742 /* 743 * Rotate a slab from the global mgt into the pcpu mgt. 744 * 745 * G(partial, full) -> active -> alternate -> G(empty) 746 * 747 * We try to exhaust partials first to reduce fragmentation, then 748 * dig into the fulls. 749 */ 750 ggm = &type->ks_mgt; 751 spin_lock(&ggm->spin); 752 753 rerotate: 754 if (ggm->partial) { 755 slab = mgt->alternate; /* Might be dummy */ 756 mgt->alternate = mgt->active; /* Might be dummy */ 757 mgt->active = ggm->partial; 758 ggm->partial = ggm->partial->next; 759 mgt->active->next = NULL; 760 --ggm->npartial; 761 if (slab != &kslab_dummy) { 762 KKASSERT(slab->next == NULL); 763 *ggm->empty_tailp = slab; 764 ggm->empty_tailp = &slab->next; 765 ++ggm->nempty; 766 } 767 spin_unlock(&ggm->spin); 768 goto retry; 769 } 770 771 if (ggm->full) { 772 slab = mgt->alternate; /* Might be dummy */ 773 mgt->alternate = mgt->active; /* Might be dummy */ 774 mgt->active = ggm->full; 775 ggm->full = ggm->full->next; 776 mgt->active->next = NULL; 777 --ggm->nfull; 778 if (slab != &kslab_dummy) { 779 KKASSERT(slab->next == NULL); 780 *ggm->empty_tailp = slab; 781 ggm->empty_tailp = &slab->next; 782 ++ggm->nempty; 783 } 784 spin_unlock(&ggm->spin); 785 goto retry; 786 } 787 788 /* 789 * We couldn't find anything, scan a limited number of empty entries 790 * looking for something with objects. 791 */ 792 if (malloc_mgt_poll_empty_locked(ggm, 16, 1)) 793 goto rerotate; 794 795 /* 796 * Absolutely nothing is available, allocate a new slab and 797 * rotate it in. 798 * 799 * Try to get a slab from the global pcpu slab cache. 800 */ 801 spin_unlock(&ggm->spin); 802 803 if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) { 804 slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE, 805 M_WAITOK); 806 } 807 808 bzero(slab, sizeof(*slab)); 809 KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <= 810 use->mgt.slab_offset); 811 812 obj = (char *)slab + use->mgt.slab_offset; 813 slab->type = type; 814 slab->orig_cpuid = gd->gd_cpuid; 815 slab->ncount = use->mgt.slab_count; 816 slab->offset = use->mgt.slab_offset; 817 slab->objsize = type->ks_objsize; 818 slab->aindex = 0; 819 slab->findex = slab->ncount; 820 slab->xindex = slab->ncount; 821 for (delta = 0; delta < slab->ncount; ++delta) { 822 slab->fobjs[delta] = obj; 823 obj = (char *)obj + type->ks_objsize; 824 } 825 826 /* 827 * Sanity check, assert that the last byte of last object is still 828 * in the slab. 829 */ 830 #if 0 831 KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 832 ~KMALLOC_SLAB_MASK) == 0); 833 #endif 834 KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 835 ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj)); 836 slab->magic = KMALLOC_SLAB_MAGIC; 837 spin_init(&slab->spin, "kmslb"); 838 839 /* 840 * Rotate it in, then retry. 841 * 842 * (NEW)slab -> active -> alternate -> G(empty) 843 */ 844 spin_lock(&ggm->spin); 845 if (mgt->alternate != &kslab_dummy) { 846 struct kmalloc_slab *slab_tmp; 847 848 slab_tmp = mgt->alternate; 849 slab_tmp->next = NULL; 850 *ggm->empty_tailp = slab_tmp; 851 ggm->empty_tailp = &slab_tmp->next; 852 ++ggm->nempty; 853 } 854 mgt->alternate = mgt->active; /* Might be dummy */ 855 mgt->active = slab; 856 spin_unlock(&ggm->spin); 857 858 goto retry; 859 860 /* 861 * Found object, adjust statistics and return 862 */ 863 found: 864 ++use->inuse; 865 ++use->calls; 866 use->memuse += size; 867 use->loosememuse += size; 868 if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) { 869 /* not MP synchronized */ 870 type->ks_loosememuse += use->loosememuse; 871 use->loosememuse = 0; 872 } 873 874 /* 875 * Handle remaining flags. M_ZERO is typically not set because 876 * the inline macro deals with zeroing for constant sizes. 877 */ 878 if (__predict_false(flags & M_ZERO)) 879 bzero(obj, size); 880 881 crit_exit(); 882 logmemory(malloc_end, NULL, type, size, flags); 883 884 return(obj); 885 } 886 887 /* 888 * Free a type-stable object. We have the base structure and can 889 * calculate the slab, but from this direction we don't know which 890 * mgt structure or list the slab might be on. 891 */ 892 void 893 _kfree_obj(void *obj, struct malloc_type *type) 894 { 895 struct kmalloc_slab *slab; 896 struct kmalloc_use *use; 897 globaldata_t gd; 898 size_t delta; 899 size_t i; 900 901 logmemory_quick(free_beg); 902 gd = mycpu; 903 904 /* 905 * Calculate the slab from the pointer 906 */ 907 slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK); 908 delta = slab->findex - slab->aindex; 909 KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount); 910 911 /* 912 * We can only safely adjust the statistics for the current cpu. 913 * Don't try to track down the original cpu. The statistics will 914 * be collected and fixed up by vmstat -m (etc). 915 */ 916 use = &slab->type->ks_use[gd->gd_cpuid]; 917 --use->inuse; 918 use->memuse -= slab->objsize; 919 920 /* 921 * There MUST be free space in the slab since we are returning 922 * the obj to the same slab it was allocated from. 923 */ 924 i = atomic_fetchadd_long(&slab->findex, 1); 925 i = i % slab->ncount; 926 if (slab->fobjs[i] != NULL) { 927 kprintf("_kfree_obj failure %zd/%zd/%zd\n", 928 slab->aindex, slab->findex, slab->ncount); 929 } 930 #ifdef KMALLOC_CHECK_DOUBLE_FREE 931 bmap_clr(slab, obj); 932 #endif 933 KKASSERT(slab->fobjs[i] == NULL); 934 slab->fobjs[i] = obj; 935 atomic_add_long(&slab->xindex, 1); /* synchronizer */ 936 937 logmemory_quick(free_end); 938 } 939