1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdlib.h> 7 #include <stdio.h> 8 #include <stdarg.h> 9 #include <errno.h> 10 #include <sys/queue.h> 11 12 #include <rte_memory.h> 13 #include <rte_errno.h> 14 #include <rte_eal.h> 15 #include <rte_eal_memconfig.h> 16 #include <rte_launch.h> 17 #include <rte_per_lcore.h> 18 #include <rte_lcore.h> 19 #include <rte_common.h> 20 #include <rte_string_fns.h> 21 #include <rte_spinlock.h> 22 #include <rte_memcpy.h> 23 #include <rte_memzone.h> 24 #include <rte_atomic.h> 25 #include <rte_fbarray.h> 26 27 #include "eal_internal_cfg.h" 28 #include "eal_memalloc.h" 29 #include "eal_memcfg.h" 30 #include "eal_private.h" 31 #include "malloc_elem.h" 32 #include "malloc_heap.h" 33 #include "malloc_mp.h" 34 35 /* start external socket ID's at a very high number */ 36 #define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ 37 #define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) 38 39 static unsigned 40 check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) 41 { 42 unsigned check_flag = 0; 43 44 if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) 45 return 1; 46 47 switch (hugepage_sz) { 48 case RTE_PGSIZE_256K: 49 check_flag = RTE_MEMZONE_256KB; 50 break; 51 case RTE_PGSIZE_2M: 52 check_flag = RTE_MEMZONE_2MB; 53 break; 54 case RTE_PGSIZE_16M: 55 check_flag = RTE_MEMZONE_16MB; 56 break; 57 case RTE_PGSIZE_256M: 58 check_flag = RTE_MEMZONE_256MB; 59 break; 60 case RTE_PGSIZE_512M: 61 check_flag = RTE_MEMZONE_512MB; 62 break; 63 case RTE_PGSIZE_1G: 64 check_flag = RTE_MEMZONE_1GB; 65 break; 66 case RTE_PGSIZE_4G: 67 check_flag = RTE_MEMZONE_4GB; 68 break; 69 case RTE_PGSIZE_16G: 70 check_flag = RTE_MEMZONE_16GB; 71 } 72 73 return check_flag & flags; 74 } 75 76 int 77 malloc_socket_to_heap_id(unsigned int socket_id) 78 { 79 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 80 int i; 81 82 for (i = 0; i < RTE_MAX_HEAPS; i++) { 83 struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 84 85 if (heap->socket_id == socket_id) 86 return i; 87 } 88 return -1; 89 } 90 91 /* 92 * Expand the heap with a memory area. 93 */ 94 static struct malloc_elem * 95 malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, 96 void *start, size_t len) 97 { 98 struct malloc_elem *elem = start; 99 100 malloc_elem_init(elem, heap, msl, len, elem, len); 101 102 malloc_elem_insert(elem); 103 104 elem = malloc_elem_join_adjacent_free(elem); 105 106 malloc_elem_free_list_insert(elem); 107 108 return elem; 109 } 110 111 static int 112 malloc_add_seg(const struct rte_memseg_list *msl, 113 const struct rte_memseg *ms, size_t len, void *arg __rte_unused) 114 { 115 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 116 struct rte_memseg_list *found_msl; 117 struct malloc_heap *heap; 118 int msl_idx, heap_idx; 119 120 if (msl->external) 121 return 0; 122 123 heap_idx = malloc_socket_to_heap_id(msl->socket_id); 124 if (heap_idx < 0) { 125 RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); 126 return -1; 127 } 128 heap = &mcfg->malloc_heaps[heap_idx]; 129 130 /* msl is const, so find it */ 131 msl_idx = msl - mcfg->memsegs; 132 133 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) 134 return -1; 135 136 found_msl = &mcfg->memsegs[msl_idx]; 137 138 malloc_heap_add_memory(heap, found_msl, ms->addr, len); 139 140 heap->total_size += len; 141 142 RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, 143 msl->socket_id); 144 return 0; 145 } 146 147 /* 148 * Iterates through the freelist for a heap to find a free element 149 * which can store data of the required size and with the requested alignment. 150 * If size is 0, find the biggest available elem. 151 * Returns null on failure, or pointer to element on success. 152 */ 153 static struct malloc_elem * 154 find_suitable_element(struct malloc_heap *heap, size_t size, 155 unsigned int flags, size_t align, size_t bound, bool contig) 156 { 157 size_t idx; 158 struct malloc_elem *elem, *alt_elem = NULL; 159 160 for (idx = malloc_elem_free_list_index(size); 161 idx < RTE_HEAP_NUM_FREELISTS; idx++) { 162 for (elem = LIST_FIRST(&heap->free_head[idx]); 163 !!elem; elem = LIST_NEXT(elem, free_list)) { 164 if (malloc_elem_can_hold(elem, size, align, bound, 165 contig)) { 166 if (check_hugepage_sz(flags, 167 elem->msl->page_sz)) 168 return elem; 169 if (alt_elem == NULL) 170 alt_elem = elem; 171 } 172 } 173 } 174 175 if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) 176 return alt_elem; 177 178 return NULL; 179 } 180 181 /* 182 * Iterates through the freelist for a heap to find a free element with the 183 * biggest size and requested alignment. Will also set size to whatever element 184 * size that was found. 185 * Returns null on failure, or pointer to element on success. 186 */ 187 static struct malloc_elem * 188 find_biggest_element(struct malloc_heap *heap, size_t *size, 189 unsigned int flags, size_t align, bool contig) 190 { 191 struct malloc_elem *elem, *max_elem = NULL; 192 size_t idx, max_size = 0; 193 194 for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 195 for (elem = LIST_FIRST(&heap->free_head[idx]); 196 !!elem; elem = LIST_NEXT(elem, free_list)) { 197 size_t cur_size; 198 if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && 199 !check_hugepage_sz(flags, 200 elem->msl->page_sz)) 201 continue; 202 if (contig) { 203 cur_size = 204 malloc_elem_find_max_iova_contig(elem, 205 align); 206 } else { 207 void *data_start = RTE_PTR_ADD(elem, 208 MALLOC_ELEM_HEADER_LEN); 209 void *data_end = RTE_PTR_ADD(elem, elem->size - 210 MALLOC_ELEM_TRAILER_LEN); 211 void *aligned = RTE_PTR_ALIGN_CEIL(data_start, 212 align); 213 /* check if aligned data start is beyond end */ 214 if (aligned >= data_end) 215 continue; 216 cur_size = RTE_PTR_DIFF(data_end, aligned); 217 } 218 if (cur_size > max_size) { 219 max_size = cur_size; 220 max_elem = elem; 221 } 222 } 223 } 224 225 *size = max_size; 226 return max_elem; 227 } 228 229 /* 230 * Main function to allocate a block of memory from the heap. 231 * It locks the free list, scans it, and adds a new memseg if the 232 * scan fails. Once the new memseg is added, it re-scans and should return 233 * the new element after releasing the lock. 234 */ 235 static void * 236 heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, 237 unsigned int flags, size_t align, size_t bound, bool contig) 238 { 239 struct malloc_elem *elem; 240 241 size = RTE_CACHE_LINE_ROUNDUP(size); 242 align = RTE_CACHE_LINE_ROUNDUP(align); 243 244 /* roundup might cause an overflow */ 245 if (size == 0) 246 return NULL; 247 elem = find_suitable_element(heap, size, flags, align, bound, contig); 248 if (elem != NULL) { 249 elem = malloc_elem_alloc(elem, size, align, bound, contig); 250 251 /* increase heap's count of allocated elements */ 252 heap->alloc_count++; 253 } 254 255 return elem == NULL ? NULL : (void *)(&elem[1]); 256 } 257 258 static void * 259 heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, 260 unsigned int flags, size_t align, bool contig) 261 { 262 struct malloc_elem *elem; 263 size_t size; 264 265 align = RTE_CACHE_LINE_ROUNDUP(align); 266 267 elem = find_biggest_element(heap, &size, flags, align, contig); 268 if (elem != NULL) { 269 elem = malloc_elem_alloc(elem, size, align, 0, contig); 270 271 /* increase heap's count of allocated elements */ 272 heap->alloc_count++; 273 } 274 275 return elem == NULL ? NULL : (void *)(&elem[1]); 276 } 277 278 /* this function is exposed in malloc_mp.h */ 279 void 280 rollback_expand_heap(struct rte_memseg **ms, int n_segs, 281 struct malloc_elem *elem, void *map_addr, size_t map_len) 282 { 283 if (elem != NULL) { 284 malloc_elem_free_list_remove(elem); 285 malloc_elem_hide_region(elem, map_addr, map_len); 286 } 287 288 eal_memalloc_free_seg_bulk(ms, n_segs); 289 } 290 291 /* this function is exposed in malloc_mp.h */ 292 struct malloc_elem * 293 alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 294 int socket, unsigned int flags, size_t align, size_t bound, 295 bool contig, struct rte_memseg **ms, int n_segs) 296 { 297 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 298 struct rte_memseg_list *msl; 299 struct malloc_elem *elem = NULL; 300 size_t alloc_sz; 301 int allocd_pages; 302 void *ret, *map_addr; 303 304 alloc_sz = (size_t)pg_sz * n_segs; 305 306 /* first, check if we're allowed to allocate this memory */ 307 if (eal_memalloc_mem_alloc_validate(socket, 308 heap->total_size + alloc_sz) < 0) { 309 RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); 310 return NULL; 311 } 312 313 allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, 314 socket, true); 315 316 /* make sure we've allocated our pages... */ 317 if (allocd_pages < 0) 318 return NULL; 319 320 map_addr = ms[0]->addr; 321 msl = rte_mem_virt2memseg_list(map_addr); 322 323 /* check if we wanted contiguous memory but didn't get it */ 324 if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { 325 RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", 326 __func__); 327 goto fail; 328 } 329 330 /* 331 * Once we have all the memseg lists configured, if there is a dma mask 332 * set, check iova addresses are not out of range. Otherwise the device 333 * setting the dma mask could have problems with the mapped memory. 334 * 335 * There are two situations when this can happen: 336 * 1) memory initialization 337 * 2) dynamic memory allocation 338 * 339 * For 1), an error when checking dma mask implies app can not be 340 * executed. For 2) implies the new memory can not be added. 341 */ 342 if (mcfg->dma_maskbits && 343 rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { 344 /* 345 * Currently this can only happen if IOMMU is enabled 346 * and the address width supported by the IOMMU hw is 347 * not enough for using the memory mapped IOVAs. 348 * 349 * If IOVA is VA, advice to try with '--iova-mode pa' 350 * which could solve some situations when IOVA VA is not 351 * really needed. 352 */ 353 RTE_LOG(ERR, EAL, 354 "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", 355 __func__); 356 357 /* 358 * If IOVA is VA and it is possible to run with IOVA PA, 359 * because user is root, give and advice for solving the 360 * problem. 361 */ 362 if ((rte_eal_iova_mode() == RTE_IOVA_VA) && 363 rte_eal_using_phys_addrs()) 364 RTE_LOG(ERR, EAL, 365 "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", 366 __func__); 367 goto fail; 368 } 369 370 /* add newly minted memsegs to malloc heap */ 371 elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); 372 373 /* try once more, as now we have allocated new memory */ 374 ret = find_suitable_element(heap, elt_size, flags, align, bound, 375 contig); 376 377 if (ret == NULL) 378 goto fail; 379 380 return elem; 381 382 fail: 383 rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 384 return NULL; 385 } 386 387 static int 388 try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, 389 size_t elt_size, int socket, unsigned int flags, size_t align, 390 size_t bound, bool contig) 391 { 392 struct malloc_elem *elem; 393 struct rte_memseg **ms; 394 void *map_addr; 395 size_t alloc_sz; 396 int n_segs; 397 bool callback_triggered = false; 398 399 alloc_sz = RTE_ALIGN_CEIL(align + elt_size + 400 MALLOC_ELEM_TRAILER_LEN, pg_sz); 401 n_segs = alloc_sz / pg_sz; 402 403 /* we can't know in advance how many pages we'll need, so we malloc */ 404 ms = malloc(sizeof(*ms) * n_segs); 405 if (ms == NULL) 406 return -1; 407 memset(ms, 0, sizeof(*ms) * n_segs); 408 409 elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, 410 bound, contig, ms, n_segs); 411 412 if (elem == NULL) 413 goto free_ms; 414 415 map_addr = ms[0]->addr; 416 417 /* notify user about changes in memory map */ 418 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); 419 420 /* notify other processes that this has happened */ 421 if (request_sync()) { 422 /* we couldn't ensure all processes have mapped memory, 423 * so free it back and notify everyone that it's been 424 * freed back. 425 * 426 * technically, we could've avoided adding memory addresses to 427 * the map, but that would've led to inconsistent behavior 428 * between primary and secondary processes, as those get 429 * callbacks during sync. therefore, force primary process to 430 * do alloc-and-rollback syncs as well. 431 */ 432 callback_triggered = true; 433 goto free_elem; 434 } 435 heap->total_size += alloc_sz; 436 437 RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", 438 socket, alloc_sz >> 20ULL); 439 440 free(ms); 441 442 return 0; 443 444 free_elem: 445 if (callback_triggered) 446 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 447 map_addr, alloc_sz); 448 449 rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 450 451 request_sync(); 452 free_ms: 453 free(ms); 454 455 return -1; 456 } 457 458 static int 459 try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, 460 size_t elt_size, int socket, unsigned int flags, size_t align, 461 size_t bound, bool contig) 462 { 463 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 464 struct malloc_mp_req req; 465 int req_result; 466 467 memset(&req, 0, sizeof(req)); 468 469 req.t = REQ_TYPE_ALLOC; 470 req.alloc_req.align = align; 471 req.alloc_req.bound = bound; 472 req.alloc_req.contig = contig; 473 req.alloc_req.flags = flags; 474 req.alloc_req.elt_size = elt_size; 475 req.alloc_req.page_sz = pg_sz; 476 req.alloc_req.socket = socket; 477 req.alloc_req.malloc_heap_idx = heap - mcfg->malloc_heaps; 478 479 req_result = request_to_primary(&req); 480 481 if (req_result != 0) 482 return -1; 483 484 if (req.result != REQ_RESULT_SUCCESS) 485 return -1; 486 487 return 0; 488 } 489 490 static int 491 try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 492 int socket, unsigned int flags, size_t align, size_t bound, 493 bool contig) 494 { 495 int ret; 496 497 rte_mcfg_mem_write_lock(); 498 499 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 500 ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, 501 flags, align, bound, contig); 502 } else { 503 ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, 504 flags, align, bound, contig); 505 } 506 507 rte_mcfg_mem_write_unlock(); 508 return ret; 509 } 510 511 static int 512 compare_pagesz(const void *a, const void *b) 513 { 514 const struct rte_memseg_list * const*mpa = a; 515 const struct rte_memseg_list * const*mpb = b; 516 const struct rte_memseg_list *msla = *mpa; 517 const struct rte_memseg_list *mslb = *mpb; 518 uint64_t pg_sz_a = msla->page_sz; 519 uint64_t pg_sz_b = mslb->page_sz; 520 521 if (pg_sz_a < pg_sz_b) 522 return -1; 523 if (pg_sz_a > pg_sz_b) 524 return 1; 525 return 0; 526 } 527 528 static int 529 alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, 530 unsigned int flags, size_t align, size_t bound, bool contig) 531 { 532 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 533 struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; 534 struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; 535 uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; 536 uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; 537 uint64_t prev_pg_sz; 538 int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; 539 bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; 540 unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 541 void *ret; 542 543 memset(requested_msls, 0, sizeof(requested_msls)); 544 memset(other_msls, 0, sizeof(other_msls)); 545 memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); 546 memset(other_pg_sz, 0, sizeof(other_pg_sz)); 547 548 /* 549 * go through memseg list and take note of all the page sizes available, 550 * and if any of them were specifically requested by the user. 551 */ 552 n_requested_msls = 0; 553 n_other_msls = 0; 554 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 555 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 556 557 if (msl->socket_id != socket) 558 continue; 559 560 if (msl->base_va == NULL) 561 continue; 562 563 /* if pages of specific size were requested */ 564 if (size_flags != 0 && check_hugepage_sz(size_flags, 565 msl->page_sz)) 566 requested_msls[n_requested_msls++] = msl; 567 else if (size_flags == 0 || size_hint) 568 other_msls[n_other_msls++] = msl; 569 } 570 571 /* sort the lists, smallest first */ 572 qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), 573 compare_pagesz); 574 qsort(other_msls, n_other_msls, sizeof(other_msls[0]), 575 compare_pagesz); 576 577 /* now, extract page sizes we are supposed to try */ 578 prev_pg_sz = 0; 579 n_requested_pg_sz = 0; 580 for (i = 0; i < n_requested_msls; i++) { 581 uint64_t pg_sz = requested_msls[i]->page_sz; 582 583 if (prev_pg_sz != pg_sz) { 584 requested_pg_sz[n_requested_pg_sz++] = pg_sz; 585 prev_pg_sz = pg_sz; 586 } 587 } 588 prev_pg_sz = 0; 589 n_other_pg_sz = 0; 590 for (i = 0; i < n_other_msls; i++) { 591 uint64_t pg_sz = other_msls[i]->page_sz; 592 593 if (prev_pg_sz != pg_sz) { 594 other_pg_sz[n_other_pg_sz++] = pg_sz; 595 prev_pg_sz = pg_sz; 596 } 597 } 598 599 /* finally, try allocating memory of specified page sizes, starting from 600 * the smallest sizes 601 */ 602 for (i = 0; i < n_requested_pg_sz; i++) { 603 uint64_t pg_sz = requested_pg_sz[i]; 604 605 /* 606 * do not pass the size hint here, as user expects other page 607 * sizes first, before resorting to best effort allocation. 608 */ 609 if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, 610 align, bound, contig)) 611 return 0; 612 } 613 if (n_other_pg_sz == 0) 614 return -1; 615 616 /* now, check if we can reserve anything with size hint */ 617 ret = find_suitable_element(heap, size, flags, align, bound, contig); 618 if (ret != NULL) 619 return 0; 620 621 /* 622 * we still couldn't reserve memory, so try expanding heap with other 623 * page sizes, if there are any 624 */ 625 for (i = 0; i < n_other_pg_sz; i++) { 626 uint64_t pg_sz = other_pg_sz[i]; 627 628 if (!try_expand_heap(heap, pg_sz, size, socket, flags, 629 align, bound, contig)) 630 return 0; 631 } 632 return -1; 633 } 634 635 /* this will try lower page sizes first */ 636 static void * 637 malloc_heap_alloc_on_heap_id(const char *type, size_t size, 638 unsigned int heap_id, unsigned int flags, size_t align, 639 size_t bound, bool contig) 640 { 641 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 642 struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 643 unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 644 int socket_id; 645 void *ret; 646 const struct internal_config *internal_conf = 647 eal_get_internal_configuration(); 648 649 rte_spinlock_lock(&(heap->lock)); 650 651 align = align == 0 ? 1 : align; 652 653 /* for legacy mode, try once and with all flags */ 654 if (internal_conf->legacy_mem) { 655 ret = heap_alloc(heap, type, size, flags, align, bound, contig); 656 goto alloc_unlock; 657 } 658 659 /* 660 * we do not pass the size hint here, because even if allocation fails, 661 * we may still be able to allocate memory from appropriate page sizes, 662 * we just need to request more memory first. 663 */ 664 665 socket_id = rte_socket_id_by_idx(heap_id); 666 /* 667 * if socket ID is negative, we cannot find a socket ID for this heap - 668 * which means it's an external heap. those can have unexpected page 669 * sizes, so if the user asked to allocate from there - assume user 670 * knows what they're doing, and allow allocating from there with any 671 * page size flags. 672 */ 673 if (socket_id < 0) 674 size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; 675 676 ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); 677 if (ret != NULL) 678 goto alloc_unlock; 679 680 /* if socket ID is invalid, this is an external heap */ 681 if (socket_id < 0) 682 goto alloc_unlock; 683 684 if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, 685 bound, contig)) { 686 ret = heap_alloc(heap, type, size, flags, align, bound, contig); 687 688 /* this should have succeeded */ 689 if (ret == NULL) 690 RTE_LOG(ERR, EAL, "Error allocating from heap\n"); 691 } 692 alloc_unlock: 693 rte_spinlock_unlock(&(heap->lock)); 694 return ret; 695 } 696 697 void * 698 malloc_heap_alloc(const char *type, size_t size, int socket_arg, 699 unsigned int flags, size_t align, size_t bound, bool contig) 700 { 701 int socket, heap_id, i; 702 void *ret; 703 704 /* return NULL if size is 0 or alignment is not power-of-2 */ 705 if (size == 0 || (align && !rte_is_power_of_2(align))) 706 return NULL; 707 708 if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) 709 socket_arg = SOCKET_ID_ANY; 710 711 if (socket_arg == SOCKET_ID_ANY) 712 socket = malloc_get_numa_socket(); 713 else 714 socket = socket_arg; 715 716 /* turn socket ID into heap ID */ 717 heap_id = malloc_socket_to_heap_id(socket); 718 /* if heap id is negative, socket ID was invalid */ 719 if (heap_id < 0) 720 return NULL; 721 722 ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, 723 bound, contig); 724 if (ret != NULL || socket_arg != SOCKET_ID_ANY) 725 return ret; 726 727 /* try other heaps. we are only iterating through native DPDK sockets, 728 * so external heaps won't be included. 729 */ 730 for (i = 0; i < (int) rte_socket_count(); i++) { 731 if (i == heap_id) 732 continue; 733 ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, 734 bound, contig); 735 if (ret != NULL) 736 return ret; 737 } 738 return NULL; 739 } 740 741 static void * 742 heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, 743 unsigned int flags, size_t align, bool contig) 744 { 745 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 746 struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 747 void *ret; 748 749 rte_spinlock_lock(&(heap->lock)); 750 751 align = align == 0 ? 1 : align; 752 753 ret = heap_alloc_biggest(heap, type, flags, align, contig); 754 755 rte_spinlock_unlock(&(heap->lock)); 756 757 return ret; 758 } 759 760 void * 761 malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, 762 size_t align, bool contig) 763 { 764 int socket, i, cur_socket, heap_id; 765 void *ret; 766 767 /* return NULL if align is not power-of-2 */ 768 if ((align && !rte_is_power_of_2(align))) 769 return NULL; 770 771 if (!rte_eal_has_hugepages()) 772 socket_arg = SOCKET_ID_ANY; 773 774 if (socket_arg == SOCKET_ID_ANY) 775 socket = malloc_get_numa_socket(); 776 else 777 socket = socket_arg; 778 779 /* turn socket ID into heap ID */ 780 heap_id = malloc_socket_to_heap_id(socket); 781 /* if heap id is negative, socket ID was invalid */ 782 if (heap_id < 0) 783 return NULL; 784 785 ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, 786 contig); 787 if (ret != NULL || socket_arg != SOCKET_ID_ANY) 788 return ret; 789 790 /* try other heaps */ 791 for (i = 0; i < (int) rte_socket_count(); i++) { 792 cur_socket = rte_socket_id_by_idx(i); 793 if (cur_socket == socket) 794 continue; 795 ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, 796 contig); 797 if (ret != NULL) 798 return ret; 799 } 800 return NULL; 801 } 802 803 /* this function is exposed in malloc_mp.h */ 804 int 805 malloc_heap_free_pages(void *aligned_start, size_t aligned_len) 806 { 807 int n_segs, seg_idx, max_seg_idx; 808 struct rte_memseg_list *msl; 809 size_t page_sz; 810 811 msl = rte_mem_virt2memseg_list(aligned_start); 812 if (msl == NULL) 813 return -1; 814 815 page_sz = (size_t)msl->page_sz; 816 n_segs = aligned_len / page_sz; 817 seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; 818 max_seg_idx = seg_idx + n_segs; 819 820 for (; seg_idx < max_seg_idx; seg_idx++) { 821 struct rte_memseg *ms; 822 823 ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); 824 eal_memalloc_free_seg(ms); 825 } 826 return 0; 827 } 828 829 int 830 malloc_heap_free(struct malloc_elem *elem) 831 { 832 struct malloc_heap *heap; 833 void *start, *aligned_start, *end, *aligned_end; 834 size_t len, aligned_len, page_sz; 835 struct rte_memseg_list *msl; 836 unsigned int i, n_segs, before_space, after_space; 837 int ret; 838 const struct internal_config *internal_conf = 839 eal_get_internal_configuration(); 840 841 if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 842 return -1; 843 844 /* elem may be merged with previous element, so keep heap address */ 845 heap = elem->heap; 846 msl = elem->msl; 847 page_sz = (size_t)msl->page_sz; 848 849 rte_spinlock_lock(&(heap->lock)); 850 851 /* mark element as free */ 852 elem->state = ELEM_FREE; 853 854 elem = malloc_elem_free(elem); 855 856 /* anything after this is a bonus */ 857 ret = 0; 858 859 /* ...of which we can't avail if we are in legacy mode, or if this is an 860 * externally allocated segment. 861 */ 862 if (internal_conf->legacy_mem || (msl->external > 0)) 863 goto free_unlock; 864 865 /* check if we can free any memory back to the system */ 866 if (elem->size < page_sz) 867 goto free_unlock; 868 869 /* if user requested to match allocations, the sizes must match - if not, 870 * we will defer freeing these hugepages until the entire original allocation 871 * can be freed 872 */ 873 if (internal_conf->match_allocations && elem->size != elem->orig_size) 874 goto free_unlock; 875 876 /* probably, but let's make sure, as we may not be using up full page */ 877 start = elem; 878 len = elem->size; 879 aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); 880 end = RTE_PTR_ADD(elem, len); 881 aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); 882 883 aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 884 885 /* can't free anything */ 886 if (aligned_len < page_sz) 887 goto free_unlock; 888 889 /* we can free something. however, some of these pages may be marked as 890 * unfreeable, so also check that as well 891 */ 892 n_segs = aligned_len / page_sz; 893 for (i = 0; i < n_segs; i++) { 894 const struct rte_memseg *tmp = 895 rte_mem_virt2memseg(aligned_start, msl); 896 897 if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 898 /* this is an unfreeable segment, so move start */ 899 aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); 900 } 901 } 902 903 /* recalculate length and number of segments */ 904 aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 905 n_segs = aligned_len / page_sz; 906 907 /* check if we can still free some pages */ 908 if (n_segs == 0) 909 goto free_unlock; 910 911 /* We're not done yet. We also have to check if by freeing space we will 912 * be leaving free elements that are too small to store new elements. 913 * Check if we have enough space in the beginning and at the end, or if 914 * start/end are exactly page aligned. 915 */ 916 before_space = RTE_PTR_DIFF(aligned_start, elem); 917 after_space = RTE_PTR_DIFF(end, aligned_end); 918 if (before_space != 0 && 919 before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 920 /* There is not enough space before start, but we may be able to 921 * move the start forward by one page. 922 */ 923 if (n_segs == 1) 924 goto free_unlock; 925 926 /* move start */ 927 aligned_start = RTE_PTR_ADD(aligned_start, page_sz); 928 aligned_len -= page_sz; 929 n_segs--; 930 } 931 if (after_space != 0 && after_space < 932 MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 933 /* There is not enough space after end, but we may be able to 934 * move the end backwards by one page. 935 */ 936 if (n_segs == 1) 937 goto free_unlock; 938 939 /* move end */ 940 aligned_end = RTE_PTR_SUB(aligned_end, page_sz); 941 aligned_len -= page_sz; 942 n_segs--; 943 } 944 945 /* now we can finally free us some pages */ 946 947 rte_mcfg_mem_write_lock(); 948 949 /* 950 * we allow secondary processes to clear the heap of this allocated 951 * memory because it is safe to do so, as even if notifications about 952 * unmapped pages don't make it to other processes, heap is shared 953 * across all processes, and will become empty of this memory anyway, 954 * and nothing can allocate it back unless primary process will be able 955 * to deliver allocation message to every single running process. 956 */ 957 958 malloc_elem_free_list_remove(elem); 959 960 malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); 961 962 heap->total_size -= aligned_len; 963 964 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 965 /* notify user about changes in memory map */ 966 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 967 aligned_start, aligned_len); 968 969 /* don't care if any of this fails */ 970 malloc_heap_free_pages(aligned_start, aligned_len); 971 972 request_sync(); 973 } else { 974 struct malloc_mp_req req; 975 976 memset(&req, 0, sizeof(req)); 977 978 req.t = REQ_TYPE_FREE; 979 req.free_req.addr = aligned_start; 980 req.free_req.len = aligned_len; 981 982 /* 983 * we request primary to deallocate pages, but we don't do it 984 * in this thread. instead, we notify primary that we would like 985 * to deallocate pages, and this process will receive another 986 * request (in parallel) that will do it for us on another 987 * thread. 988 * 989 * we also don't really care if this succeeds - the data is 990 * already removed from the heap, so it is, for all intents and 991 * purposes, hidden from the rest of DPDK even if some other 992 * process (including this one) may have these pages mapped. 993 * 994 * notifications about deallocated memory happen during sync. 995 */ 996 request_to_primary(&req); 997 } 998 999 RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", 1000 msl->socket_id, aligned_len >> 20ULL); 1001 1002 rte_mcfg_mem_write_unlock(); 1003 free_unlock: 1004 rte_spinlock_unlock(&(heap->lock)); 1005 return ret; 1006 } 1007 1008 int 1009 malloc_heap_resize(struct malloc_elem *elem, size_t size) 1010 { 1011 int ret; 1012 1013 if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 1014 return -1; 1015 1016 rte_spinlock_lock(&(elem->heap->lock)); 1017 1018 ret = malloc_elem_resize(elem, size); 1019 1020 rte_spinlock_unlock(&(elem->heap->lock)); 1021 1022 return ret; 1023 } 1024 1025 /* 1026 * Function to retrieve data for a given heap 1027 */ 1028 int 1029 malloc_heap_get_stats(struct malloc_heap *heap, 1030 struct rte_malloc_socket_stats *socket_stats) 1031 { 1032 size_t idx; 1033 struct malloc_elem *elem; 1034 1035 rte_spinlock_lock(&heap->lock); 1036 1037 /* Initialise variables for heap */ 1038 socket_stats->free_count = 0; 1039 socket_stats->heap_freesz_bytes = 0; 1040 socket_stats->greatest_free_size = 0; 1041 1042 /* Iterate through free list */ 1043 for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 1044 for (elem = LIST_FIRST(&heap->free_head[idx]); 1045 !!elem; elem = LIST_NEXT(elem, free_list)) 1046 { 1047 socket_stats->free_count++; 1048 socket_stats->heap_freesz_bytes += elem->size; 1049 if (elem->size > socket_stats->greatest_free_size) 1050 socket_stats->greatest_free_size = elem->size; 1051 } 1052 } 1053 /* Get stats on overall heap and allocated memory on this heap */ 1054 socket_stats->heap_totalsz_bytes = heap->total_size; 1055 socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - 1056 socket_stats->heap_freesz_bytes); 1057 socket_stats->alloc_count = heap->alloc_count; 1058 1059 rte_spinlock_unlock(&heap->lock); 1060 return 0; 1061 } 1062 1063 /* 1064 * Function to retrieve data for a given heap 1065 */ 1066 void 1067 malloc_heap_dump(struct malloc_heap *heap, FILE *f) 1068 { 1069 struct malloc_elem *elem; 1070 1071 rte_spinlock_lock(&heap->lock); 1072 1073 fprintf(f, "Heap size: 0x%zx\n", heap->total_size); 1074 fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); 1075 1076 elem = heap->first; 1077 while (elem) { 1078 malloc_elem_dump(elem, f); 1079 elem = elem->next; 1080 } 1081 1082 rte_spinlock_unlock(&heap->lock); 1083 } 1084 1085 static int 1086 destroy_elem(struct malloc_elem *elem, size_t len) 1087 { 1088 struct malloc_heap *heap = elem->heap; 1089 1090 /* notify all subscribers that a memory area is going to be removed */ 1091 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); 1092 1093 /* this element can be removed */ 1094 malloc_elem_free_list_remove(elem); 1095 malloc_elem_hide_region(elem, elem, len); 1096 1097 heap->total_size -= len; 1098 1099 memset(elem, 0, sizeof(*elem)); 1100 1101 return 0; 1102 } 1103 1104 struct rte_memseg_list * 1105 malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], 1106 unsigned int n_pages, size_t page_sz, const char *seg_name, 1107 unsigned int socket_id) 1108 { 1109 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1110 char fbarray_name[RTE_FBARRAY_NAME_LEN]; 1111 struct rte_memseg_list *msl = NULL; 1112 struct rte_fbarray *arr; 1113 size_t seg_len = n_pages * page_sz; 1114 unsigned int i; 1115 1116 /* first, find a free memseg list */ 1117 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 1118 struct rte_memseg_list *tmp = &mcfg->memsegs[i]; 1119 if (tmp->base_va == NULL) { 1120 msl = tmp; 1121 break; 1122 } 1123 } 1124 if (msl == NULL) { 1125 RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); 1126 rte_errno = ENOSPC; 1127 return NULL; 1128 } 1129 1130 snprintf(fbarray_name, sizeof(fbarray_name), "%s_%p", 1131 seg_name, va_addr); 1132 1133 /* create the backing fbarray */ 1134 if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, 1135 sizeof(struct rte_memseg)) < 0) { 1136 RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); 1137 return NULL; 1138 } 1139 arr = &msl->memseg_arr; 1140 1141 /* fbarray created, fill it up */ 1142 for (i = 0; i < n_pages; i++) { 1143 struct rte_memseg *ms; 1144 1145 rte_fbarray_set_used(arr, i); 1146 ms = rte_fbarray_get(arr, i); 1147 ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); 1148 ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; 1149 ms->hugepage_sz = page_sz; 1150 ms->len = page_sz; 1151 ms->nchannel = rte_memory_get_nchannel(); 1152 ms->nrank = rte_memory_get_nrank(); 1153 ms->socket_id = socket_id; 1154 } 1155 1156 /* set up the memseg list */ 1157 msl->base_va = va_addr; 1158 msl->page_sz = page_sz; 1159 msl->socket_id = socket_id; 1160 msl->len = seg_len; 1161 msl->version = 0; 1162 msl->external = 1; 1163 1164 return msl; 1165 } 1166 1167 struct extseg_walk_arg { 1168 void *va_addr; 1169 size_t len; 1170 struct rte_memseg_list *msl; 1171 }; 1172 1173 static int 1174 extseg_walk(const struct rte_memseg_list *msl, void *arg) 1175 { 1176 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1177 struct extseg_walk_arg *wa = arg; 1178 1179 if (msl->base_va == wa->va_addr && msl->len == wa->len) { 1180 unsigned int found_idx; 1181 1182 /* msl is const */ 1183 found_idx = msl - mcfg->memsegs; 1184 wa->msl = &mcfg->memsegs[found_idx]; 1185 return 1; 1186 } 1187 return 0; 1188 } 1189 1190 struct rte_memseg_list * 1191 malloc_heap_find_external_seg(void *va_addr, size_t len) 1192 { 1193 struct extseg_walk_arg wa; 1194 int res; 1195 1196 wa.va_addr = va_addr; 1197 wa.len = len; 1198 1199 res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); 1200 1201 if (res != 1) { 1202 /* 0 means nothing was found, -1 shouldn't happen */ 1203 if (res == 0) 1204 rte_errno = ENOENT; 1205 return NULL; 1206 } 1207 return wa.msl; 1208 } 1209 1210 int 1211 malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) 1212 { 1213 /* destroy the fbarray backing this memory */ 1214 if (rte_fbarray_destroy(&msl->memseg_arr) < 0) 1215 return -1; 1216 1217 /* reset the memseg list */ 1218 memset(msl, 0, sizeof(*msl)); 1219 1220 return 0; 1221 } 1222 1223 int 1224 malloc_heap_add_external_memory(struct malloc_heap *heap, 1225 struct rte_memseg_list *msl) 1226 { 1227 /* erase contents of new memory */ 1228 memset(msl->base_va, 0, msl->len); 1229 1230 /* now, add newly minted memory to the malloc heap */ 1231 malloc_heap_add_memory(heap, msl, msl->base_va, msl->len); 1232 1233 heap->total_size += msl->len; 1234 1235 /* all done! */ 1236 RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", 1237 heap->name, msl->base_va); 1238 1239 /* notify all subscribers that a new memory area has been added */ 1240 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1241 msl->base_va, msl->len); 1242 1243 return 0; 1244 } 1245 1246 int 1247 malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, 1248 size_t len) 1249 { 1250 struct malloc_elem *elem = heap->first; 1251 1252 /* find element with specified va address */ 1253 while (elem != NULL && elem != va_addr) { 1254 elem = elem->next; 1255 /* stop if we've blown past our VA */ 1256 if (elem > (struct malloc_elem *)va_addr) { 1257 rte_errno = ENOENT; 1258 return -1; 1259 } 1260 } 1261 /* check if element was found */ 1262 if (elem == NULL || elem->msl->len != len) { 1263 rte_errno = ENOENT; 1264 return -1; 1265 } 1266 /* if element's size is not equal to segment len, segment is busy */ 1267 if (elem->state == ELEM_BUSY || elem->size != len) { 1268 rte_errno = EBUSY; 1269 return -1; 1270 } 1271 return destroy_elem(elem, len); 1272 } 1273 1274 int 1275 malloc_heap_create(struct malloc_heap *heap, const char *heap_name) 1276 { 1277 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1278 uint32_t next_socket_id = mcfg->next_socket_id; 1279 1280 /* prevent overflow. did you really create 2 billion heaps??? */ 1281 if (next_socket_id > INT32_MAX) { 1282 RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); 1283 rte_errno = ENOSPC; 1284 return -1; 1285 } 1286 1287 /* initialize empty heap */ 1288 heap->alloc_count = 0; 1289 heap->first = NULL; 1290 heap->last = NULL; 1291 LIST_INIT(heap->free_head); 1292 rte_spinlock_init(&heap->lock); 1293 heap->total_size = 0; 1294 heap->socket_id = next_socket_id; 1295 1296 /* we hold a global mem hotplug writelock, so it's safe to increment */ 1297 mcfg->next_socket_id++; 1298 1299 /* set up name */ 1300 strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 1301 return 0; 1302 } 1303 1304 int 1305 malloc_heap_destroy(struct malloc_heap *heap) 1306 { 1307 if (heap->alloc_count != 0) { 1308 RTE_LOG(ERR, EAL, "Heap is still in use\n"); 1309 rte_errno = EBUSY; 1310 return -1; 1311 } 1312 if (heap->first != NULL || heap->last != NULL) { 1313 RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); 1314 rte_errno = EBUSY; 1315 return -1; 1316 } 1317 if (heap->total_size != 0) 1318 RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); 1319 1320 /* after this, the lock will be dropped */ 1321 memset(heap, 0, sizeof(*heap)); 1322 1323 return 0; 1324 } 1325 1326 int 1327 rte_eal_malloc_heap_init(void) 1328 { 1329 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1330 unsigned int i; 1331 const struct internal_config *internal_conf = 1332 eal_get_internal_configuration(); 1333 1334 if (internal_conf->match_allocations) 1335 RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); 1336 1337 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 1338 /* assign min socket ID to external heaps */ 1339 mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; 1340 1341 /* assign names to default DPDK heaps */ 1342 for (i = 0; i < rte_socket_count(); i++) { 1343 struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 1344 char heap_name[RTE_HEAP_NAME_MAX_LEN]; 1345 int socket_id = rte_socket_id_by_idx(i); 1346 1347 snprintf(heap_name, sizeof(heap_name), 1348 "socket_%i", socket_id); 1349 strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 1350 heap->socket_id = socket_id; 1351 } 1352 } 1353 1354 1355 if (register_mp_requests()) { 1356 RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); 1357 rte_mcfg_mem_read_unlock(); 1358 return -1; 1359 } 1360 1361 /* unlock mem hotplug here. it's safe for primary as no requests can 1362 * even come before primary itself is fully initialized, and secondaries 1363 * do not need to initialize the heap. 1364 */ 1365 rte_mcfg_mem_read_unlock(); 1366 1367 /* secondary process does not need to initialize anything */ 1368 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1369 return 0; 1370 1371 /* add all IOVA-contiguous areas to the heap */ 1372 return rte_memseg_contig_walk(malloc_add_seg, NULL); 1373 } 1374