1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <errno.h> 6 #include <stdio.h> 7 #include <stdint.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <inttypes.h> 11 12 #include <rte_fbarray.h> 13 #include <rte_memory.h> 14 #include <rte_eal.h> 15 #include <rte_eal_memconfig.h> 16 #include <rte_eal_paging.h> 17 #include <rte_errno.h> 18 #include <rte_log.h> 19 #ifndef RTE_EXEC_ENV_WINDOWS 20 #include <rte_telemetry.h> 21 #endif 22 23 #include "eal_memalloc.h" 24 #include "eal_private.h" 25 #include "eal_internal_cfg.h" 26 #include "eal_memcfg.h" 27 #include "eal_options.h" 28 #include "malloc_heap.h" 29 30 /* 31 * Try to mmap *size bytes in /dev/zero. If it is successful, return the 32 * pointer to the mmap'd area and keep *size unmodified. Else, retry 33 * with a smaller zone: decrease *size by hugepage_sz until it reaches 34 * 0. In this case, return NULL. Note: this function returns an address 35 * which is a multiple of hugepage size. 36 */ 37 38 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" 39 40 static void *next_baseaddr; 41 static uint64_t system_page_sz; 42 43 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5 44 void * 45 eal_get_virtual_area(void *requested_addr, size_t *size, 46 size_t page_sz, int flags, int reserve_flags) 47 { 48 bool addr_is_hint, allow_shrink, unmap, no_align; 49 uint64_t map_sz; 50 void *mapped_addr, *aligned_addr; 51 uint8_t try = 0; 52 struct internal_config *internal_conf = 53 eal_get_internal_configuration(); 54 55 if (system_page_sz == 0) 56 system_page_sz = rte_mem_page_size(); 57 58 RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); 59 60 addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; 61 allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; 62 unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; 63 64 if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 && 65 rte_eal_process_type() == RTE_PROC_PRIMARY) 66 next_baseaddr = (void *) internal_conf->base_virtaddr; 67 68 #ifdef RTE_ARCH_64 69 if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 && 70 rte_eal_process_type() == RTE_PROC_PRIMARY) 71 next_baseaddr = (void *) eal_get_baseaddr(); 72 #endif 73 if (requested_addr == NULL && next_baseaddr != NULL) { 74 requested_addr = next_baseaddr; 75 requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); 76 addr_is_hint = true; 77 } 78 79 /* we don't need alignment of resulting pointer in the following cases: 80 * 81 * 1. page size is equal to system size 82 * 2. we have a requested address, and it is page-aligned, and we will 83 * be discarding the address if we get a different one. 84 * 85 * for all other cases, alignment is potentially necessary. 86 */ 87 no_align = (requested_addr != NULL && 88 requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && 89 !addr_is_hint) || 90 page_sz == system_page_sz; 91 92 do { 93 map_sz = no_align ? *size : *size + page_sz; 94 if (map_sz > SIZE_MAX) { 95 RTE_LOG(ERR, EAL, "Map size too big\n"); 96 rte_errno = E2BIG; 97 return NULL; 98 } 99 100 mapped_addr = eal_mem_reserve( 101 requested_addr, (size_t)map_sz, reserve_flags); 102 if ((mapped_addr == NULL) && allow_shrink) 103 *size -= page_sz; 104 105 if ((mapped_addr != NULL) && addr_is_hint && 106 (mapped_addr != requested_addr)) { 107 try++; 108 next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); 109 if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) { 110 /* hint was not used. Try with another offset */ 111 eal_mem_free(mapped_addr, map_sz); 112 mapped_addr = NULL; 113 requested_addr = next_baseaddr; 114 } 115 } 116 } while ((allow_shrink || addr_is_hint) && 117 (mapped_addr == NULL) && (*size > 0)); 118 119 /* align resulting address - if map failed, we will ignore the value 120 * anyway, so no need to add additional checks. 121 */ 122 aligned_addr = no_align ? mapped_addr : 123 RTE_PTR_ALIGN(mapped_addr, page_sz); 124 125 if (*size == 0) { 126 RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", 127 rte_strerror(rte_errno)); 128 return NULL; 129 } else if (mapped_addr == NULL) { 130 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", 131 rte_strerror(rte_errno)); 132 return NULL; 133 } else if (requested_addr != NULL && !addr_is_hint && 134 aligned_addr != requested_addr) { 135 RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", 136 requested_addr, aligned_addr); 137 eal_mem_free(mapped_addr, map_sz); 138 rte_errno = EADDRNOTAVAIL; 139 return NULL; 140 } else if (requested_addr != NULL && addr_is_hint && 141 aligned_addr != requested_addr) { 142 /* 143 * demote this warning to debug if we did not explicitly request 144 * a base virtual address. 145 */ 146 if (internal_conf->base_virtaddr != 0) { 147 RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", 148 requested_addr, aligned_addr); 149 RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); 150 } else { 151 RTE_LOG(DEBUG, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", 152 requested_addr, aligned_addr); 153 RTE_LOG(DEBUG, EAL, " This may cause issues with mapping memory into secondary processes\n"); 154 } 155 } else if (next_baseaddr != NULL) { 156 next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); 157 } 158 159 RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", 160 aligned_addr, *size); 161 162 if (unmap) { 163 eal_mem_free(mapped_addr, map_sz); 164 } else if (!no_align) { 165 void *map_end, *aligned_end; 166 size_t before_len, after_len; 167 168 /* when we reserve space with alignment, we add alignment to 169 * mapping size. On 32-bit, if 1GB alignment was requested, this 170 * would waste 1GB of address space, which is a luxury we cannot 171 * afford. so, if alignment was performed, check if any unneeded 172 * address space can be unmapped back. 173 */ 174 175 map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); 176 aligned_end = RTE_PTR_ADD(aligned_addr, *size); 177 178 /* unmap space before aligned mmap address */ 179 before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); 180 if (before_len > 0) 181 eal_mem_free(mapped_addr, before_len); 182 183 /* unmap space after aligned end mmap address */ 184 after_len = RTE_PTR_DIFF(map_end, aligned_end); 185 if (after_len > 0) 186 eal_mem_free(aligned_end, after_len); 187 } 188 189 if (!unmap) { 190 /* Exclude these pages from a core dump. */ 191 eal_mem_set_dump(aligned_addr, *size, false); 192 } 193 194 return aligned_addr; 195 } 196 197 int 198 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name, 199 uint64_t page_sz, int n_segs, int socket_id, bool heap) 200 { 201 if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, 202 sizeof(struct rte_memseg))) { 203 RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", 204 rte_strerror(rte_errno)); 205 return -1; 206 } 207 208 msl->page_sz = page_sz; 209 msl->socket_id = socket_id; 210 msl->base_va = NULL; 211 msl->heap = heap; 212 213 RTE_LOG(DEBUG, EAL, 214 "Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n", 215 socket_id, page_sz >> 10); 216 217 return 0; 218 } 219 220 int 221 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz, 222 int n_segs, int socket_id, int type_msl_idx, bool heap) 223 { 224 char name[RTE_FBARRAY_NAME_LEN]; 225 226 snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, 227 type_msl_idx); 228 229 return eal_memseg_list_init_named( 230 msl, name, page_sz, n_segs, socket_id, heap); 231 } 232 233 int 234 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags) 235 { 236 size_t page_sz, mem_sz; 237 void *addr; 238 239 page_sz = msl->page_sz; 240 mem_sz = page_sz * msl->memseg_arr.len; 241 242 addr = eal_get_virtual_area( 243 msl->base_va, &mem_sz, page_sz, 0, reserve_flags); 244 if (addr == NULL) { 245 #ifndef RTE_EXEC_ENV_WINDOWS 246 /* The hint would be misleading on Windows, because address 247 * is by default system-selected (base VA = 0). 248 * However, this function is called from many places, 249 * including common code, so don't duplicate the message. 250 */ 251 if (rte_errno == EADDRNOTAVAIL) 252 RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - " 253 "please use '--" OPT_BASE_VIRTADDR "' option\n", 254 (unsigned long long)mem_sz, msl->base_va); 255 #endif 256 return -1; 257 } 258 msl->base_va = addr; 259 msl->len = mem_sz; 260 261 RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n", 262 addr, mem_sz); 263 264 return 0; 265 } 266 267 void 268 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs) 269 { 270 size_t page_sz = msl->page_sz; 271 int i; 272 273 for (i = 0; i < n_segs; i++) { 274 struct rte_fbarray *arr = &msl->memseg_arr; 275 struct rte_memseg *ms = rte_fbarray_get(arr, i); 276 277 if (rte_eal_iova_mode() == RTE_IOVA_VA) 278 ms->iova = (uintptr_t)addr; 279 else 280 ms->iova = RTE_BAD_IOVA; 281 ms->addr = addr; 282 ms->hugepage_sz = page_sz; 283 ms->socket_id = 0; 284 ms->len = page_sz; 285 286 rte_fbarray_set_used(arr, i); 287 288 addr = RTE_PTR_ADD(addr, page_sz); 289 } 290 } 291 292 static struct rte_memseg * 293 virt2memseg(const void *addr, const struct rte_memseg_list *msl) 294 { 295 const struct rte_fbarray *arr; 296 void *start, *end; 297 int ms_idx; 298 299 if (msl == NULL) 300 return NULL; 301 302 /* a memseg list was specified, check if it's the right one */ 303 start = msl->base_va; 304 end = RTE_PTR_ADD(start, msl->len); 305 306 if (addr < start || addr >= end) 307 return NULL; 308 309 /* now, calculate index */ 310 arr = &msl->memseg_arr; 311 ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; 312 return rte_fbarray_get(arr, ms_idx); 313 } 314 315 static struct rte_memseg_list * 316 virt2memseg_list(const void *addr) 317 { 318 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 319 struct rte_memseg_list *msl; 320 int msl_idx; 321 322 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { 323 void *start, *end; 324 msl = &mcfg->memsegs[msl_idx]; 325 326 start = msl->base_va; 327 end = RTE_PTR_ADD(start, msl->len); 328 if (addr >= start && addr < end) 329 break; 330 } 331 /* if we didn't find our memseg list */ 332 if (msl_idx == RTE_MAX_MEMSEG_LISTS) 333 return NULL; 334 return msl; 335 } 336 337 struct rte_memseg_list * 338 rte_mem_virt2memseg_list(const void *addr) 339 { 340 return virt2memseg_list(addr); 341 } 342 343 struct virtiova { 344 rte_iova_t iova; 345 void *virt; 346 }; 347 static int 348 find_virt(const struct rte_memseg_list *msl __rte_unused, 349 const struct rte_memseg *ms, void *arg) 350 { 351 struct virtiova *vi = arg; 352 if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { 353 size_t offset = vi->iova - ms->iova; 354 vi->virt = RTE_PTR_ADD(ms->addr, offset); 355 /* stop the walk */ 356 return 1; 357 } 358 return 0; 359 } 360 static int 361 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, 362 const struct rte_memseg *ms, size_t len, void *arg) 363 { 364 struct virtiova *vi = arg; 365 if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { 366 size_t offset = vi->iova - ms->iova; 367 vi->virt = RTE_PTR_ADD(ms->addr, offset); 368 /* stop the walk */ 369 return 1; 370 } 371 return 0; 372 } 373 374 void * 375 rte_mem_iova2virt(rte_iova_t iova) 376 { 377 struct virtiova vi; 378 const struct internal_config *internal_conf = 379 eal_get_internal_configuration(); 380 381 memset(&vi, 0, sizeof(vi)); 382 383 vi.iova = iova; 384 /* for legacy mem, we can get away with scanning VA-contiguous segments, 385 * as we know they are PA-contiguous as well 386 */ 387 if (internal_conf->legacy_mem) 388 rte_memseg_contig_walk(find_virt_legacy, &vi); 389 else 390 rte_memseg_walk(find_virt, &vi); 391 392 return vi.virt; 393 } 394 395 struct rte_memseg * 396 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) 397 { 398 return virt2memseg(addr, msl != NULL ? msl : 399 rte_mem_virt2memseg_list(addr)); 400 } 401 402 static int 403 physmem_size(const struct rte_memseg_list *msl, void *arg) 404 { 405 uint64_t *total_len = arg; 406 407 if (msl->external) 408 return 0; 409 410 *total_len += msl->memseg_arr.count * msl->page_sz; 411 412 return 0; 413 } 414 415 /* get the total size of memory */ 416 uint64_t 417 rte_eal_get_physmem_size(void) 418 { 419 uint64_t total_len = 0; 420 421 rte_memseg_list_walk(physmem_size, &total_len); 422 423 return total_len; 424 } 425 426 static int 427 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, 428 void *arg) 429 { 430 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 431 int msl_idx, ms_idx, fd; 432 FILE *f = arg; 433 434 msl_idx = msl - mcfg->memsegs; 435 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) 436 return -1; 437 438 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 439 if (ms_idx < 0) 440 return -1; 441 442 fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); 443 fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " 444 "virt:%p, socket_id:%"PRId32", " 445 "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " 446 "nrank:%"PRIx32" fd:%i\n", 447 msl_idx, ms_idx, 448 ms->iova, 449 ms->len, 450 ms->addr, 451 ms->socket_id, 452 ms->hugepage_sz, 453 ms->nchannel, 454 ms->nrank, 455 fd); 456 457 return 0; 458 } 459 460 /* 461 * Defining here because declared in rte_memory.h, but the actual implementation 462 * is in eal_common_memalloc.c, like all other memalloc internals. 463 */ 464 int 465 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, 466 void *arg) 467 { 468 const struct internal_config *internal_conf = 469 eal_get_internal_configuration(); 470 471 /* FreeBSD boots with legacy mem enabled by default */ 472 if (internal_conf->legacy_mem) { 473 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); 474 rte_errno = ENOTSUP; 475 return -1; 476 } 477 return eal_memalloc_mem_event_callback_register(name, clb, arg); 478 } 479 480 int 481 rte_mem_event_callback_unregister(const char *name, void *arg) 482 { 483 const struct internal_config *internal_conf = 484 eal_get_internal_configuration(); 485 486 /* FreeBSD boots with legacy mem enabled by default */ 487 if (internal_conf->legacy_mem) { 488 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); 489 rte_errno = ENOTSUP; 490 return -1; 491 } 492 return eal_memalloc_mem_event_callback_unregister(name, arg); 493 } 494 495 int 496 rte_mem_alloc_validator_register(const char *name, 497 rte_mem_alloc_validator_t clb, int socket_id, size_t limit) 498 { 499 const struct internal_config *internal_conf = 500 eal_get_internal_configuration(); 501 502 /* FreeBSD boots with legacy mem enabled by default */ 503 if (internal_conf->legacy_mem) { 504 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); 505 rte_errno = ENOTSUP; 506 return -1; 507 } 508 return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, 509 limit); 510 } 511 512 int 513 rte_mem_alloc_validator_unregister(const char *name, int socket_id) 514 { 515 const struct internal_config *internal_conf = 516 eal_get_internal_configuration(); 517 518 /* FreeBSD boots with legacy mem enabled by default */ 519 if (internal_conf->legacy_mem) { 520 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); 521 rte_errno = ENOTSUP; 522 return -1; 523 } 524 return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); 525 } 526 527 /* Dump the physical memory layout on console */ 528 void 529 rte_dump_physmem_layout(FILE *f) 530 { 531 rte_memseg_walk(dump_memseg, f); 532 } 533 534 static int 535 check_iova(const struct rte_memseg_list *msl __rte_unused, 536 const struct rte_memseg *ms, void *arg) 537 { 538 uint64_t *mask = arg; 539 rte_iova_t iova; 540 541 /* higher address within segment */ 542 iova = (ms->iova + ms->len) - 1; 543 if (!(iova & *mask)) 544 return 0; 545 546 RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", 547 ms->iova, ms->len); 548 549 RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); 550 return 1; 551 } 552 553 #define MAX_DMA_MASK_BITS 63 554 555 /* check memseg iovas are within the required range based on dma mask */ 556 static int 557 check_dma_mask(uint8_t maskbits, bool thread_unsafe) 558 { 559 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 560 uint64_t mask; 561 int ret; 562 563 /* Sanity check. We only check width can be managed with 64 bits 564 * variables. Indeed any higher value is likely wrong. */ 565 if (maskbits > MAX_DMA_MASK_BITS) { 566 RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", 567 maskbits, MAX_DMA_MASK_BITS); 568 return -1; 569 } 570 571 /* create dma mask */ 572 mask = ~((1ULL << maskbits) - 1); 573 574 if (thread_unsafe) 575 ret = rte_memseg_walk_thread_unsafe(check_iova, &mask); 576 else 577 ret = rte_memseg_walk(check_iova, &mask); 578 579 if (ret) 580 /* 581 * Dma mask precludes hugepage usage. 582 * This device can not be used and we do not need to keep 583 * the dma mask. 584 */ 585 return 1; 586 587 /* 588 * we need to keep the more restricted maskbit for checking 589 * potential dynamic memory allocation in the future. 590 */ 591 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : 592 RTE_MIN(mcfg->dma_maskbits, maskbits); 593 594 return 0; 595 } 596 597 int 598 rte_mem_check_dma_mask(uint8_t maskbits) 599 { 600 return check_dma_mask(maskbits, false); 601 } 602 603 int 604 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits) 605 { 606 return check_dma_mask(maskbits, true); 607 } 608 609 /* 610 * Set dma mask to use when memory initialization is done. 611 * 612 * This function should ONLY be used by code executed before the memory 613 * initialization. PMDs should use rte_mem_check_dma_mask if addressing 614 * limitations by the device. 615 */ 616 void 617 rte_mem_set_dma_mask(uint8_t maskbits) 618 { 619 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 620 621 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : 622 RTE_MIN(mcfg->dma_maskbits, maskbits); 623 } 624 625 /* return the number of memory channels */ 626 unsigned rte_memory_get_nchannel(void) 627 { 628 return rte_eal_get_configuration()->mem_config->nchannel; 629 } 630 631 /* return the number of memory rank */ 632 unsigned rte_memory_get_nrank(void) 633 { 634 return rte_eal_get_configuration()->mem_config->nrank; 635 } 636 637 static int 638 rte_eal_memdevice_init(void) 639 { 640 struct rte_config *config; 641 const struct internal_config *internal_conf; 642 643 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 644 return 0; 645 646 internal_conf = eal_get_internal_configuration(); 647 config = rte_eal_get_configuration(); 648 config->mem_config->nchannel = internal_conf->force_nchannel; 649 config->mem_config->nrank = internal_conf->force_nrank; 650 651 return 0; 652 } 653 654 /* Lock page in physical memory and prevent from swapping. */ 655 int 656 rte_mem_lock_page(const void *virt) 657 { 658 uintptr_t virtual = (uintptr_t)virt; 659 size_t page_size = rte_mem_page_size(); 660 uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size); 661 return rte_mem_lock((void *)aligned, page_size); 662 } 663 664 int 665 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) 666 { 667 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 668 int i, ms_idx, ret = 0; 669 670 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 671 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 672 const struct rte_memseg *ms; 673 struct rte_fbarray *arr; 674 675 if (msl->memseg_arr.count == 0) 676 continue; 677 678 arr = &msl->memseg_arr; 679 680 ms_idx = rte_fbarray_find_next_used(arr, 0); 681 while (ms_idx >= 0) { 682 int n_segs; 683 size_t len; 684 685 ms = rte_fbarray_get(arr, ms_idx); 686 687 /* find how many more segments there are, starting with 688 * this one. 689 */ 690 n_segs = rte_fbarray_find_contig_used(arr, ms_idx); 691 len = n_segs * msl->page_sz; 692 693 ret = func(msl, ms, len, arg); 694 if (ret) 695 return ret; 696 ms_idx = rte_fbarray_find_next_used(arr, 697 ms_idx + n_segs); 698 } 699 } 700 return 0; 701 } 702 703 int 704 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) 705 { 706 int ret = 0; 707 708 /* do not allow allocations/frees/init while we iterate */ 709 rte_mcfg_mem_read_lock(); 710 ret = rte_memseg_contig_walk_thread_unsafe(func, arg); 711 rte_mcfg_mem_read_unlock(); 712 713 return ret; 714 } 715 716 int 717 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) 718 { 719 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 720 int i, ms_idx, ret = 0; 721 722 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 723 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 724 const struct rte_memseg *ms; 725 struct rte_fbarray *arr; 726 727 if (msl->memseg_arr.count == 0) 728 continue; 729 730 arr = &msl->memseg_arr; 731 732 ms_idx = rte_fbarray_find_next_used(arr, 0); 733 while (ms_idx >= 0) { 734 ms = rte_fbarray_get(arr, ms_idx); 735 ret = func(msl, ms, arg); 736 if (ret) 737 return ret; 738 ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); 739 } 740 } 741 return 0; 742 } 743 744 int 745 rte_memseg_walk(rte_memseg_walk_t func, void *arg) 746 { 747 int ret = 0; 748 749 /* do not allow allocations/frees/init while we iterate */ 750 rte_mcfg_mem_read_lock(); 751 ret = rte_memseg_walk_thread_unsafe(func, arg); 752 rte_mcfg_mem_read_unlock(); 753 754 return ret; 755 } 756 757 int 758 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) 759 { 760 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 761 int i, ret = 0; 762 763 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 764 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 765 766 if (msl->base_va == NULL) 767 continue; 768 769 ret = func(msl, arg); 770 if (ret) 771 return ret; 772 } 773 return 0; 774 } 775 776 int 777 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) 778 { 779 int ret = 0; 780 781 /* do not allow allocations/frees/init while we iterate */ 782 rte_mcfg_mem_read_lock(); 783 ret = rte_memseg_list_walk_thread_unsafe(func, arg); 784 rte_mcfg_mem_read_unlock(); 785 786 return ret; 787 } 788 789 int 790 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) 791 { 792 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 793 struct rte_memseg_list *msl; 794 struct rte_fbarray *arr; 795 int msl_idx, seg_idx, ret; 796 797 if (ms == NULL) { 798 rte_errno = EINVAL; 799 return -1; 800 } 801 802 msl = rte_mem_virt2memseg_list(ms->addr); 803 if (msl == NULL) { 804 rte_errno = EINVAL; 805 return -1; 806 } 807 arr = &msl->memseg_arr; 808 809 msl_idx = msl - mcfg->memsegs; 810 seg_idx = rte_fbarray_find_idx(arr, ms); 811 812 if (!rte_fbarray_is_used(arr, seg_idx)) { 813 rte_errno = ENOENT; 814 return -1; 815 } 816 817 /* segment fd API is not supported for external segments */ 818 if (msl->external) { 819 rte_errno = ENOTSUP; 820 return -1; 821 } 822 823 ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); 824 if (ret < 0) { 825 rte_errno = -ret; 826 ret = -1; 827 } 828 return ret; 829 } 830 831 int 832 rte_memseg_get_fd(const struct rte_memseg *ms) 833 { 834 int ret; 835 836 rte_mcfg_mem_read_lock(); 837 ret = rte_memseg_get_fd_thread_unsafe(ms); 838 rte_mcfg_mem_read_unlock(); 839 840 return ret; 841 } 842 843 int 844 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, 845 size_t *offset) 846 { 847 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 848 struct rte_memseg_list *msl; 849 struct rte_fbarray *arr; 850 int msl_idx, seg_idx, ret; 851 852 if (ms == NULL || offset == NULL) { 853 rte_errno = EINVAL; 854 return -1; 855 } 856 857 msl = rte_mem_virt2memseg_list(ms->addr); 858 if (msl == NULL) { 859 rte_errno = EINVAL; 860 return -1; 861 } 862 arr = &msl->memseg_arr; 863 864 msl_idx = msl - mcfg->memsegs; 865 seg_idx = rte_fbarray_find_idx(arr, ms); 866 867 if (!rte_fbarray_is_used(arr, seg_idx)) { 868 rte_errno = ENOENT; 869 return -1; 870 } 871 872 /* segment fd API is not supported for external segments */ 873 if (msl->external) { 874 rte_errno = ENOTSUP; 875 return -1; 876 } 877 878 ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); 879 if (ret < 0) { 880 rte_errno = -ret; 881 ret = -1; 882 } 883 return ret; 884 } 885 886 int 887 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) 888 { 889 int ret; 890 891 rte_mcfg_mem_read_lock(); 892 ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); 893 rte_mcfg_mem_read_unlock(); 894 895 return ret; 896 } 897 898 int 899 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], 900 unsigned int n_pages, size_t page_sz) 901 { 902 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 903 unsigned int socket_id, n; 904 int ret = 0; 905 906 if (va_addr == NULL || page_sz == 0 || len == 0 || 907 !rte_is_power_of_2(page_sz) || 908 RTE_ALIGN(len, page_sz) != len || 909 ((len / page_sz) != n_pages && iova_addrs != NULL) || 910 !rte_is_aligned(va_addr, page_sz)) { 911 rte_errno = EINVAL; 912 return -1; 913 } 914 rte_mcfg_mem_write_lock(); 915 916 /* make sure the segment doesn't already exist */ 917 if (malloc_heap_find_external_seg(va_addr, len) != NULL) { 918 rte_errno = EEXIST; 919 ret = -1; 920 goto unlock; 921 } 922 923 /* get next available socket ID */ 924 socket_id = mcfg->next_socket_id; 925 if (socket_id > INT32_MAX) { 926 RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); 927 rte_errno = ENOSPC; 928 ret = -1; 929 goto unlock; 930 } 931 932 /* we can create a new memseg */ 933 n = len / page_sz; 934 if (malloc_heap_create_external_seg(va_addr, iova_addrs, n, 935 page_sz, "extmem", socket_id) == NULL) { 936 ret = -1; 937 goto unlock; 938 } 939 940 /* memseg list successfully created - increment next socket ID */ 941 mcfg->next_socket_id++; 942 unlock: 943 rte_mcfg_mem_write_unlock(); 944 return ret; 945 } 946 947 int 948 rte_extmem_unregister(void *va_addr, size_t len) 949 { 950 struct rte_memseg_list *msl; 951 int ret = 0; 952 953 if (va_addr == NULL || len == 0) { 954 rte_errno = EINVAL; 955 return -1; 956 } 957 rte_mcfg_mem_write_lock(); 958 959 /* find our segment */ 960 msl = malloc_heap_find_external_seg(va_addr, len); 961 if (msl == NULL) { 962 rte_errno = ENOENT; 963 ret = -1; 964 goto unlock; 965 } 966 967 ret = malloc_heap_destroy_external_seg(msl); 968 unlock: 969 rte_mcfg_mem_write_unlock(); 970 return ret; 971 } 972 973 static int 974 sync_memory(void *va_addr, size_t len, bool attach) 975 { 976 struct rte_memseg_list *msl; 977 int ret = 0; 978 979 if (va_addr == NULL || len == 0) { 980 rte_errno = EINVAL; 981 return -1; 982 } 983 rte_mcfg_mem_write_lock(); 984 985 /* find our segment */ 986 msl = malloc_heap_find_external_seg(va_addr, len); 987 if (msl == NULL) { 988 rte_errno = ENOENT; 989 ret = -1; 990 goto unlock; 991 } 992 if (attach) 993 ret = rte_fbarray_attach(&msl->memseg_arr); 994 else 995 ret = rte_fbarray_detach(&msl->memseg_arr); 996 997 unlock: 998 rte_mcfg_mem_write_unlock(); 999 return ret; 1000 } 1001 1002 int 1003 rte_extmem_attach(void *va_addr, size_t len) 1004 { 1005 return sync_memory(va_addr, len, true); 1006 } 1007 1008 int 1009 rte_extmem_detach(void *va_addr, size_t len) 1010 { 1011 return sync_memory(va_addr, len, false); 1012 } 1013 1014 /* detach all EAL memory */ 1015 int 1016 rte_eal_memory_detach(void) 1017 { 1018 const struct internal_config *internal_conf = 1019 eal_get_internal_configuration(); 1020 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1021 size_t page_sz = rte_mem_page_size(); 1022 unsigned int i; 1023 1024 if (internal_conf->in_memory == 1) 1025 return 0; 1026 1027 rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); 1028 1029 /* detach internal memory subsystem data first */ 1030 if (eal_memalloc_cleanup()) 1031 RTE_LOG(ERR, EAL, "Could not release memory subsystem data\n"); 1032 1033 for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) { 1034 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 1035 1036 /* skip uninitialized segments */ 1037 if (msl->base_va == NULL) 1038 continue; 1039 /* 1040 * external segments are supposed to be detached at this point, 1041 * but if they aren't, we can't really do anything about it, 1042 * because if we skip them here, they'll become invalid after 1043 * we unmap the memconfig anyway. however, if this is externally 1044 * referenced memory, we have no business unmapping it. 1045 */ 1046 if (!msl->external) 1047 if (rte_mem_unmap(msl->base_va, msl->len) != 0) 1048 RTE_LOG(ERR, EAL, "Could not unmap memory: %s\n", 1049 rte_strerror(rte_errno)); 1050 1051 /* 1052 * we are detaching the fbarray rather than destroying because 1053 * other processes might still reference this fbarray, and we 1054 * have no way of knowing if they still do. 1055 */ 1056 if (rte_fbarray_detach(&msl->memseg_arr)) 1057 RTE_LOG(ERR, EAL, "Could not detach fbarray: %s\n", 1058 rte_strerror(rte_errno)); 1059 } 1060 rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); 1061 1062 /* 1063 * we've detached the memseg lists, so we can unmap the shared mem 1064 * config - we can't zero it out because it might still be referenced 1065 * by other processes. 1066 */ 1067 if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) { 1068 if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0) 1069 RTE_LOG(ERR, EAL, "Could not unmap shared memory config: %s\n", 1070 rte_strerror(rte_errno)); 1071 } 1072 rte_eal_get_configuration()->mem_config = NULL; 1073 1074 return 0; 1075 } 1076 1077 /* init memory subsystem */ 1078 int 1079 rte_eal_memory_init(void) 1080 { 1081 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1082 const struct internal_config *internal_conf = 1083 eal_get_internal_configuration(); 1084 1085 int retval; 1086 RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); 1087 1088 if (!mcfg) 1089 return -1; 1090 1091 /* lock mem hotplug here, to prevent races while we init */ 1092 rte_mcfg_mem_read_lock(); 1093 1094 if (rte_eal_memseg_init() < 0) 1095 goto fail; 1096 1097 if (eal_memalloc_init() < 0) 1098 goto fail; 1099 1100 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? 1101 rte_eal_hugepage_init() : 1102 rte_eal_hugepage_attach(); 1103 if (retval < 0) 1104 goto fail; 1105 1106 if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0) 1107 goto fail; 1108 1109 return 0; 1110 fail: 1111 rte_mcfg_mem_read_unlock(); 1112 return -1; 1113 } 1114 1115 #ifndef RTE_EXEC_ENV_WINDOWS 1116 #define EAL_MEMZONE_LIST_REQ "/eal/memzone_list" 1117 #define EAL_MEMZONE_INFO_REQ "/eal/memzone_info" 1118 #define EAL_HEAP_LIST_REQ "/eal/heap_list" 1119 #define EAL_HEAP_INFO_REQ "/eal/heap_info" 1120 #define ADDR_STR 15 1121 1122 /* Telemetry callback handler to return heap stats for requested heap id. */ 1123 static int 1124 handle_eal_heap_info_request(const char *cmd __rte_unused, const char *params, 1125 struct rte_tel_data *d) 1126 { 1127 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1128 struct rte_malloc_socket_stats sock_stats; 1129 struct malloc_heap *heap; 1130 unsigned int heap_id; 1131 1132 if (params == NULL || strlen(params) == 0) 1133 return -1; 1134 1135 heap_id = (unsigned int)strtoul(params, NULL, 10); 1136 1137 /* Get the heap stats of user provided heap id */ 1138 heap = &mcfg->malloc_heaps[heap_id]; 1139 malloc_heap_get_stats(heap, &sock_stats); 1140 1141 rte_tel_data_start_dict(d); 1142 rte_tel_data_add_dict_int(d, "Head id", heap_id); 1143 rte_tel_data_add_dict_string(d, "Name", heap->name); 1144 rte_tel_data_add_dict_u64(d, "Heap_size", 1145 sock_stats.heap_totalsz_bytes); 1146 rte_tel_data_add_dict_u64(d, "Free_size", sock_stats.heap_freesz_bytes); 1147 rte_tel_data_add_dict_u64(d, "Alloc_size", 1148 sock_stats.heap_allocsz_bytes); 1149 rte_tel_data_add_dict_u64(d, "Greatest_free_size", 1150 sock_stats.greatest_free_size); 1151 rte_tel_data_add_dict_u64(d, "Alloc_count", sock_stats.alloc_count); 1152 rte_tel_data_add_dict_u64(d, "Free_count", sock_stats.free_count); 1153 1154 return 0; 1155 } 1156 1157 /* Telemetry callback handler to list the heap ids setup. */ 1158 static int 1159 handle_eal_heap_list_request(const char *cmd __rte_unused, 1160 const char *params __rte_unused, 1161 struct rte_tel_data *d) 1162 { 1163 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1164 struct rte_malloc_socket_stats sock_stats; 1165 unsigned int heap_id; 1166 1167 rte_tel_data_start_array(d, RTE_TEL_INT_VAL); 1168 /* Iterate through all initialised heaps */ 1169 for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { 1170 struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 1171 1172 malloc_heap_get_stats(heap, &sock_stats); 1173 if (sock_stats.heap_totalsz_bytes != 0) 1174 rte_tel_data_add_array_int(d, heap_id); 1175 } 1176 1177 return 0; 1178 } 1179 1180 /* Telemetry callback handler to return memzone info for requested index. */ 1181 static int 1182 handle_eal_memzone_info_request(const char *cmd __rte_unused, 1183 const char *params, struct rte_tel_data *d) 1184 { 1185 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1186 struct rte_memseg_list *msl = NULL; 1187 int ms_idx, ms_count = 0; 1188 void *cur_addr, *mz_end; 1189 struct rte_memzone *mz; 1190 struct rte_memseg *ms; 1191 char addr[ADDR_STR]; 1192 unsigned int mz_idx; 1193 size_t page_sz; 1194 1195 if (params == NULL || strlen(params) == 0) 1196 return -1; 1197 1198 mz_idx = strtoul(params, NULL, 10); 1199 1200 /* Get the memzone handle using index */ 1201 mz = rte_fbarray_get(&mcfg->memzones, mz_idx); 1202 1203 rte_tel_data_start_dict(d); 1204 rte_tel_data_add_dict_int(d, "Zone", mz_idx); 1205 rte_tel_data_add_dict_string(d, "Name", mz->name); 1206 rte_tel_data_add_dict_int(d, "Length", mz->len); 1207 snprintf(addr, ADDR_STR, "%p", mz->addr); 1208 rte_tel_data_add_dict_string(d, "Address", addr); 1209 rte_tel_data_add_dict_int(d, "Socket", mz->socket_id); 1210 rte_tel_data_add_dict_int(d, "Flags", mz->flags); 1211 1212 /* go through each page occupied by this memzone */ 1213 msl = rte_mem_virt2memseg_list(mz->addr); 1214 if (!msl) { 1215 RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n"); 1216 return -1; 1217 } 1218 page_sz = (size_t)mz->hugepage_sz; 1219 cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz); 1220 mz_end = RTE_PTR_ADD(cur_addr, mz->len); 1221 1222 ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz; 1223 ms = rte_fbarray_get(&msl->memseg_arr, ms_idx); 1224 1225 rte_tel_data_add_dict_int(d, "Hugepage_size", page_sz); 1226 snprintf(addr, ADDR_STR, "%p", ms->addr); 1227 rte_tel_data_add_dict_string(d, "Hugepage_base", addr); 1228 1229 do { 1230 /* advance VA to next page */ 1231 cur_addr = RTE_PTR_ADD(cur_addr, page_sz); 1232 1233 /* memzones occupy contiguous segments */ 1234 ++ms; 1235 ms_count++; 1236 } while (cur_addr < mz_end); 1237 1238 rte_tel_data_add_dict_int(d, "Hugepage_used", ms_count); 1239 1240 return 0; 1241 } 1242 1243 static void 1244 memzone_list_cb(const struct rte_memzone *mz __rte_unused, 1245 void *arg __rte_unused) 1246 { 1247 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1248 struct rte_tel_data *d = arg; 1249 int mz_idx; 1250 1251 mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz); 1252 rte_tel_data_add_array_int(d, mz_idx); 1253 } 1254 1255 1256 /* Telemetry callback handler to list the memzones reserved. */ 1257 static int 1258 handle_eal_memzone_list_request(const char *cmd __rte_unused, 1259 const char *params __rte_unused, 1260 struct rte_tel_data *d) 1261 { 1262 rte_tel_data_start_array(d, RTE_TEL_INT_VAL); 1263 rte_memzone_walk(memzone_list_cb, d); 1264 1265 return 0; 1266 } 1267 1268 RTE_INIT(memory_telemetry) 1269 { 1270 rte_telemetry_register_cmd( 1271 EAL_MEMZONE_LIST_REQ, handle_eal_memzone_list_request, 1272 "List of memzone index reserved. Takes no parameters"); 1273 rte_telemetry_register_cmd( 1274 EAL_MEMZONE_INFO_REQ, handle_eal_memzone_info_request, 1275 "Returns memzone info. Parameters: int mz_id"); 1276 rte_telemetry_register_cmd( 1277 EAL_HEAP_LIST_REQ, handle_eal_heap_list_request, 1278 "List of heap index setup. Takes no parameters"); 1279 rte_telemetry_register_cmd( 1280 EAL_HEAP_INFO_REQ, handle_eal_heap_info_request, 1281 "Returns malloc heap stats. Parameters: int heap_id"); 1282 } 1283 #endif 1284