1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <fcntl.h> 6 #include <errno.h> 7 #include <stdio.h> 8 #include <stdint.h> 9 #include <stdlib.h> 10 #include <stdarg.h> 11 #include <string.h> 12 #include <unistd.h> 13 #include <inttypes.h> 14 #include <sys/queue.h> 15 16 #include <rte_fbarray.h> 17 #include <rte_memory.h> 18 #include <rte_eal.h> 19 #include <rte_eal_memconfig.h> 20 #include <rte_eal_paging.h> 21 #include <rte_errno.h> 22 #include <rte_log.h> 23 24 #include "eal_memalloc.h" 25 #include "eal_private.h" 26 #include "eal_internal_cfg.h" 27 #include "eal_memcfg.h" 28 #include "eal_options.h" 29 #include "malloc_heap.h" 30 31 /* 32 * Try to mmap *size bytes in /dev/zero. If it is successful, return the 33 * pointer to the mmap'd area and keep *size unmodified. Else, retry 34 * with a smaller zone: decrease *size by hugepage_sz until it reaches 35 * 0. In this case, return NULL. Note: this function returns an address 36 * which is a multiple of hugepage size. 37 */ 38 39 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" 40 41 static void *next_baseaddr; 42 static uint64_t system_page_sz; 43 44 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5 45 void * 46 eal_get_virtual_area(void *requested_addr, size_t *size, 47 size_t page_sz, int flags, int reserve_flags) 48 { 49 bool addr_is_hint, allow_shrink, unmap, no_align; 50 uint64_t map_sz; 51 void *mapped_addr, *aligned_addr; 52 uint8_t try = 0; 53 struct internal_config *internal_conf = 54 eal_get_internal_configuration(); 55 56 if (system_page_sz == 0) 57 system_page_sz = rte_mem_page_size(); 58 59 RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); 60 61 addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; 62 allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; 63 unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; 64 65 if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 && 66 rte_eal_process_type() == RTE_PROC_PRIMARY) 67 next_baseaddr = (void *) internal_conf->base_virtaddr; 68 69 #ifdef RTE_ARCH_64 70 if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 && 71 rte_eal_process_type() == RTE_PROC_PRIMARY) 72 next_baseaddr = (void *) eal_get_baseaddr(); 73 #endif 74 if (requested_addr == NULL && next_baseaddr != NULL) { 75 requested_addr = next_baseaddr; 76 requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); 77 addr_is_hint = true; 78 } 79 80 /* we don't need alignment of resulting pointer in the following cases: 81 * 82 * 1. page size is equal to system size 83 * 2. we have a requested address, and it is page-aligned, and we will 84 * be discarding the address if we get a different one. 85 * 86 * for all other cases, alignment is potentially necessary. 87 */ 88 no_align = (requested_addr != NULL && 89 requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && 90 !addr_is_hint) || 91 page_sz == system_page_sz; 92 93 do { 94 map_sz = no_align ? *size : *size + page_sz; 95 if (map_sz > SIZE_MAX) { 96 RTE_LOG(ERR, EAL, "Map size too big\n"); 97 rte_errno = E2BIG; 98 return NULL; 99 } 100 101 mapped_addr = eal_mem_reserve( 102 requested_addr, (size_t)map_sz, reserve_flags); 103 if ((mapped_addr == NULL) && allow_shrink) 104 *size -= page_sz; 105 106 if ((mapped_addr != NULL) && addr_is_hint && 107 (mapped_addr != requested_addr)) { 108 try++; 109 next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); 110 if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) { 111 /* hint was not used. Try with another offset */ 112 eal_mem_free(mapped_addr, map_sz); 113 mapped_addr = NULL; 114 requested_addr = next_baseaddr; 115 } 116 } 117 } while ((allow_shrink || addr_is_hint) && 118 (mapped_addr == NULL) && (*size > 0)); 119 120 /* align resulting address - if map failed, we will ignore the value 121 * anyway, so no need to add additional checks. 122 */ 123 aligned_addr = no_align ? mapped_addr : 124 RTE_PTR_ALIGN(mapped_addr, page_sz); 125 126 if (*size == 0) { 127 RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", 128 rte_strerror(rte_errno)); 129 return NULL; 130 } else if (mapped_addr == NULL) { 131 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", 132 rte_strerror(rte_errno)); 133 return NULL; 134 } else if (requested_addr != NULL && !addr_is_hint && 135 aligned_addr != requested_addr) { 136 RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", 137 requested_addr, aligned_addr); 138 eal_mem_free(mapped_addr, map_sz); 139 rte_errno = EADDRNOTAVAIL; 140 return NULL; 141 } else if (requested_addr != NULL && addr_is_hint && 142 aligned_addr != requested_addr) { 143 RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", 144 requested_addr, aligned_addr); 145 RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); 146 } else if (next_baseaddr != NULL) { 147 next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); 148 } 149 150 RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", 151 aligned_addr, *size); 152 153 if (unmap) { 154 eal_mem_free(mapped_addr, map_sz); 155 } else if (!no_align) { 156 void *map_end, *aligned_end; 157 size_t before_len, after_len; 158 159 /* when we reserve space with alignment, we add alignment to 160 * mapping size. On 32-bit, if 1GB alignment was requested, this 161 * would waste 1GB of address space, which is a luxury we cannot 162 * afford. so, if alignment was performed, check if any unneeded 163 * address space can be unmapped back. 164 */ 165 166 map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); 167 aligned_end = RTE_PTR_ADD(aligned_addr, *size); 168 169 /* unmap space before aligned mmap address */ 170 before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); 171 if (before_len > 0) 172 eal_mem_free(mapped_addr, before_len); 173 174 /* unmap space after aligned end mmap address */ 175 after_len = RTE_PTR_DIFF(map_end, aligned_end); 176 if (after_len > 0) 177 eal_mem_free(aligned_end, after_len); 178 } 179 180 if (!unmap) { 181 /* Exclude these pages from a core dump. */ 182 eal_mem_set_dump(aligned_addr, *size, false); 183 } 184 185 return aligned_addr; 186 } 187 188 int 189 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name, 190 uint64_t page_sz, int n_segs, int socket_id, bool heap) 191 { 192 if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, 193 sizeof(struct rte_memseg))) { 194 RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", 195 rte_strerror(rte_errno)); 196 return -1; 197 } 198 199 msl->page_sz = page_sz; 200 msl->socket_id = socket_id; 201 msl->base_va = NULL; 202 msl->heap = heap; 203 204 RTE_LOG(DEBUG, EAL, 205 "Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n", 206 socket_id, page_sz >> 10); 207 208 return 0; 209 } 210 211 int 212 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz, 213 int n_segs, int socket_id, int type_msl_idx, bool heap) 214 { 215 char name[RTE_FBARRAY_NAME_LEN]; 216 217 snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, 218 type_msl_idx); 219 220 return eal_memseg_list_init_named( 221 msl, name, page_sz, n_segs, socket_id, heap); 222 } 223 224 int 225 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags) 226 { 227 size_t page_sz, mem_sz; 228 void *addr; 229 230 page_sz = msl->page_sz; 231 mem_sz = page_sz * msl->memseg_arr.len; 232 233 addr = eal_get_virtual_area( 234 msl->base_va, &mem_sz, page_sz, 0, reserve_flags); 235 if (addr == NULL) { 236 #ifndef RTE_EXEC_ENV_WINDOWS 237 /* The hint would be misleading on Windows, because address 238 * is by default system-selected (base VA = 0). 239 * However, this function is called from many places, 240 * including common code, so don't duplicate the message. 241 */ 242 if (rte_errno == EADDRNOTAVAIL) 243 RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - " 244 "please use '--" OPT_BASE_VIRTADDR "' option\n", 245 (unsigned long long)mem_sz, msl->base_va); 246 #endif 247 return -1; 248 } 249 msl->base_va = addr; 250 msl->len = mem_sz; 251 252 RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n", 253 addr, mem_sz); 254 255 return 0; 256 } 257 258 void 259 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs) 260 { 261 size_t page_sz = msl->page_sz; 262 int i; 263 264 for (i = 0; i < n_segs; i++) { 265 struct rte_fbarray *arr = &msl->memseg_arr; 266 struct rte_memseg *ms = rte_fbarray_get(arr, i); 267 268 if (rte_eal_iova_mode() == RTE_IOVA_VA) 269 ms->iova = (uintptr_t)addr; 270 else 271 ms->iova = RTE_BAD_IOVA; 272 ms->addr = addr; 273 ms->hugepage_sz = page_sz; 274 ms->socket_id = 0; 275 ms->len = page_sz; 276 277 rte_fbarray_set_used(arr, i); 278 279 addr = RTE_PTR_ADD(addr, page_sz); 280 } 281 } 282 283 static struct rte_memseg * 284 virt2memseg(const void *addr, const struct rte_memseg_list *msl) 285 { 286 const struct rte_fbarray *arr; 287 void *start, *end; 288 int ms_idx; 289 290 if (msl == NULL) 291 return NULL; 292 293 /* a memseg list was specified, check if it's the right one */ 294 start = msl->base_va; 295 end = RTE_PTR_ADD(start, msl->len); 296 297 if (addr < start || addr >= end) 298 return NULL; 299 300 /* now, calculate index */ 301 arr = &msl->memseg_arr; 302 ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; 303 return rte_fbarray_get(arr, ms_idx); 304 } 305 306 static struct rte_memseg_list * 307 virt2memseg_list(const void *addr) 308 { 309 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 310 struct rte_memseg_list *msl; 311 int msl_idx; 312 313 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { 314 void *start, *end; 315 msl = &mcfg->memsegs[msl_idx]; 316 317 start = msl->base_va; 318 end = RTE_PTR_ADD(start, msl->len); 319 if (addr >= start && addr < end) 320 break; 321 } 322 /* if we didn't find our memseg list */ 323 if (msl_idx == RTE_MAX_MEMSEG_LISTS) 324 return NULL; 325 return msl; 326 } 327 328 struct rte_memseg_list * 329 rte_mem_virt2memseg_list(const void *addr) 330 { 331 return virt2memseg_list(addr); 332 } 333 334 struct virtiova { 335 rte_iova_t iova; 336 void *virt; 337 }; 338 static int 339 find_virt(const struct rte_memseg_list *msl __rte_unused, 340 const struct rte_memseg *ms, void *arg) 341 { 342 struct virtiova *vi = arg; 343 if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { 344 size_t offset = vi->iova - ms->iova; 345 vi->virt = RTE_PTR_ADD(ms->addr, offset); 346 /* stop the walk */ 347 return 1; 348 } 349 return 0; 350 } 351 static int 352 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, 353 const struct rte_memseg *ms, size_t len, void *arg) 354 { 355 struct virtiova *vi = arg; 356 if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { 357 size_t offset = vi->iova - ms->iova; 358 vi->virt = RTE_PTR_ADD(ms->addr, offset); 359 /* stop the walk */ 360 return 1; 361 } 362 return 0; 363 } 364 365 void * 366 rte_mem_iova2virt(rte_iova_t iova) 367 { 368 struct virtiova vi; 369 const struct internal_config *internal_conf = 370 eal_get_internal_configuration(); 371 372 memset(&vi, 0, sizeof(vi)); 373 374 vi.iova = iova; 375 /* for legacy mem, we can get away with scanning VA-contiguous segments, 376 * as we know they are PA-contiguous as well 377 */ 378 if (internal_conf->legacy_mem) 379 rte_memseg_contig_walk(find_virt_legacy, &vi); 380 else 381 rte_memseg_walk(find_virt, &vi); 382 383 return vi.virt; 384 } 385 386 struct rte_memseg * 387 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) 388 { 389 return virt2memseg(addr, msl != NULL ? msl : 390 rte_mem_virt2memseg_list(addr)); 391 } 392 393 static int 394 physmem_size(const struct rte_memseg_list *msl, void *arg) 395 { 396 uint64_t *total_len = arg; 397 398 if (msl->external) 399 return 0; 400 401 *total_len += msl->memseg_arr.count * msl->page_sz; 402 403 return 0; 404 } 405 406 /* get the total size of memory */ 407 uint64_t 408 rte_eal_get_physmem_size(void) 409 { 410 uint64_t total_len = 0; 411 412 rte_memseg_list_walk(physmem_size, &total_len); 413 414 return total_len; 415 } 416 417 static int 418 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, 419 void *arg) 420 { 421 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 422 int msl_idx, ms_idx, fd; 423 FILE *f = arg; 424 425 msl_idx = msl - mcfg->memsegs; 426 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) 427 return -1; 428 429 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 430 if (ms_idx < 0) 431 return -1; 432 433 fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); 434 fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " 435 "virt:%p, socket_id:%"PRId32", " 436 "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " 437 "nrank:%"PRIx32" fd:%i\n", 438 msl_idx, ms_idx, 439 ms->iova, 440 ms->len, 441 ms->addr, 442 ms->socket_id, 443 ms->hugepage_sz, 444 ms->nchannel, 445 ms->nrank, 446 fd); 447 448 return 0; 449 } 450 451 /* 452 * Defining here because declared in rte_memory.h, but the actual implementation 453 * is in eal_common_memalloc.c, like all other memalloc internals. 454 */ 455 int 456 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, 457 void *arg) 458 { 459 const struct internal_config *internal_conf = 460 eal_get_internal_configuration(); 461 462 /* FreeBSD boots with legacy mem enabled by default */ 463 if (internal_conf->legacy_mem) { 464 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); 465 rte_errno = ENOTSUP; 466 return -1; 467 } 468 return eal_memalloc_mem_event_callback_register(name, clb, arg); 469 } 470 471 int 472 rte_mem_event_callback_unregister(const char *name, void *arg) 473 { 474 const struct internal_config *internal_conf = 475 eal_get_internal_configuration(); 476 477 /* FreeBSD boots with legacy mem enabled by default */ 478 if (internal_conf->legacy_mem) { 479 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); 480 rte_errno = ENOTSUP; 481 return -1; 482 } 483 return eal_memalloc_mem_event_callback_unregister(name, arg); 484 } 485 486 int 487 rte_mem_alloc_validator_register(const char *name, 488 rte_mem_alloc_validator_t clb, int socket_id, size_t limit) 489 { 490 const struct internal_config *internal_conf = 491 eal_get_internal_configuration(); 492 493 /* FreeBSD boots with legacy mem enabled by default */ 494 if (internal_conf->legacy_mem) { 495 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); 496 rte_errno = ENOTSUP; 497 return -1; 498 } 499 return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, 500 limit); 501 } 502 503 int 504 rte_mem_alloc_validator_unregister(const char *name, int socket_id) 505 { 506 const struct internal_config *internal_conf = 507 eal_get_internal_configuration(); 508 509 /* FreeBSD boots with legacy mem enabled by default */ 510 if (internal_conf->legacy_mem) { 511 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); 512 rte_errno = ENOTSUP; 513 return -1; 514 } 515 return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); 516 } 517 518 /* Dump the physical memory layout on console */ 519 void 520 rte_dump_physmem_layout(FILE *f) 521 { 522 rte_memseg_walk(dump_memseg, f); 523 } 524 525 static int 526 check_iova(const struct rte_memseg_list *msl __rte_unused, 527 const struct rte_memseg *ms, void *arg) 528 { 529 uint64_t *mask = arg; 530 rte_iova_t iova; 531 532 /* higher address within segment */ 533 iova = (ms->iova + ms->len) - 1; 534 if (!(iova & *mask)) 535 return 0; 536 537 RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", 538 ms->iova, ms->len); 539 540 RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); 541 return 1; 542 } 543 544 #define MAX_DMA_MASK_BITS 63 545 546 /* check memseg iovas are within the required range based on dma mask */ 547 static int 548 check_dma_mask(uint8_t maskbits, bool thread_unsafe) 549 { 550 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 551 uint64_t mask; 552 int ret; 553 554 /* Sanity check. We only check width can be managed with 64 bits 555 * variables. Indeed any higher value is likely wrong. */ 556 if (maskbits > MAX_DMA_MASK_BITS) { 557 RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", 558 maskbits, MAX_DMA_MASK_BITS); 559 return -1; 560 } 561 562 /* create dma mask */ 563 mask = ~((1ULL << maskbits) - 1); 564 565 if (thread_unsafe) 566 ret = rte_memseg_walk_thread_unsafe(check_iova, &mask); 567 else 568 ret = rte_memseg_walk(check_iova, &mask); 569 570 if (ret) 571 /* 572 * Dma mask precludes hugepage usage. 573 * This device can not be used and we do not need to keep 574 * the dma mask. 575 */ 576 return 1; 577 578 /* 579 * we need to keep the more restricted maskbit for checking 580 * potential dynamic memory allocation in the future. 581 */ 582 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : 583 RTE_MIN(mcfg->dma_maskbits, maskbits); 584 585 return 0; 586 } 587 588 int 589 rte_mem_check_dma_mask(uint8_t maskbits) 590 { 591 return check_dma_mask(maskbits, false); 592 } 593 594 int 595 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits) 596 { 597 return check_dma_mask(maskbits, true); 598 } 599 600 /* 601 * Set dma mask to use when memory initialization is done. 602 * 603 * This function should ONLY be used by code executed before the memory 604 * initialization. PMDs should use rte_mem_check_dma_mask if addressing 605 * limitations by the device. 606 */ 607 void 608 rte_mem_set_dma_mask(uint8_t maskbits) 609 { 610 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 611 612 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : 613 RTE_MIN(mcfg->dma_maskbits, maskbits); 614 } 615 616 /* return the number of memory channels */ 617 unsigned rte_memory_get_nchannel(void) 618 { 619 return rte_eal_get_configuration()->mem_config->nchannel; 620 } 621 622 /* return the number of memory rank */ 623 unsigned rte_memory_get_nrank(void) 624 { 625 return rte_eal_get_configuration()->mem_config->nrank; 626 } 627 628 static int 629 rte_eal_memdevice_init(void) 630 { 631 struct rte_config *config; 632 const struct internal_config *internal_conf; 633 634 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 635 return 0; 636 637 internal_conf = eal_get_internal_configuration(); 638 config = rte_eal_get_configuration(); 639 config->mem_config->nchannel = internal_conf->force_nchannel; 640 config->mem_config->nrank = internal_conf->force_nrank; 641 642 return 0; 643 } 644 645 /* Lock page in physical memory and prevent from swapping. */ 646 int 647 rte_mem_lock_page(const void *virt) 648 { 649 uintptr_t virtual = (uintptr_t)virt; 650 size_t page_size = rte_mem_page_size(); 651 uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size); 652 return rte_mem_lock((void *)aligned, page_size); 653 } 654 655 int 656 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) 657 { 658 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 659 int i, ms_idx, ret = 0; 660 661 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 662 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 663 const struct rte_memseg *ms; 664 struct rte_fbarray *arr; 665 666 if (msl->memseg_arr.count == 0) 667 continue; 668 669 arr = &msl->memseg_arr; 670 671 ms_idx = rte_fbarray_find_next_used(arr, 0); 672 while (ms_idx >= 0) { 673 int n_segs; 674 size_t len; 675 676 ms = rte_fbarray_get(arr, ms_idx); 677 678 /* find how many more segments there are, starting with 679 * this one. 680 */ 681 n_segs = rte_fbarray_find_contig_used(arr, ms_idx); 682 len = n_segs * msl->page_sz; 683 684 ret = func(msl, ms, len, arg); 685 if (ret) 686 return ret; 687 ms_idx = rte_fbarray_find_next_used(arr, 688 ms_idx + n_segs); 689 } 690 } 691 return 0; 692 } 693 694 int 695 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) 696 { 697 int ret = 0; 698 699 /* do not allow allocations/frees/init while we iterate */ 700 rte_mcfg_mem_read_lock(); 701 ret = rte_memseg_contig_walk_thread_unsafe(func, arg); 702 rte_mcfg_mem_read_unlock(); 703 704 return ret; 705 } 706 707 int 708 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) 709 { 710 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 711 int i, ms_idx, ret = 0; 712 713 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 714 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 715 const struct rte_memseg *ms; 716 struct rte_fbarray *arr; 717 718 if (msl->memseg_arr.count == 0) 719 continue; 720 721 arr = &msl->memseg_arr; 722 723 ms_idx = rte_fbarray_find_next_used(arr, 0); 724 while (ms_idx >= 0) { 725 ms = rte_fbarray_get(arr, ms_idx); 726 ret = func(msl, ms, arg); 727 if (ret) 728 return ret; 729 ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); 730 } 731 } 732 return 0; 733 } 734 735 int 736 rte_memseg_walk(rte_memseg_walk_t func, void *arg) 737 { 738 int ret = 0; 739 740 /* do not allow allocations/frees/init while we iterate */ 741 rte_mcfg_mem_read_lock(); 742 ret = rte_memseg_walk_thread_unsafe(func, arg); 743 rte_mcfg_mem_read_unlock(); 744 745 return ret; 746 } 747 748 int 749 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) 750 { 751 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 752 int i, ret = 0; 753 754 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 755 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 756 757 if (msl->base_va == NULL) 758 continue; 759 760 ret = func(msl, arg); 761 if (ret) 762 return ret; 763 } 764 return 0; 765 } 766 767 int 768 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) 769 { 770 int ret = 0; 771 772 /* do not allow allocations/frees/init while we iterate */ 773 rte_mcfg_mem_read_lock(); 774 ret = rte_memseg_list_walk_thread_unsafe(func, arg); 775 rte_mcfg_mem_read_unlock(); 776 777 return ret; 778 } 779 780 int 781 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) 782 { 783 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 784 struct rte_memseg_list *msl; 785 struct rte_fbarray *arr; 786 int msl_idx, seg_idx, ret; 787 788 if (ms == NULL) { 789 rte_errno = EINVAL; 790 return -1; 791 } 792 793 msl = rte_mem_virt2memseg_list(ms->addr); 794 if (msl == NULL) { 795 rte_errno = EINVAL; 796 return -1; 797 } 798 arr = &msl->memseg_arr; 799 800 msl_idx = msl - mcfg->memsegs; 801 seg_idx = rte_fbarray_find_idx(arr, ms); 802 803 if (!rte_fbarray_is_used(arr, seg_idx)) { 804 rte_errno = ENOENT; 805 return -1; 806 } 807 808 /* segment fd API is not supported for external segments */ 809 if (msl->external) { 810 rte_errno = ENOTSUP; 811 return -1; 812 } 813 814 ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); 815 if (ret < 0) { 816 rte_errno = -ret; 817 ret = -1; 818 } 819 return ret; 820 } 821 822 int 823 rte_memseg_get_fd(const struct rte_memseg *ms) 824 { 825 int ret; 826 827 rte_mcfg_mem_read_lock(); 828 ret = rte_memseg_get_fd_thread_unsafe(ms); 829 rte_mcfg_mem_read_unlock(); 830 831 return ret; 832 } 833 834 int 835 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, 836 size_t *offset) 837 { 838 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 839 struct rte_memseg_list *msl; 840 struct rte_fbarray *arr; 841 int msl_idx, seg_idx, ret; 842 843 if (ms == NULL || offset == NULL) { 844 rte_errno = EINVAL; 845 return -1; 846 } 847 848 msl = rte_mem_virt2memseg_list(ms->addr); 849 if (msl == NULL) { 850 rte_errno = EINVAL; 851 return -1; 852 } 853 arr = &msl->memseg_arr; 854 855 msl_idx = msl - mcfg->memsegs; 856 seg_idx = rte_fbarray_find_idx(arr, ms); 857 858 if (!rte_fbarray_is_used(arr, seg_idx)) { 859 rte_errno = ENOENT; 860 return -1; 861 } 862 863 /* segment fd API is not supported for external segments */ 864 if (msl->external) { 865 rte_errno = ENOTSUP; 866 return -1; 867 } 868 869 ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); 870 if (ret < 0) { 871 rte_errno = -ret; 872 ret = -1; 873 } 874 return ret; 875 } 876 877 int 878 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) 879 { 880 int ret; 881 882 rte_mcfg_mem_read_lock(); 883 ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); 884 rte_mcfg_mem_read_unlock(); 885 886 return ret; 887 } 888 889 int 890 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], 891 unsigned int n_pages, size_t page_sz) 892 { 893 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 894 unsigned int socket_id, n; 895 int ret = 0; 896 897 if (va_addr == NULL || page_sz == 0 || len == 0 || 898 !rte_is_power_of_2(page_sz) || 899 RTE_ALIGN(len, page_sz) != len || 900 ((len / page_sz) != n_pages && iova_addrs != NULL) || 901 !rte_is_aligned(va_addr, page_sz)) { 902 rte_errno = EINVAL; 903 return -1; 904 } 905 rte_mcfg_mem_write_lock(); 906 907 /* make sure the segment doesn't already exist */ 908 if (malloc_heap_find_external_seg(va_addr, len) != NULL) { 909 rte_errno = EEXIST; 910 ret = -1; 911 goto unlock; 912 } 913 914 /* get next available socket ID */ 915 socket_id = mcfg->next_socket_id; 916 if (socket_id > INT32_MAX) { 917 RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); 918 rte_errno = ENOSPC; 919 ret = -1; 920 goto unlock; 921 } 922 923 /* we can create a new memseg */ 924 n = len / page_sz; 925 if (malloc_heap_create_external_seg(va_addr, iova_addrs, n, 926 page_sz, "extmem", socket_id) == NULL) { 927 ret = -1; 928 goto unlock; 929 } 930 931 /* memseg list successfully created - increment next socket ID */ 932 mcfg->next_socket_id++; 933 unlock: 934 rte_mcfg_mem_write_unlock(); 935 return ret; 936 } 937 938 int 939 rte_extmem_unregister(void *va_addr, size_t len) 940 { 941 struct rte_memseg_list *msl; 942 int ret = 0; 943 944 if (va_addr == NULL || len == 0) { 945 rte_errno = EINVAL; 946 return -1; 947 } 948 rte_mcfg_mem_write_lock(); 949 950 /* find our segment */ 951 msl = malloc_heap_find_external_seg(va_addr, len); 952 if (msl == NULL) { 953 rte_errno = ENOENT; 954 ret = -1; 955 goto unlock; 956 } 957 958 ret = malloc_heap_destroy_external_seg(msl); 959 unlock: 960 rte_mcfg_mem_write_unlock(); 961 return ret; 962 } 963 964 static int 965 sync_memory(void *va_addr, size_t len, bool attach) 966 { 967 struct rte_memseg_list *msl; 968 int ret = 0; 969 970 if (va_addr == NULL || len == 0) { 971 rte_errno = EINVAL; 972 return -1; 973 } 974 rte_mcfg_mem_write_lock(); 975 976 /* find our segment */ 977 msl = malloc_heap_find_external_seg(va_addr, len); 978 if (msl == NULL) { 979 rte_errno = ENOENT; 980 ret = -1; 981 goto unlock; 982 } 983 if (attach) 984 ret = rte_fbarray_attach(&msl->memseg_arr); 985 else 986 ret = rte_fbarray_detach(&msl->memseg_arr); 987 988 unlock: 989 rte_mcfg_mem_write_unlock(); 990 return ret; 991 } 992 993 int 994 rte_extmem_attach(void *va_addr, size_t len) 995 { 996 return sync_memory(va_addr, len, true); 997 } 998 999 int 1000 rte_extmem_detach(void *va_addr, size_t len) 1001 { 1002 return sync_memory(va_addr, len, false); 1003 } 1004 1005 /* detach all EAL memory */ 1006 int 1007 rte_eal_memory_detach(void) 1008 { 1009 const struct internal_config *internal_conf = 1010 eal_get_internal_configuration(); 1011 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1012 size_t page_sz = rte_mem_page_size(); 1013 unsigned int i; 1014 1015 if (internal_conf->in_memory == 1) 1016 return 0; 1017 1018 rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); 1019 1020 /* detach internal memory subsystem data first */ 1021 if (eal_memalloc_cleanup()) 1022 RTE_LOG(ERR, EAL, "Could not release memory subsystem data\n"); 1023 1024 for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) { 1025 struct rte_memseg_list *msl = &mcfg->memsegs[i]; 1026 1027 /* skip uninitialized segments */ 1028 if (msl->base_va == NULL) 1029 continue; 1030 /* 1031 * external segments are supposed to be detached at this point, 1032 * but if they aren't, we can't really do anything about it, 1033 * because if we skip them here, they'll become invalid after 1034 * we unmap the memconfig anyway. however, if this is externally 1035 * referenced memory, we have no business unmapping it. 1036 */ 1037 if (!msl->external) 1038 if (rte_mem_unmap(msl->base_va, msl->len) != 0) 1039 RTE_LOG(ERR, EAL, "Could not unmap memory: %s\n", 1040 rte_strerror(rte_errno)); 1041 1042 /* 1043 * we are detaching the fbarray rather than destroying because 1044 * other processes might still reference this fbarray, and we 1045 * have no way of knowing if they still do. 1046 */ 1047 if (rte_fbarray_detach(&msl->memseg_arr)) 1048 RTE_LOG(ERR, EAL, "Could not detach fbarray: %s\n", 1049 rte_strerror(rte_errno)); 1050 } 1051 rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); 1052 1053 /* 1054 * we've detached the memseg lists, so we can unmap the shared mem 1055 * config - we can't zero it out because it might still be referenced 1056 * by other processes. 1057 */ 1058 if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) { 1059 if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0) 1060 RTE_LOG(ERR, EAL, "Could not unmap shared memory config: %s\n", 1061 rte_strerror(rte_errno)); 1062 } 1063 rte_eal_get_configuration()->mem_config = NULL; 1064 1065 return 0; 1066 } 1067 1068 /* init memory subsystem */ 1069 int 1070 rte_eal_memory_init(void) 1071 { 1072 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1073 const struct internal_config *internal_conf = 1074 eal_get_internal_configuration(); 1075 1076 int retval; 1077 RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); 1078 1079 if (!mcfg) 1080 return -1; 1081 1082 /* lock mem hotplug here, to prevent races while we init */ 1083 rte_mcfg_mem_read_lock(); 1084 1085 if (rte_eal_memseg_init() < 0) 1086 goto fail; 1087 1088 if (eal_memalloc_init() < 0) 1089 goto fail; 1090 1091 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? 1092 rte_eal_hugepage_init() : 1093 rte_eal_hugepage_attach(); 1094 if (retval < 0) 1095 goto fail; 1096 1097 if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0) 1098 goto fail; 1099 1100 return 0; 1101 fail: 1102 rte_mcfg_mem_read_unlock(); 1103 return -1; 1104 } 1105