1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017-2018 Intel Corporation 3 */ 4 5 #include <errno.h> 6 #include <stdbool.h> 7 #include <stdlib.h> 8 #include <stdio.h> 9 #include <stdint.h> 10 #include <string.h> 11 #include <sys/mman.h> 12 #include <sys/stat.h> 13 #include <sys/file.h> 14 #include <unistd.h> 15 #include <limits.h> 16 #include <fcntl.h> 17 #include <signal.h> 18 #include <setjmp.h> 19 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 20 #include <linux/memfd.h> 21 #define MEMFD_SUPPORTED 22 #endif 23 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 24 #include <numa.h> 25 #include <numaif.h> 26 #endif 27 #include <linux/falloc.h> 28 #include <linux/mman.h> /* for hugetlb-related mmap flags */ 29 30 #include <rte_common.h> 31 #include <rte_log.h> 32 #include <rte_eal.h> 33 #include <rte_memory.h> 34 35 #include "eal_filesystem.h" 36 #include "eal_internal_cfg.h" 37 #include "eal_memalloc.h" 38 #include "eal_memcfg.h" 39 #include "eal_private.h" 40 41 const int anonymous_hugepages_supported = 42 #ifdef MAP_HUGE_SHIFT 43 1; 44 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT 45 #else 46 0; 47 #define RTE_MAP_HUGE_SHIFT 26 48 #endif 49 50 /* 51 * we've already checked memfd support at compile-time, but we also need to 52 * check if we can create hugepage files with memfd. 53 * 54 * also, this is not a constant, because while we may be *compiled* with memfd 55 * hugetlbfs support, we might not be *running* on a system that supports memfd 56 * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at 57 * runtime, and fall back to anonymous memory. 58 */ 59 static int memfd_create_supported = 60 #ifdef MFD_HUGETLB 61 1; 62 #define RTE_MFD_HUGETLB MFD_HUGETLB 63 #else 64 0; 65 #define RTE_MFD_HUGETLB 4U 66 #endif 67 68 /* 69 * not all kernel version support fallocate on hugetlbfs, so fall back to 70 * ftruncate and disallow deallocation if fallocate is not supported. 71 */ 72 static int fallocate_supported = -1; /* unknown */ 73 74 /* 75 * we have two modes - single file segments, and file-per-page mode. 76 * 77 * for single-file segments, we use memseg_list_fd to store the segment fd, 78 * while the fds[] will not be allocated, and len will be set to 0. 79 * 80 * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' 81 * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. 82 * 83 * we cannot know how many pages a system will have in advance, but we do know 84 * that they come in lists, and we know lengths of these lists. so, simply store 85 * a malloc'd array of fd's indexed by list and segment index. 86 * 87 * they will be initialized at startup, and filled as we allocate/deallocate 88 * segments. 89 */ 90 static struct { 91 int *fds; /**< dynamically allocated array of segment lock fd's */ 92 int memseg_list_fd; /**< memseg list fd */ 93 int len; /**< total length of the array */ 94 int count; /**< entries used in an array */ 95 } fd_list[RTE_MAX_MEMSEG_LISTS]; 96 97 /** local copy of a memory map, used to synchronize memory hotplug in MP */ 98 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; 99 100 static sigjmp_buf huge_jmpenv; 101 102 static void huge_sigbus_handler(int signo __rte_unused) 103 { 104 siglongjmp(huge_jmpenv, 1); 105 } 106 107 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, 108 * non-static local variable in the stack frame calling sigsetjmp might be 109 * clobbered by a call to longjmp. 110 */ 111 static int huge_wrap_sigsetjmp(void) 112 { 113 return sigsetjmp(huge_jmpenv, 1); 114 } 115 116 static struct sigaction huge_action_old; 117 static int huge_need_recover; 118 119 static void 120 huge_register_sigbus(void) 121 { 122 sigset_t mask; 123 struct sigaction action; 124 125 sigemptyset(&mask); 126 sigaddset(&mask, SIGBUS); 127 action.sa_flags = 0; 128 action.sa_mask = mask; 129 action.sa_handler = huge_sigbus_handler; 130 131 huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); 132 } 133 134 static void 135 huge_recover_sigbus(void) 136 { 137 if (huge_need_recover) { 138 sigaction(SIGBUS, &huge_action_old, NULL); 139 huge_need_recover = 0; 140 } 141 } 142 143 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 144 static bool 145 check_numa(void) 146 { 147 bool ret = true; 148 /* Check if kernel supports NUMA. */ 149 if (numa_available() != 0) { 150 EAL_LOG(DEBUG, "NUMA is not supported."); 151 ret = false; 152 } 153 return ret; 154 } 155 156 static void 157 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) 158 { 159 EAL_LOG(DEBUG, "Trying to obtain current memory policy."); 160 if (get_mempolicy(oldpolicy, oldmask->maskp, 161 oldmask->size + 1, 0, 0) < 0) { 162 EAL_LOG(ERR, 163 "Failed to get current mempolicy: %s. " 164 "Assuming MPOL_DEFAULT.", strerror(errno)); 165 *oldpolicy = MPOL_DEFAULT; 166 } 167 EAL_LOG(DEBUG, 168 "Setting policy MPOL_PREFERRED for socket %d", 169 socket_id); 170 numa_set_preferred(socket_id); 171 } 172 173 static void 174 restore_numa(int *oldpolicy, struct bitmask *oldmask) 175 { 176 EAL_LOG(DEBUG, 177 "Restoring previous memory policy: %d", *oldpolicy); 178 if (*oldpolicy == MPOL_DEFAULT) { 179 numa_set_localalloc(); 180 } else if (set_mempolicy(*oldpolicy, oldmask->maskp, 181 oldmask->size + 1) < 0) { 182 EAL_LOG(ERR, "Failed to restore mempolicy: %s", 183 strerror(errno)); 184 numa_set_localalloc(); 185 } 186 numa_free_cpumask(oldmask); 187 } 188 #endif 189 190 /* 191 * uses fstat to report the size of a file on disk 192 */ 193 static off_t 194 get_file_size(int fd) 195 { 196 struct stat st; 197 if (fstat(fd, &st) < 0) 198 return 0; 199 return st.st_size; 200 } 201 202 static int 203 pagesz_flags(uint64_t page_sz) 204 { 205 /* as per mmap() manpage, all page sizes are log2 of page size 206 * shifted by MAP_HUGE_SHIFT 207 */ 208 int log2 = rte_log2_u64(page_sz); 209 return log2 << RTE_MAP_HUGE_SHIFT; 210 } 211 212 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ 213 static int lock(int fd, int type) 214 { 215 int ret; 216 217 /* flock may be interrupted */ 218 do { 219 ret = flock(fd, type | LOCK_NB); 220 } while (ret && errno == EINTR); 221 222 if (ret && errno == EWOULDBLOCK) { 223 /* couldn't lock */ 224 return 0; 225 } else if (ret) { 226 EAL_LOG(ERR, "%s(): error calling flock(): %s", 227 __func__, strerror(errno)); 228 return -1; 229 } 230 /* lock was successful */ 231 return 1; 232 } 233 234 static int 235 get_seg_memfd(struct hugepage_info *hi __rte_unused, 236 unsigned int list_idx __rte_unused, 237 unsigned int seg_idx __rte_unused) 238 { 239 #ifdef MEMFD_SUPPORTED 240 int fd; 241 char segname[250]; /* as per manpage, limit is 249 bytes plus null */ 242 243 int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); 244 const struct internal_config *internal_conf = 245 eal_get_internal_configuration(); 246 247 if (internal_conf->single_file_segments) { 248 fd = fd_list[list_idx].memseg_list_fd; 249 250 if (fd < 0) { 251 snprintf(segname, sizeof(segname), "seg_%i", list_idx); 252 fd = memfd_create(segname, flags); 253 if (fd < 0) { 254 EAL_LOG(DEBUG, "%s(): memfd create failed: %s", 255 __func__, strerror(errno)); 256 return -1; 257 } 258 fd_list[list_idx].memseg_list_fd = fd; 259 } 260 } else { 261 fd = fd_list[list_idx].fds[seg_idx]; 262 263 if (fd < 0) { 264 snprintf(segname, sizeof(segname), "seg_%i-%i", 265 list_idx, seg_idx); 266 fd = memfd_create(segname, flags); 267 if (fd < 0) { 268 EAL_LOG(DEBUG, "%s(): memfd create failed: %s", 269 __func__, strerror(errno)); 270 return -1; 271 } 272 fd_list[list_idx].fds[seg_idx] = fd; 273 } 274 } 275 return fd; 276 #endif 277 return -1; 278 } 279 280 static int 281 get_seg_fd(char *path, int buflen, struct hugepage_info *hi, 282 unsigned int list_idx, unsigned int seg_idx, 283 bool *dirty) 284 { 285 int fd; 286 int *out_fd; 287 struct stat st; 288 int ret; 289 const struct internal_config *internal_conf = 290 eal_get_internal_configuration(); 291 292 if (dirty != NULL) 293 *dirty = false; 294 295 /* for in-memory mode, we only make it here when we're sure we support 296 * memfd, and this is a special case. 297 */ 298 if (internal_conf->in_memory) 299 return get_seg_memfd(hi, list_idx, seg_idx); 300 301 if (internal_conf->single_file_segments) { 302 out_fd = &fd_list[list_idx].memseg_list_fd; 303 eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); 304 } else { 305 out_fd = &fd_list[list_idx].fds[seg_idx]; 306 eal_get_hugefile_path(path, buflen, hi->hugedir, 307 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); 308 } 309 fd = *out_fd; 310 if (fd >= 0) 311 return fd; 312 313 /* 314 * There is no TOCTOU between stat() and unlink()/open() 315 * because the hugepage directory is locked. 316 */ 317 ret = stat(path, &st); 318 if (ret < 0 && errno != ENOENT) { 319 EAL_LOG(DEBUG, "%s(): stat() for '%s' failed: %s", 320 __func__, path, strerror(errno)); 321 return -1; 322 } 323 if (!internal_conf->hugepage_file.unlink_existing && ret == 0 && 324 dirty != NULL) 325 *dirty = true; 326 327 /* 328 * The kernel clears a hugepage only when it is mapped 329 * from a particular file for the first time. 330 * If the file already exists, the old content will be mapped. 331 * If the memory manager assumes all mapped pages to be clean, 332 * the file must be removed and created anew. 333 * Otherwise, the primary caller must be notified 334 * that mapped pages will be dirty 335 * (secondary callers receive the segment state from the primary one). 336 * When multiple hugepages are mapped from the same file, 337 * whether they will be dirty depends on the part that is mapped. 338 */ 339 if (!internal_conf->single_file_segments && 340 internal_conf->hugepage_file.unlink_existing && 341 rte_eal_process_type() == RTE_PROC_PRIMARY && 342 ret == 0) { 343 /* coverity[toctou] */ 344 if (unlink(path) < 0) { 345 EAL_LOG(DEBUG, "%s(): could not remove '%s': %s", 346 __func__, path, strerror(errno)); 347 return -1; 348 } 349 } 350 351 /* coverity[toctou] */ 352 fd = open(path, O_CREAT | O_RDWR, 0600); 353 if (fd < 0) { 354 EAL_LOG(ERR, "%s(): open '%s' failed: %s", 355 __func__, path, strerror(errno)); 356 return -1; 357 } 358 /* take out a read lock */ 359 if (lock(fd, LOCK_SH) < 0) { 360 EAL_LOG(ERR, "%s(): lock '%s' failed: %s", 361 __func__, path, strerror(errno)); 362 close(fd); 363 return -1; 364 } 365 *out_fd = fd; 366 return fd; 367 } 368 369 static int 370 resize_hugefile_in_memory(int fd, uint64_t fa_offset, 371 uint64_t page_sz, bool grow) 372 { 373 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 374 FALLOC_FL_KEEP_SIZE; 375 int ret; 376 377 /* grow or shrink the file */ 378 ret = fallocate(fd, flags, fa_offset, page_sz); 379 380 if (ret < 0) { 381 EAL_LOG(DEBUG, "%s(): fallocate() failed: %s", 382 __func__, 383 strerror(errno)); 384 return -1; 385 } 386 return 0; 387 } 388 389 static int 390 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, 391 bool grow, bool *dirty) 392 { 393 const struct internal_config *internal_conf = 394 eal_get_internal_configuration(); 395 bool again = false; 396 397 do { 398 if (fallocate_supported == 0) { 399 /* we cannot deallocate memory if fallocate() is not 400 * supported, and hugepage file is already locked at 401 * creation, so no further synchronization needed. 402 */ 403 404 if (!grow) { 405 EAL_LOG(DEBUG, "%s(): fallocate not supported, not freeing page back to the system", 406 __func__); 407 return -1; 408 } 409 uint64_t new_size = fa_offset + page_sz; 410 uint64_t cur_size = get_file_size(fd); 411 412 /* fallocate isn't supported, fall back to ftruncate */ 413 if (dirty != NULL) 414 *dirty = new_size <= cur_size; 415 if (new_size > cur_size && 416 ftruncate(fd, new_size) < 0) { 417 EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s", 418 __func__, strerror(errno)); 419 return -1; 420 } 421 } else { 422 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 423 FALLOC_FL_KEEP_SIZE; 424 int ret; 425 426 /* 427 * technically, it is perfectly safe for both primary 428 * and secondary to grow and shrink the page files: 429 * growing the file repeatedly has no effect because 430 * a page can only be allocated once, while mmap ensures 431 * that secondaries hold on to the page even after the 432 * page itself is removed from the filesystem. 433 * 434 * however, leaving growing/shrinking to the primary 435 * tends to expose bugs in fdlist page count handling, 436 * so leave this here just in case. 437 */ 438 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 439 return 0; 440 441 /* grow or shrink the file */ 442 ret = fallocate(fd, flags, fa_offset, page_sz); 443 444 if (ret < 0) { 445 if (fallocate_supported == -1 && 446 errno == ENOTSUP) { 447 EAL_LOG(ERR, "%s(): fallocate() not supported, hugepage deallocation will be disabled", 448 __func__); 449 again = true; 450 fallocate_supported = 0; 451 } else { 452 EAL_LOG(DEBUG, "%s(): fallocate() failed: %s", 453 __func__, 454 strerror(errno)); 455 return -1; 456 } 457 } else { 458 fallocate_supported = 1; 459 /* 460 * It is unknown which portions of an existing 461 * hugepage file were allocated previously, 462 * so all pages within the file are considered 463 * dirty, unless the file is a fresh one. 464 */ 465 if (dirty != NULL) 466 *dirty &= !internal_conf->hugepage_file.unlink_existing; 467 } 468 } 469 } while (again); 470 471 return 0; 472 } 473 474 static void 475 close_hugefile(int fd, char *path, int list_idx) 476 { 477 const struct internal_config *internal_conf = 478 eal_get_internal_configuration(); 479 /* 480 * primary process must unlink the file, but only when not in in-memory 481 * mode (as in that case there is no file to unlink). 482 */ 483 if (!internal_conf->in_memory && 484 rte_eal_process_type() == RTE_PROC_PRIMARY && 485 unlink(path)) 486 EAL_LOG(ERR, "%s(): unlinking '%s' failed: %s", 487 __func__, path, strerror(errno)); 488 489 close(fd); 490 fd_list[list_idx].memseg_list_fd = -1; 491 } 492 493 static int 494 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow, 495 bool *dirty) 496 { 497 /* in-memory mode is a special case, because we can be sure that 498 * fallocate() is supported. 499 */ 500 const struct internal_config *internal_conf = 501 eal_get_internal_configuration(); 502 503 if (internal_conf->in_memory) { 504 if (dirty != NULL) 505 *dirty = false; 506 return resize_hugefile_in_memory(fd, fa_offset, 507 page_sz, grow); 508 } 509 510 return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, 511 grow, dirty); 512 } 513 514 static int 515 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, 516 struct hugepage_info *hi, unsigned int list_idx, 517 unsigned int seg_idx) 518 { 519 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 520 int cur_socket_id = 0; 521 #endif 522 uint64_t map_offset; 523 rte_iova_t iova; 524 void *va; 525 char path[PATH_MAX]; 526 int ret = 0; 527 int fd; 528 bool dirty; 529 size_t alloc_sz; 530 int flags; 531 void *new_addr; 532 const struct internal_config *internal_conf = 533 eal_get_internal_configuration(); 534 535 alloc_sz = hi->hugepage_sz; 536 537 /* these are checked at init, but code analyzers don't know that */ 538 if (internal_conf->in_memory && !anonymous_hugepages_supported) { 539 EAL_LOG(ERR, "Anonymous hugepages not supported, in-memory mode cannot allocate memory"); 540 return -1; 541 } 542 if (internal_conf->in_memory && !memfd_create_supported && 543 internal_conf->single_file_segments) { 544 EAL_LOG(ERR, "Single-file segments are not supported without memfd support"); 545 return -1; 546 } 547 548 /* in-memory without memfd is a special case */ 549 int mmap_flags; 550 551 if (internal_conf->in_memory && !memfd_create_supported) { 552 const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | 553 MAP_PRIVATE | MAP_ANONYMOUS; 554 int pagesz_flag; 555 556 pagesz_flag = pagesz_flags(alloc_sz); 557 fd = -1; 558 dirty = false; 559 mmap_flags = in_memory_flags | pagesz_flag; 560 561 /* single-file segments codepath will never be active 562 * here because in-memory mode is incompatible with the 563 * fallback path, and it's stopped at EAL initialization 564 * stage. 565 */ 566 map_offset = 0; 567 } else { 568 /* takes out a read lock on segment or segment list */ 569 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, 570 &dirty); 571 if (fd < 0) { 572 EAL_LOG(ERR, "Couldn't get fd on hugepage file"); 573 return -1; 574 } 575 576 if (internal_conf->single_file_segments) { 577 map_offset = seg_idx * alloc_sz; 578 ret = resize_hugefile(fd, map_offset, alloc_sz, true, 579 &dirty); 580 if (ret < 0) 581 goto resized; 582 583 fd_list[list_idx].count++; 584 } else { 585 map_offset = 0; 586 if (ftruncate(fd, alloc_sz) < 0) { 587 EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s", 588 __func__, strerror(errno)); 589 goto resized; 590 } 591 if (internal_conf->hugepage_file.unlink_before_mapping && 592 !internal_conf->in_memory) { 593 if (unlink(path)) { 594 EAL_LOG(DEBUG, "%s(): unlink() failed: %s", 595 __func__, strerror(errno)); 596 goto resized; 597 } 598 } 599 } 600 mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; 601 } 602 603 huge_register_sigbus(); 604 605 /* 606 * map the segment, and populate page tables, the kernel fills 607 * this segment with zeros if it's a new page. 608 */ 609 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, 610 map_offset); 611 612 if (va == MAP_FAILED) { 613 EAL_LOG(DEBUG, "%s(): mmap() failed: %s", __func__, 614 strerror(errno)); 615 /* mmap failed, but the previous region might have been 616 * unmapped anyway. try to remap it 617 */ 618 goto unmapped; 619 } 620 if (va != addr) { 621 EAL_LOG(DEBUG, "%s(): wrong mmap() address", __func__); 622 munmap(va, alloc_sz); 623 goto resized; 624 } 625 626 /* In linux, hugetlb limitations, like cgroup, are 627 * enforced at fault time instead of mmap(), even 628 * with the option of MAP_POPULATE. Kernel will send 629 * a SIGBUS signal. To avoid to be killed, save stack 630 * environment here, if SIGBUS happens, we can jump 631 * back here. 632 */ 633 if (huge_wrap_sigsetjmp()) { 634 EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more hugepages of size %uMB", 635 (unsigned int)(alloc_sz >> 20)); 636 goto mapped; 637 } 638 639 /* we need to trigger a write to the page to enforce page fault and 640 * ensure that page is accessible to us, but we can't overwrite value 641 * that is already there, so read the old value, and write itback. 642 * kernel populates the page with zeroes initially. 643 */ 644 *(volatile int *)addr = *(volatile int *)addr; 645 646 iova = rte_mem_virt2iova(addr); 647 if (iova == RTE_BAD_PHYS_ADDR) { 648 EAL_LOG(DEBUG, "%s(): can't get IOVA addr", 649 __func__); 650 goto mapped; 651 } 652 653 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 654 /* 655 * If the kernel has been built without NUMA support, get_mempolicy() 656 * will return an error. If check_numa() returns false, memory 657 * allocation is not NUMA aware and the socket_id should not be 658 * checked. 659 */ 660 if (check_numa()) { 661 ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, 662 MPOL_F_NODE | MPOL_F_ADDR); 663 if (ret < 0) { 664 EAL_LOG(DEBUG, "%s(): get_mempolicy: %s", 665 __func__, strerror(errno)); 666 goto mapped; 667 } else if (cur_socket_id != socket_id) { 668 EAL_LOG(DEBUG, 669 "%s(): allocation happened on wrong socket (wanted %d, got %d)", 670 __func__, socket_id, cur_socket_id); 671 goto mapped; 672 } 673 } 674 #else 675 if (rte_socket_count() > 1) 676 EAL_LOG(DEBUG, "%s(): not checking hugepage NUMA node.", 677 __func__); 678 #endif 679 680 huge_recover_sigbus(); 681 682 ms->addr = addr; 683 ms->hugepage_sz = alloc_sz; 684 ms->len = alloc_sz; 685 ms->nchannel = rte_memory_get_nchannel(); 686 ms->nrank = rte_memory_get_nrank(); 687 ms->iova = iova; 688 ms->socket_id = socket_id; 689 ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0; 690 691 return 0; 692 693 mapped: 694 munmap(addr, alloc_sz); 695 unmapped: 696 huge_recover_sigbus(); 697 flags = EAL_RESERVE_FORCE_ADDRESS; 698 new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); 699 if (new_addr != addr) { 700 if (new_addr != NULL) 701 munmap(new_addr, alloc_sz); 702 /* we're leaving a hole in our virtual address space. if 703 * somebody else maps this hole now, we could accidentally 704 * override it in the future. 705 */ 706 EAL_LOG(CRIT, "Can't mmap holes in our virtual address space"); 707 } 708 /* roll back the ref count */ 709 if (internal_conf->single_file_segments) 710 fd_list[list_idx].count--; 711 resized: 712 /* some codepaths will return negative fd, so exit early */ 713 if (fd < 0) 714 return -1; 715 716 if (internal_conf->single_file_segments) { 717 resize_hugefile(fd, map_offset, alloc_sz, false, NULL); 718 /* ignore failure, can't make it any worse */ 719 720 /* if refcount is at zero, close the file */ 721 if (fd_list[list_idx].count == 0) 722 close_hugefile(fd, path, list_idx); 723 } else { 724 /* only remove file if we can take out a write lock */ 725 if (!internal_conf->hugepage_file.unlink_before_mapping && 726 internal_conf->in_memory == 0 && 727 lock(fd, LOCK_EX) == 1) 728 unlink(path); 729 close(fd); 730 fd_list[list_idx].fds[seg_idx] = -1; 731 } 732 return -1; 733 } 734 735 static int 736 free_seg(struct rte_memseg *ms, struct hugepage_info *hi, 737 unsigned int list_idx, unsigned int seg_idx) 738 { 739 uint64_t map_offset; 740 char path[PATH_MAX]; 741 int fd, ret = 0; 742 const struct internal_config *internal_conf = 743 eal_get_internal_configuration(); 744 745 /* erase page data */ 746 memset(ms->addr, 0, ms->len); 747 748 if (mmap(ms->addr, ms->len, PROT_NONE, 749 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == 750 MAP_FAILED) { 751 EAL_LOG(DEBUG, "couldn't unmap page"); 752 return -1; 753 } 754 755 eal_mem_set_dump(ms->addr, ms->len, false); 756 757 /* if we're using anonymous hugepages, nothing to be done */ 758 if (internal_conf->in_memory && !memfd_create_supported) { 759 memset(ms, 0, sizeof(*ms)); 760 return 0; 761 } 762 763 /* if we are not in single file segments mode, we're going to unmap the 764 * segment and thus drop the lock on original fd, but hugepage dir is 765 * now locked so we can take out another one without races. 766 */ 767 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL); 768 if (fd < 0) 769 return -1; 770 771 if (internal_conf->single_file_segments) { 772 map_offset = seg_idx * ms->len; 773 if (resize_hugefile(fd, map_offset, ms->len, false, NULL)) 774 return -1; 775 776 if (--(fd_list[list_idx].count) == 0) 777 close_hugefile(fd, path, list_idx); 778 779 ret = 0; 780 } else { 781 /* if we're able to take out a write lock, we're the last one 782 * holding onto this page. 783 */ 784 if (!internal_conf->in_memory && 785 internal_conf->hugepage_file.unlink_existing && 786 !internal_conf->hugepage_file.unlink_before_mapping) { 787 ret = lock(fd, LOCK_EX); 788 if (ret >= 0) { 789 /* no one else is using this page */ 790 if (ret == 1) 791 unlink(path); 792 } 793 } 794 /* closing fd will drop the lock */ 795 close(fd); 796 fd_list[list_idx].fds[seg_idx] = -1; 797 } 798 799 memset(ms, 0, sizeof(*ms)); 800 801 return ret < 0 ? -1 : 0; 802 } 803 804 struct alloc_walk_param { 805 struct hugepage_info *hi; 806 struct rte_memseg **ms; 807 size_t page_sz; 808 unsigned int segs_allocated; 809 unsigned int n_segs; 810 int socket; 811 bool exact; 812 }; 813 static int 814 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) 815 { 816 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 817 struct alloc_walk_param *wa = arg; 818 struct rte_memseg_list *cur_msl; 819 size_t page_sz; 820 int cur_idx, start_idx, j, dir_fd = -1; 821 unsigned int msl_idx, need, i; 822 const struct internal_config *internal_conf = 823 eal_get_internal_configuration(); 824 825 if (msl->page_sz != wa->page_sz) 826 return 0; 827 if (msl->socket_id != wa->socket) 828 return 0; 829 830 page_sz = (size_t)msl->page_sz; 831 832 msl_idx = msl - mcfg->memsegs; 833 cur_msl = &mcfg->memsegs[msl_idx]; 834 835 need = wa->n_segs; 836 837 /* try finding space in memseg list */ 838 if (wa->exact) { 839 /* if we require exact number of pages in a list, find them */ 840 cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, 841 need); 842 if (cur_idx < 0) 843 return 0; 844 start_idx = cur_idx; 845 } else { 846 int cur_len; 847 848 /* we don't require exact number of pages, so we're going to go 849 * for best-effort allocation. that means finding the biggest 850 * unused block, and going with that. 851 */ 852 cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, 853 0); 854 if (cur_idx < 0) 855 return 0; 856 start_idx = cur_idx; 857 /* adjust the size to possibly be smaller than original 858 * request, but do not allow it to be bigger. 859 */ 860 cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, 861 cur_idx); 862 need = RTE_MIN(need, (unsigned int)cur_len); 863 } 864 865 /* do not allow any page allocations during the time we're allocating, 866 * because file creation and locking operations are not atomic, 867 * and we might be the first or the last ones to use a particular page, 868 * so we need to ensure atomicity of every operation. 869 * 870 * during init, we already hold a write lock, so don't try to take out 871 * another one. 872 */ 873 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 874 dir_fd = open(wa->hi->hugedir, O_RDONLY); 875 if (dir_fd < 0) { 876 EAL_LOG(ERR, "%s(): Cannot open '%s': %s", 877 __func__, wa->hi->hugedir, strerror(errno)); 878 return -1; 879 } 880 /* blocking writelock */ 881 if (flock(dir_fd, LOCK_EX)) { 882 EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", 883 __func__, wa->hi->hugedir, strerror(errno)); 884 close(dir_fd); 885 return -1; 886 } 887 } 888 889 for (i = 0; i < need; i++, cur_idx++) { 890 struct rte_memseg *cur; 891 void *map_addr; 892 893 cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); 894 map_addr = RTE_PTR_ADD(cur_msl->base_va, 895 cur_idx * page_sz); 896 897 if (alloc_seg(cur, map_addr, wa->socket, wa->hi, 898 msl_idx, cur_idx)) { 899 EAL_LOG(DEBUG, "attempted to allocate %i segments, but only %i were allocated", 900 need, i); 901 902 /* if exact number wasn't requested, stop */ 903 if (!wa->exact) 904 goto out; 905 906 /* clean up */ 907 for (j = start_idx; j < cur_idx; j++) { 908 struct rte_memseg *tmp; 909 struct rte_fbarray *arr = 910 &cur_msl->memseg_arr; 911 912 tmp = rte_fbarray_get(arr, j); 913 rte_fbarray_set_free(arr, j); 914 915 /* free_seg may attempt to create a file, which 916 * may fail. 917 */ 918 if (free_seg(tmp, wa->hi, msl_idx, j)) 919 EAL_LOG(DEBUG, "Cannot free page"); 920 } 921 /* clear the list */ 922 if (wa->ms) 923 memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); 924 925 if (dir_fd >= 0) 926 close(dir_fd); 927 return -1; 928 } 929 if (wa->ms) 930 wa->ms[i] = cur; 931 932 rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); 933 } 934 out: 935 wa->segs_allocated = i; 936 if (i > 0) 937 cur_msl->version++; 938 if (dir_fd >= 0) 939 close(dir_fd); 940 /* if we didn't allocate any segments, move on to the next list */ 941 return i > 0; 942 } 943 944 struct free_walk_param { 945 struct hugepage_info *hi; 946 struct rte_memseg *ms; 947 }; 948 static int 949 free_seg_walk(const struct rte_memseg_list *msl, void *arg) 950 { 951 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 952 struct rte_memseg_list *found_msl; 953 struct free_walk_param *wa = arg; 954 uintptr_t start_addr, end_addr; 955 int msl_idx, seg_idx, ret, dir_fd = -1; 956 const struct internal_config *internal_conf = 957 eal_get_internal_configuration(); 958 959 start_addr = (uintptr_t) msl->base_va; 960 end_addr = start_addr + msl->len; 961 962 if ((uintptr_t)wa->ms->addr < start_addr || 963 (uintptr_t)wa->ms->addr >= end_addr) 964 return 0; 965 966 msl_idx = msl - mcfg->memsegs; 967 seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; 968 969 /* msl is const */ 970 found_msl = &mcfg->memsegs[msl_idx]; 971 972 /* do not allow any page allocations during the time we're freeing, 973 * because file creation and locking operations are not atomic, 974 * and we might be the first or the last ones to use a particular page, 975 * so we need to ensure atomicity of every operation. 976 * 977 * during init, we already hold a write lock, so don't try to take out 978 * another one. 979 */ 980 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 981 dir_fd = open(wa->hi->hugedir, O_RDONLY); 982 if (dir_fd < 0) { 983 EAL_LOG(ERR, "%s(): Cannot open '%s': %s", 984 __func__, wa->hi->hugedir, strerror(errno)); 985 return -1; 986 } 987 /* blocking writelock */ 988 if (flock(dir_fd, LOCK_EX)) { 989 EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", 990 __func__, wa->hi->hugedir, strerror(errno)); 991 close(dir_fd); 992 return -1; 993 } 994 } 995 996 found_msl->version++; 997 998 rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); 999 1000 ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); 1001 1002 if (dir_fd >= 0) 1003 close(dir_fd); 1004 1005 if (ret < 0) 1006 return -1; 1007 1008 return 1; 1009 } 1010 1011 int 1012 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, 1013 int socket, bool exact) 1014 { 1015 int i, ret = -1; 1016 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1017 bool have_numa = false; 1018 int oldpolicy; 1019 struct bitmask *oldmask; 1020 #endif 1021 struct alloc_walk_param wa; 1022 struct hugepage_info *hi = NULL; 1023 struct internal_config *internal_conf = 1024 eal_get_internal_configuration(); 1025 1026 memset(&wa, 0, sizeof(wa)); 1027 1028 /* dynamic allocation not supported in legacy mode */ 1029 if (internal_conf->legacy_mem) 1030 return -1; 1031 1032 for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) { 1033 if (page_sz == 1034 internal_conf->hugepage_info[i].hugepage_sz) { 1035 hi = &internal_conf->hugepage_info[i]; 1036 break; 1037 } 1038 } 1039 if (!hi) { 1040 EAL_LOG(ERR, "%s(): can't find relevant hugepage_info entry", 1041 __func__); 1042 return -1; 1043 } 1044 1045 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1046 if (check_numa()) { 1047 oldmask = numa_allocate_nodemask(); 1048 prepare_numa(&oldpolicy, oldmask, socket); 1049 have_numa = true; 1050 } 1051 #endif 1052 1053 wa.exact = exact; 1054 wa.hi = hi; 1055 wa.ms = ms; 1056 wa.n_segs = n_segs; 1057 wa.page_sz = page_sz; 1058 wa.socket = socket; 1059 wa.segs_allocated = 0; 1060 1061 /* memalloc is locked, so it's safe to use thread-unsafe version */ 1062 ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); 1063 if (ret == 0) { 1064 EAL_LOG(ERR, "%s(): couldn't find suitable memseg_list", 1065 __func__); 1066 ret = -1; 1067 } else if (ret > 0) { 1068 ret = (int)wa.segs_allocated; 1069 } 1070 1071 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1072 if (have_numa) 1073 restore_numa(&oldpolicy, oldmask); 1074 #endif 1075 return ret; 1076 } 1077 1078 struct rte_memseg * 1079 eal_memalloc_alloc_seg(size_t page_sz, int socket) 1080 { 1081 struct rte_memseg *ms; 1082 if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) 1083 return NULL; 1084 /* return pointer to newly allocated memseg */ 1085 return ms; 1086 } 1087 1088 int 1089 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) 1090 { 1091 int seg, ret = 0; 1092 struct internal_config *internal_conf = 1093 eal_get_internal_configuration(); 1094 1095 /* dynamic free not supported in legacy mode */ 1096 if (internal_conf->legacy_mem) 1097 return -1; 1098 1099 for (seg = 0; seg < n_segs; seg++) { 1100 struct rte_memseg *cur = ms[seg]; 1101 struct hugepage_info *hi = NULL; 1102 struct free_walk_param wa; 1103 int i, walk_res; 1104 1105 /* if this page is marked as unfreeable, fail */ 1106 if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 1107 EAL_LOG(DEBUG, "Page is not allowed to be freed"); 1108 ret = -1; 1109 continue; 1110 } 1111 1112 memset(&wa, 0, sizeof(wa)); 1113 1114 for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info); 1115 i++) { 1116 hi = &internal_conf->hugepage_info[i]; 1117 if (cur->hugepage_sz == hi->hugepage_sz) 1118 break; 1119 } 1120 if (i == (int)RTE_DIM(internal_conf->hugepage_info)) { 1121 EAL_LOG(ERR, "Can't find relevant hugepage_info entry"); 1122 ret = -1; 1123 continue; 1124 } 1125 1126 wa.ms = cur; 1127 wa.hi = hi; 1128 1129 /* memalloc is locked, so it's safe to use thread-unsafe version 1130 */ 1131 walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, 1132 &wa); 1133 if (walk_res == 1) 1134 continue; 1135 if (walk_res == 0) 1136 EAL_LOG(ERR, "Couldn't find memseg list"); 1137 ret = -1; 1138 } 1139 return ret; 1140 } 1141 1142 int 1143 eal_memalloc_free_seg(struct rte_memseg *ms) 1144 { 1145 const struct internal_config *internal_conf = 1146 eal_get_internal_configuration(); 1147 1148 /* dynamic free not supported in legacy mode */ 1149 if (internal_conf->legacy_mem) 1150 return -1; 1151 1152 return eal_memalloc_free_seg_bulk(&ms, 1); 1153 } 1154 1155 static int 1156 sync_chunk(struct rte_memseg_list *primary_msl, 1157 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1158 unsigned int msl_idx, bool used, int start, int end) 1159 { 1160 struct rte_fbarray *l_arr, *p_arr; 1161 int i, ret, chunk_len, diff_len; 1162 1163 l_arr = &local_msl->memseg_arr; 1164 p_arr = &primary_msl->memseg_arr; 1165 1166 /* we need to aggregate allocations/deallocations into bigger chunks, 1167 * as we don't want to spam the user with per-page callbacks. 1168 * 1169 * to avoid any potential issues, we also want to trigger 1170 * deallocation callbacks *before* we actually deallocate 1171 * memory, so that the user application could wrap up its use 1172 * before it goes away. 1173 */ 1174 1175 chunk_len = end - start; 1176 1177 /* find how many contiguous pages we can map/unmap for this chunk */ 1178 diff_len = used ? 1179 rte_fbarray_find_contig_free(l_arr, start) : 1180 rte_fbarray_find_contig_used(l_arr, start); 1181 1182 /* has to be at least one page */ 1183 if (diff_len < 1) 1184 return -1; 1185 1186 diff_len = RTE_MIN(chunk_len, diff_len); 1187 1188 /* if we are freeing memory, notify the application */ 1189 if (!used) { 1190 struct rte_memseg *ms; 1191 void *start_va; 1192 size_t len, page_sz; 1193 1194 ms = rte_fbarray_get(l_arr, start); 1195 start_va = ms->addr; 1196 page_sz = (size_t)primary_msl->page_sz; 1197 len = page_sz * diff_len; 1198 1199 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 1200 start_va, len); 1201 } 1202 1203 for (i = 0; i < diff_len; i++) { 1204 struct rte_memseg *p_ms, *l_ms; 1205 int seg_idx = start + i; 1206 1207 l_ms = rte_fbarray_get(l_arr, seg_idx); 1208 p_ms = rte_fbarray_get(p_arr, seg_idx); 1209 1210 if (l_ms == NULL || p_ms == NULL) 1211 return -1; 1212 1213 if (used) { 1214 ret = alloc_seg(l_ms, p_ms->addr, 1215 p_ms->socket_id, hi, 1216 msl_idx, seg_idx); 1217 if (ret < 0) 1218 return -1; 1219 rte_fbarray_set_used(l_arr, seg_idx); 1220 } else { 1221 ret = free_seg(l_ms, hi, msl_idx, seg_idx); 1222 rte_fbarray_set_free(l_arr, seg_idx); 1223 if (ret < 0) 1224 return -1; 1225 } 1226 } 1227 1228 /* if we just allocated memory, notify the application */ 1229 if (used) { 1230 struct rte_memseg *ms; 1231 void *start_va; 1232 size_t len, page_sz; 1233 1234 ms = rte_fbarray_get(l_arr, start); 1235 start_va = ms->addr; 1236 page_sz = (size_t)primary_msl->page_sz; 1237 len = page_sz * diff_len; 1238 1239 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1240 start_va, len); 1241 } 1242 1243 /* calculate how much we can advance until next chunk */ 1244 diff_len = used ? 1245 rte_fbarray_find_contig_used(l_arr, start) : 1246 rte_fbarray_find_contig_free(l_arr, start); 1247 ret = RTE_MIN(chunk_len, diff_len); 1248 1249 return ret; 1250 } 1251 1252 static int 1253 sync_status(struct rte_memseg_list *primary_msl, 1254 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1255 unsigned int msl_idx, bool used) 1256 { 1257 struct rte_fbarray *l_arr, *p_arr; 1258 int p_idx, l_chunk_len, p_chunk_len, ret; 1259 int start, end; 1260 1261 /* this is a little bit tricky, but the basic idea is - walk both lists 1262 * and spot any places where there are discrepancies. walking both lists 1263 * and noting discrepancies in a single go is a hard problem, so we do 1264 * it in two passes - first we spot any places where allocated segments 1265 * mismatch (i.e. ensure that everything that's allocated in the primary 1266 * is also allocated in the secondary), and then we do it by looking at 1267 * free segments instead. 1268 * 1269 * we also need to aggregate changes into chunks, as we have to call 1270 * callbacks per allocation, not per page. 1271 */ 1272 l_arr = &local_msl->memseg_arr; 1273 p_arr = &primary_msl->memseg_arr; 1274 1275 if (used) 1276 p_idx = rte_fbarray_find_next_used(p_arr, 0); 1277 else 1278 p_idx = rte_fbarray_find_next_free(p_arr, 0); 1279 1280 while (p_idx >= 0) { 1281 int next_chunk_search_idx; 1282 1283 if (used) { 1284 p_chunk_len = rte_fbarray_find_contig_used(p_arr, 1285 p_idx); 1286 l_chunk_len = rte_fbarray_find_contig_used(l_arr, 1287 p_idx); 1288 } else { 1289 p_chunk_len = rte_fbarray_find_contig_free(p_arr, 1290 p_idx); 1291 l_chunk_len = rte_fbarray_find_contig_free(l_arr, 1292 p_idx); 1293 } 1294 /* best case scenario - no differences (or bigger, which will be 1295 * fixed during next iteration), look for next chunk 1296 */ 1297 if (l_chunk_len >= p_chunk_len) { 1298 next_chunk_search_idx = p_idx + p_chunk_len; 1299 goto next_chunk; 1300 } 1301 1302 /* if both chunks start at the same point, skip parts we know 1303 * are identical, and sync the rest. each call to sync_chunk 1304 * will only sync contiguous segments, so we need to call this 1305 * until we are sure there are no more differences in this 1306 * chunk. 1307 */ 1308 start = p_idx + l_chunk_len; 1309 end = p_idx + p_chunk_len; 1310 do { 1311 ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, 1312 used, start, end); 1313 start += ret; 1314 } while (start < end && ret >= 0); 1315 /* if ret is negative, something went wrong */ 1316 if (ret < 0) 1317 return -1; 1318 1319 next_chunk_search_idx = p_idx + p_chunk_len; 1320 next_chunk: 1321 /* skip to end of this chunk */ 1322 if (used) { 1323 p_idx = rte_fbarray_find_next_used(p_arr, 1324 next_chunk_search_idx); 1325 } else { 1326 p_idx = rte_fbarray_find_next_free(p_arr, 1327 next_chunk_search_idx); 1328 } 1329 } 1330 return 0; 1331 } 1332 1333 static int 1334 sync_existing(struct rte_memseg_list *primary_msl, 1335 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1336 unsigned int msl_idx) 1337 { 1338 int ret, dir_fd; 1339 1340 /* do not allow any page allocations during the time we're allocating, 1341 * because file creation and locking operations are not atomic, 1342 * and we might be the first or the last ones to use a particular page, 1343 * so we need to ensure atomicity of every operation. 1344 */ 1345 dir_fd = open(hi->hugedir, O_RDONLY); 1346 if (dir_fd < 0) { 1347 EAL_LOG(ERR, "%s(): Cannot open '%s': %s", __func__, 1348 hi->hugedir, strerror(errno)); 1349 return -1; 1350 } 1351 /* blocking writelock */ 1352 if (flock(dir_fd, LOCK_EX)) { 1353 EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", __func__, 1354 hi->hugedir, strerror(errno)); 1355 close(dir_fd); 1356 return -1; 1357 } 1358 1359 /* ensure all allocated space is the same in both lists */ 1360 ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); 1361 if (ret < 0) 1362 goto fail; 1363 1364 /* ensure all unallocated space is the same in both lists */ 1365 ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); 1366 if (ret < 0) 1367 goto fail; 1368 1369 /* update version number */ 1370 local_msl->version = primary_msl->version; 1371 1372 close(dir_fd); 1373 1374 return 0; 1375 fail: 1376 close(dir_fd); 1377 return -1; 1378 } 1379 1380 static int 1381 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1382 { 1383 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1384 struct rte_memseg_list *primary_msl, *local_msl; 1385 struct hugepage_info *hi = NULL; 1386 unsigned int i; 1387 int msl_idx; 1388 struct internal_config *internal_conf = 1389 eal_get_internal_configuration(); 1390 1391 if (msl->external) 1392 return 0; 1393 1394 msl_idx = msl - mcfg->memsegs; 1395 primary_msl = &mcfg->memsegs[msl_idx]; 1396 local_msl = &local_memsegs[msl_idx]; 1397 1398 for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) { 1399 uint64_t cur_sz = 1400 internal_conf->hugepage_info[i].hugepage_sz; 1401 uint64_t msl_sz = primary_msl->page_sz; 1402 if (msl_sz == cur_sz) { 1403 hi = &internal_conf->hugepage_info[i]; 1404 break; 1405 } 1406 } 1407 if (!hi) { 1408 EAL_LOG(ERR, "Can't find relevant hugepage_info entry"); 1409 return -1; 1410 } 1411 1412 /* if versions don't match, synchronize everything */ 1413 if (local_msl->version != primary_msl->version && 1414 sync_existing(primary_msl, local_msl, hi, msl_idx)) 1415 return -1; 1416 return 0; 1417 } 1418 1419 1420 int 1421 eal_memalloc_sync_with_primary(void) 1422 { 1423 /* nothing to be done in primary */ 1424 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1425 return 0; 1426 1427 /* memalloc is locked, so it's safe to call thread-unsafe version */ 1428 if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) 1429 return -1; 1430 return 0; 1431 } 1432 1433 static int 1434 secondary_msl_create_walk(const struct rte_memseg_list *msl, 1435 void *arg __rte_unused) 1436 { 1437 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1438 struct rte_memseg_list *primary_msl, *local_msl; 1439 char name[PATH_MAX]; 1440 int msl_idx, ret; 1441 1442 if (msl->external) 1443 return 0; 1444 1445 msl_idx = msl - mcfg->memsegs; 1446 primary_msl = &mcfg->memsegs[msl_idx]; 1447 local_msl = &local_memsegs[msl_idx]; 1448 1449 /* create distinct fbarrays for each secondary */ 1450 snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", 1451 primary_msl->memseg_arr.name, getpid()); 1452 1453 ret = rte_fbarray_init(&local_msl->memseg_arr, name, 1454 primary_msl->memseg_arr.len, 1455 primary_msl->memseg_arr.elt_sz); 1456 if (ret < 0) { 1457 EAL_LOG(ERR, "Cannot initialize local memory map"); 1458 return -1; 1459 } 1460 local_msl->base_va = primary_msl->base_va; 1461 local_msl->len = primary_msl->len; 1462 1463 return 0; 1464 } 1465 1466 static int 1467 secondary_msl_destroy_walk(const struct rte_memseg_list *msl, 1468 void *arg __rte_unused) 1469 { 1470 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1471 struct rte_memseg_list *local_msl; 1472 int msl_idx, ret; 1473 1474 if (msl->external) 1475 return 0; 1476 1477 msl_idx = msl - mcfg->memsegs; 1478 local_msl = &local_memsegs[msl_idx]; 1479 1480 ret = rte_fbarray_destroy(&local_msl->memseg_arr); 1481 if (ret < 0) { 1482 EAL_LOG(ERR, "Cannot destroy local memory map"); 1483 return -1; 1484 } 1485 local_msl->base_va = NULL; 1486 local_msl->len = 0; 1487 1488 return 0; 1489 } 1490 1491 static int 1492 alloc_list(int list_idx, int len) 1493 { 1494 int *data; 1495 int i; 1496 const struct internal_config *internal_conf = 1497 eal_get_internal_configuration(); 1498 1499 /* single-file segments mode does not need fd list */ 1500 if (!internal_conf->single_file_segments) { 1501 /* ensure we have space to store fd per each possible segment */ 1502 data = malloc(sizeof(int) * len); 1503 if (data == NULL) { 1504 EAL_LOG(ERR, "Unable to allocate space for file descriptors"); 1505 return -1; 1506 } 1507 /* set all fd's as invalid */ 1508 for (i = 0; i < len; i++) 1509 data[i] = -1; 1510 fd_list[list_idx].fds = data; 1511 fd_list[list_idx].len = len; 1512 } else { 1513 fd_list[list_idx].fds = NULL; 1514 fd_list[list_idx].len = 0; 1515 } 1516 1517 fd_list[list_idx].count = 0; 1518 fd_list[list_idx].memseg_list_fd = -1; 1519 1520 return 0; 1521 } 1522 1523 static int 1524 destroy_list(int list_idx) 1525 { 1526 const struct internal_config *internal_conf = 1527 eal_get_internal_configuration(); 1528 1529 /* single-file segments mode does not need fd list */ 1530 if (!internal_conf->single_file_segments) { 1531 int *fds = fd_list[list_idx].fds; 1532 int i; 1533 /* go through each fd and ensure it's closed */ 1534 for (i = 0; i < fd_list[list_idx].len; i++) { 1535 if (fds[i] >= 0) { 1536 close(fds[i]); 1537 fds[i] = -1; 1538 } 1539 } 1540 free(fds); 1541 fd_list[list_idx].fds = NULL; 1542 fd_list[list_idx].len = 0; 1543 } else if (fd_list[list_idx].memseg_list_fd >= 0) { 1544 close(fd_list[list_idx].memseg_list_fd); 1545 fd_list[list_idx].count = 0; 1546 fd_list[list_idx].memseg_list_fd = -1; 1547 } 1548 return 0; 1549 } 1550 1551 static int 1552 fd_list_create_walk(const struct rte_memseg_list *msl, 1553 void *arg __rte_unused) 1554 { 1555 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1556 unsigned int len; 1557 int msl_idx; 1558 1559 if (msl->external) 1560 return 0; 1561 1562 msl_idx = msl - mcfg->memsegs; 1563 len = msl->memseg_arr.len; 1564 1565 return alloc_list(msl_idx, len); 1566 } 1567 1568 static int 1569 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1570 { 1571 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1572 int msl_idx; 1573 1574 if (msl->external) 1575 return 0; 1576 1577 msl_idx = msl - mcfg->memsegs; 1578 1579 return destroy_list(msl_idx); 1580 } 1581 1582 int 1583 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) 1584 { 1585 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1586 const struct internal_config *internal_conf = 1587 eal_get_internal_configuration(); 1588 1589 /* single file segments mode doesn't support individual segment fd's */ 1590 if (internal_conf->single_file_segments) 1591 return -ENOTSUP; 1592 1593 /* if list is not allocated, allocate it */ 1594 if (fd_list[list_idx].len == 0) { 1595 int len = mcfg->memsegs[list_idx].memseg_arr.len; 1596 1597 if (alloc_list(list_idx, len) < 0) 1598 return -ENOMEM; 1599 } 1600 fd_list[list_idx].fds[seg_idx] = fd; 1601 1602 return 0; 1603 } 1604 1605 int 1606 eal_memalloc_set_seg_list_fd(int list_idx, int fd) 1607 { 1608 const struct internal_config *internal_conf = 1609 eal_get_internal_configuration(); 1610 1611 /* non-single file segment mode doesn't support segment list fd's */ 1612 if (!internal_conf->single_file_segments) 1613 return -ENOTSUP; 1614 1615 fd_list[list_idx].memseg_list_fd = fd; 1616 1617 return 0; 1618 } 1619 1620 int 1621 eal_memalloc_get_seg_fd(int list_idx, int seg_idx) 1622 { 1623 int fd; 1624 const struct internal_config *internal_conf = 1625 eal_get_internal_configuration(); 1626 1627 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1628 #ifndef MEMFD_SUPPORTED 1629 /* in in-memory or no-huge mode, we rely on memfd support */ 1630 return -ENOTSUP; 1631 #endif 1632 /* memfd supported, but hugetlbfs memfd may not be */ 1633 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1634 return -ENOTSUP; 1635 } 1636 1637 if (internal_conf->single_file_segments) { 1638 fd = fd_list[list_idx].memseg_list_fd; 1639 } else if (fd_list[list_idx].len == 0) { 1640 /* list not initialized */ 1641 fd = -1; 1642 } else { 1643 fd = fd_list[list_idx].fds[seg_idx]; 1644 } 1645 if (fd < 0) 1646 return -ENODEV; 1647 return fd; 1648 } 1649 1650 static int 1651 test_memfd_create(void) 1652 { 1653 #ifdef MEMFD_SUPPORTED 1654 const struct internal_config *internal_conf = 1655 eal_get_internal_configuration(); 1656 unsigned int i; 1657 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 1658 uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz; 1659 int pagesz_flag = pagesz_flags(pagesz); 1660 int flags; 1661 1662 flags = pagesz_flag | RTE_MFD_HUGETLB; 1663 int fd = memfd_create("test", flags); 1664 if (fd < 0) { 1665 /* we failed - let memalloc know this isn't working */ 1666 if (errno == EINVAL) { 1667 memfd_create_supported = 0; 1668 return 0; /* not supported */ 1669 } 1670 1671 /* we got other error - something's wrong */ 1672 return -1; /* error */ 1673 } 1674 close(fd); 1675 return 1; /* supported */ 1676 } 1677 #endif 1678 return 0; /* not supported */ 1679 } 1680 1681 int 1682 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) 1683 { 1684 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1685 const struct internal_config *internal_conf = 1686 eal_get_internal_configuration(); 1687 1688 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1689 #ifndef MEMFD_SUPPORTED 1690 /* in in-memory or no-huge mode, we rely on memfd support */ 1691 return -ENOTSUP; 1692 #endif 1693 /* memfd supported, but hugetlbfs memfd may not be */ 1694 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1695 return -ENOTSUP; 1696 } 1697 1698 if (internal_conf->single_file_segments) { 1699 size_t pgsz = mcfg->memsegs[list_idx].page_sz; 1700 1701 /* segment not active? */ 1702 if (fd_list[list_idx].memseg_list_fd < 0) 1703 return -ENOENT; 1704 *offset = pgsz * seg_idx; 1705 } else { 1706 /* fd_list not initialized? */ 1707 if (fd_list[list_idx].len == 0) 1708 return -ENODEV; 1709 1710 /* segment not active? */ 1711 if (fd_list[list_idx].fds[seg_idx] < 0) 1712 return -ENOENT; 1713 *offset = 0; 1714 } 1715 return 0; 1716 } 1717 1718 int 1719 eal_memalloc_cleanup(void) 1720 { 1721 /* close all remaining fd's - these are per-process, so it's safe */ 1722 if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL)) 1723 return -1; 1724 1725 /* destroy the shadow page table if we're a secondary process */ 1726 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1727 return 0; 1728 1729 if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk, 1730 NULL)) 1731 return -1; 1732 1733 return 0; 1734 } 1735 1736 int 1737 eal_memalloc_init(void) 1738 { 1739 const struct internal_config *internal_conf = 1740 eal_get_internal_configuration(); 1741 1742 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 1743 /* memory_hotplug_lock is held during initialization, so it's 1744 * safe to call thread-unsafe version. 1745 */ 1746 if (rte_memseg_list_walk_thread_unsafe(secondary_msl_create_walk, NULL) < 0) 1747 return -1; 1748 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 1749 internal_conf->in_memory) { 1750 int mfd_res = test_memfd_create(); 1751 1752 if (mfd_res < 0) { 1753 EAL_LOG(ERR, "Unable to check if memfd is supported"); 1754 return -1; 1755 } 1756 if (mfd_res == 1) 1757 EAL_LOG(DEBUG, "Using memfd for anonymous memory"); 1758 else 1759 EAL_LOG(INFO, "Using memfd is not supported, falling back to anonymous hugepages"); 1760 1761 /* we only support single-file segments mode with in-memory mode 1762 * if we support hugetlbfs with memfd_create. this code will 1763 * test if we do. 1764 */ 1765 if (internal_conf->single_file_segments && 1766 mfd_res != 1) { 1767 EAL_LOG(ERR, "Single-file segments mode cannot be used without memfd support"); 1768 return -1; 1769 } 1770 /* this cannot ever happen but better safe than sorry */ 1771 if (!anonymous_hugepages_supported) { 1772 EAL_LOG(ERR, "Using anonymous memory is not supported"); 1773 return -1; 1774 } 1775 /* safety net, should be impossible to configure */ 1776 if (internal_conf->hugepage_file.unlink_before_mapping && 1777 !internal_conf->hugepage_file.unlink_existing) { 1778 EAL_LOG(ERR, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping."); 1779 return -1; 1780 } 1781 } 1782 1783 /* initialize all of the fd lists */ 1784 if (rte_memseg_list_walk_thread_unsafe(fd_list_create_walk, NULL)) 1785 return -1; 1786 return 0; 1787 } 1788