1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017-2018 Intel Corporation 3 */ 4 5 #include <errno.h> 6 #include <stdarg.h> 7 #include <stdbool.h> 8 #include <stdlib.h> 9 #include <stdio.h> 10 #include <stdint.h> 11 #include <inttypes.h> 12 #include <string.h> 13 #include <sys/mman.h> 14 #include <sys/types.h> 15 #include <sys/stat.h> 16 #include <sys/queue.h> 17 #include <sys/file.h> 18 #include <unistd.h> 19 #include <limits.h> 20 #include <fcntl.h> 21 #include <sys/ioctl.h> 22 #include <sys/time.h> 23 #include <signal.h> 24 #include <setjmp.h> 25 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 26 #include <linux/memfd.h> 27 #define MEMFD_SUPPORTED 28 #endif 29 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 30 #include <numa.h> 31 #include <numaif.h> 32 #endif 33 #include <linux/falloc.h> 34 #include <linux/mman.h> /* for hugetlb-related mmap flags */ 35 36 #include <rte_common.h> 37 #include <rte_log.h> 38 #include <rte_eal.h> 39 #include <rte_errno.h> 40 #include <rte_memory.h> 41 #include <rte_spinlock.h> 42 43 #include "eal_filesystem.h" 44 #include "eal_internal_cfg.h" 45 #include "eal_memalloc.h" 46 #include "eal_memcfg.h" 47 #include "eal_private.h" 48 49 const int anonymous_hugepages_supported = 50 #ifdef MAP_HUGE_SHIFT 51 1; 52 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT 53 #else 54 0; 55 #define RTE_MAP_HUGE_SHIFT 26 56 #endif 57 58 /* 59 * we've already checked memfd support at compile-time, but we also need to 60 * check if we can create hugepage files with memfd. 61 * 62 * also, this is not a constant, because while we may be *compiled* with memfd 63 * hugetlbfs support, we might not be *running* on a system that supports memfd 64 * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at 65 * runtime, and fall back to anonymous memory. 66 */ 67 static int memfd_create_supported = 68 #ifdef MFD_HUGETLB 69 1; 70 #define RTE_MFD_HUGETLB MFD_HUGETLB 71 #else 72 0; 73 #define RTE_MFD_HUGETLB 4U 74 #endif 75 76 /* 77 * not all kernel version support fallocate on hugetlbfs, so fall back to 78 * ftruncate and disallow deallocation if fallocate is not supported. 79 */ 80 static int fallocate_supported = -1; /* unknown */ 81 82 /* 83 * we have two modes - single file segments, and file-per-page mode. 84 * 85 * for single-file segments, we use memseg_list_fd to store the segment fd, 86 * while the fds[] will not be allocated, and len will be set to 0. 87 * 88 * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' 89 * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. 90 * 91 * we cannot know how many pages a system will have in advance, but we do know 92 * that they come in lists, and we know lengths of these lists. so, simply store 93 * a malloc'd array of fd's indexed by list and segment index. 94 * 95 * they will be initialized at startup, and filled as we allocate/deallocate 96 * segments. 97 */ 98 static struct { 99 int *fds; /**< dynamically allocated array of segment lock fd's */ 100 int memseg_list_fd; /**< memseg list fd */ 101 int len; /**< total length of the array */ 102 int count; /**< entries used in an array */ 103 } fd_list[RTE_MAX_MEMSEG_LISTS]; 104 105 /** local copy of a memory map, used to synchronize memory hotplug in MP */ 106 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; 107 108 static sigjmp_buf huge_jmpenv; 109 110 static void __rte_unused huge_sigbus_handler(int signo __rte_unused) 111 { 112 siglongjmp(huge_jmpenv, 1); 113 } 114 115 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, 116 * non-static local variable in the stack frame calling sigsetjmp might be 117 * clobbered by a call to longjmp. 118 */ 119 static int __rte_unused huge_wrap_sigsetjmp(void) 120 { 121 return sigsetjmp(huge_jmpenv, 1); 122 } 123 124 static struct sigaction huge_action_old; 125 static int huge_need_recover; 126 127 static void __rte_unused 128 huge_register_sigbus(void) 129 { 130 sigset_t mask; 131 struct sigaction action; 132 133 sigemptyset(&mask); 134 sigaddset(&mask, SIGBUS); 135 action.sa_flags = 0; 136 action.sa_mask = mask; 137 action.sa_handler = huge_sigbus_handler; 138 139 huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); 140 } 141 142 static void __rte_unused 143 huge_recover_sigbus(void) 144 { 145 if (huge_need_recover) { 146 sigaction(SIGBUS, &huge_action_old, NULL); 147 huge_need_recover = 0; 148 } 149 } 150 151 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 152 static bool 153 check_numa(void) 154 { 155 bool ret = true; 156 /* Check if kernel supports NUMA. */ 157 if (numa_available() != 0) { 158 RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); 159 ret = false; 160 } 161 return ret; 162 } 163 164 static void 165 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) 166 { 167 RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); 168 if (get_mempolicy(oldpolicy, oldmask->maskp, 169 oldmask->size + 1, 0, 0) < 0) { 170 RTE_LOG(ERR, EAL, 171 "Failed to get current mempolicy: %s. " 172 "Assuming MPOL_DEFAULT.\n", strerror(errno)); 173 *oldpolicy = MPOL_DEFAULT; 174 } 175 RTE_LOG(DEBUG, EAL, 176 "Setting policy MPOL_PREFERRED for socket %d\n", 177 socket_id); 178 numa_set_preferred(socket_id); 179 } 180 181 static void 182 restore_numa(int *oldpolicy, struct bitmask *oldmask) 183 { 184 RTE_LOG(DEBUG, EAL, 185 "Restoring previous memory policy: %d\n", *oldpolicy); 186 if (*oldpolicy == MPOL_DEFAULT) { 187 numa_set_localalloc(); 188 } else if (set_mempolicy(*oldpolicy, oldmask->maskp, 189 oldmask->size + 1) < 0) { 190 RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", 191 strerror(errno)); 192 numa_set_localalloc(); 193 } 194 numa_free_cpumask(oldmask); 195 } 196 #endif 197 198 /* 199 * uses fstat to report the size of a file on disk 200 */ 201 static off_t 202 get_file_size(int fd) 203 { 204 struct stat st; 205 if (fstat(fd, &st) < 0) 206 return 0; 207 return st.st_size; 208 } 209 210 static int 211 pagesz_flags(uint64_t page_sz) 212 { 213 /* as per mmap() manpage, all page sizes are log2 of page size 214 * shifted by MAP_HUGE_SHIFT 215 */ 216 int log2 = rte_log2_u64(page_sz); 217 return log2 << RTE_MAP_HUGE_SHIFT; 218 } 219 220 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ 221 static int lock(int fd, int type) 222 { 223 int ret; 224 225 /* flock may be interrupted */ 226 do { 227 ret = flock(fd, type | LOCK_NB); 228 } while (ret && errno == EINTR); 229 230 if (ret && errno == EWOULDBLOCK) { 231 /* couldn't lock */ 232 return 0; 233 } else if (ret) { 234 RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", 235 __func__, strerror(errno)); 236 return -1; 237 } 238 /* lock was successful */ 239 return 1; 240 } 241 242 static int 243 get_seg_memfd(struct hugepage_info *hi __rte_unused, 244 unsigned int list_idx __rte_unused, 245 unsigned int seg_idx __rte_unused) 246 { 247 #ifdef MEMFD_SUPPORTED 248 int fd; 249 char segname[250]; /* as per manpage, limit is 249 bytes plus null */ 250 251 int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); 252 const struct internal_config *internal_conf = 253 eal_get_internal_configuration(); 254 255 if (internal_conf->single_file_segments) { 256 fd = fd_list[list_idx].memseg_list_fd; 257 258 if (fd < 0) { 259 snprintf(segname, sizeof(segname), "seg_%i", list_idx); 260 fd = memfd_create(segname, flags); 261 if (fd < 0) { 262 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 263 __func__, strerror(errno)); 264 return -1; 265 } 266 fd_list[list_idx].memseg_list_fd = fd; 267 } 268 } else { 269 fd = fd_list[list_idx].fds[seg_idx]; 270 271 if (fd < 0) { 272 snprintf(segname, sizeof(segname), "seg_%i-%i", 273 list_idx, seg_idx); 274 fd = memfd_create(segname, flags); 275 if (fd < 0) { 276 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 277 __func__, strerror(errno)); 278 return -1; 279 } 280 fd_list[list_idx].fds[seg_idx] = fd; 281 } 282 } 283 return fd; 284 #endif 285 return -1; 286 } 287 288 static int 289 get_seg_fd(char *path, int buflen, struct hugepage_info *hi, 290 unsigned int list_idx, unsigned int seg_idx) 291 { 292 int fd; 293 const struct internal_config *internal_conf = 294 eal_get_internal_configuration(); 295 296 /* for in-memory mode, we only make it here when we're sure we support 297 * memfd, and this is a special case. 298 */ 299 if (internal_conf->in_memory) 300 return get_seg_memfd(hi, list_idx, seg_idx); 301 302 if (internal_conf->single_file_segments) { 303 /* create a hugepage file path */ 304 eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); 305 306 fd = fd_list[list_idx].memseg_list_fd; 307 308 if (fd < 0) { 309 fd = open(path, O_CREAT | O_RDWR, 0600); 310 if (fd < 0) { 311 RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", 312 __func__, strerror(errno)); 313 return -1; 314 } 315 /* take out a read lock and keep it indefinitely */ 316 if (lock(fd, LOCK_SH) < 0) { 317 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", 318 __func__, strerror(errno)); 319 close(fd); 320 return -1; 321 } 322 fd_list[list_idx].memseg_list_fd = fd; 323 } 324 } else { 325 /* create a hugepage file path */ 326 eal_get_hugefile_path(path, buflen, hi->hugedir, 327 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); 328 329 fd = fd_list[list_idx].fds[seg_idx]; 330 331 if (fd < 0) { 332 /* A primary process is the only one creating these 333 * files. If there is a leftover that was not cleaned 334 * by clear_hugedir(), we must *now* make sure to drop 335 * the file or we will remap old stuff while the rest 336 * of the code is built on the assumption that a new 337 * page is clean. 338 */ 339 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 340 unlink(path) == -1 && 341 errno != ENOENT) { 342 RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n", 343 __func__, path, strerror(errno)); 344 return -1; 345 } 346 347 fd = open(path, O_CREAT | O_RDWR, 0600); 348 if (fd < 0) { 349 RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", 350 __func__, strerror(errno)); 351 return -1; 352 } 353 /* take out a read lock */ 354 if (lock(fd, LOCK_SH) < 0) { 355 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", 356 __func__, strerror(errno)); 357 close(fd); 358 return -1; 359 } 360 fd_list[list_idx].fds[seg_idx] = fd; 361 } 362 } 363 return fd; 364 } 365 366 static int 367 resize_hugefile_in_memory(int fd, uint64_t fa_offset, 368 uint64_t page_sz, bool grow) 369 { 370 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 371 FALLOC_FL_KEEP_SIZE; 372 int ret; 373 374 /* grow or shrink the file */ 375 ret = fallocate(fd, flags, fa_offset, page_sz); 376 377 if (ret < 0) { 378 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 379 __func__, 380 strerror(errno)); 381 return -1; 382 } 383 return 0; 384 } 385 386 static int 387 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, 388 bool grow) 389 { 390 bool again = false; 391 392 do { 393 if (fallocate_supported == 0) { 394 /* we cannot deallocate memory if fallocate() is not 395 * supported, and hugepage file is already locked at 396 * creation, so no further synchronization needed. 397 */ 398 399 if (!grow) { 400 RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", 401 __func__); 402 return -1; 403 } 404 uint64_t new_size = fa_offset + page_sz; 405 uint64_t cur_size = get_file_size(fd); 406 407 /* fallocate isn't supported, fall back to ftruncate */ 408 if (new_size > cur_size && 409 ftruncate(fd, new_size) < 0) { 410 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 411 __func__, strerror(errno)); 412 return -1; 413 } 414 } else { 415 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 416 FALLOC_FL_KEEP_SIZE; 417 int ret; 418 419 /* 420 * technically, it is perfectly safe for both primary 421 * and secondary to grow and shrink the page files: 422 * growing the file repeatedly has no effect because 423 * a page can only be allocated once, while mmap ensures 424 * that secondaries hold on to the page even after the 425 * page itself is removed from the filesystem. 426 * 427 * however, leaving growing/shrinking to the primary 428 * tends to expose bugs in fdlist page count handling, 429 * so leave this here just in case. 430 */ 431 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 432 return 0; 433 434 /* grow or shrink the file */ 435 ret = fallocate(fd, flags, fa_offset, page_sz); 436 437 if (ret < 0) { 438 if (fallocate_supported == -1 && 439 errno == ENOTSUP) { 440 RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", 441 __func__); 442 again = true; 443 fallocate_supported = 0; 444 } else { 445 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 446 __func__, 447 strerror(errno)); 448 return -1; 449 } 450 } else 451 fallocate_supported = 1; 452 } 453 } while (again); 454 455 return 0; 456 } 457 458 static void 459 close_hugefile(int fd, char *path, int list_idx) 460 { 461 const struct internal_config *internal_conf = 462 eal_get_internal_configuration(); 463 /* 464 * primary process must unlink the file, but only when not in in-memory 465 * mode (as in that case there is no file to unlink). 466 */ 467 if (!internal_conf->in_memory && 468 rte_eal_process_type() == RTE_PROC_PRIMARY && 469 unlink(path)) 470 RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", 471 __func__, path, strerror(errno)); 472 473 close(fd); 474 fd_list[list_idx].memseg_list_fd = -1; 475 } 476 477 static int 478 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow) 479 { 480 /* in-memory mode is a special case, because we can be sure that 481 * fallocate() is supported. 482 */ 483 const struct internal_config *internal_conf = 484 eal_get_internal_configuration(); 485 486 if (internal_conf->in_memory) 487 return resize_hugefile_in_memory(fd, fa_offset, 488 page_sz, grow); 489 490 return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, 491 grow); 492 } 493 494 static int 495 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, 496 struct hugepage_info *hi, unsigned int list_idx, 497 unsigned int seg_idx) 498 { 499 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 500 int cur_socket_id = 0; 501 #endif 502 uint64_t map_offset; 503 rte_iova_t iova; 504 void *va; 505 char path[PATH_MAX]; 506 int ret = 0; 507 int fd; 508 size_t alloc_sz; 509 int flags; 510 void *new_addr; 511 const struct internal_config *internal_conf = 512 eal_get_internal_configuration(); 513 514 alloc_sz = hi->hugepage_sz; 515 516 /* these are checked at init, but code analyzers don't know that */ 517 if (internal_conf->in_memory && !anonymous_hugepages_supported) { 518 RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); 519 return -1; 520 } 521 if (internal_conf->in_memory && !memfd_create_supported && 522 internal_conf->single_file_segments) { 523 RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); 524 return -1; 525 } 526 527 /* in-memory without memfd is a special case */ 528 int mmap_flags; 529 530 if (internal_conf->in_memory && !memfd_create_supported) { 531 const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | 532 MAP_PRIVATE | MAP_ANONYMOUS; 533 int pagesz_flag; 534 535 pagesz_flag = pagesz_flags(alloc_sz); 536 fd = -1; 537 mmap_flags = in_memory_flags | pagesz_flag; 538 539 /* single-file segments codepath will never be active 540 * here because in-memory mode is incompatible with the 541 * fallback path, and it's stopped at EAL initialization 542 * stage. 543 */ 544 map_offset = 0; 545 } else { 546 /* takes out a read lock on segment or segment list */ 547 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); 548 if (fd < 0) { 549 RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); 550 return -1; 551 } 552 553 if (internal_conf->single_file_segments) { 554 map_offset = seg_idx * alloc_sz; 555 ret = resize_hugefile(fd, map_offset, alloc_sz, true); 556 if (ret < 0) 557 goto resized; 558 559 fd_list[list_idx].count++; 560 } else { 561 map_offset = 0; 562 if (ftruncate(fd, alloc_sz) < 0) { 563 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 564 __func__, strerror(errno)); 565 goto resized; 566 } 567 if (internal_conf->hugepage_unlink && 568 !internal_conf->in_memory) { 569 if (unlink(path)) { 570 RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", 571 __func__, strerror(errno)); 572 goto resized; 573 } 574 } 575 } 576 mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; 577 } 578 579 /* 580 * map the segment, and populate page tables, the kernel fills 581 * this segment with zeros if it's a new page. 582 */ 583 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, 584 map_offset); 585 586 if (va == MAP_FAILED) { 587 RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, 588 strerror(errno)); 589 /* mmap failed, but the previous region might have been 590 * unmapped anyway. try to remap it 591 */ 592 goto unmapped; 593 } 594 if (va != addr) { 595 RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); 596 munmap(va, alloc_sz); 597 goto resized; 598 } 599 600 /* In linux, hugetlb limitations, like cgroup, are 601 * enforced at fault time instead of mmap(), even 602 * with the option of MAP_POPULATE. Kernel will send 603 * a SIGBUS signal. To avoid to be killed, save stack 604 * environment here, if SIGBUS happens, we can jump 605 * back here. 606 */ 607 if (huge_wrap_sigsetjmp()) { 608 RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", 609 (unsigned int)(alloc_sz >> 20)); 610 goto mapped; 611 } 612 613 /* we need to trigger a write to the page to enforce page fault and 614 * ensure that page is accessible to us, but we can't overwrite value 615 * that is already there, so read the old value, and write itback. 616 * kernel populates the page with zeroes initially. 617 */ 618 *(volatile int *)addr = *(volatile int *)addr; 619 620 iova = rte_mem_virt2iova(addr); 621 if (iova == RTE_BAD_PHYS_ADDR) { 622 RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", 623 __func__); 624 goto mapped; 625 } 626 627 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 628 /* 629 * If the kernel has been built without NUMA support, get_mempolicy() 630 * will return an error. If check_numa() returns false, memory 631 * allocation is not NUMA aware and the socket_id should not be 632 * checked. 633 */ 634 if (check_numa()) { 635 ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, 636 MPOL_F_NODE | MPOL_F_ADDR); 637 if (ret < 0) { 638 RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", 639 __func__, strerror(errno)); 640 goto mapped; 641 } else if (cur_socket_id != socket_id) { 642 RTE_LOG(DEBUG, EAL, 643 "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", 644 __func__, socket_id, cur_socket_id); 645 goto mapped; 646 } 647 } 648 #else 649 if (rte_socket_count() > 1) 650 RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", 651 __func__); 652 #endif 653 654 ms->addr = addr; 655 ms->hugepage_sz = alloc_sz; 656 ms->len = alloc_sz; 657 ms->nchannel = rte_memory_get_nchannel(); 658 ms->nrank = rte_memory_get_nrank(); 659 ms->iova = iova; 660 ms->socket_id = socket_id; 661 662 return 0; 663 664 mapped: 665 munmap(addr, alloc_sz); 666 unmapped: 667 flags = EAL_RESERVE_FORCE_ADDRESS; 668 new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); 669 if (new_addr != addr) { 670 if (new_addr != NULL) 671 munmap(new_addr, alloc_sz); 672 /* we're leaving a hole in our virtual address space. if 673 * somebody else maps this hole now, we could accidentally 674 * override it in the future. 675 */ 676 RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); 677 } 678 /* roll back the ref count */ 679 if (internal_conf->single_file_segments) 680 fd_list[list_idx].count--; 681 resized: 682 /* some codepaths will return negative fd, so exit early */ 683 if (fd < 0) 684 return -1; 685 686 if (internal_conf->single_file_segments) { 687 resize_hugefile(fd, map_offset, alloc_sz, false); 688 /* ignore failure, can't make it any worse */ 689 690 /* if refcount is at zero, close the file */ 691 if (fd_list[list_idx].count == 0) 692 close_hugefile(fd, path, list_idx); 693 } else { 694 /* only remove file if we can take out a write lock */ 695 if (internal_conf->hugepage_unlink == 0 && 696 internal_conf->in_memory == 0 && 697 lock(fd, LOCK_EX) == 1) 698 unlink(path); 699 close(fd); 700 fd_list[list_idx].fds[seg_idx] = -1; 701 } 702 return -1; 703 } 704 705 static int 706 free_seg(struct rte_memseg *ms, struct hugepage_info *hi, 707 unsigned int list_idx, unsigned int seg_idx) 708 { 709 uint64_t map_offset; 710 char path[PATH_MAX]; 711 int fd, ret = 0; 712 const struct internal_config *internal_conf = 713 eal_get_internal_configuration(); 714 715 /* erase page data */ 716 memset(ms->addr, 0, ms->len); 717 718 if (mmap(ms->addr, ms->len, PROT_NONE, 719 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == 720 MAP_FAILED) { 721 RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); 722 return -1; 723 } 724 725 eal_mem_set_dump(ms->addr, ms->len, false); 726 727 /* if we're using anonymous hugepages, nothing to be done */ 728 if (internal_conf->in_memory && !memfd_create_supported) { 729 memset(ms, 0, sizeof(*ms)); 730 return 0; 731 } 732 733 /* if we are not in single file segments mode, we're going to unmap the 734 * segment and thus drop the lock on original fd, but hugepage dir is 735 * now locked so we can take out another one without races. 736 */ 737 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); 738 if (fd < 0) 739 return -1; 740 741 if (internal_conf->single_file_segments) { 742 map_offset = seg_idx * ms->len; 743 if (resize_hugefile(fd, map_offset, ms->len, false)) 744 return -1; 745 746 if (--(fd_list[list_idx].count) == 0) 747 close_hugefile(fd, path, list_idx); 748 749 ret = 0; 750 } else { 751 /* if we're able to take out a write lock, we're the last one 752 * holding onto this page. 753 */ 754 if (!internal_conf->in_memory && !internal_conf->hugepage_unlink) { 755 ret = lock(fd, LOCK_EX); 756 if (ret >= 0) { 757 /* no one else is using this page */ 758 if (ret == 1) 759 unlink(path); 760 } 761 } 762 /* closing fd will drop the lock */ 763 close(fd); 764 fd_list[list_idx].fds[seg_idx] = -1; 765 } 766 767 memset(ms, 0, sizeof(*ms)); 768 769 return ret < 0 ? -1 : 0; 770 } 771 772 struct alloc_walk_param { 773 struct hugepage_info *hi; 774 struct rte_memseg **ms; 775 size_t page_sz; 776 unsigned int segs_allocated; 777 unsigned int n_segs; 778 int socket; 779 bool exact; 780 }; 781 static int 782 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) 783 { 784 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 785 struct alloc_walk_param *wa = arg; 786 struct rte_memseg_list *cur_msl; 787 size_t page_sz; 788 int cur_idx, start_idx, j, dir_fd = -1; 789 unsigned int msl_idx, need, i; 790 const struct internal_config *internal_conf = 791 eal_get_internal_configuration(); 792 793 if (msl->page_sz != wa->page_sz) 794 return 0; 795 if (msl->socket_id != wa->socket) 796 return 0; 797 798 page_sz = (size_t)msl->page_sz; 799 800 msl_idx = msl - mcfg->memsegs; 801 cur_msl = &mcfg->memsegs[msl_idx]; 802 803 need = wa->n_segs; 804 805 /* try finding space in memseg list */ 806 if (wa->exact) { 807 /* if we require exact number of pages in a list, find them */ 808 cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, 809 need); 810 if (cur_idx < 0) 811 return 0; 812 start_idx = cur_idx; 813 } else { 814 int cur_len; 815 816 /* we don't require exact number of pages, so we're going to go 817 * for best-effort allocation. that means finding the biggest 818 * unused block, and going with that. 819 */ 820 cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, 821 0); 822 if (cur_idx < 0) 823 return 0; 824 start_idx = cur_idx; 825 /* adjust the size to possibly be smaller than original 826 * request, but do not allow it to be bigger. 827 */ 828 cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, 829 cur_idx); 830 need = RTE_MIN(need, (unsigned int)cur_len); 831 } 832 833 /* do not allow any page allocations during the time we're allocating, 834 * because file creation and locking operations are not atomic, 835 * and we might be the first or the last ones to use a particular page, 836 * so we need to ensure atomicity of every operation. 837 * 838 * during init, we already hold a write lock, so don't try to take out 839 * another one. 840 */ 841 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 842 dir_fd = open(wa->hi->hugedir, O_RDONLY); 843 if (dir_fd < 0) { 844 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 845 __func__, wa->hi->hugedir, strerror(errno)); 846 return -1; 847 } 848 /* blocking writelock */ 849 if (flock(dir_fd, LOCK_EX)) { 850 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 851 __func__, wa->hi->hugedir, strerror(errno)); 852 close(dir_fd); 853 return -1; 854 } 855 } 856 857 for (i = 0; i < need; i++, cur_idx++) { 858 struct rte_memseg *cur; 859 void *map_addr; 860 861 cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); 862 map_addr = RTE_PTR_ADD(cur_msl->base_va, 863 cur_idx * page_sz); 864 865 if (alloc_seg(cur, map_addr, wa->socket, wa->hi, 866 msl_idx, cur_idx)) { 867 RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", 868 need, i); 869 870 /* if exact number wasn't requested, stop */ 871 if (!wa->exact) 872 goto out; 873 874 /* clean up */ 875 for (j = start_idx; j < cur_idx; j++) { 876 struct rte_memseg *tmp; 877 struct rte_fbarray *arr = 878 &cur_msl->memseg_arr; 879 880 tmp = rte_fbarray_get(arr, j); 881 rte_fbarray_set_free(arr, j); 882 883 /* free_seg may attempt to create a file, which 884 * may fail. 885 */ 886 if (free_seg(tmp, wa->hi, msl_idx, j)) 887 RTE_LOG(DEBUG, EAL, "Cannot free page\n"); 888 } 889 /* clear the list */ 890 if (wa->ms) 891 memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); 892 893 if (dir_fd >= 0) 894 close(dir_fd); 895 return -1; 896 } 897 if (wa->ms) 898 wa->ms[i] = cur; 899 900 rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); 901 } 902 out: 903 wa->segs_allocated = i; 904 if (i > 0) 905 cur_msl->version++; 906 if (dir_fd >= 0) 907 close(dir_fd); 908 /* if we didn't allocate any segments, move on to the next list */ 909 return i > 0; 910 } 911 912 struct free_walk_param { 913 struct hugepage_info *hi; 914 struct rte_memseg *ms; 915 }; 916 static int 917 free_seg_walk(const struct rte_memseg_list *msl, void *arg) 918 { 919 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 920 struct rte_memseg_list *found_msl; 921 struct free_walk_param *wa = arg; 922 uintptr_t start_addr, end_addr; 923 int msl_idx, seg_idx, ret, dir_fd = -1; 924 const struct internal_config *internal_conf = 925 eal_get_internal_configuration(); 926 927 start_addr = (uintptr_t) msl->base_va; 928 end_addr = start_addr + msl->len; 929 930 if ((uintptr_t)wa->ms->addr < start_addr || 931 (uintptr_t)wa->ms->addr >= end_addr) 932 return 0; 933 934 msl_idx = msl - mcfg->memsegs; 935 seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; 936 937 /* msl is const */ 938 found_msl = &mcfg->memsegs[msl_idx]; 939 940 /* do not allow any page allocations during the time we're freeing, 941 * because file creation and locking operations are not atomic, 942 * and we might be the first or the last ones to use a particular page, 943 * so we need to ensure atomicity of every operation. 944 * 945 * during init, we already hold a write lock, so don't try to take out 946 * another one. 947 */ 948 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 949 dir_fd = open(wa->hi->hugedir, O_RDONLY); 950 if (dir_fd < 0) { 951 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 952 __func__, wa->hi->hugedir, strerror(errno)); 953 return -1; 954 } 955 /* blocking writelock */ 956 if (flock(dir_fd, LOCK_EX)) { 957 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 958 __func__, wa->hi->hugedir, strerror(errno)); 959 close(dir_fd); 960 return -1; 961 } 962 } 963 964 found_msl->version++; 965 966 rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); 967 968 ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); 969 970 if (dir_fd >= 0) 971 close(dir_fd); 972 973 if (ret < 0) 974 return -1; 975 976 return 1; 977 } 978 979 int 980 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, 981 int socket, bool exact) 982 { 983 int i, ret = -1; 984 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 985 bool have_numa = false; 986 int oldpolicy; 987 struct bitmask *oldmask; 988 #endif 989 struct alloc_walk_param wa; 990 struct hugepage_info *hi = NULL; 991 struct internal_config *internal_conf = 992 eal_get_internal_configuration(); 993 994 memset(&wa, 0, sizeof(wa)); 995 996 /* dynamic allocation not supported in legacy mode */ 997 if (internal_conf->legacy_mem) 998 return -1; 999 1000 for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) { 1001 if (page_sz == 1002 internal_conf->hugepage_info[i].hugepage_sz) { 1003 hi = &internal_conf->hugepage_info[i]; 1004 break; 1005 } 1006 } 1007 if (!hi) { 1008 RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", 1009 __func__); 1010 return -1; 1011 } 1012 1013 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1014 if (check_numa()) { 1015 oldmask = numa_allocate_nodemask(); 1016 prepare_numa(&oldpolicy, oldmask, socket); 1017 have_numa = true; 1018 } 1019 #endif 1020 1021 wa.exact = exact; 1022 wa.hi = hi; 1023 wa.ms = ms; 1024 wa.n_segs = n_segs; 1025 wa.page_sz = page_sz; 1026 wa.socket = socket; 1027 wa.segs_allocated = 0; 1028 1029 /* memalloc is locked, so it's safe to use thread-unsafe version */ 1030 ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); 1031 if (ret == 0) { 1032 RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", 1033 __func__); 1034 ret = -1; 1035 } else if (ret > 0) { 1036 ret = (int)wa.segs_allocated; 1037 } 1038 1039 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1040 if (have_numa) 1041 restore_numa(&oldpolicy, oldmask); 1042 #endif 1043 return ret; 1044 } 1045 1046 struct rte_memseg * 1047 eal_memalloc_alloc_seg(size_t page_sz, int socket) 1048 { 1049 struct rte_memseg *ms; 1050 if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) 1051 return NULL; 1052 /* return pointer to newly allocated memseg */ 1053 return ms; 1054 } 1055 1056 int 1057 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) 1058 { 1059 int seg, ret = 0; 1060 struct internal_config *internal_conf = 1061 eal_get_internal_configuration(); 1062 1063 /* dynamic free not supported in legacy mode */ 1064 if (internal_conf->legacy_mem) 1065 return -1; 1066 1067 for (seg = 0; seg < n_segs; seg++) { 1068 struct rte_memseg *cur = ms[seg]; 1069 struct hugepage_info *hi = NULL; 1070 struct free_walk_param wa; 1071 int i, walk_res; 1072 1073 /* if this page is marked as unfreeable, fail */ 1074 if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 1075 RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); 1076 ret = -1; 1077 continue; 1078 } 1079 1080 memset(&wa, 0, sizeof(wa)); 1081 1082 for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info); 1083 i++) { 1084 hi = &internal_conf->hugepage_info[i]; 1085 if (cur->hugepage_sz == hi->hugepage_sz) 1086 break; 1087 } 1088 if (i == (int)RTE_DIM(internal_conf->hugepage_info)) { 1089 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1090 ret = -1; 1091 continue; 1092 } 1093 1094 wa.ms = cur; 1095 wa.hi = hi; 1096 1097 /* memalloc is locked, so it's safe to use thread-unsafe version 1098 */ 1099 walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, 1100 &wa); 1101 if (walk_res == 1) 1102 continue; 1103 if (walk_res == 0) 1104 RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); 1105 ret = -1; 1106 } 1107 return ret; 1108 } 1109 1110 int 1111 eal_memalloc_free_seg(struct rte_memseg *ms) 1112 { 1113 const struct internal_config *internal_conf = 1114 eal_get_internal_configuration(); 1115 1116 /* dynamic free not supported in legacy mode */ 1117 if (internal_conf->legacy_mem) 1118 return -1; 1119 1120 return eal_memalloc_free_seg_bulk(&ms, 1); 1121 } 1122 1123 static int 1124 sync_chunk(struct rte_memseg_list *primary_msl, 1125 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1126 unsigned int msl_idx, bool used, int start, int end) 1127 { 1128 struct rte_fbarray *l_arr, *p_arr; 1129 int i, ret, chunk_len, diff_len; 1130 1131 l_arr = &local_msl->memseg_arr; 1132 p_arr = &primary_msl->memseg_arr; 1133 1134 /* we need to aggregate allocations/deallocations into bigger chunks, 1135 * as we don't want to spam the user with per-page callbacks. 1136 * 1137 * to avoid any potential issues, we also want to trigger 1138 * deallocation callbacks *before* we actually deallocate 1139 * memory, so that the user application could wrap up its use 1140 * before it goes away. 1141 */ 1142 1143 chunk_len = end - start; 1144 1145 /* find how many contiguous pages we can map/unmap for this chunk */ 1146 diff_len = used ? 1147 rte_fbarray_find_contig_free(l_arr, start) : 1148 rte_fbarray_find_contig_used(l_arr, start); 1149 1150 /* has to be at least one page */ 1151 if (diff_len < 1) 1152 return -1; 1153 1154 diff_len = RTE_MIN(chunk_len, diff_len); 1155 1156 /* if we are freeing memory, notify the application */ 1157 if (!used) { 1158 struct rte_memseg *ms; 1159 void *start_va; 1160 size_t len, page_sz; 1161 1162 ms = rte_fbarray_get(l_arr, start); 1163 start_va = ms->addr; 1164 page_sz = (size_t)primary_msl->page_sz; 1165 len = page_sz * diff_len; 1166 1167 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 1168 start_va, len); 1169 } 1170 1171 for (i = 0; i < diff_len; i++) { 1172 struct rte_memseg *p_ms, *l_ms; 1173 int seg_idx = start + i; 1174 1175 l_ms = rte_fbarray_get(l_arr, seg_idx); 1176 p_ms = rte_fbarray_get(p_arr, seg_idx); 1177 1178 if (l_ms == NULL || p_ms == NULL) 1179 return -1; 1180 1181 if (used) { 1182 ret = alloc_seg(l_ms, p_ms->addr, 1183 p_ms->socket_id, hi, 1184 msl_idx, seg_idx); 1185 if (ret < 0) 1186 return -1; 1187 rte_fbarray_set_used(l_arr, seg_idx); 1188 } else { 1189 ret = free_seg(l_ms, hi, msl_idx, seg_idx); 1190 rte_fbarray_set_free(l_arr, seg_idx); 1191 if (ret < 0) 1192 return -1; 1193 } 1194 } 1195 1196 /* if we just allocated memory, notify the application */ 1197 if (used) { 1198 struct rte_memseg *ms; 1199 void *start_va; 1200 size_t len, page_sz; 1201 1202 ms = rte_fbarray_get(l_arr, start); 1203 start_va = ms->addr; 1204 page_sz = (size_t)primary_msl->page_sz; 1205 len = page_sz * diff_len; 1206 1207 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1208 start_va, len); 1209 } 1210 1211 /* calculate how much we can advance until next chunk */ 1212 diff_len = used ? 1213 rte_fbarray_find_contig_used(l_arr, start) : 1214 rte_fbarray_find_contig_free(l_arr, start); 1215 ret = RTE_MIN(chunk_len, diff_len); 1216 1217 return ret; 1218 } 1219 1220 static int 1221 sync_status(struct rte_memseg_list *primary_msl, 1222 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1223 unsigned int msl_idx, bool used) 1224 { 1225 struct rte_fbarray *l_arr, *p_arr; 1226 int p_idx, l_chunk_len, p_chunk_len, ret; 1227 int start, end; 1228 1229 /* this is a little bit tricky, but the basic idea is - walk both lists 1230 * and spot any places where there are discrepancies. walking both lists 1231 * and noting discrepancies in a single go is a hard problem, so we do 1232 * it in two passes - first we spot any places where allocated segments 1233 * mismatch (i.e. ensure that everything that's allocated in the primary 1234 * is also allocated in the secondary), and then we do it by looking at 1235 * free segments instead. 1236 * 1237 * we also need to aggregate changes into chunks, as we have to call 1238 * callbacks per allocation, not per page. 1239 */ 1240 l_arr = &local_msl->memseg_arr; 1241 p_arr = &primary_msl->memseg_arr; 1242 1243 if (used) 1244 p_idx = rte_fbarray_find_next_used(p_arr, 0); 1245 else 1246 p_idx = rte_fbarray_find_next_free(p_arr, 0); 1247 1248 while (p_idx >= 0) { 1249 int next_chunk_search_idx; 1250 1251 if (used) { 1252 p_chunk_len = rte_fbarray_find_contig_used(p_arr, 1253 p_idx); 1254 l_chunk_len = rte_fbarray_find_contig_used(l_arr, 1255 p_idx); 1256 } else { 1257 p_chunk_len = rte_fbarray_find_contig_free(p_arr, 1258 p_idx); 1259 l_chunk_len = rte_fbarray_find_contig_free(l_arr, 1260 p_idx); 1261 } 1262 /* best case scenario - no differences (or bigger, which will be 1263 * fixed during next iteration), look for next chunk 1264 */ 1265 if (l_chunk_len >= p_chunk_len) { 1266 next_chunk_search_idx = p_idx + p_chunk_len; 1267 goto next_chunk; 1268 } 1269 1270 /* if both chunks start at the same point, skip parts we know 1271 * are identical, and sync the rest. each call to sync_chunk 1272 * will only sync contiguous segments, so we need to call this 1273 * until we are sure there are no more differences in this 1274 * chunk. 1275 */ 1276 start = p_idx + l_chunk_len; 1277 end = p_idx + p_chunk_len; 1278 do { 1279 ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, 1280 used, start, end); 1281 start += ret; 1282 } while (start < end && ret >= 0); 1283 /* if ret is negative, something went wrong */ 1284 if (ret < 0) 1285 return -1; 1286 1287 next_chunk_search_idx = p_idx + p_chunk_len; 1288 next_chunk: 1289 /* skip to end of this chunk */ 1290 if (used) { 1291 p_idx = rte_fbarray_find_next_used(p_arr, 1292 next_chunk_search_idx); 1293 } else { 1294 p_idx = rte_fbarray_find_next_free(p_arr, 1295 next_chunk_search_idx); 1296 } 1297 } 1298 return 0; 1299 } 1300 1301 static int 1302 sync_existing(struct rte_memseg_list *primary_msl, 1303 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1304 unsigned int msl_idx) 1305 { 1306 int ret, dir_fd; 1307 1308 /* do not allow any page allocations during the time we're allocating, 1309 * because file creation and locking operations are not atomic, 1310 * and we might be the first or the last ones to use a particular page, 1311 * so we need to ensure atomicity of every operation. 1312 */ 1313 dir_fd = open(hi->hugedir, O_RDONLY); 1314 if (dir_fd < 0) { 1315 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, 1316 hi->hugedir, strerror(errno)); 1317 return -1; 1318 } 1319 /* blocking writelock */ 1320 if (flock(dir_fd, LOCK_EX)) { 1321 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, 1322 hi->hugedir, strerror(errno)); 1323 close(dir_fd); 1324 return -1; 1325 } 1326 1327 /* ensure all allocated space is the same in both lists */ 1328 ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); 1329 if (ret < 0) 1330 goto fail; 1331 1332 /* ensure all unallocated space is the same in both lists */ 1333 ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); 1334 if (ret < 0) 1335 goto fail; 1336 1337 /* update version number */ 1338 local_msl->version = primary_msl->version; 1339 1340 close(dir_fd); 1341 1342 return 0; 1343 fail: 1344 close(dir_fd); 1345 return -1; 1346 } 1347 1348 static int 1349 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1350 { 1351 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1352 struct rte_memseg_list *primary_msl, *local_msl; 1353 struct hugepage_info *hi = NULL; 1354 unsigned int i; 1355 int msl_idx; 1356 struct internal_config *internal_conf = 1357 eal_get_internal_configuration(); 1358 1359 if (msl->external) 1360 return 0; 1361 1362 msl_idx = msl - mcfg->memsegs; 1363 primary_msl = &mcfg->memsegs[msl_idx]; 1364 local_msl = &local_memsegs[msl_idx]; 1365 1366 for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) { 1367 uint64_t cur_sz = 1368 internal_conf->hugepage_info[i].hugepage_sz; 1369 uint64_t msl_sz = primary_msl->page_sz; 1370 if (msl_sz == cur_sz) { 1371 hi = &internal_conf->hugepage_info[i]; 1372 break; 1373 } 1374 } 1375 if (!hi) { 1376 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1377 return -1; 1378 } 1379 1380 /* if versions don't match, synchronize everything */ 1381 if (local_msl->version != primary_msl->version && 1382 sync_existing(primary_msl, local_msl, hi, msl_idx)) 1383 return -1; 1384 return 0; 1385 } 1386 1387 1388 int 1389 eal_memalloc_sync_with_primary(void) 1390 { 1391 /* nothing to be done in primary */ 1392 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1393 return 0; 1394 1395 /* memalloc is locked, so it's safe to call thread-unsafe version */ 1396 if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) 1397 return -1; 1398 return 0; 1399 } 1400 1401 static int 1402 secondary_msl_create_walk(const struct rte_memseg_list *msl, 1403 void *arg __rte_unused) 1404 { 1405 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1406 struct rte_memseg_list *primary_msl, *local_msl; 1407 char name[PATH_MAX]; 1408 int msl_idx, ret; 1409 1410 if (msl->external) 1411 return 0; 1412 1413 msl_idx = msl - mcfg->memsegs; 1414 primary_msl = &mcfg->memsegs[msl_idx]; 1415 local_msl = &local_memsegs[msl_idx]; 1416 1417 /* create distinct fbarrays for each secondary */ 1418 snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", 1419 primary_msl->memseg_arr.name, getpid()); 1420 1421 ret = rte_fbarray_init(&local_msl->memseg_arr, name, 1422 primary_msl->memseg_arr.len, 1423 primary_msl->memseg_arr.elt_sz); 1424 if (ret < 0) { 1425 RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); 1426 return -1; 1427 } 1428 local_msl->base_va = primary_msl->base_va; 1429 local_msl->len = primary_msl->len; 1430 1431 return 0; 1432 } 1433 1434 static int 1435 secondary_msl_destroy_walk(const struct rte_memseg_list *msl, 1436 void *arg __rte_unused) 1437 { 1438 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1439 struct rte_memseg_list *local_msl; 1440 int msl_idx, ret; 1441 1442 if (msl->external) 1443 return 0; 1444 1445 msl_idx = msl - mcfg->memsegs; 1446 local_msl = &local_memsegs[msl_idx]; 1447 1448 ret = rte_fbarray_destroy(&local_msl->memseg_arr); 1449 if (ret < 0) { 1450 RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n"); 1451 return -1; 1452 } 1453 local_msl->base_va = NULL; 1454 local_msl->len = 0; 1455 1456 return 0; 1457 } 1458 1459 static int 1460 alloc_list(int list_idx, int len) 1461 { 1462 int *data; 1463 int i; 1464 const struct internal_config *internal_conf = 1465 eal_get_internal_configuration(); 1466 1467 /* single-file segments mode does not need fd list */ 1468 if (!internal_conf->single_file_segments) { 1469 /* ensure we have space to store fd per each possible segment */ 1470 data = malloc(sizeof(int) * len); 1471 if (data == NULL) { 1472 RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); 1473 return -1; 1474 } 1475 /* set all fd's as invalid */ 1476 for (i = 0; i < len; i++) 1477 data[i] = -1; 1478 fd_list[list_idx].fds = data; 1479 fd_list[list_idx].len = len; 1480 } else { 1481 fd_list[list_idx].fds = NULL; 1482 fd_list[list_idx].len = 0; 1483 } 1484 1485 fd_list[list_idx].count = 0; 1486 fd_list[list_idx].memseg_list_fd = -1; 1487 1488 return 0; 1489 } 1490 1491 static int 1492 destroy_list(int list_idx) 1493 { 1494 const struct internal_config *internal_conf = 1495 eal_get_internal_configuration(); 1496 1497 /* single-file segments mode does not need fd list */ 1498 if (!internal_conf->single_file_segments) { 1499 int *fds = fd_list[list_idx].fds; 1500 int i; 1501 /* go through each fd and ensure it's closed */ 1502 for (i = 0; i < fd_list[list_idx].len; i++) { 1503 if (fds[i] >= 0) { 1504 close(fds[i]); 1505 fds[i] = -1; 1506 } 1507 } 1508 free(fds); 1509 fd_list[list_idx].fds = NULL; 1510 fd_list[list_idx].len = 0; 1511 } else if (fd_list[list_idx].memseg_list_fd >= 0) { 1512 close(fd_list[list_idx].memseg_list_fd); 1513 fd_list[list_idx].count = 0; 1514 fd_list[list_idx].memseg_list_fd = -1; 1515 } 1516 return 0; 1517 } 1518 1519 static int 1520 fd_list_create_walk(const struct rte_memseg_list *msl, 1521 void *arg __rte_unused) 1522 { 1523 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1524 unsigned int len; 1525 int msl_idx; 1526 1527 if (msl->external) 1528 return 0; 1529 1530 msl_idx = msl - mcfg->memsegs; 1531 len = msl->memseg_arr.len; 1532 1533 return alloc_list(msl_idx, len); 1534 } 1535 1536 static int 1537 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1538 { 1539 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1540 int msl_idx; 1541 1542 if (msl->external) 1543 return 0; 1544 1545 msl_idx = msl - mcfg->memsegs; 1546 1547 return destroy_list(msl_idx); 1548 } 1549 1550 int 1551 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) 1552 { 1553 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1554 const struct internal_config *internal_conf = 1555 eal_get_internal_configuration(); 1556 1557 /* single file segments mode doesn't support individual segment fd's */ 1558 if (internal_conf->single_file_segments) 1559 return -ENOTSUP; 1560 1561 /* if list is not allocated, allocate it */ 1562 if (fd_list[list_idx].len == 0) { 1563 int len = mcfg->memsegs[list_idx].memseg_arr.len; 1564 1565 if (alloc_list(list_idx, len) < 0) 1566 return -ENOMEM; 1567 } 1568 fd_list[list_idx].fds[seg_idx] = fd; 1569 1570 return 0; 1571 } 1572 1573 int 1574 eal_memalloc_set_seg_list_fd(int list_idx, int fd) 1575 { 1576 const struct internal_config *internal_conf = 1577 eal_get_internal_configuration(); 1578 1579 /* non-single file segment mode doesn't support segment list fd's */ 1580 if (!internal_conf->single_file_segments) 1581 return -ENOTSUP; 1582 1583 fd_list[list_idx].memseg_list_fd = fd; 1584 1585 return 0; 1586 } 1587 1588 int 1589 eal_memalloc_get_seg_fd(int list_idx, int seg_idx) 1590 { 1591 int fd; 1592 const struct internal_config *internal_conf = 1593 eal_get_internal_configuration(); 1594 1595 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1596 #ifndef MEMFD_SUPPORTED 1597 /* in in-memory or no-huge mode, we rely on memfd support */ 1598 return -ENOTSUP; 1599 #endif 1600 /* memfd supported, but hugetlbfs memfd may not be */ 1601 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1602 return -ENOTSUP; 1603 } 1604 1605 if (internal_conf->single_file_segments) { 1606 fd = fd_list[list_idx].memseg_list_fd; 1607 } else if (fd_list[list_idx].len == 0) { 1608 /* list not initialized */ 1609 fd = -1; 1610 } else { 1611 fd = fd_list[list_idx].fds[seg_idx]; 1612 } 1613 if (fd < 0) 1614 return -ENODEV; 1615 return fd; 1616 } 1617 1618 static int 1619 test_memfd_create(void) 1620 { 1621 #ifdef MEMFD_SUPPORTED 1622 const struct internal_config *internal_conf = 1623 eal_get_internal_configuration(); 1624 unsigned int i; 1625 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 1626 uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz; 1627 int pagesz_flag = pagesz_flags(pagesz); 1628 int flags; 1629 1630 flags = pagesz_flag | RTE_MFD_HUGETLB; 1631 int fd = memfd_create("test", flags); 1632 if (fd < 0) { 1633 /* we failed - let memalloc know this isn't working */ 1634 if (errno == EINVAL) { 1635 memfd_create_supported = 0; 1636 return 0; /* not supported */ 1637 } 1638 1639 /* we got other error - something's wrong */ 1640 return -1; /* error */ 1641 } 1642 close(fd); 1643 return 1; /* supported */ 1644 } 1645 #endif 1646 return 0; /* not supported */ 1647 } 1648 1649 int 1650 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) 1651 { 1652 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1653 const struct internal_config *internal_conf = 1654 eal_get_internal_configuration(); 1655 1656 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1657 #ifndef MEMFD_SUPPORTED 1658 /* in in-memory or no-huge mode, we rely on memfd support */ 1659 return -ENOTSUP; 1660 #endif 1661 /* memfd supported, but hugetlbfs memfd may not be */ 1662 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1663 return -ENOTSUP; 1664 } 1665 1666 if (internal_conf->single_file_segments) { 1667 size_t pgsz = mcfg->memsegs[list_idx].page_sz; 1668 1669 /* segment not active? */ 1670 if (fd_list[list_idx].memseg_list_fd < 0) 1671 return -ENOENT; 1672 *offset = pgsz * seg_idx; 1673 } else { 1674 /* fd_list not initialized? */ 1675 if (fd_list[list_idx].len == 0) 1676 return -ENODEV; 1677 1678 /* segment not active? */ 1679 if (fd_list[list_idx].fds[seg_idx] < 0) 1680 return -ENOENT; 1681 *offset = 0; 1682 } 1683 return 0; 1684 } 1685 1686 int 1687 eal_memalloc_cleanup(void) 1688 { 1689 /* close all remaining fd's - these are per-process, so it's safe */ 1690 if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL)) 1691 return -1; 1692 1693 /* destroy the shadow page table if we're a secondary process */ 1694 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1695 return 0; 1696 1697 if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk, 1698 NULL)) 1699 return -1; 1700 1701 return 0; 1702 } 1703 1704 int 1705 eal_memalloc_init(void) 1706 { 1707 const struct internal_config *internal_conf = 1708 eal_get_internal_configuration(); 1709 1710 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 1711 if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) 1712 return -1; 1713 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 1714 internal_conf->in_memory) { 1715 int mfd_res = test_memfd_create(); 1716 1717 if (mfd_res < 0) { 1718 RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); 1719 return -1; 1720 } 1721 if (mfd_res == 1) 1722 RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); 1723 else 1724 RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); 1725 1726 /* we only support single-file segments mode with in-memory mode 1727 * if we support hugetlbfs with memfd_create. this code will 1728 * test if we do. 1729 */ 1730 if (internal_conf->single_file_segments && 1731 mfd_res != 1) { 1732 RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); 1733 return -1; 1734 } 1735 /* this cannot ever happen but better safe than sorry */ 1736 if (!anonymous_hugepages_supported) { 1737 RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); 1738 return -1; 1739 } 1740 } 1741 1742 /* initialize all of the fd lists */ 1743 if (rte_memseg_list_walk(fd_list_create_walk, NULL)) 1744 return -1; 1745 return 0; 1746 } 1747