1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017-2018 Intel Corporation 3 */ 4 5 #include <errno.h> 6 #include <stdarg.h> 7 #include <stdbool.h> 8 #include <stdlib.h> 9 #include <stdio.h> 10 #include <stdint.h> 11 #include <inttypes.h> 12 #include <string.h> 13 #include <sys/mman.h> 14 #include <sys/types.h> 15 #include <sys/stat.h> 16 #include <sys/queue.h> 17 #include <sys/file.h> 18 #include <unistd.h> 19 #include <limits.h> 20 #include <fcntl.h> 21 #include <sys/ioctl.h> 22 #include <sys/time.h> 23 #include <signal.h> 24 #include <setjmp.h> 25 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 26 #include <linux/memfd.h> 27 #define MEMFD_SUPPORTED 28 #endif 29 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 30 #include <numa.h> 31 #include <numaif.h> 32 #endif 33 #include <linux/falloc.h> 34 #include <linux/mman.h> /* for hugetlb-related mmap flags */ 35 36 #include <rte_common.h> 37 #include <rte_log.h> 38 #include <rte_eal.h> 39 #include <rte_errno.h> 40 #include <rte_memory.h> 41 #include <rte_spinlock.h> 42 43 #include "eal_filesystem.h" 44 #include "eal_internal_cfg.h" 45 #include "eal_memalloc.h" 46 #include "eal_memcfg.h" 47 #include "eal_private.h" 48 49 const int anonymous_hugepages_supported = 50 #ifdef MAP_HUGE_SHIFT 51 1; 52 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT 53 #else 54 0; 55 #define RTE_MAP_HUGE_SHIFT 26 56 #endif 57 58 /* 59 * we've already checked memfd support at compile-time, but we also need to 60 * check if we can create hugepage files with memfd. 61 * 62 * also, this is not a constant, because while we may be *compiled* with memfd 63 * hugetlbfs support, we might not be *running* on a system that supports memfd 64 * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at 65 * runtime, and fall back to anonymous memory. 66 */ 67 static int memfd_create_supported = 68 #ifdef MFD_HUGETLB 69 1; 70 #define RTE_MFD_HUGETLB MFD_HUGETLB 71 #else 72 0; 73 #define RTE_MFD_HUGETLB 4U 74 #endif 75 76 /* 77 * not all kernel version support fallocate on hugetlbfs, so fall back to 78 * ftruncate and disallow deallocation if fallocate is not supported. 79 */ 80 static int fallocate_supported = -1; /* unknown */ 81 82 /* 83 * we have two modes - single file segments, and file-per-page mode. 84 * 85 * for single-file segments, we use memseg_list_fd to store the segment fd, 86 * while the fds[] will not be allocated, and len will be set to 0. 87 * 88 * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' 89 * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. 90 * 91 * we cannot know how many pages a system will have in advance, but we do know 92 * that they come in lists, and we know lengths of these lists. so, simply store 93 * a malloc'd array of fd's indexed by list and segment index. 94 * 95 * they will be initialized at startup, and filled as we allocate/deallocate 96 * segments. 97 */ 98 static struct { 99 int *fds; /**< dynamically allocated array of segment lock fd's */ 100 int memseg_list_fd; /**< memseg list fd */ 101 int len; /**< total length of the array */ 102 int count; /**< entries used in an array */ 103 } fd_list[RTE_MAX_MEMSEG_LISTS]; 104 105 /** local copy of a memory map, used to synchronize memory hotplug in MP */ 106 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; 107 108 static sigjmp_buf huge_jmpenv; 109 110 static void huge_sigbus_handler(int signo __rte_unused) 111 { 112 siglongjmp(huge_jmpenv, 1); 113 } 114 115 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, 116 * non-static local variable in the stack frame calling sigsetjmp might be 117 * clobbered by a call to longjmp. 118 */ 119 static int huge_wrap_sigsetjmp(void) 120 { 121 return sigsetjmp(huge_jmpenv, 1); 122 } 123 124 static struct sigaction huge_action_old; 125 static int huge_need_recover; 126 127 static void 128 huge_register_sigbus(void) 129 { 130 sigset_t mask; 131 struct sigaction action; 132 133 sigemptyset(&mask); 134 sigaddset(&mask, SIGBUS); 135 action.sa_flags = 0; 136 action.sa_mask = mask; 137 action.sa_handler = huge_sigbus_handler; 138 139 huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); 140 } 141 142 static void 143 huge_recover_sigbus(void) 144 { 145 if (huge_need_recover) { 146 sigaction(SIGBUS, &huge_action_old, NULL); 147 huge_need_recover = 0; 148 } 149 } 150 151 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 152 static bool 153 check_numa(void) 154 { 155 bool ret = true; 156 /* Check if kernel supports NUMA. */ 157 if (numa_available() != 0) { 158 RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); 159 ret = false; 160 } 161 return ret; 162 } 163 164 static void 165 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) 166 { 167 RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); 168 if (get_mempolicy(oldpolicy, oldmask->maskp, 169 oldmask->size + 1, 0, 0) < 0) { 170 RTE_LOG(ERR, EAL, 171 "Failed to get current mempolicy: %s. " 172 "Assuming MPOL_DEFAULT.\n", strerror(errno)); 173 *oldpolicy = MPOL_DEFAULT; 174 } 175 RTE_LOG(DEBUG, EAL, 176 "Setting policy MPOL_PREFERRED for socket %d\n", 177 socket_id); 178 numa_set_preferred(socket_id); 179 } 180 181 static void 182 restore_numa(int *oldpolicy, struct bitmask *oldmask) 183 { 184 RTE_LOG(DEBUG, EAL, 185 "Restoring previous memory policy: %d\n", *oldpolicy); 186 if (*oldpolicy == MPOL_DEFAULT) { 187 numa_set_localalloc(); 188 } else if (set_mempolicy(*oldpolicy, oldmask->maskp, 189 oldmask->size + 1) < 0) { 190 RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", 191 strerror(errno)); 192 numa_set_localalloc(); 193 } 194 numa_free_cpumask(oldmask); 195 } 196 #endif 197 198 /* 199 * uses fstat to report the size of a file on disk 200 */ 201 static off_t 202 get_file_size(int fd) 203 { 204 struct stat st; 205 if (fstat(fd, &st) < 0) 206 return 0; 207 return st.st_size; 208 } 209 210 static int 211 pagesz_flags(uint64_t page_sz) 212 { 213 /* as per mmap() manpage, all page sizes are log2 of page size 214 * shifted by MAP_HUGE_SHIFT 215 */ 216 int log2 = rte_log2_u64(page_sz); 217 return log2 << RTE_MAP_HUGE_SHIFT; 218 } 219 220 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ 221 static int lock(int fd, int type) 222 { 223 int ret; 224 225 /* flock may be interrupted */ 226 do { 227 ret = flock(fd, type | LOCK_NB); 228 } while (ret && errno == EINTR); 229 230 if (ret && errno == EWOULDBLOCK) { 231 /* couldn't lock */ 232 return 0; 233 } else if (ret) { 234 RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", 235 __func__, strerror(errno)); 236 return -1; 237 } 238 /* lock was successful */ 239 return 1; 240 } 241 242 static int 243 get_seg_memfd(struct hugepage_info *hi __rte_unused, 244 unsigned int list_idx __rte_unused, 245 unsigned int seg_idx __rte_unused) 246 { 247 #ifdef MEMFD_SUPPORTED 248 int fd; 249 char segname[250]; /* as per manpage, limit is 249 bytes plus null */ 250 251 int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); 252 const struct internal_config *internal_conf = 253 eal_get_internal_configuration(); 254 255 if (internal_conf->single_file_segments) { 256 fd = fd_list[list_idx].memseg_list_fd; 257 258 if (fd < 0) { 259 snprintf(segname, sizeof(segname), "seg_%i", list_idx); 260 fd = memfd_create(segname, flags); 261 if (fd < 0) { 262 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 263 __func__, strerror(errno)); 264 return -1; 265 } 266 fd_list[list_idx].memseg_list_fd = fd; 267 } 268 } else { 269 fd = fd_list[list_idx].fds[seg_idx]; 270 271 if (fd < 0) { 272 snprintf(segname, sizeof(segname), "seg_%i-%i", 273 list_idx, seg_idx); 274 fd = memfd_create(segname, flags); 275 if (fd < 0) { 276 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 277 __func__, strerror(errno)); 278 return -1; 279 } 280 fd_list[list_idx].fds[seg_idx] = fd; 281 } 282 } 283 return fd; 284 #endif 285 return -1; 286 } 287 288 static int 289 get_seg_fd(char *path, int buflen, struct hugepage_info *hi, 290 unsigned int list_idx, unsigned int seg_idx) 291 { 292 int fd; 293 const struct internal_config *internal_conf = 294 eal_get_internal_configuration(); 295 296 /* for in-memory mode, we only make it here when we're sure we support 297 * memfd, and this is a special case. 298 */ 299 if (internal_conf->in_memory) 300 return get_seg_memfd(hi, list_idx, seg_idx); 301 302 if (internal_conf->single_file_segments) { 303 /* create a hugepage file path */ 304 eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); 305 306 fd = fd_list[list_idx].memseg_list_fd; 307 308 if (fd < 0) { 309 fd = open(path, O_CREAT | O_RDWR, 0600); 310 if (fd < 0) { 311 RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", 312 __func__, strerror(errno)); 313 return -1; 314 } 315 /* take out a read lock and keep it indefinitely */ 316 if (lock(fd, LOCK_SH) < 0) { 317 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", 318 __func__, strerror(errno)); 319 close(fd); 320 return -1; 321 } 322 fd_list[list_idx].memseg_list_fd = fd; 323 } 324 } else { 325 /* create a hugepage file path */ 326 eal_get_hugefile_path(path, buflen, hi->hugedir, 327 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); 328 329 fd = fd_list[list_idx].fds[seg_idx]; 330 331 if (fd < 0) { 332 /* A primary process is the only one creating these 333 * files. If there is a leftover that was not cleaned 334 * by clear_hugedir(), we must *now* make sure to drop 335 * the file or we will remap old stuff while the rest 336 * of the code is built on the assumption that a new 337 * page is clean. 338 */ 339 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 340 unlink(path) == -1 && 341 errno != ENOENT) { 342 RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n", 343 __func__, path, strerror(errno)); 344 return -1; 345 } 346 347 fd = open(path, O_CREAT | O_RDWR, 0600); 348 if (fd < 0) { 349 RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", 350 __func__, strerror(errno)); 351 return -1; 352 } 353 /* take out a read lock */ 354 if (lock(fd, LOCK_SH) < 0) { 355 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", 356 __func__, strerror(errno)); 357 close(fd); 358 return -1; 359 } 360 fd_list[list_idx].fds[seg_idx] = fd; 361 } 362 } 363 return fd; 364 } 365 366 static int 367 resize_hugefile_in_memory(int fd, uint64_t fa_offset, 368 uint64_t page_sz, bool grow) 369 { 370 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 371 FALLOC_FL_KEEP_SIZE; 372 int ret; 373 374 /* grow or shrink the file */ 375 ret = fallocate(fd, flags, fa_offset, page_sz); 376 377 if (ret < 0) { 378 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 379 __func__, 380 strerror(errno)); 381 return -1; 382 } 383 return 0; 384 } 385 386 static int 387 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, 388 bool grow) 389 { 390 bool again = false; 391 392 do { 393 if (fallocate_supported == 0) { 394 /* we cannot deallocate memory if fallocate() is not 395 * supported, and hugepage file is already locked at 396 * creation, so no further synchronization needed. 397 */ 398 399 if (!grow) { 400 RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", 401 __func__); 402 return -1; 403 } 404 uint64_t new_size = fa_offset + page_sz; 405 uint64_t cur_size = get_file_size(fd); 406 407 /* fallocate isn't supported, fall back to ftruncate */ 408 if (new_size > cur_size && 409 ftruncate(fd, new_size) < 0) { 410 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 411 __func__, strerror(errno)); 412 return -1; 413 } 414 } else { 415 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 416 FALLOC_FL_KEEP_SIZE; 417 int ret; 418 419 /* 420 * technically, it is perfectly safe for both primary 421 * and secondary to grow and shrink the page files: 422 * growing the file repeatedly has no effect because 423 * a page can only be allocated once, while mmap ensures 424 * that secondaries hold on to the page even after the 425 * page itself is removed from the filesystem. 426 * 427 * however, leaving growing/shrinking to the primary 428 * tends to expose bugs in fdlist page count handling, 429 * so leave this here just in case. 430 */ 431 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 432 return 0; 433 434 /* grow or shrink the file */ 435 ret = fallocate(fd, flags, fa_offset, page_sz); 436 437 if (ret < 0) { 438 if (fallocate_supported == -1 && 439 errno == ENOTSUP) { 440 RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", 441 __func__); 442 again = true; 443 fallocate_supported = 0; 444 } else { 445 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 446 __func__, 447 strerror(errno)); 448 return -1; 449 } 450 } else 451 fallocate_supported = 1; 452 } 453 } while (again); 454 455 return 0; 456 } 457 458 static void 459 close_hugefile(int fd, char *path, int list_idx) 460 { 461 const struct internal_config *internal_conf = 462 eal_get_internal_configuration(); 463 /* 464 * primary process must unlink the file, but only when not in in-memory 465 * mode (as in that case there is no file to unlink). 466 */ 467 if (!internal_conf->in_memory && 468 rte_eal_process_type() == RTE_PROC_PRIMARY && 469 unlink(path)) 470 RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", 471 __func__, path, strerror(errno)); 472 473 close(fd); 474 fd_list[list_idx].memseg_list_fd = -1; 475 } 476 477 static int 478 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow) 479 { 480 /* in-memory mode is a special case, because we can be sure that 481 * fallocate() is supported. 482 */ 483 const struct internal_config *internal_conf = 484 eal_get_internal_configuration(); 485 486 if (internal_conf->in_memory) 487 return resize_hugefile_in_memory(fd, fa_offset, 488 page_sz, grow); 489 490 return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, 491 grow); 492 } 493 494 static int 495 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, 496 struct hugepage_info *hi, unsigned int list_idx, 497 unsigned int seg_idx) 498 { 499 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 500 int cur_socket_id = 0; 501 #endif 502 uint64_t map_offset; 503 rte_iova_t iova; 504 void *va; 505 char path[PATH_MAX]; 506 int ret = 0; 507 int fd; 508 size_t alloc_sz; 509 int flags; 510 void *new_addr; 511 const struct internal_config *internal_conf = 512 eal_get_internal_configuration(); 513 514 alloc_sz = hi->hugepage_sz; 515 516 /* these are checked at init, but code analyzers don't know that */ 517 if (internal_conf->in_memory && !anonymous_hugepages_supported) { 518 RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); 519 return -1; 520 } 521 if (internal_conf->in_memory && !memfd_create_supported && 522 internal_conf->single_file_segments) { 523 RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); 524 return -1; 525 } 526 527 /* in-memory without memfd is a special case */ 528 int mmap_flags; 529 530 if (internal_conf->in_memory && !memfd_create_supported) { 531 const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | 532 MAP_PRIVATE | MAP_ANONYMOUS; 533 int pagesz_flag; 534 535 pagesz_flag = pagesz_flags(alloc_sz); 536 fd = -1; 537 mmap_flags = in_memory_flags | pagesz_flag; 538 539 /* single-file segments codepath will never be active 540 * here because in-memory mode is incompatible with the 541 * fallback path, and it's stopped at EAL initialization 542 * stage. 543 */ 544 map_offset = 0; 545 } else { 546 /* takes out a read lock on segment or segment list */ 547 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); 548 if (fd < 0) { 549 RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); 550 return -1; 551 } 552 553 if (internal_conf->single_file_segments) { 554 map_offset = seg_idx * alloc_sz; 555 ret = resize_hugefile(fd, map_offset, alloc_sz, true); 556 if (ret < 0) 557 goto resized; 558 559 fd_list[list_idx].count++; 560 } else { 561 map_offset = 0; 562 if (ftruncate(fd, alloc_sz) < 0) { 563 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 564 __func__, strerror(errno)); 565 goto resized; 566 } 567 if (internal_conf->hugepage_unlink && 568 !internal_conf->in_memory) { 569 if (unlink(path)) { 570 RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", 571 __func__, strerror(errno)); 572 goto resized; 573 } 574 } 575 } 576 mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; 577 } 578 579 huge_register_sigbus(); 580 581 /* 582 * map the segment, and populate page tables, the kernel fills 583 * this segment with zeros if it's a new page. 584 */ 585 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, 586 map_offset); 587 588 if (va == MAP_FAILED) { 589 RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, 590 strerror(errno)); 591 /* mmap failed, but the previous region might have been 592 * unmapped anyway. try to remap it 593 */ 594 goto unmapped; 595 } 596 if (va != addr) { 597 RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); 598 munmap(va, alloc_sz); 599 goto resized; 600 } 601 602 /* In linux, hugetlb limitations, like cgroup, are 603 * enforced at fault time instead of mmap(), even 604 * with the option of MAP_POPULATE. Kernel will send 605 * a SIGBUS signal. To avoid to be killed, save stack 606 * environment here, if SIGBUS happens, we can jump 607 * back here. 608 */ 609 if (huge_wrap_sigsetjmp()) { 610 RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", 611 (unsigned int)(alloc_sz >> 20)); 612 goto mapped; 613 } 614 615 /* we need to trigger a write to the page to enforce page fault and 616 * ensure that page is accessible to us, but we can't overwrite value 617 * that is already there, so read the old value, and write itback. 618 * kernel populates the page with zeroes initially. 619 */ 620 *(volatile int *)addr = *(volatile int *)addr; 621 622 iova = rte_mem_virt2iova(addr); 623 if (iova == RTE_BAD_PHYS_ADDR) { 624 RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", 625 __func__); 626 goto mapped; 627 } 628 629 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 630 /* 631 * If the kernel has been built without NUMA support, get_mempolicy() 632 * will return an error. If check_numa() returns false, memory 633 * allocation is not NUMA aware and the socket_id should not be 634 * checked. 635 */ 636 if (check_numa()) { 637 ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, 638 MPOL_F_NODE | MPOL_F_ADDR); 639 if (ret < 0) { 640 RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", 641 __func__, strerror(errno)); 642 goto mapped; 643 } else if (cur_socket_id != socket_id) { 644 RTE_LOG(DEBUG, EAL, 645 "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", 646 __func__, socket_id, cur_socket_id); 647 goto mapped; 648 } 649 } 650 #else 651 if (rte_socket_count() > 1) 652 RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", 653 __func__); 654 #endif 655 656 huge_recover_sigbus(); 657 658 ms->addr = addr; 659 ms->hugepage_sz = alloc_sz; 660 ms->len = alloc_sz; 661 ms->nchannel = rte_memory_get_nchannel(); 662 ms->nrank = rte_memory_get_nrank(); 663 ms->iova = iova; 664 ms->socket_id = socket_id; 665 666 return 0; 667 668 mapped: 669 munmap(addr, alloc_sz); 670 unmapped: 671 huge_recover_sigbus(); 672 flags = EAL_RESERVE_FORCE_ADDRESS; 673 new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); 674 if (new_addr != addr) { 675 if (new_addr != NULL) 676 munmap(new_addr, alloc_sz); 677 /* we're leaving a hole in our virtual address space. if 678 * somebody else maps this hole now, we could accidentally 679 * override it in the future. 680 */ 681 RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); 682 } 683 /* roll back the ref count */ 684 if (internal_conf->single_file_segments) 685 fd_list[list_idx].count--; 686 resized: 687 /* some codepaths will return negative fd, so exit early */ 688 if (fd < 0) 689 return -1; 690 691 if (internal_conf->single_file_segments) { 692 resize_hugefile(fd, map_offset, alloc_sz, false); 693 /* ignore failure, can't make it any worse */ 694 695 /* if refcount is at zero, close the file */ 696 if (fd_list[list_idx].count == 0) 697 close_hugefile(fd, path, list_idx); 698 } else { 699 /* only remove file if we can take out a write lock */ 700 if (internal_conf->hugepage_unlink == 0 && 701 internal_conf->in_memory == 0 && 702 lock(fd, LOCK_EX) == 1) 703 unlink(path); 704 close(fd); 705 fd_list[list_idx].fds[seg_idx] = -1; 706 } 707 return -1; 708 } 709 710 static int 711 free_seg(struct rte_memseg *ms, struct hugepage_info *hi, 712 unsigned int list_idx, unsigned int seg_idx) 713 { 714 uint64_t map_offset; 715 char path[PATH_MAX]; 716 int fd, ret = 0; 717 const struct internal_config *internal_conf = 718 eal_get_internal_configuration(); 719 720 /* erase page data */ 721 memset(ms->addr, 0, ms->len); 722 723 if (mmap(ms->addr, ms->len, PROT_NONE, 724 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == 725 MAP_FAILED) { 726 RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); 727 return -1; 728 } 729 730 eal_mem_set_dump(ms->addr, ms->len, false); 731 732 /* if we're using anonymous hugepages, nothing to be done */ 733 if (internal_conf->in_memory && !memfd_create_supported) { 734 memset(ms, 0, sizeof(*ms)); 735 return 0; 736 } 737 738 /* if we are not in single file segments mode, we're going to unmap the 739 * segment and thus drop the lock on original fd, but hugepage dir is 740 * now locked so we can take out another one without races. 741 */ 742 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); 743 if (fd < 0) 744 return -1; 745 746 if (internal_conf->single_file_segments) { 747 map_offset = seg_idx * ms->len; 748 if (resize_hugefile(fd, map_offset, ms->len, false)) 749 return -1; 750 751 if (--(fd_list[list_idx].count) == 0) 752 close_hugefile(fd, path, list_idx); 753 754 ret = 0; 755 } else { 756 /* if we're able to take out a write lock, we're the last one 757 * holding onto this page. 758 */ 759 if (!internal_conf->in_memory && !internal_conf->hugepage_unlink) { 760 ret = lock(fd, LOCK_EX); 761 if (ret >= 0) { 762 /* no one else is using this page */ 763 if (ret == 1) 764 unlink(path); 765 } 766 } 767 /* closing fd will drop the lock */ 768 close(fd); 769 fd_list[list_idx].fds[seg_idx] = -1; 770 } 771 772 memset(ms, 0, sizeof(*ms)); 773 774 return ret < 0 ? -1 : 0; 775 } 776 777 struct alloc_walk_param { 778 struct hugepage_info *hi; 779 struct rte_memseg **ms; 780 size_t page_sz; 781 unsigned int segs_allocated; 782 unsigned int n_segs; 783 int socket; 784 bool exact; 785 }; 786 static int 787 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) 788 { 789 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 790 struct alloc_walk_param *wa = arg; 791 struct rte_memseg_list *cur_msl; 792 size_t page_sz; 793 int cur_idx, start_idx, j, dir_fd = -1; 794 unsigned int msl_idx, need, i; 795 const struct internal_config *internal_conf = 796 eal_get_internal_configuration(); 797 798 if (msl->page_sz != wa->page_sz) 799 return 0; 800 if (msl->socket_id != wa->socket) 801 return 0; 802 803 page_sz = (size_t)msl->page_sz; 804 805 msl_idx = msl - mcfg->memsegs; 806 cur_msl = &mcfg->memsegs[msl_idx]; 807 808 need = wa->n_segs; 809 810 /* try finding space in memseg list */ 811 if (wa->exact) { 812 /* if we require exact number of pages in a list, find them */ 813 cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, 814 need); 815 if (cur_idx < 0) 816 return 0; 817 start_idx = cur_idx; 818 } else { 819 int cur_len; 820 821 /* we don't require exact number of pages, so we're going to go 822 * for best-effort allocation. that means finding the biggest 823 * unused block, and going with that. 824 */ 825 cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, 826 0); 827 if (cur_idx < 0) 828 return 0; 829 start_idx = cur_idx; 830 /* adjust the size to possibly be smaller than original 831 * request, but do not allow it to be bigger. 832 */ 833 cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, 834 cur_idx); 835 need = RTE_MIN(need, (unsigned int)cur_len); 836 } 837 838 /* do not allow any page allocations during the time we're allocating, 839 * because file creation and locking operations are not atomic, 840 * and we might be the first or the last ones to use a particular page, 841 * so we need to ensure atomicity of every operation. 842 * 843 * during init, we already hold a write lock, so don't try to take out 844 * another one. 845 */ 846 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 847 dir_fd = open(wa->hi->hugedir, O_RDONLY); 848 if (dir_fd < 0) { 849 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 850 __func__, wa->hi->hugedir, strerror(errno)); 851 return -1; 852 } 853 /* blocking writelock */ 854 if (flock(dir_fd, LOCK_EX)) { 855 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 856 __func__, wa->hi->hugedir, strerror(errno)); 857 close(dir_fd); 858 return -1; 859 } 860 } 861 862 for (i = 0; i < need; i++, cur_idx++) { 863 struct rte_memseg *cur; 864 void *map_addr; 865 866 cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); 867 map_addr = RTE_PTR_ADD(cur_msl->base_va, 868 cur_idx * page_sz); 869 870 if (alloc_seg(cur, map_addr, wa->socket, wa->hi, 871 msl_idx, cur_idx)) { 872 RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", 873 need, i); 874 875 /* if exact number wasn't requested, stop */ 876 if (!wa->exact) 877 goto out; 878 879 /* clean up */ 880 for (j = start_idx; j < cur_idx; j++) { 881 struct rte_memseg *tmp; 882 struct rte_fbarray *arr = 883 &cur_msl->memseg_arr; 884 885 tmp = rte_fbarray_get(arr, j); 886 rte_fbarray_set_free(arr, j); 887 888 /* free_seg may attempt to create a file, which 889 * may fail. 890 */ 891 if (free_seg(tmp, wa->hi, msl_idx, j)) 892 RTE_LOG(DEBUG, EAL, "Cannot free page\n"); 893 } 894 /* clear the list */ 895 if (wa->ms) 896 memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); 897 898 if (dir_fd >= 0) 899 close(dir_fd); 900 return -1; 901 } 902 if (wa->ms) 903 wa->ms[i] = cur; 904 905 rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); 906 } 907 out: 908 wa->segs_allocated = i; 909 if (i > 0) 910 cur_msl->version++; 911 if (dir_fd >= 0) 912 close(dir_fd); 913 /* if we didn't allocate any segments, move on to the next list */ 914 return i > 0; 915 } 916 917 struct free_walk_param { 918 struct hugepage_info *hi; 919 struct rte_memseg *ms; 920 }; 921 static int 922 free_seg_walk(const struct rte_memseg_list *msl, void *arg) 923 { 924 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 925 struct rte_memseg_list *found_msl; 926 struct free_walk_param *wa = arg; 927 uintptr_t start_addr, end_addr; 928 int msl_idx, seg_idx, ret, dir_fd = -1; 929 const struct internal_config *internal_conf = 930 eal_get_internal_configuration(); 931 932 start_addr = (uintptr_t) msl->base_va; 933 end_addr = start_addr + msl->len; 934 935 if ((uintptr_t)wa->ms->addr < start_addr || 936 (uintptr_t)wa->ms->addr >= end_addr) 937 return 0; 938 939 msl_idx = msl - mcfg->memsegs; 940 seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; 941 942 /* msl is const */ 943 found_msl = &mcfg->memsegs[msl_idx]; 944 945 /* do not allow any page allocations during the time we're freeing, 946 * because file creation and locking operations are not atomic, 947 * and we might be the first or the last ones to use a particular page, 948 * so we need to ensure atomicity of every operation. 949 * 950 * during init, we already hold a write lock, so don't try to take out 951 * another one. 952 */ 953 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 954 dir_fd = open(wa->hi->hugedir, O_RDONLY); 955 if (dir_fd < 0) { 956 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 957 __func__, wa->hi->hugedir, strerror(errno)); 958 return -1; 959 } 960 /* blocking writelock */ 961 if (flock(dir_fd, LOCK_EX)) { 962 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 963 __func__, wa->hi->hugedir, strerror(errno)); 964 close(dir_fd); 965 return -1; 966 } 967 } 968 969 found_msl->version++; 970 971 rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); 972 973 ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); 974 975 if (dir_fd >= 0) 976 close(dir_fd); 977 978 if (ret < 0) 979 return -1; 980 981 return 1; 982 } 983 984 int 985 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, 986 int socket, bool exact) 987 { 988 int i, ret = -1; 989 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 990 bool have_numa = false; 991 int oldpolicy; 992 struct bitmask *oldmask; 993 #endif 994 struct alloc_walk_param wa; 995 struct hugepage_info *hi = NULL; 996 struct internal_config *internal_conf = 997 eal_get_internal_configuration(); 998 999 memset(&wa, 0, sizeof(wa)); 1000 1001 /* dynamic allocation not supported in legacy mode */ 1002 if (internal_conf->legacy_mem) 1003 return -1; 1004 1005 for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) { 1006 if (page_sz == 1007 internal_conf->hugepage_info[i].hugepage_sz) { 1008 hi = &internal_conf->hugepage_info[i]; 1009 break; 1010 } 1011 } 1012 if (!hi) { 1013 RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", 1014 __func__); 1015 return -1; 1016 } 1017 1018 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1019 if (check_numa()) { 1020 oldmask = numa_allocate_nodemask(); 1021 prepare_numa(&oldpolicy, oldmask, socket); 1022 have_numa = true; 1023 } 1024 #endif 1025 1026 wa.exact = exact; 1027 wa.hi = hi; 1028 wa.ms = ms; 1029 wa.n_segs = n_segs; 1030 wa.page_sz = page_sz; 1031 wa.socket = socket; 1032 wa.segs_allocated = 0; 1033 1034 /* memalloc is locked, so it's safe to use thread-unsafe version */ 1035 ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); 1036 if (ret == 0) { 1037 RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", 1038 __func__); 1039 ret = -1; 1040 } else if (ret > 0) { 1041 ret = (int)wa.segs_allocated; 1042 } 1043 1044 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1045 if (have_numa) 1046 restore_numa(&oldpolicy, oldmask); 1047 #endif 1048 return ret; 1049 } 1050 1051 struct rte_memseg * 1052 eal_memalloc_alloc_seg(size_t page_sz, int socket) 1053 { 1054 struct rte_memseg *ms; 1055 if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) 1056 return NULL; 1057 /* return pointer to newly allocated memseg */ 1058 return ms; 1059 } 1060 1061 int 1062 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) 1063 { 1064 int seg, ret = 0; 1065 struct internal_config *internal_conf = 1066 eal_get_internal_configuration(); 1067 1068 /* dynamic free not supported in legacy mode */ 1069 if (internal_conf->legacy_mem) 1070 return -1; 1071 1072 for (seg = 0; seg < n_segs; seg++) { 1073 struct rte_memseg *cur = ms[seg]; 1074 struct hugepage_info *hi = NULL; 1075 struct free_walk_param wa; 1076 int i, walk_res; 1077 1078 /* if this page is marked as unfreeable, fail */ 1079 if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 1080 RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); 1081 ret = -1; 1082 continue; 1083 } 1084 1085 memset(&wa, 0, sizeof(wa)); 1086 1087 for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info); 1088 i++) { 1089 hi = &internal_conf->hugepage_info[i]; 1090 if (cur->hugepage_sz == hi->hugepage_sz) 1091 break; 1092 } 1093 if (i == (int)RTE_DIM(internal_conf->hugepage_info)) { 1094 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1095 ret = -1; 1096 continue; 1097 } 1098 1099 wa.ms = cur; 1100 wa.hi = hi; 1101 1102 /* memalloc is locked, so it's safe to use thread-unsafe version 1103 */ 1104 walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, 1105 &wa); 1106 if (walk_res == 1) 1107 continue; 1108 if (walk_res == 0) 1109 RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); 1110 ret = -1; 1111 } 1112 return ret; 1113 } 1114 1115 int 1116 eal_memalloc_free_seg(struct rte_memseg *ms) 1117 { 1118 const struct internal_config *internal_conf = 1119 eal_get_internal_configuration(); 1120 1121 /* dynamic free not supported in legacy mode */ 1122 if (internal_conf->legacy_mem) 1123 return -1; 1124 1125 return eal_memalloc_free_seg_bulk(&ms, 1); 1126 } 1127 1128 static int 1129 sync_chunk(struct rte_memseg_list *primary_msl, 1130 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1131 unsigned int msl_idx, bool used, int start, int end) 1132 { 1133 struct rte_fbarray *l_arr, *p_arr; 1134 int i, ret, chunk_len, diff_len; 1135 1136 l_arr = &local_msl->memseg_arr; 1137 p_arr = &primary_msl->memseg_arr; 1138 1139 /* we need to aggregate allocations/deallocations into bigger chunks, 1140 * as we don't want to spam the user with per-page callbacks. 1141 * 1142 * to avoid any potential issues, we also want to trigger 1143 * deallocation callbacks *before* we actually deallocate 1144 * memory, so that the user application could wrap up its use 1145 * before it goes away. 1146 */ 1147 1148 chunk_len = end - start; 1149 1150 /* find how many contiguous pages we can map/unmap for this chunk */ 1151 diff_len = used ? 1152 rte_fbarray_find_contig_free(l_arr, start) : 1153 rte_fbarray_find_contig_used(l_arr, start); 1154 1155 /* has to be at least one page */ 1156 if (diff_len < 1) 1157 return -1; 1158 1159 diff_len = RTE_MIN(chunk_len, diff_len); 1160 1161 /* if we are freeing memory, notify the application */ 1162 if (!used) { 1163 struct rte_memseg *ms; 1164 void *start_va; 1165 size_t len, page_sz; 1166 1167 ms = rte_fbarray_get(l_arr, start); 1168 start_va = ms->addr; 1169 page_sz = (size_t)primary_msl->page_sz; 1170 len = page_sz * diff_len; 1171 1172 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 1173 start_va, len); 1174 } 1175 1176 for (i = 0; i < diff_len; i++) { 1177 struct rte_memseg *p_ms, *l_ms; 1178 int seg_idx = start + i; 1179 1180 l_ms = rte_fbarray_get(l_arr, seg_idx); 1181 p_ms = rte_fbarray_get(p_arr, seg_idx); 1182 1183 if (l_ms == NULL || p_ms == NULL) 1184 return -1; 1185 1186 if (used) { 1187 ret = alloc_seg(l_ms, p_ms->addr, 1188 p_ms->socket_id, hi, 1189 msl_idx, seg_idx); 1190 if (ret < 0) 1191 return -1; 1192 rte_fbarray_set_used(l_arr, seg_idx); 1193 } else { 1194 ret = free_seg(l_ms, hi, msl_idx, seg_idx); 1195 rte_fbarray_set_free(l_arr, seg_idx); 1196 if (ret < 0) 1197 return -1; 1198 } 1199 } 1200 1201 /* if we just allocated memory, notify the application */ 1202 if (used) { 1203 struct rte_memseg *ms; 1204 void *start_va; 1205 size_t len, page_sz; 1206 1207 ms = rte_fbarray_get(l_arr, start); 1208 start_va = ms->addr; 1209 page_sz = (size_t)primary_msl->page_sz; 1210 len = page_sz * diff_len; 1211 1212 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1213 start_va, len); 1214 } 1215 1216 /* calculate how much we can advance until next chunk */ 1217 diff_len = used ? 1218 rte_fbarray_find_contig_used(l_arr, start) : 1219 rte_fbarray_find_contig_free(l_arr, start); 1220 ret = RTE_MIN(chunk_len, diff_len); 1221 1222 return ret; 1223 } 1224 1225 static int 1226 sync_status(struct rte_memseg_list *primary_msl, 1227 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1228 unsigned int msl_idx, bool used) 1229 { 1230 struct rte_fbarray *l_arr, *p_arr; 1231 int p_idx, l_chunk_len, p_chunk_len, ret; 1232 int start, end; 1233 1234 /* this is a little bit tricky, but the basic idea is - walk both lists 1235 * and spot any places where there are discrepancies. walking both lists 1236 * and noting discrepancies in a single go is a hard problem, so we do 1237 * it in two passes - first we spot any places where allocated segments 1238 * mismatch (i.e. ensure that everything that's allocated in the primary 1239 * is also allocated in the secondary), and then we do it by looking at 1240 * free segments instead. 1241 * 1242 * we also need to aggregate changes into chunks, as we have to call 1243 * callbacks per allocation, not per page. 1244 */ 1245 l_arr = &local_msl->memseg_arr; 1246 p_arr = &primary_msl->memseg_arr; 1247 1248 if (used) 1249 p_idx = rte_fbarray_find_next_used(p_arr, 0); 1250 else 1251 p_idx = rte_fbarray_find_next_free(p_arr, 0); 1252 1253 while (p_idx >= 0) { 1254 int next_chunk_search_idx; 1255 1256 if (used) { 1257 p_chunk_len = rte_fbarray_find_contig_used(p_arr, 1258 p_idx); 1259 l_chunk_len = rte_fbarray_find_contig_used(l_arr, 1260 p_idx); 1261 } else { 1262 p_chunk_len = rte_fbarray_find_contig_free(p_arr, 1263 p_idx); 1264 l_chunk_len = rte_fbarray_find_contig_free(l_arr, 1265 p_idx); 1266 } 1267 /* best case scenario - no differences (or bigger, which will be 1268 * fixed during next iteration), look for next chunk 1269 */ 1270 if (l_chunk_len >= p_chunk_len) { 1271 next_chunk_search_idx = p_idx + p_chunk_len; 1272 goto next_chunk; 1273 } 1274 1275 /* if both chunks start at the same point, skip parts we know 1276 * are identical, and sync the rest. each call to sync_chunk 1277 * will only sync contiguous segments, so we need to call this 1278 * until we are sure there are no more differences in this 1279 * chunk. 1280 */ 1281 start = p_idx + l_chunk_len; 1282 end = p_idx + p_chunk_len; 1283 do { 1284 ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, 1285 used, start, end); 1286 start += ret; 1287 } while (start < end && ret >= 0); 1288 /* if ret is negative, something went wrong */ 1289 if (ret < 0) 1290 return -1; 1291 1292 next_chunk_search_idx = p_idx + p_chunk_len; 1293 next_chunk: 1294 /* skip to end of this chunk */ 1295 if (used) { 1296 p_idx = rte_fbarray_find_next_used(p_arr, 1297 next_chunk_search_idx); 1298 } else { 1299 p_idx = rte_fbarray_find_next_free(p_arr, 1300 next_chunk_search_idx); 1301 } 1302 } 1303 return 0; 1304 } 1305 1306 static int 1307 sync_existing(struct rte_memseg_list *primary_msl, 1308 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1309 unsigned int msl_idx) 1310 { 1311 int ret, dir_fd; 1312 1313 /* do not allow any page allocations during the time we're allocating, 1314 * because file creation and locking operations are not atomic, 1315 * and we might be the first or the last ones to use a particular page, 1316 * so we need to ensure atomicity of every operation. 1317 */ 1318 dir_fd = open(hi->hugedir, O_RDONLY); 1319 if (dir_fd < 0) { 1320 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, 1321 hi->hugedir, strerror(errno)); 1322 return -1; 1323 } 1324 /* blocking writelock */ 1325 if (flock(dir_fd, LOCK_EX)) { 1326 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, 1327 hi->hugedir, strerror(errno)); 1328 close(dir_fd); 1329 return -1; 1330 } 1331 1332 /* ensure all allocated space is the same in both lists */ 1333 ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); 1334 if (ret < 0) 1335 goto fail; 1336 1337 /* ensure all unallocated space is the same in both lists */ 1338 ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); 1339 if (ret < 0) 1340 goto fail; 1341 1342 /* update version number */ 1343 local_msl->version = primary_msl->version; 1344 1345 close(dir_fd); 1346 1347 return 0; 1348 fail: 1349 close(dir_fd); 1350 return -1; 1351 } 1352 1353 static int 1354 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1355 { 1356 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1357 struct rte_memseg_list *primary_msl, *local_msl; 1358 struct hugepage_info *hi = NULL; 1359 unsigned int i; 1360 int msl_idx; 1361 struct internal_config *internal_conf = 1362 eal_get_internal_configuration(); 1363 1364 if (msl->external) 1365 return 0; 1366 1367 msl_idx = msl - mcfg->memsegs; 1368 primary_msl = &mcfg->memsegs[msl_idx]; 1369 local_msl = &local_memsegs[msl_idx]; 1370 1371 for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) { 1372 uint64_t cur_sz = 1373 internal_conf->hugepage_info[i].hugepage_sz; 1374 uint64_t msl_sz = primary_msl->page_sz; 1375 if (msl_sz == cur_sz) { 1376 hi = &internal_conf->hugepage_info[i]; 1377 break; 1378 } 1379 } 1380 if (!hi) { 1381 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1382 return -1; 1383 } 1384 1385 /* if versions don't match, synchronize everything */ 1386 if (local_msl->version != primary_msl->version && 1387 sync_existing(primary_msl, local_msl, hi, msl_idx)) 1388 return -1; 1389 return 0; 1390 } 1391 1392 1393 int 1394 eal_memalloc_sync_with_primary(void) 1395 { 1396 /* nothing to be done in primary */ 1397 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1398 return 0; 1399 1400 /* memalloc is locked, so it's safe to call thread-unsafe version */ 1401 if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) 1402 return -1; 1403 return 0; 1404 } 1405 1406 static int 1407 secondary_msl_create_walk(const struct rte_memseg_list *msl, 1408 void *arg __rte_unused) 1409 { 1410 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1411 struct rte_memseg_list *primary_msl, *local_msl; 1412 char name[PATH_MAX]; 1413 int msl_idx, ret; 1414 1415 if (msl->external) 1416 return 0; 1417 1418 msl_idx = msl - mcfg->memsegs; 1419 primary_msl = &mcfg->memsegs[msl_idx]; 1420 local_msl = &local_memsegs[msl_idx]; 1421 1422 /* create distinct fbarrays for each secondary */ 1423 snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", 1424 primary_msl->memseg_arr.name, getpid()); 1425 1426 ret = rte_fbarray_init(&local_msl->memseg_arr, name, 1427 primary_msl->memseg_arr.len, 1428 primary_msl->memseg_arr.elt_sz); 1429 if (ret < 0) { 1430 RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); 1431 return -1; 1432 } 1433 local_msl->base_va = primary_msl->base_va; 1434 local_msl->len = primary_msl->len; 1435 1436 return 0; 1437 } 1438 1439 static int 1440 secondary_msl_destroy_walk(const struct rte_memseg_list *msl, 1441 void *arg __rte_unused) 1442 { 1443 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1444 struct rte_memseg_list *local_msl; 1445 int msl_idx, ret; 1446 1447 if (msl->external) 1448 return 0; 1449 1450 msl_idx = msl - mcfg->memsegs; 1451 local_msl = &local_memsegs[msl_idx]; 1452 1453 ret = rte_fbarray_destroy(&local_msl->memseg_arr); 1454 if (ret < 0) { 1455 RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n"); 1456 return -1; 1457 } 1458 local_msl->base_va = NULL; 1459 local_msl->len = 0; 1460 1461 return 0; 1462 } 1463 1464 static int 1465 alloc_list(int list_idx, int len) 1466 { 1467 int *data; 1468 int i; 1469 const struct internal_config *internal_conf = 1470 eal_get_internal_configuration(); 1471 1472 /* single-file segments mode does not need fd list */ 1473 if (!internal_conf->single_file_segments) { 1474 /* ensure we have space to store fd per each possible segment */ 1475 data = malloc(sizeof(int) * len); 1476 if (data == NULL) { 1477 RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); 1478 return -1; 1479 } 1480 /* set all fd's as invalid */ 1481 for (i = 0; i < len; i++) 1482 data[i] = -1; 1483 fd_list[list_idx].fds = data; 1484 fd_list[list_idx].len = len; 1485 } else { 1486 fd_list[list_idx].fds = NULL; 1487 fd_list[list_idx].len = 0; 1488 } 1489 1490 fd_list[list_idx].count = 0; 1491 fd_list[list_idx].memseg_list_fd = -1; 1492 1493 return 0; 1494 } 1495 1496 static int 1497 destroy_list(int list_idx) 1498 { 1499 const struct internal_config *internal_conf = 1500 eal_get_internal_configuration(); 1501 1502 /* single-file segments mode does not need fd list */ 1503 if (!internal_conf->single_file_segments) { 1504 int *fds = fd_list[list_idx].fds; 1505 int i; 1506 /* go through each fd and ensure it's closed */ 1507 for (i = 0; i < fd_list[list_idx].len; i++) { 1508 if (fds[i] >= 0) { 1509 close(fds[i]); 1510 fds[i] = -1; 1511 } 1512 } 1513 free(fds); 1514 fd_list[list_idx].fds = NULL; 1515 fd_list[list_idx].len = 0; 1516 } else if (fd_list[list_idx].memseg_list_fd >= 0) { 1517 close(fd_list[list_idx].memseg_list_fd); 1518 fd_list[list_idx].count = 0; 1519 fd_list[list_idx].memseg_list_fd = -1; 1520 } 1521 return 0; 1522 } 1523 1524 static int 1525 fd_list_create_walk(const struct rte_memseg_list *msl, 1526 void *arg __rte_unused) 1527 { 1528 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1529 unsigned int len; 1530 int msl_idx; 1531 1532 if (msl->external) 1533 return 0; 1534 1535 msl_idx = msl - mcfg->memsegs; 1536 len = msl->memseg_arr.len; 1537 1538 return alloc_list(msl_idx, len); 1539 } 1540 1541 static int 1542 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1543 { 1544 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1545 int msl_idx; 1546 1547 if (msl->external) 1548 return 0; 1549 1550 msl_idx = msl - mcfg->memsegs; 1551 1552 return destroy_list(msl_idx); 1553 } 1554 1555 int 1556 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) 1557 { 1558 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1559 const struct internal_config *internal_conf = 1560 eal_get_internal_configuration(); 1561 1562 /* single file segments mode doesn't support individual segment fd's */ 1563 if (internal_conf->single_file_segments) 1564 return -ENOTSUP; 1565 1566 /* if list is not allocated, allocate it */ 1567 if (fd_list[list_idx].len == 0) { 1568 int len = mcfg->memsegs[list_idx].memseg_arr.len; 1569 1570 if (alloc_list(list_idx, len) < 0) 1571 return -ENOMEM; 1572 } 1573 fd_list[list_idx].fds[seg_idx] = fd; 1574 1575 return 0; 1576 } 1577 1578 int 1579 eal_memalloc_set_seg_list_fd(int list_idx, int fd) 1580 { 1581 const struct internal_config *internal_conf = 1582 eal_get_internal_configuration(); 1583 1584 /* non-single file segment mode doesn't support segment list fd's */ 1585 if (!internal_conf->single_file_segments) 1586 return -ENOTSUP; 1587 1588 fd_list[list_idx].memseg_list_fd = fd; 1589 1590 return 0; 1591 } 1592 1593 int 1594 eal_memalloc_get_seg_fd(int list_idx, int seg_idx) 1595 { 1596 int fd; 1597 const struct internal_config *internal_conf = 1598 eal_get_internal_configuration(); 1599 1600 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1601 #ifndef MEMFD_SUPPORTED 1602 /* in in-memory or no-huge mode, we rely on memfd support */ 1603 return -ENOTSUP; 1604 #endif 1605 /* memfd supported, but hugetlbfs memfd may not be */ 1606 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1607 return -ENOTSUP; 1608 } 1609 1610 if (internal_conf->single_file_segments) { 1611 fd = fd_list[list_idx].memseg_list_fd; 1612 } else if (fd_list[list_idx].len == 0) { 1613 /* list not initialized */ 1614 fd = -1; 1615 } else { 1616 fd = fd_list[list_idx].fds[seg_idx]; 1617 } 1618 if (fd < 0) 1619 return -ENODEV; 1620 return fd; 1621 } 1622 1623 static int 1624 test_memfd_create(void) 1625 { 1626 #ifdef MEMFD_SUPPORTED 1627 const struct internal_config *internal_conf = 1628 eal_get_internal_configuration(); 1629 unsigned int i; 1630 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 1631 uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz; 1632 int pagesz_flag = pagesz_flags(pagesz); 1633 int flags; 1634 1635 flags = pagesz_flag | RTE_MFD_HUGETLB; 1636 int fd = memfd_create("test", flags); 1637 if (fd < 0) { 1638 /* we failed - let memalloc know this isn't working */ 1639 if (errno == EINVAL) { 1640 memfd_create_supported = 0; 1641 return 0; /* not supported */ 1642 } 1643 1644 /* we got other error - something's wrong */ 1645 return -1; /* error */ 1646 } 1647 close(fd); 1648 return 1; /* supported */ 1649 } 1650 #endif 1651 return 0; /* not supported */ 1652 } 1653 1654 int 1655 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) 1656 { 1657 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1658 const struct internal_config *internal_conf = 1659 eal_get_internal_configuration(); 1660 1661 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1662 #ifndef MEMFD_SUPPORTED 1663 /* in in-memory or no-huge mode, we rely on memfd support */ 1664 return -ENOTSUP; 1665 #endif 1666 /* memfd supported, but hugetlbfs memfd may not be */ 1667 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1668 return -ENOTSUP; 1669 } 1670 1671 if (internal_conf->single_file_segments) { 1672 size_t pgsz = mcfg->memsegs[list_idx].page_sz; 1673 1674 /* segment not active? */ 1675 if (fd_list[list_idx].memseg_list_fd < 0) 1676 return -ENOENT; 1677 *offset = pgsz * seg_idx; 1678 } else { 1679 /* fd_list not initialized? */ 1680 if (fd_list[list_idx].len == 0) 1681 return -ENODEV; 1682 1683 /* segment not active? */ 1684 if (fd_list[list_idx].fds[seg_idx] < 0) 1685 return -ENOENT; 1686 *offset = 0; 1687 } 1688 return 0; 1689 } 1690 1691 int 1692 eal_memalloc_cleanup(void) 1693 { 1694 /* close all remaining fd's - these are per-process, so it's safe */ 1695 if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL)) 1696 return -1; 1697 1698 /* destroy the shadow page table if we're a secondary process */ 1699 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1700 return 0; 1701 1702 if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk, 1703 NULL)) 1704 return -1; 1705 1706 return 0; 1707 } 1708 1709 int 1710 eal_memalloc_init(void) 1711 { 1712 const struct internal_config *internal_conf = 1713 eal_get_internal_configuration(); 1714 1715 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 1716 if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) 1717 return -1; 1718 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 1719 internal_conf->in_memory) { 1720 int mfd_res = test_memfd_create(); 1721 1722 if (mfd_res < 0) { 1723 RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); 1724 return -1; 1725 } 1726 if (mfd_res == 1) 1727 RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); 1728 else 1729 RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); 1730 1731 /* we only support single-file segments mode with in-memory mode 1732 * if we support hugetlbfs with memfd_create. this code will 1733 * test if we do. 1734 */ 1735 if (internal_conf->single_file_segments && 1736 mfd_res != 1) { 1737 RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); 1738 return -1; 1739 } 1740 /* this cannot ever happen but better safe than sorry */ 1741 if (!anonymous_hugepages_supported) { 1742 RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); 1743 return -1; 1744 } 1745 } 1746 1747 /* initialize all of the fd lists */ 1748 if (rte_memseg_list_walk(fd_list_create_walk, NULL)) 1749 return -1; 1750 return 0; 1751 } 1752