1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017-2018 Intel Corporation 3 */ 4 5 #include <errno.h> 6 #include <stdarg.h> 7 #include <stdbool.h> 8 #include <stdlib.h> 9 #include <stdio.h> 10 #include <stdint.h> 11 #include <inttypes.h> 12 #include <string.h> 13 #include <sys/mman.h> 14 #include <sys/types.h> 15 #include <sys/stat.h> 16 #include <sys/queue.h> 17 #include <sys/file.h> 18 #include <unistd.h> 19 #include <limits.h> 20 #include <fcntl.h> 21 #include <sys/ioctl.h> 22 #include <sys/time.h> 23 #include <signal.h> 24 #include <setjmp.h> 25 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 26 #include <linux/memfd.h> 27 #define MEMFD_SUPPORTED 28 #endif 29 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 30 #include <numa.h> 31 #include <numaif.h> 32 #endif 33 #include <linux/falloc.h> 34 #include <linux/mman.h> /* for hugetlb-related mmap flags */ 35 36 #include <rte_common.h> 37 #include <rte_log.h> 38 #include <rte_eal.h> 39 #include <rte_errno.h> 40 #include <rte_memory.h> 41 #include <rte_spinlock.h> 42 43 #include "eal_filesystem.h" 44 #include "eal_internal_cfg.h" 45 #include "eal_memalloc.h" 46 #include "eal_memcfg.h" 47 #include "eal_private.h" 48 49 const int anonymous_hugepages_supported = 50 #ifdef MAP_HUGE_SHIFT 51 1; 52 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT 53 #else 54 0; 55 #define RTE_MAP_HUGE_SHIFT 26 56 #endif 57 58 /* 59 * we've already checked memfd support at compile-time, but we also need to 60 * check if we can create hugepage files with memfd. 61 * 62 * also, this is not a constant, because while we may be *compiled* with memfd 63 * hugetlbfs support, we might not be *running* on a system that supports memfd 64 * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at 65 * runtime, and fall back to anonymous memory. 66 */ 67 static int memfd_create_supported = 68 #ifdef MFD_HUGETLB 69 1; 70 #define RTE_MFD_HUGETLB MFD_HUGETLB 71 #else 72 0; 73 #define RTE_MFD_HUGETLB 4U 74 #endif 75 76 /* 77 * not all kernel version support fallocate on hugetlbfs, so fall back to 78 * ftruncate and disallow deallocation if fallocate is not supported. 79 */ 80 static int fallocate_supported = -1; /* unknown */ 81 82 /* 83 * we have two modes - single file segments, and file-per-page mode. 84 * 85 * for single-file segments, we use memseg_list_fd to store the segment fd, 86 * while the fds[] will not be allocated, and len will be set to 0. 87 * 88 * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' 89 * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. 90 * 91 * we cannot know how many pages a system will have in advance, but we do know 92 * that they come in lists, and we know lengths of these lists. so, simply store 93 * a malloc'd array of fd's indexed by list and segment index. 94 * 95 * they will be initialized at startup, and filled as we allocate/deallocate 96 * segments. 97 */ 98 static struct { 99 int *fds; /**< dynamically allocated array of segment lock fd's */ 100 int memseg_list_fd; /**< memseg list fd */ 101 int len; /**< total length of the array */ 102 int count; /**< entries used in an array */ 103 } fd_list[RTE_MAX_MEMSEG_LISTS]; 104 105 /** local copy of a memory map, used to synchronize memory hotplug in MP */ 106 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; 107 108 static sigjmp_buf huge_jmpenv; 109 110 static void huge_sigbus_handler(int signo __rte_unused) 111 { 112 siglongjmp(huge_jmpenv, 1); 113 } 114 115 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, 116 * non-static local variable in the stack frame calling sigsetjmp might be 117 * clobbered by a call to longjmp. 118 */ 119 static int huge_wrap_sigsetjmp(void) 120 { 121 return sigsetjmp(huge_jmpenv, 1); 122 } 123 124 static struct sigaction huge_action_old; 125 static int huge_need_recover; 126 127 static void 128 huge_register_sigbus(void) 129 { 130 sigset_t mask; 131 struct sigaction action; 132 133 sigemptyset(&mask); 134 sigaddset(&mask, SIGBUS); 135 action.sa_flags = 0; 136 action.sa_mask = mask; 137 action.sa_handler = huge_sigbus_handler; 138 139 huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); 140 } 141 142 static void 143 huge_recover_sigbus(void) 144 { 145 if (huge_need_recover) { 146 sigaction(SIGBUS, &huge_action_old, NULL); 147 huge_need_recover = 0; 148 } 149 } 150 151 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 152 static bool 153 check_numa(void) 154 { 155 bool ret = true; 156 /* Check if kernel supports NUMA. */ 157 if (numa_available() != 0) { 158 RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); 159 ret = false; 160 } 161 return ret; 162 } 163 164 static void 165 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) 166 { 167 RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); 168 if (get_mempolicy(oldpolicy, oldmask->maskp, 169 oldmask->size + 1, 0, 0) < 0) { 170 RTE_LOG(ERR, EAL, 171 "Failed to get current mempolicy: %s. " 172 "Assuming MPOL_DEFAULT.\n", strerror(errno)); 173 *oldpolicy = MPOL_DEFAULT; 174 } 175 RTE_LOG(DEBUG, EAL, 176 "Setting policy MPOL_PREFERRED for socket %d\n", 177 socket_id); 178 numa_set_preferred(socket_id); 179 } 180 181 static void 182 restore_numa(int *oldpolicy, struct bitmask *oldmask) 183 { 184 RTE_LOG(DEBUG, EAL, 185 "Restoring previous memory policy: %d\n", *oldpolicy); 186 if (*oldpolicy == MPOL_DEFAULT) { 187 numa_set_localalloc(); 188 } else if (set_mempolicy(*oldpolicy, oldmask->maskp, 189 oldmask->size + 1) < 0) { 190 RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", 191 strerror(errno)); 192 numa_set_localalloc(); 193 } 194 numa_free_cpumask(oldmask); 195 } 196 #endif 197 198 /* 199 * uses fstat to report the size of a file on disk 200 */ 201 static off_t 202 get_file_size(int fd) 203 { 204 struct stat st; 205 if (fstat(fd, &st) < 0) 206 return 0; 207 return st.st_size; 208 } 209 210 static int 211 pagesz_flags(uint64_t page_sz) 212 { 213 /* as per mmap() manpage, all page sizes are log2 of page size 214 * shifted by MAP_HUGE_SHIFT 215 */ 216 int log2 = rte_log2_u64(page_sz); 217 return log2 << RTE_MAP_HUGE_SHIFT; 218 } 219 220 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ 221 static int lock(int fd, int type) 222 { 223 int ret; 224 225 /* flock may be interrupted */ 226 do { 227 ret = flock(fd, type | LOCK_NB); 228 } while (ret && errno == EINTR); 229 230 if (ret && errno == EWOULDBLOCK) { 231 /* couldn't lock */ 232 return 0; 233 } else if (ret) { 234 RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", 235 __func__, strerror(errno)); 236 return -1; 237 } 238 /* lock was successful */ 239 return 1; 240 } 241 242 static int 243 get_seg_memfd(struct hugepage_info *hi __rte_unused, 244 unsigned int list_idx __rte_unused, 245 unsigned int seg_idx __rte_unused) 246 { 247 #ifdef MEMFD_SUPPORTED 248 int fd; 249 char segname[250]; /* as per manpage, limit is 249 bytes plus null */ 250 251 int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); 252 const struct internal_config *internal_conf = 253 eal_get_internal_configuration(); 254 255 if (internal_conf->single_file_segments) { 256 fd = fd_list[list_idx].memseg_list_fd; 257 258 if (fd < 0) { 259 snprintf(segname, sizeof(segname), "seg_%i", list_idx); 260 fd = memfd_create(segname, flags); 261 if (fd < 0) { 262 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 263 __func__, strerror(errno)); 264 return -1; 265 } 266 fd_list[list_idx].memseg_list_fd = fd; 267 } 268 } else { 269 fd = fd_list[list_idx].fds[seg_idx]; 270 271 if (fd < 0) { 272 snprintf(segname, sizeof(segname), "seg_%i-%i", 273 list_idx, seg_idx); 274 fd = memfd_create(segname, flags); 275 if (fd < 0) { 276 RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 277 __func__, strerror(errno)); 278 return -1; 279 } 280 fd_list[list_idx].fds[seg_idx] = fd; 281 } 282 } 283 return fd; 284 #endif 285 return -1; 286 } 287 288 static int 289 get_seg_fd(char *path, int buflen, struct hugepage_info *hi, 290 unsigned int list_idx, unsigned int seg_idx, 291 bool *dirty) 292 { 293 int fd; 294 int *out_fd; 295 struct stat st; 296 int ret; 297 const struct internal_config *internal_conf = 298 eal_get_internal_configuration(); 299 300 if (dirty != NULL) 301 *dirty = false; 302 303 /* for in-memory mode, we only make it here when we're sure we support 304 * memfd, and this is a special case. 305 */ 306 if (internal_conf->in_memory) 307 return get_seg_memfd(hi, list_idx, seg_idx); 308 309 if (internal_conf->single_file_segments) { 310 out_fd = &fd_list[list_idx].memseg_list_fd; 311 eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); 312 } else { 313 out_fd = &fd_list[list_idx].fds[seg_idx]; 314 eal_get_hugefile_path(path, buflen, hi->hugedir, 315 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); 316 } 317 fd = *out_fd; 318 if (fd >= 0) 319 return fd; 320 321 /* 322 * There is no TOCTOU between stat() and unlink()/open() 323 * because the hugepage directory is locked. 324 */ 325 ret = stat(path, &st); 326 if (ret < 0 && errno != ENOENT) { 327 RTE_LOG(DEBUG, EAL, "%s(): stat() for '%s' failed: %s\n", 328 __func__, path, strerror(errno)); 329 return -1; 330 } 331 if (!internal_conf->hugepage_file.unlink_existing && ret == 0 && 332 dirty != NULL) 333 *dirty = true; 334 335 /* 336 * The kernel clears a hugepage only when it is mapped 337 * from a particular file for the first time. 338 * If the file already exists, the old content will be mapped. 339 * If the memory manager assumes all mapped pages to be clean, 340 * the file must be removed and created anew. 341 * Otherwise, the primary caller must be notified 342 * that mapped pages will be dirty 343 * (secondary callers receive the segment state from the primary one). 344 * When multiple hugepages are mapped from the same file, 345 * whether they will be dirty depends on the part that is mapped. 346 */ 347 if (!internal_conf->single_file_segments && 348 internal_conf->hugepage_file.unlink_existing && 349 rte_eal_process_type() == RTE_PROC_PRIMARY && 350 ret == 0) { 351 /* coverity[toctou] */ 352 if (unlink(path) < 0) { 353 RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n", 354 __func__, path, strerror(errno)); 355 return -1; 356 } 357 } 358 359 /* coverity[toctou] */ 360 fd = open(path, O_CREAT | O_RDWR, 0600); 361 if (fd < 0) { 362 RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n", 363 __func__, path, strerror(errno)); 364 return -1; 365 } 366 /* take out a read lock */ 367 if (lock(fd, LOCK_SH) < 0) { 368 RTE_LOG(ERR, EAL, "%s(): lock '%s' failed: %s\n", 369 __func__, path, strerror(errno)); 370 close(fd); 371 return -1; 372 } 373 *out_fd = fd; 374 return fd; 375 } 376 377 static int 378 resize_hugefile_in_memory(int fd, uint64_t fa_offset, 379 uint64_t page_sz, bool grow) 380 { 381 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 382 FALLOC_FL_KEEP_SIZE; 383 int ret; 384 385 /* grow or shrink the file */ 386 ret = fallocate(fd, flags, fa_offset, page_sz); 387 388 if (ret < 0) { 389 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 390 __func__, 391 strerror(errno)); 392 return -1; 393 } 394 return 0; 395 } 396 397 static int 398 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, 399 bool grow, bool *dirty) 400 { 401 const struct internal_config *internal_conf = 402 eal_get_internal_configuration(); 403 bool again = false; 404 405 do { 406 if (fallocate_supported == 0) { 407 /* we cannot deallocate memory if fallocate() is not 408 * supported, and hugepage file is already locked at 409 * creation, so no further synchronization needed. 410 */ 411 412 if (!grow) { 413 RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", 414 __func__); 415 return -1; 416 } 417 uint64_t new_size = fa_offset + page_sz; 418 uint64_t cur_size = get_file_size(fd); 419 420 /* fallocate isn't supported, fall back to ftruncate */ 421 if (dirty != NULL) 422 *dirty = new_size <= cur_size; 423 if (new_size > cur_size && 424 ftruncate(fd, new_size) < 0) { 425 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 426 __func__, strerror(errno)); 427 return -1; 428 } 429 } else { 430 int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 431 FALLOC_FL_KEEP_SIZE; 432 int ret; 433 434 /* 435 * technically, it is perfectly safe for both primary 436 * and secondary to grow and shrink the page files: 437 * growing the file repeatedly has no effect because 438 * a page can only be allocated once, while mmap ensures 439 * that secondaries hold on to the page even after the 440 * page itself is removed from the filesystem. 441 * 442 * however, leaving growing/shrinking to the primary 443 * tends to expose bugs in fdlist page count handling, 444 * so leave this here just in case. 445 */ 446 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 447 return 0; 448 449 /* grow or shrink the file */ 450 ret = fallocate(fd, flags, fa_offset, page_sz); 451 452 if (ret < 0) { 453 if (fallocate_supported == -1 && 454 errno == ENOTSUP) { 455 RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", 456 __func__); 457 again = true; 458 fallocate_supported = 0; 459 } else { 460 RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 461 __func__, 462 strerror(errno)); 463 return -1; 464 } 465 } else { 466 fallocate_supported = 1; 467 /* 468 * It is unknown which portions of an existing 469 * hugepage file were allocated previously, 470 * so all pages within the file are considered 471 * dirty, unless the file is a fresh one. 472 */ 473 if (dirty != NULL) 474 *dirty &= !internal_conf->hugepage_file.unlink_existing; 475 } 476 } 477 } while (again); 478 479 return 0; 480 } 481 482 static void 483 close_hugefile(int fd, char *path, int list_idx) 484 { 485 const struct internal_config *internal_conf = 486 eal_get_internal_configuration(); 487 /* 488 * primary process must unlink the file, but only when not in in-memory 489 * mode (as in that case there is no file to unlink). 490 */ 491 if (!internal_conf->in_memory && 492 rte_eal_process_type() == RTE_PROC_PRIMARY && 493 unlink(path)) 494 RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", 495 __func__, path, strerror(errno)); 496 497 close(fd); 498 fd_list[list_idx].memseg_list_fd = -1; 499 } 500 501 static int 502 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow, 503 bool *dirty) 504 { 505 /* in-memory mode is a special case, because we can be sure that 506 * fallocate() is supported. 507 */ 508 const struct internal_config *internal_conf = 509 eal_get_internal_configuration(); 510 511 if (internal_conf->in_memory) { 512 if (dirty != NULL) 513 *dirty = false; 514 return resize_hugefile_in_memory(fd, fa_offset, 515 page_sz, grow); 516 } 517 518 return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, 519 grow, dirty); 520 } 521 522 static int 523 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, 524 struct hugepage_info *hi, unsigned int list_idx, 525 unsigned int seg_idx) 526 { 527 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 528 int cur_socket_id = 0; 529 #endif 530 uint64_t map_offset; 531 rte_iova_t iova; 532 void *va; 533 char path[PATH_MAX]; 534 int ret = 0; 535 int fd; 536 bool dirty; 537 size_t alloc_sz; 538 int flags; 539 void *new_addr; 540 const struct internal_config *internal_conf = 541 eal_get_internal_configuration(); 542 543 alloc_sz = hi->hugepage_sz; 544 545 /* these are checked at init, but code analyzers don't know that */ 546 if (internal_conf->in_memory && !anonymous_hugepages_supported) { 547 RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); 548 return -1; 549 } 550 if (internal_conf->in_memory && !memfd_create_supported && 551 internal_conf->single_file_segments) { 552 RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); 553 return -1; 554 } 555 556 /* in-memory without memfd is a special case */ 557 int mmap_flags; 558 559 if (internal_conf->in_memory && !memfd_create_supported) { 560 const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | 561 MAP_PRIVATE | MAP_ANONYMOUS; 562 int pagesz_flag; 563 564 pagesz_flag = pagesz_flags(alloc_sz); 565 fd = -1; 566 dirty = false; 567 mmap_flags = in_memory_flags | pagesz_flag; 568 569 /* single-file segments codepath will never be active 570 * here because in-memory mode is incompatible with the 571 * fallback path, and it's stopped at EAL initialization 572 * stage. 573 */ 574 map_offset = 0; 575 } else { 576 /* takes out a read lock on segment or segment list */ 577 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, 578 &dirty); 579 if (fd < 0) { 580 RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); 581 return -1; 582 } 583 584 if (internal_conf->single_file_segments) { 585 map_offset = seg_idx * alloc_sz; 586 ret = resize_hugefile(fd, map_offset, alloc_sz, true, 587 &dirty); 588 if (ret < 0) 589 goto resized; 590 591 fd_list[list_idx].count++; 592 } else { 593 map_offset = 0; 594 if (ftruncate(fd, alloc_sz) < 0) { 595 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 596 __func__, strerror(errno)); 597 goto resized; 598 } 599 if (internal_conf->hugepage_file.unlink_before_mapping && 600 !internal_conf->in_memory) { 601 if (unlink(path)) { 602 RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", 603 __func__, strerror(errno)); 604 goto resized; 605 } 606 } 607 } 608 mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; 609 } 610 611 huge_register_sigbus(); 612 613 /* 614 * map the segment, and populate page tables, the kernel fills 615 * this segment with zeros if it's a new page. 616 */ 617 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, 618 map_offset); 619 620 if (va == MAP_FAILED) { 621 RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, 622 strerror(errno)); 623 /* mmap failed, but the previous region might have been 624 * unmapped anyway. try to remap it 625 */ 626 goto unmapped; 627 } 628 if (va != addr) { 629 RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); 630 munmap(va, alloc_sz); 631 goto resized; 632 } 633 634 /* In linux, hugetlb limitations, like cgroup, are 635 * enforced at fault time instead of mmap(), even 636 * with the option of MAP_POPULATE. Kernel will send 637 * a SIGBUS signal. To avoid to be killed, save stack 638 * environment here, if SIGBUS happens, we can jump 639 * back here. 640 */ 641 if (huge_wrap_sigsetjmp()) { 642 RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", 643 (unsigned int)(alloc_sz >> 20)); 644 goto mapped; 645 } 646 647 /* we need to trigger a write to the page to enforce page fault and 648 * ensure that page is accessible to us, but we can't overwrite value 649 * that is already there, so read the old value, and write itback. 650 * kernel populates the page with zeroes initially. 651 */ 652 *(volatile int *)addr = *(volatile int *)addr; 653 654 iova = rte_mem_virt2iova(addr); 655 if (iova == RTE_BAD_PHYS_ADDR) { 656 RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", 657 __func__); 658 goto mapped; 659 } 660 661 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 662 /* 663 * If the kernel has been built without NUMA support, get_mempolicy() 664 * will return an error. If check_numa() returns false, memory 665 * allocation is not NUMA aware and the socket_id should not be 666 * checked. 667 */ 668 if (check_numa()) { 669 ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, 670 MPOL_F_NODE | MPOL_F_ADDR); 671 if (ret < 0) { 672 RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", 673 __func__, strerror(errno)); 674 goto mapped; 675 } else if (cur_socket_id != socket_id) { 676 RTE_LOG(DEBUG, EAL, 677 "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", 678 __func__, socket_id, cur_socket_id); 679 goto mapped; 680 } 681 } 682 #else 683 if (rte_socket_count() > 1) 684 RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", 685 __func__); 686 #endif 687 688 huge_recover_sigbus(); 689 690 ms->addr = addr; 691 ms->hugepage_sz = alloc_sz; 692 ms->len = alloc_sz; 693 ms->nchannel = rte_memory_get_nchannel(); 694 ms->nrank = rte_memory_get_nrank(); 695 ms->iova = iova; 696 ms->socket_id = socket_id; 697 ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0; 698 699 return 0; 700 701 mapped: 702 munmap(addr, alloc_sz); 703 unmapped: 704 huge_recover_sigbus(); 705 flags = EAL_RESERVE_FORCE_ADDRESS; 706 new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); 707 if (new_addr != addr) { 708 if (new_addr != NULL) 709 munmap(new_addr, alloc_sz); 710 /* we're leaving a hole in our virtual address space. if 711 * somebody else maps this hole now, we could accidentally 712 * override it in the future. 713 */ 714 RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); 715 } 716 /* roll back the ref count */ 717 if (internal_conf->single_file_segments) 718 fd_list[list_idx].count--; 719 resized: 720 /* some codepaths will return negative fd, so exit early */ 721 if (fd < 0) 722 return -1; 723 724 if (internal_conf->single_file_segments) { 725 resize_hugefile(fd, map_offset, alloc_sz, false, NULL); 726 /* ignore failure, can't make it any worse */ 727 728 /* if refcount is at zero, close the file */ 729 if (fd_list[list_idx].count == 0) 730 close_hugefile(fd, path, list_idx); 731 } else { 732 /* only remove file if we can take out a write lock */ 733 if (!internal_conf->hugepage_file.unlink_before_mapping && 734 internal_conf->in_memory == 0 && 735 lock(fd, LOCK_EX) == 1) 736 unlink(path); 737 close(fd); 738 fd_list[list_idx].fds[seg_idx] = -1; 739 } 740 return -1; 741 } 742 743 static int 744 free_seg(struct rte_memseg *ms, struct hugepage_info *hi, 745 unsigned int list_idx, unsigned int seg_idx) 746 { 747 uint64_t map_offset; 748 char path[PATH_MAX]; 749 int fd, ret = 0; 750 const struct internal_config *internal_conf = 751 eal_get_internal_configuration(); 752 753 /* erase page data */ 754 memset(ms->addr, 0, ms->len); 755 756 if (mmap(ms->addr, ms->len, PROT_NONE, 757 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == 758 MAP_FAILED) { 759 RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); 760 return -1; 761 } 762 763 eal_mem_set_dump(ms->addr, ms->len, false); 764 765 /* if we're using anonymous hugepages, nothing to be done */ 766 if (internal_conf->in_memory && !memfd_create_supported) { 767 memset(ms, 0, sizeof(*ms)); 768 return 0; 769 } 770 771 /* if we are not in single file segments mode, we're going to unmap the 772 * segment and thus drop the lock on original fd, but hugepage dir is 773 * now locked so we can take out another one without races. 774 */ 775 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL); 776 if (fd < 0) 777 return -1; 778 779 if (internal_conf->single_file_segments) { 780 map_offset = seg_idx * ms->len; 781 if (resize_hugefile(fd, map_offset, ms->len, false, NULL)) 782 return -1; 783 784 if (--(fd_list[list_idx].count) == 0) 785 close_hugefile(fd, path, list_idx); 786 787 ret = 0; 788 } else { 789 /* if we're able to take out a write lock, we're the last one 790 * holding onto this page. 791 */ 792 if (!internal_conf->in_memory && 793 internal_conf->hugepage_file.unlink_existing && 794 !internal_conf->hugepage_file.unlink_before_mapping) { 795 ret = lock(fd, LOCK_EX); 796 if (ret >= 0) { 797 /* no one else is using this page */ 798 if (ret == 1) 799 unlink(path); 800 } 801 } 802 /* closing fd will drop the lock */ 803 close(fd); 804 fd_list[list_idx].fds[seg_idx] = -1; 805 } 806 807 memset(ms, 0, sizeof(*ms)); 808 809 return ret < 0 ? -1 : 0; 810 } 811 812 struct alloc_walk_param { 813 struct hugepage_info *hi; 814 struct rte_memseg **ms; 815 size_t page_sz; 816 unsigned int segs_allocated; 817 unsigned int n_segs; 818 int socket; 819 bool exact; 820 }; 821 static int 822 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) 823 { 824 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 825 struct alloc_walk_param *wa = arg; 826 struct rte_memseg_list *cur_msl; 827 size_t page_sz; 828 int cur_idx, start_idx, j, dir_fd = -1; 829 unsigned int msl_idx, need, i; 830 const struct internal_config *internal_conf = 831 eal_get_internal_configuration(); 832 833 if (msl->page_sz != wa->page_sz) 834 return 0; 835 if (msl->socket_id != wa->socket) 836 return 0; 837 838 page_sz = (size_t)msl->page_sz; 839 840 msl_idx = msl - mcfg->memsegs; 841 cur_msl = &mcfg->memsegs[msl_idx]; 842 843 need = wa->n_segs; 844 845 /* try finding space in memseg list */ 846 if (wa->exact) { 847 /* if we require exact number of pages in a list, find them */ 848 cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, 849 need); 850 if (cur_idx < 0) 851 return 0; 852 start_idx = cur_idx; 853 } else { 854 int cur_len; 855 856 /* we don't require exact number of pages, so we're going to go 857 * for best-effort allocation. that means finding the biggest 858 * unused block, and going with that. 859 */ 860 cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, 861 0); 862 if (cur_idx < 0) 863 return 0; 864 start_idx = cur_idx; 865 /* adjust the size to possibly be smaller than original 866 * request, but do not allow it to be bigger. 867 */ 868 cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, 869 cur_idx); 870 need = RTE_MIN(need, (unsigned int)cur_len); 871 } 872 873 /* do not allow any page allocations during the time we're allocating, 874 * because file creation and locking operations are not atomic, 875 * and we might be the first or the last ones to use a particular page, 876 * so we need to ensure atomicity of every operation. 877 * 878 * during init, we already hold a write lock, so don't try to take out 879 * another one. 880 */ 881 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 882 dir_fd = open(wa->hi->hugedir, O_RDONLY); 883 if (dir_fd < 0) { 884 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 885 __func__, wa->hi->hugedir, strerror(errno)); 886 return -1; 887 } 888 /* blocking writelock */ 889 if (flock(dir_fd, LOCK_EX)) { 890 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 891 __func__, wa->hi->hugedir, strerror(errno)); 892 close(dir_fd); 893 return -1; 894 } 895 } 896 897 for (i = 0; i < need; i++, cur_idx++) { 898 struct rte_memseg *cur; 899 void *map_addr; 900 901 cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); 902 map_addr = RTE_PTR_ADD(cur_msl->base_va, 903 cur_idx * page_sz); 904 905 if (alloc_seg(cur, map_addr, wa->socket, wa->hi, 906 msl_idx, cur_idx)) { 907 RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", 908 need, i); 909 910 /* if exact number wasn't requested, stop */ 911 if (!wa->exact) 912 goto out; 913 914 /* clean up */ 915 for (j = start_idx; j < cur_idx; j++) { 916 struct rte_memseg *tmp; 917 struct rte_fbarray *arr = 918 &cur_msl->memseg_arr; 919 920 tmp = rte_fbarray_get(arr, j); 921 rte_fbarray_set_free(arr, j); 922 923 /* free_seg may attempt to create a file, which 924 * may fail. 925 */ 926 if (free_seg(tmp, wa->hi, msl_idx, j)) 927 RTE_LOG(DEBUG, EAL, "Cannot free page\n"); 928 } 929 /* clear the list */ 930 if (wa->ms) 931 memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); 932 933 if (dir_fd >= 0) 934 close(dir_fd); 935 return -1; 936 } 937 if (wa->ms) 938 wa->ms[i] = cur; 939 940 rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); 941 } 942 out: 943 wa->segs_allocated = i; 944 if (i > 0) 945 cur_msl->version++; 946 if (dir_fd >= 0) 947 close(dir_fd); 948 /* if we didn't allocate any segments, move on to the next list */ 949 return i > 0; 950 } 951 952 struct free_walk_param { 953 struct hugepage_info *hi; 954 struct rte_memseg *ms; 955 }; 956 static int 957 free_seg_walk(const struct rte_memseg_list *msl, void *arg) 958 { 959 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 960 struct rte_memseg_list *found_msl; 961 struct free_walk_param *wa = arg; 962 uintptr_t start_addr, end_addr; 963 int msl_idx, seg_idx, ret, dir_fd = -1; 964 const struct internal_config *internal_conf = 965 eal_get_internal_configuration(); 966 967 start_addr = (uintptr_t) msl->base_va; 968 end_addr = start_addr + msl->len; 969 970 if ((uintptr_t)wa->ms->addr < start_addr || 971 (uintptr_t)wa->ms->addr >= end_addr) 972 return 0; 973 974 msl_idx = msl - mcfg->memsegs; 975 seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; 976 977 /* msl is const */ 978 found_msl = &mcfg->memsegs[msl_idx]; 979 980 /* do not allow any page allocations during the time we're freeing, 981 * because file creation and locking operations are not atomic, 982 * and we might be the first or the last ones to use a particular page, 983 * so we need to ensure atomicity of every operation. 984 * 985 * during init, we already hold a write lock, so don't try to take out 986 * another one. 987 */ 988 if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 989 dir_fd = open(wa->hi->hugedir, O_RDONLY); 990 if (dir_fd < 0) { 991 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 992 __func__, wa->hi->hugedir, strerror(errno)); 993 return -1; 994 } 995 /* blocking writelock */ 996 if (flock(dir_fd, LOCK_EX)) { 997 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 998 __func__, wa->hi->hugedir, strerror(errno)); 999 close(dir_fd); 1000 return -1; 1001 } 1002 } 1003 1004 found_msl->version++; 1005 1006 rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); 1007 1008 ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); 1009 1010 if (dir_fd >= 0) 1011 close(dir_fd); 1012 1013 if (ret < 0) 1014 return -1; 1015 1016 return 1; 1017 } 1018 1019 int 1020 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, 1021 int socket, bool exact) 1022 { 1023 int i, ret = -1; 1024 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1025 bool have_numa = false; 1026 int oldpolicy; 1027 struct bitmask *oldmask; 1028 #endif 1029 struct alloc_walk_param wa; 1030 struct hugepage_info *hi = NULL; 1031 struct internal_config *internal_conf = 1032 eal_get_internal_configuration(); 1033 1034 memset(&wa, 0, sizeof(wa)); 1035 1036 /* dynamic allocation not supported in legacy mode */ 1037 if (internal_conf->legacy_mem) 1038 return -1; 1039 1040 for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) { 1041 if (page_sz == 1042 internal_conf->hugepage_info[i].hugepage_sz) { 1043 hi = &internal_conf->hugepage_info[i]; 1044 break; 1045 } 1046 } 1047 if (!hi) { 1048 RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", 1049 __func__); 1050 return -1; 1051 } 1052 1053 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1054 if (check_numa()) { 1055 oldmask = numa_allocate_nodemask(); 1056 prepare_numa(&oldpolicy, oldmask, socket); 1057 have_numa = true; 1058 } 1059 #endif 1060 1061 wa.exact = exact; 1062 wa.hi = hi; 1063 wa.ms = ms; 1064 wa.n_segs = n_segs; 1065 wa.page_sz = page_sz; 1066 wa.socket = socket; 1067 wa.segs_allocated = 0; 1068 1069 /* memalloc is locked, so it's safe to use thread-unsafe version */ 1070 ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); 1071 if (ret == 0) { 1072 RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", 1073 __func__); 1074 ret = -1; 1075 } else if (ret > 0) { 1076 ret = (int)wa.segs_allocated; 1077 } 1078 1079 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 1080 if (have_numa) 1081 restore_numa(&oldpolicy, oldmask); 1082 #endif 1083 return ret; 1084 } 1085 1086 struct rte_memseg * 1087 eal_memalloc_alloc_seg(size_t page_sz, int socket) 1088 { 1089 struct rte_memseg *ms; 1090 if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) 1091 return NULL; 1092 /* return pointer to newly allocated memseg */ 1093 return ms; 1094 } 1095 1096 int 1097 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) 1098 { 1099 int seg, ret = 0; 1100 struct internal_config *internal_conf = 1101 eal_get_internal_configuration(); 1102 1103 /* dynamic free not supported in legacy mode */ 1104 if (internal_conf->legacy_mem) 1105 return -1; 1106 1107 for (seg = 0; seg < n_segs; seg++) { 1108 struct rte_memseg *cur = ms[seg]; 1109 struct hugepage_info *hi = NULL; 1110 struct free_walk_param wa; 1111 int i, walk_res; 1112 1113 /* if this page is marked as unfreeable, fail */ 1114 if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 1115 RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); 1116 ret = -1; 1117 continue; 1118 } 1119 1120 memset(&wa, 0, sizeof(wa)); 1121 1122 for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info); 1123 i++) { 1124 hi = &internal_conf->hugepage_info[i]; 1125 if (cur->hugepage_sz == hi->hugepage_sz) 1126 break; 1127 } 1128 if (i == (int)RTE_DIM(internal_conf->hugepage_info)) { 1129 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1130 ret = -1; 1131 continue; 1132 } 1133 1134 wa.ms = cur; 1135 wa.hi = hi; 1136 1137 /* memalloc is locked, so it's safe to use thread-unsafe version 1138 */ 1139 walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, 1140 &wa); 1141 if (walk_res == 1) 1142 continue; 1143 if (walk_res == 0) 1144 RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); 1145 ret = -1; 1146 } 1147 return ret; 1148 } 1149 1150 int 1151 eal_memalloc_free_seg(struct rte_memseg *ms) 1152 { 1153 const struct internal_config *internal_conf = 1154 eal_get_internal_configuration(); 1155 1156 /* dynamic free not supported in legacy mode */ 1157 if (internal_conf->legacy_mem) 1158 return -1; 1159 1160 return eal_memalloc_free_seg_bulk(&ms, 1); 1161 } 1162 1163 static int 1164 sync_chunk(struct rte_memseg_list *primary_msl, 1165 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1166 unsigned int msl_idx, bool used, int start, int end) 1167 { 1168 struct rte_fbarray *l_arr, *p_arr; 1169 int i, ret, chunk_len, diff_len; 1170 1171 l_arr = &local_msl->memseg_arr; 1172 p_arr = &primary_msl->memseg_arr; 1173 1174 /* we need to aggregate allocations/deallocations into bigger chunks, 1175 * as we don't want to spam the user with per-page callbacks. 1176 * 1177 * to avoid any potential issues, we also want to trigger 1178 * deallocation callbacks *before* we actually deallocate 1179 * memory, so that the user application could wrap up its use 1180 * before it goes away. 1181 */ 1182 1183 chunk_len = end - start; 1184 1185 /* find how many contiguous pages we can map/unmap for this chunk */ 1186 diff_len = used ? 1187 rte_fbarray_find_contig_free(l_arr, start) : 1188 rte_fbarray_find_contig_used(l_arr, start); 1189 1190 /* has to be at least one page */ 1191 if (diff_len < 1) 1192 return -1; 1193 1194 diff_len = RTE_MIN(chunk_len, diff_len); 1195 1196 /* if we are freeing memory, notify the application */ 1197 if (!used) { 1198 struct rte_memseg *ms; 1199 void *start_va; 1200 size_t len, page_sz; 1201 1202 ms = rte_fbarray_get(l_arr, start); 1203 start_va = ms->addr; 1204 page_sz = (size_t)primary_msl->page_sz; 1205 len = page_sz * diff_len; 1206 1207 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 1208 start_va, len); 1209 } 1210 1211 for (i = 0; i < diff_len; i++) { 1212 struct rte_memseg *p_ms, *l_ms; 1213 int seg_idx = start + i; 1214 1215 l_ms = rte_fbarray_get(l_arr, seg_idx); 1216 p_ms = rte_fbarray_get(p_arr, seg_idx); 1217 1218 if (l_ms == NULL || p_ms == NULL) 1219 return -1; 1220 1221 if (used) { 1222 ret = alloc_seg(l_ms, p_ms->addr, 1223 p_ms->socket_id, hi, 1224 msl_idx, seg_idx); 1225 if (ret < 0) 1226 return -1; 1227 rte_fbarray_set_used(l_arr, seg_idx); 1228 } else { 1229 ret = free_seg(l_ms, hi, msl_idx, seg_idx); 1230 rte_fbarray_set_free(l_arr, seg_idx); 1231 if (ret < 0) 1232 return -1; 1233 } 1234 } 1235 1236 /* if we just allocated memory, notify the application */ 1237 if (used) { 1238 struct rte_memseg *ms; 1239 void *start_va; 1240 size_t len, page_sz; 1241 1242 ms = rte_fbarray_get(l_arr, start); 1243 start_va = ms->addr; 1244 page_sz = (size_t)primary_msl->page_sz; 1245 len = page_sz * diff_len; 1246 1247 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1248 start_va, len); 1249 } 1250 1251 /* calculate how much we can advance until next chunk */ 1252 diff_len = used ? 1253 rte_fbarray_find_contig_used(l_arr, start) : 1254 rte_fbarray_find_contig_free(l_arr, start); 1255 ret = RTE_MIN(chunk_len, diff_len); 1256 1257 return ret; 1258 } 1259 1260 static int 1261 sync_status(struct rte_memseg_list *primary_msl, 1262 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1263 unsigned int msl_idx, bool used) 1264 { 1265 struct rte_fbarray *l_arr, *p_arr; 1266 int p_idx, l_chunk_len, p_chunk_len, ret; 1267 int start, end; 1268 1269 /* this is a little bit tricky, but the basic idea is - walk both lists 1270 * and spot any places where there are discrepancies. walking both lists 1271 * and noting discrepancies in a single go is a hard problem, so we do 1272 * it in two passes - first we spot any places where allocated segments 1273 * mismatch (i.e. ensure that everything that's allocated in the primary 1274 * is also allocated in the secondary), and then we do it by looking at 1275 * free segments instead. 1276 * 1277 * we also need to aggregate changes into chunks, as we have to call 1278 * callbacks per allocation, not per page. 1279 */ 1280 l_arr = &local_msl->memseg_arr; 1281 p_arr = &primary_msl->memseg_arr; 1282 1283 if (used) 1284 p_idx = rte_fbarray_find_next_used(p_arr, 0); 1285 else 1286 p_idx = rte_fbarray_find_next_free(p_arr, 0); 1287 1288 while (p_idx >= 0) { 1289 int next_chunk_search_idx; 1290 1291 if (used) { 1292 p_chunk_len = rte_fbarray_find_contig_used(p_arr, 1293 p_idx); 1294 l_chunk_len = rte_fbarray_find_contig_used(l_arr, 1295 p_idx); 1296 } else { 1297 p_chunk_len = rte_fbarray_find_contig_free(p_arr, 1298 p_idx); 1299 l_chunk_len = rte_fbarray_find_contig_free(l_arr, 1300 p_idx); 1301 } 1302 /* best case scenario - no differences (or bigger, which will be 1303 * fixed during next iteration), look for next chunk 1304 */ 1305 if (l_chunk_len >= p_chunk_len) { 1306 next_chunk_search_idx = p_idx + p_chunk_len; 1307 goto next_chunk; 1308 } 1309 1310 /* if both chunks start at the same point, skip parts we know 1311 * are identical, and sync the rest. each call to sync_chunk 1312 * will only sync contiguous segments, so we need to call this 1313 * until we are sure there are no more differences in this 1314 * chunk. 1315 */ 1316 start = p_idx + l_chunk_len; 1317 end = p_idx + p_chunk_len; 1318 do { 1319 ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, 1320 used, start, end); 1321 start += ret; 1322 } while (start < end && ret >= 0); 1323 /* if ret is negative, something went wrong */ 1324 if (ret < 0) 1325 return -1; 1326 1327 next_chunk_search_idx = p_idx + p_chunk_len; 1328 next_chunk: 1329 /* skip to end of this chunk */ 1330 if (used) { 1331 p_idx = rte_fbarray_find_next_used(p_arr, 1332 next_chunk_search_idx); 1333 } else { 1334 p_idx = rte_fbarray_find_next_free(p_arr, 1335 next_chunk_search_idx); 1336 } 1337 } 1338 return 0; 1339 } 1340 1341 static int 1342 sync_existing(struct rte_memseg_list *primary_msl, 1343 struct rte_memseg_list *local_msl, struct hugepage_info *hi, 1344 unsigned int msl_idx) 1345 { 1346 int ret, dir_fd; 1347 1348 /* do not allow any page allocations during the time we're allocating, 1349 * because file creation and locking operations are not atomic, 1350 * and we might be the first or the last ones to use a particular page, 1351 * so we need to ensure atomicity of every operation. 1352 */ 1353 dir_fd = open(hi->hugedir, O_RDONLY); 1354 if (dir_fd < 0) { 1355 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, 1356 hi->hugedir, strerror(errno)); 1357 return -1; 1358 } 1359 /* blocking writelock */ 1360 if (flock(dir_fd, LOCK_EX)) { 1361 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, 1362 hi->hugedir, strerror(errno)); 1363 close(dir_fd); 1364 return -1; 1365 } 1366 1367 /* ensure all allocated space is the same in both lists */ 1368 ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); 1369 if (ret < 0) 1370 goto fail; 1371 1372 /* ensure all unallocated space is the same in both lists */ 1373 ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); 1374 if (ret < 0) 1375 goto fail; 1376 1377 /* update version number */ 1378 local_msl->version = primary_msl->version; 1379 1380 close(dir_fd); 1381 1382 return 0; 1383 fail: 1384 close(dir_fd); 1385 return -1; 1386 } 1387 1388 static int 1389 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1390 { 1391 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1392 struct rte_memseg_list *primary_msl, *local_msl; 1393 struct hugepage_info *hi = NULL; 1394 unsigned int i; 1395 int msl_idx; 1396 struct internal_config *internal_conf = 1397 eal_get_internal_configuration(); 1398 1399 if (msl->external) 1400 return 0; 1401 1402 msl_idx = msl - mcfg->memsegs; 1403 primary_msl = &mcfg->memsegs[msl_idx]; 1404 local_msl = &local_memsegs[msl_idx]; 1405 1406 for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) { 1407 uint64_t cur_sz = 1408 internal_conf->hugepage_info[i].hugepage_sz; 1409 uint64_t msl_sz = primary_msl->page_sz; 1410 if (msl_sz == cur_sz) { 1411 hi = &internal_conf->hugepage_info[i]; 1412 break; 1413 } 1414 } 1415 if (!hi) { 1416 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 1417 return -1; 1418 } 1419 1420 /* if versions don't match, synchronize everything */ 1421 if (local_msl->version != primary_msl->version && 1422 sync_existing(primary_msl, local_msl, hi, msl_idx)) 1423 return -1; 1424 return 0; 1425 } 1426 1427 1428 int 1429 eal_memalloc_sync_with_primary(void) 1430 { 1431 /* nothing to be done in primary */ 1432 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1433 return 0; 1434 1435 /* memalloc is locked, so it's safe to call thread-unsafe version */ 1436 if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) 1437 return -1; 1438 return 0; 1439 } 1440 1441 static int 1442 secondary_msl_create_walk(const struct rte_memseg_list *msl, 1443 void *arg __rte_unused) 1444 { 1445 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1446 struct rte_memseg_list *primary_msl, *local_msl; 1447 char name[PATH_MAX]; 1448 int msl_idx, ret; 1449 1450 if (msl->external) 1451 return 0; 1452 1453 msl_idx = msl - mcfg->memsegs; 1454 primary_msl = &mcfg->memsegs[msl_idx]; 1455 local_msl = &local_memsegs[msl_idx]; 1456 1457 /* create distinct fbarrays for each secondary */ 1458 snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", 1459 primary_msl->memseg_arr.name, getpid()); 1460 1461 ret = rte_fbarray_init(&local_msl->memseg_arr, name, 1462 primary_msl->memseg_arr.len, 1463 primary_msl->memseg_arr.elt_sz); 1464 if (ret < 0) { 1465 RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); 1466 return -1; 1467 } 1468 local_msl->base_va = primary_msl->base_va; 1469 local_msl->len = primary_msl->len; 1470 1471 return 0; 1472 } 1473 1474 static int 1475 secondary_msl_destroy_walk(const struct rte_memseg_list *msl, 1476 void *arg __rte_unused) 1477 { 1478 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1479 struct rte_memseg_list *local_msl; 1480 int msl_idx, ret; 1481 1482 if (msl->external) 1483 return 0; 1484 1485 msl_idx = msl - mcfg->memsegs; 1486 local_msl = &local_memsegs[msl_idx]; 1487 1488 ret = rte_fbarray_destroy(&local_msl->memseg_arr); 1489 if (ret < 0) { 1490 RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n"); 1491 return -1; 1492 } 1493 local_msl->base_va = NULL; 1494 local_msl->len = 0; 1495 1496 return 0; 1497 } 1498 1499 static int 1500 alloc_list(int list_idx, int len) 1501 { 1502 int *data; 1503 int i; 1504 const struct internal_config *internal_conf = 1505 eal_get_internal_configuration(); 1506 1507 /* single-file segments mode does not need fd list */ 1508 if (!internal_conf->single_file_segments) { 1509 /* ensure we have space to store fd per each possible segment */ 1510 data = malloc(sizeof(int) * len); 1511 if (data == NULL) { 1512 RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); 1513 return -1; 1514 } 1515 /* set all fd's as invalid */ 1516 for (i = 0; i < len; i++) 1517 data[i] = -1; 1518 fd_list[list_idx].fds = data; 1519 fd_list[list_idx].len = len; 1520 } else { 1521 fd_list[list_idx].fds = NULL; 1522 fd_list[list_idx].len = 0; 1523 } 1524 1525 fd_list[list_idx].count = 0; 1526 fd_list[list_idx].memseg_list_fd = -1; 1527 1528 return 0; 1529 } 1530 1531 static int 1532 destroy_list(int list_idx) 1533 { 1534 const struct internal_config *internal_conf = 1535 eal_get_internal_configuration(); 1536 1537 /* single-file segments mode does not need fd list */ 1538 if (!internal_conf->single_file_segments) { 1539 int *fds = fd_list[list_idx].fds; 1540 int i; 1541 /* go through each fd and ensure it's closed */ 1542 for (i = 0; i < fd_list[list_idx].len; i++) { 1543 if (fds[i] >= 0) { 1544 close(fds[i]); 1545 fds[i] = -1; 1546 } 1547 } 1548 free(fds); 1549 fd_list[list_idx].fds = NULL; 1550 fd_list[list_idx].len = 0; 1551 } else if (fd_list[list_idx].memseg_list_fd >= 0) { 1552 close(fd_list[list_idx].memseg_list_fd); 1553 fd_list[list_idx].count = 0; 1554 fd_list[list_idx].memseg_list_fd = -1; 1555 } 1556 return 0; 1557 } 1558 1559 static int 1560 fd_list_create_walk(const struct rte_memseg_list *msl, 1561 void *arg __rte_unused) 1562 { 1563 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1564 unsigned int len; 1565 int msl_idx; 1566 1567 if (msl->external) 1568 return 0; 1569 1570 msl_idx = msl - mcfg->memsegs; 1571 len = msl->memseg_arr.len; 1572 1573 return alloc_list(msl_idx, len); 1574 } 1575 1576 static int 1577 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 1578 { 1579 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1580 int msl_idx; 1581 1582 if (msl->external) 1583 return 0; 1584 1585 msl_idx = msl - mcfg->memsegs; 1586 1587 return destroy_list(msl_idx); 1588 } 1589 1590 int 1591 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) 1592 { 1593 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1594 const struct internal_config *internal_conf = 1595 eal_get_internal_configuration(); 1596 1597 /* single file segments mode doesn't support individual segment fd's */ 1598 if (internal_conf->single_file_segments) 1599 return -ENOTSUP; 1600 1601 /* if list is not allocated, allocate it */ 1602 if (fd_list[list_idx].len == 0) { 1603 int len = mcfg->memsegs[list_idx].memseg_arr.len; 1604 1605 if (alloc_list(list_idx, len) < 0) 1606 return -ENOMEM; 1607 } 1608 fd_list[list_idx].fds[seg_idx] = fd; 1609 1610 return 0; 1611 } 1612 1613 int 1614 eal_memalloc_set_seg_list_fd(int list_idx, int fd) 1615 { 1616 const struct internal_config *internal_conf = 1617 eal_get_internal_configuration(); 1618 1619 /* non-single file segment mode doesn't support segment list fd's */ 1620 if (!internal_conf->single_file_segments) 1621 return -ENOTSUP; 1622 1623 fd_list[list_idx].memseg_list_fd = fd; 1624 1625 return 0; 1626 } 1627 1628 int 1629 eal_memalloc_get_seg_fd(int list_idx, int seg_idx) 1630 { 1631 int fd; 1632 const struct internal_config *internal_conf = 1633 eal_get_internal_configuration(); 1634 1635 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1636 #ifndef MEMFD_SUPPORTED 1637 /* in in-memory or no-huge mode, we rely on memfd support */ 1638 return -ENOTSUP; 1639 #endif 1640 /* memfd supported, but hugetlbfs memfd may not be */ 1641 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1642 return -ENOTSUP; 1643 } 1644 1645 if (internal_conf->single_file_segments) { 1646 fd = fd_list[list_idx].memseg_list_fd; 1647 } else if (fd_list[list_idx].len == 0) { 1648 /* list not initialized */ 1649 fd = -1; 1650 } else { 1651 fd = fd_list[list_idx].fds[seg_idx]; 1652 } 1653 if (fd < 0) 1654 return -ENODEV; 1655 return fd; 1656 } 1657 1658 static int 1659 test_memfd_create(void) 1660 { 1661 #ifdef MEMFD_SUPPORTED 1662 const struct internal_config *internal_conf = 1663 eal_get_internal_configuration(); 1664 unsigned int i; 1665 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 1666 uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz; 1667 int pagesz_flag = pagesz_flags(pagesz); 1668 int flags; 1669 1670 flags = pagesz_flag | RTE_MFD_HUGETLB; 1671 int fd = memfd_create("test", flags); 1672 if (fd < 0) { 1673 /* we failed - let memalloc know this isn't working */ 1674 if (errno == EINVAL) { 1675 memfd_create_supported = 0; 1676 return 0; /* not supported */ 1677 } 1678 1679 /* we got other error - something's wrong */ 1680 return -1; /* error */ 1681 } 1682 close(fd); 1683 return 1; /* supported */ 1684 } 1685 #endif 1686 return 0; /* not supported */ 1687 } 1688 1689 int 1690 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) 1691 { 1692 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1693 const struct internal_config *internal_conf = 1694 eal_get_internal_configuration(); 1695 1696 if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 1697 #ifndef MEMFD_SUPPORTED 1698 /* in in-memory or no-huge mode, we rely on memfd support */ 1699 return -ENOTSUP; 1700 #endif 1701 /* memfd supported, but hugetlbfs memfd may not be */ 1702 if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 1703 return -ENOTSUP; 1704 } 1705 1706 if (internal_conf->single_file_segments) { 1707 size_t pgsz = mcfg->memsegs[list_idx].page_sz; 1708 1709 /* segment not active? */ 1710 if (fd_list[list_idx].memseg_list_fd < 0) 1711 return -ENOENT; 1712 *offset = pgsz * seg_idx; 1713 } else { 1714 /* fd_list not initialized? */ 1715 if (fd_list[list_idx].len == 0) 1716 return -ENODEV; 1717 1718 /* segment not active? */ 1719 if (fd_list[list_idx].fds[seg_idx] < 0) 1720 return -ENOENT; 1721 *offset = 0; 1722 } 1723 return 0; 1724 } 1725 1726 int 1727 eal_memalloc_cleanup(void) 1728 { 1729 /* close all remaining fd's - these are per-process, so it's safe */ 1730 if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL)) 1731 return -1; 1732 1733 /* destroy the shadow page table if we're a secondary process */ 1734 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1735 return 0; 1736 1737 if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk, 1738 NULL)) 1739 return -1; 1740 1741 return 0; 1742 } 1743 1744 int 1745 eal_memalloc_init(void) 1746 { 1747 const struct internal_config *internal_conf = 1748 eal_get_internal_configuration(); 1749 1750 if (rte_eal_process_type() == RTE_PROC_SECONDARY) 1751 if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) 1752 return -1; 1753 if (rte_eal_process_type() == RTE_PROC_PRIMARY && 1754 internal_conf->in_memory) { 1755 int mfd_res = test_memfd_create(); 1756 1757 if (mfd_res < 0) { 1758 RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); 1759 return -1; 1760 } 1761 if (mfd_res == 1) 1762 RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); 1763 else 1764 RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); 1765 1766 /* we only support single-file segments mode with in-memory mode 1767 * if we support hugetlbfs with memfd_create. this code will 1768 * test if we do. 1769 */ 1770 if (internal_conf->single_file_segments && 1771 mfd_res != 1) { 1772 RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); 1773 return -1; 1774 } 1775 /* this cannot ever happen but better safe than sorry */ 1776 if (!anonymous_hugepages_supported) { 1777 RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); 1778 return -1; 1779 } 1780 /* safety net, should be impossible to configure */ 1781 if (internal_conf->hugepage_file.unlink_before_mapping && 1782 !internal_conf->hugepage_file.unlink_existing) { 1783 RTE_LOG(ERR, EAL, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.\n"); 1784 return -1; 1785 } 1786 } 1787 1788 /* initialize all of the fd lists */ 1789 if (rte_memseg_list_walk(fd_list_create_walk, NULL)) 1790 return -1; 1791 return 0; 1792 } 1793