199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2017-2018 Intel Corporation 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson 599a2dd95SBruce Richardson #include <errno.h> 699a2dd95SBruce Richardson #include <stdarg.h> 799a2dd95SBruce Richardson #include <stdbool.h> 899a2dd95SBruce Richardson #include <stdlib.h> 999a2dd95SBruce Richardson #include <stdio.h> 1099a2dd95SBruce Richardson #include <stdint.h> 1199a2dd95SBruce Richardson #include <inttypes.h> 1299a2dd95SBruce Richardson #include <string.h> 1399a2dd95SBruce Richardson #include <sys/mman.h> 1499a2dd95SBruce Richardson #include <sys/types.h> 1599a2dd95SBruce Richardson #include <sys/stat.h> 1699a2dd95SBruce Richardson #include <sys/queue.h> 1799a2dd95SBruce Richardson #include <sys/file.h> 1899a2dd95SBruce Richardson #include <unistd.h> 1999a2dd95SBruce Richardson #include <limits.h> 2099a2dd95SBruce Richardson #include <fcntl.h> 2199a2dd95SBruce Richardson #include <sys/ioctl.h> 2299a2dd95SBruce Richardson #include <sys/time.h> 2399a2dd95SBruce Richardson #include <signal.h> 2499a2dd95SBruce Richardson #include <setjmp.h> 2599a2dd95SBruce Richardson #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 2699a2dd95SBruce Richardson #include <linux/memfd.h> 2799a2dd95SBruce Richardson #define MEMFD_SUPPORTED 2899a2dd95SBruce Richardson #endif 2999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 3099a2dd95SBruce Richardson #include <numa.h> 3199a2dd95SBruce Richardson #include <numaif.h> 3299a2dd95SBruce Richardson #endif 3399a2dd95SBruce Richardson #include <linux/falloc.h> 3499a2dd95SBruce Richardson #include <linux/mman.h> /* for hugetlb-related mmap flags */ 3599a2dd95SBruce Richardson 3699a2dd95SBruce Richardson #include <rte_common.h> 3799a2dd95SBruce Richardson #include <rte_log.h> 3899a2dd95SBruce Richardson #include <rte_eal.h> 3999a2dd95SBruce Richardson #include <rte_errno.h> 4099a2dd95SBruce Richardson #include <rte_memory.h> 4199a2dd95SBruce Richardson #include <rte_spinlock.h> 4299a2dd95SBruce Richardson 4399a2dd95SBruce Richardson #include "eal_filesystem.h" 4499a2dd95SBruce Richardson #include "eal_internal_cfg.h" 4599a2dd95SBruce Richardson #include "eal_memalloc.h" 4699a2dd95SBruce Richardson #include "eal_memcfg.h" 4799a2dd95SBruce Richardson #include "eal_private.h" 4899a2dd95SBruce Richardson 4999a2dd95SBruce Richardson const int anonymous_hugepages_supported = 5099a2dd95SBruce Richardson #ifdef MAP_HUGE_SHIFT 5199a2dd95SBruce Richardson 1; 5299a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT 5399a2dd95SBruce Richardson #else 5499a2dd95SBruce Richardson 0; 5599a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT 26 5699a2dd95SBruce Richardson #endif 5799a2dd95SBruce Richardson 5899a2dd95SBruce Richardson /* 5999a2dd95SBruce Richardson * we've already checked memfd support at compile-time, but we also need to 6099a2dd95SBruce Richardson * check if we can create hugepage files with memfd. 6199a2dd95SBruce Richardson * 6299a2dd95SBruce Richardson * also, this is not a constant, because while we may be *compiled* with memfd 6399a2dd95SBruce Richardson * hugetlbfs support, we might not be *running* on a system that supports memfd 6499a2dd95SBruce Richardson * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at 6599a2dd95SBruce Richardson * runtime, and fall back to anonymous memory. 6699a2dd95SBruce Richardson */ 6799a2dd95SBruce Richardson static int memfd_create_supported = 6899a2dd95SBruce Richardson #ifdef MFD_HUGETLB 6999a2dd95SBruce Richardson 1; 7099a2dd95SBruce Richardson #define RTE_MFD_HUGETLB MFD_HUGETLB 7199a2dd95SBruce Richardson #else 7299a2dd95SBruce Richardson 0; 7399a2dd95SBruce Richardson #define RTE_MFD_HUGETLB 4U 7499a2dd95SBruce Richardson #endif 7599a2dd95SBruce Richardson 7699a2dd95SBruce Richardson /* 7799a2dd95SBruce Richardson * not all kernel version support fallocate on hugetlbfs, so fall back to 7899a2dd95SBruce Richardson * ftruncate and disallow deallocation if fallocate is not supported. 7999a2dd95SBruce Richardson */ 8099a2dd95SBruce Richardson static int fallocate_supported = -1; /* unknown */ 8199a2dd95SBruce Richardson 8299a2dd95SBruce Richardson /* 8399a2dd95SBruce Richardson * we have two modes - single file segments, and file-per-page mode. 8499a2dd95SBruce Richardson * 8599a2dd95SBruce Richardson * for single-file segments, we use memseg_list_fd to store the segment fd, 8699a2dd95SBruce Richardson * while the fds[] will not be allocated, and len will be set to 0. 8799a2dd95SBruce Richardson * 8899a2dd95SBruce Richardson * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' 8999a2dd95SBruce Richardson * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. 9099a2dd95SBruce Richardson * 9199a2dd95SBruce Richardson * we cannot know how many pages a system will have in advance, but we do know 9299a2dd95SBruce Richardson * that they come in lists, and we know lengths of these lists. so, simply store 9399a2dd95SBruce Richardson * a malloc'd array of fd's indexed by list and segment index. 9499a2dd95SBruce Richardson * 9599a2dd95SBruce Richardson * they will be initialized at startup, and filled as we allocate/deallocate 9699a2dd95SBruce Richardson * segments. 9799a2dd95SBruce Richardson */ 9899a2dd95SBruce Richardson static struct { 9999a2dd95SBruce Richardson int *fds; /**< dynamically allocated array of segment lock fd's */ 10099a2dd95SBruce Richardson int memseg_list_fd; /**< memseg list fd */ 10199a2dd95SBruce Richardson int len; /**< total length of the array */ 10299a2dd95SBruce Richardson int count; /**< entries used in an array */ 10399a2dd95SBruce Richardson } fd_list[RTE_MAX_MEMSEG_LISTS]; 10499a2dd95SBruce Richardson 10599a2dd95SBruce Richardson /** local copy of a memory map, used to synchronize memory hotplug in MP */ 10699a2dd95SBruce Richardson static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; 10799a2dd95SBruce Richardson 10899a2dd95SBruce Richardson static sigjmp_buf huge_jmpenv; 10999a2dd95SBruce Richardson 1109bffc928SOlivier Matz static void huge_sigbus_handler(int signo __rte_unused) 11199a2dd95SBruce Richardson { 11299a2dd95SBruce Richardson siglongjmp(huge_jmpenv, 1); 11399a2dd95SBruce Richardson } 11499a2dd95SBruce Richardson 11599a2dd95SBruce Richardson /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, 11699a2dd95SBruce Richardson * non-static local variable in the stack frame calling sigsetjmp might be 11799a2dd95SBruce Richardson * clobbered by a call to longjmp. 11899a2dd95SBruce Richardson */ 1199bffc928SOlivier Matz static int huge_wrap_sigsetjmp(void) 12099a2dd95SBruce Richardson { 12199a2dd95SBruce Richardson return sigsetjmp(huge_jmpenv, 1); 12299a2dd95SBruce Richardson } 12399a2dd95SBruce Richardson 12499a2dd95SBruce Richardson static struct sigaction huge_action_old; 12599a2dd95SBruce Richardson static int huge_need_recover; 12699a2dd95SBruce Richardson 1279bffc928SOlivier Matz static void 12899a2dd95SBruce Richardson huge_register_sigbus(void) 12999a2dd95SBruce Richardson { 13099a2dd95SBruce Richardson sigset_t mask; 13199a2dd95SBruce Richardson struct sigaction action; 13299a2dd95SBruce Richardson 13399a2dd95SBruce Richardson sigemptyset(&mask); 13499a2dd95SBruce Richardson sigaddset(&mask, SIGBUS); 13599a2dd95SBruce Richardson action.sa_flags = 0; 13699a2dd95SBruce Richardson action.sa_mask = mask; 13799a2dd95SBruce Richardson action.sa_handler = huge_sigbus_handler; 13899a2dd95SBruce Richardson 13999a2dd95SBruce Richardson huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); 14099a2dd95SBruce Richardson } 14199a2dd95SBruce Richardson 1429bffc928SOlivier Matz static void 14399a2dd95SBruce Richardson huge_recover_sigbus(void) 14499a2dd95SBruce Richardson { 14599a2dd95SBruce Richardson if (huge_need_recover) { 14699a2dd95SBruce Richardson sigaction(SIGBUS, &huge_action_old, NULL); 14799a2dd95SBruce Richardson huge_need_recover = 0; 14899a2dd95SBruce Richardson } 14999a2dd95SBruce Richardson } 15099a2dd95SBruce Richardson 15199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 15299a2dd95SBruce Richardson static bool 15399a2dd95SBruce Richardson check_numa(void) 15499a2dd95SBruce Richardson { 15599a2dd95SBruce Richardson bool ret = true; 15699a2dd95SBruce Richardson /* Check if kernel supports NUMA. */ 15799a2dd95SBruce Richardson if (numa_available() != 0) { 15899a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); 15999a2dd95SBruce Richardson ret = false; 16099a2dd95SBruce Richardson } 16199a2dd95SBruce Richardson return ret; 16299a2dd95SBruce Richardson } 16399a2dd95SBruce Richardson 16499a2dd95SBruce Richardson static void 16599a2dd95SBruce Richardson prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) 16699a2dd95SBruce Richardson { 16799a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); 16899a2dd95SBruce Richardson if (get_mempolicy(oldpolicy, oldmask->maskp, 16999a2dd95SBruce Richardson oldmask->size + 1, 0, 0) < 0) { 17099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, 17199a2dd95SBruce Richardson "Failed to get current mempolicy: %s. " 17299a2dd95SBruce Richardson "Assuming MPOL_DEFAULT.\n", strerror(errno)); 17399a2dd95SBruce Richardson *oldpolicy = MPOL_DEFAULT; 17499a2dd95SBruce Richardson } 17599a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, 17699a2dd95SBruce Richardson "Setting policy MPOL_PREFERRED for socket %d\n", 17799a2dd95SBruce Richardson socket_id); 17899a2dd95SBruce Richardson numa_set_preferred(socket_id); 17999a2dd95SBruce Richardson } 18099a2dd95SBruce Richardson 18199a2dd95SBruce Richardson static void 18299a2dd95SBruce Richardson restore_numa(int *oldpolicy, struct bitmask *oldmask) 18399a2dd95SBruce Richardson { 18499a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, 18599a2dd95SBruce Richardson "Restoring previous memory policy: %d\n", *oldpolicy); 18699a2dd95SBruce Richardson if (*oldpolicy == MPOL_DEFAULT) { 18799a2dd95SBruce Richardson numa_set_localalloc(); 18899a2dd95SBruce Richardson } else if (set_mempolicy(*oldpolicy, oldmask->maskp, 18999a2dd95SBruce Richardson oldmask->size + 1) < 0) { 19099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", 19199a2dd95SBruce Richardson strerror(errno)); 19299a2dd95SBruce Richardson numa_set_localalloc(); 19399a2dd95SBruce Richardson } 19499a2dd95SBruce Richardson numa_free_cpumask(oldmask); 19599a2dd95SBruce Richardson } 19699a2dd95SBruce Richardson #endif 19799a2dd95SBruce Richardson 19899a2dd95SBruce Richardson /* 19999a2dd95SBruce Richardson * uses fstat to report the size of a file on disk 20099a2dd95SBruce Richardson */ 20199a2dd95SBruce Richardson static off_t 20299a2dd95SBruce Richardson get_file_size(int fd) 20399a2dd95SBruce Richardson { 20499a2dd95SBruce Richardson struct stat st; 20599a2dd95SBruce Richardson if (fstat(fd, &st) < 0) 20699a2dd95SBruce Richardson return 0; 20799a2dd95SBruce Richardson return st.st_size; 20899a2dd95SBruce Richardson } 20999a2dd95SBruce Richardson 21099a2dd95SBruce Richardson static int 21199a2dd95SBruce Richardson pagesz_flags(uint64_t page_sz) 21299a2dd95SBruce Richardson { 21399a2dd95SBruce Richardson /* as per mmap() manpage, all page sizes are log2 of page size 21499a2dd95SBruce Richardson * shifted by MAP_HUGE_SHIFT 21599a2dd95SBruce Richardson */ 21699a2dd95SBruce Richardson int log2 = rte_log2_u64(page_sz); 21799a2dd95SBruce Richardson return log2 << RTE_MAP_HUGE_SHIFT; 21899a2dd95SBruce Richardson } 21999a2dd95SBruce Richardson 22099a2dd95SBruce Richardson /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ 22199a2dd95SBruce Richardson static int lock(int fd, int type) 22299a2dd95SBruce Richardson { 22399a2dd95SBruce Richardson int ret; 22499a2dd95SBruce Richardson 22599a2dd95SBruce Richardson /* flock may be interrupted */ 22699a2dd95SBruce Richardson do { 22799a2dd95SBruce Richardson ret = flock(fd, type | LOCK_NB); 22899a2dd95SBruce Richardson } while (ret && errno == EINTR); 22999a2dd95SBruce Richardson 23099a2dd95SBruce Richardson if (ret && errno == EWOULDBLOCK) { 23199a2dd95SBruce Richardson /* couldn't lock */ 23299a2dd95SBruce Richardson return 0; 23399a2dd95SBruce Richardson } else if (ret) { 23499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", 23599a2dd95SBruce Richardson __func__, strerror(errno)); 23699a2dd95SBruce Richardson return -1; 23799a2dd95SBruce Richardson } 23899a2dd95SBruce Richardson /* lock was successful */ 23999a2dd95SBruce Richardson return 1; 24099a2dd95SBruce Richardson } 24199a2dd95SBruce Richardson 24299a2dd95SBruce Richardson static int 24399a2dd95SBruce Richardson get_seg_memfd(struct hugepage_info *hi __rte_unused, 24499a2dd95SBruce Richardson unsigned int list_idx __rte_unused, 24599a2dd95SBruce Richardson unsigned int seg_idx __rte_unused) 24699a2dd95SBruce Richardson { 24799a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED 24899a2dd95SBruce Richardson int fd; 24999a2dd95SBruce Richardson char segname[250]; /* as per manpage, limit is 249 bytes plus null */ 25099a2dd95SBruce Richardson 25199a2dd95SBruce Richardson int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); 25299a2dd95SBruce Richardson const struct internal_config *internal_conf = 25399a2dd95SBruce Richardson eal_get_internal_configuration(); 25499a2dd95SBruce Richardson 25599a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 25699a2dd95SBruce Richardson fd = fd_list[list_idx].memseg_list_fd; 25799a2dd95SBruce Richardson 25899a2dd95SBruce Richardson if (fd < 0) { 25999a2dd95SBruce Richardson snprintf(segname, sizeof(segname), "seg_%i", list_idx); 26099a2dd95SBruce Richardson fd = memfd_create(segname, flags); 26199a2dd95SBruce Richardson if (fd < 0) { 26299a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 26399a2dd95SBruce Richardson __func__, strerror(errno)); 26499a2dd95SBruce Richardson return -1; 26599a2dd95SBruce Richardson } 26699a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = fd; 26799a2dd95SBruce Richardson } 26899a2dd95SBruce Richardson } else { 26999a2dd95SBruce Richardson fd = fd_list[list_idx].fds[seg_idx]; 27099a2dd95SBruce Richardson 27199a2dd95SBruce Richardson if (fd < 0) { 27299a2dd95SBruce Richardson snprintf(segname, sizeof(segname), "seg_%i-%i", 27399a2dd95SBruce Richardson list_idx, seg_idx); 27499a2dd95SBruce Richardson fd = memfd_create(segname, flags); 27599a2dd95SBruce Richardson if (fd < 0) { 27699a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", 27799a2dd95SBruce Richardson __func__, strerror(errno)); 27899a2dd95SBruce Richardson return -1; 27999a2dd95SBruce Richardson } 28099a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = fd; 28199a2dd95SBruce Richardson } 28299a2dd95SBruce Richardson } 28399a2dd95SBruce Richardson return fd; 28499a2dd95SBruce Richardson #endif 28599a2dd95SBruce Richardson return -1; 28699a2dd95SBruce Richardson } 28799a2dd95SBruce Richardson 28899a2dd95SBruce Richardson static int 28999a2dd95SBruce Richardson get_seg_fd(char *path, int buflen, struct hugepage_info *hi, 290*32b4771cSDmitry Kozlyuk unsigned int list_idx, unsigned int seg_idx, 291*32b4771cSDmitry Kozlyuk bool *dirty) 29299a2dd95SBruce Richardson { 29399a2dd95SBruce Richardson int fd; 294*32b4771cSDmitry Kozlyuk int *out_fd; 295*32b4771cSDmitry Kozlyuk struct stat st; 296*32b4771cSDmitry Kozlyuk int ret; 29799a2dd95SBruce Richardson const struct internal_config *internal_conf = 29899a2dd95SBruce Richardson eal_get_internal_configuration(); 29999a2dd95SBruce Richardson 300*32b4771cSDmitry Kozlyuk if (dirty != NULL) 301*32b4771cSDmitry Kozlyuk *dirty = false; 302*32b4771cSDmitry Kozlyuk 30399a2dd95SBruce Richardson /* for in-memory mode, we only make it here when we're sure we support 30499a2dd95SBruce Richardson * memfd, and this is a special case. 30599a2dd95SBruce Richardson */ 30699a2dd95SBruce Richardson if (internal_conf->in_memory) 30799a2dd95SBruce Richardson return get_seg_memfd(hi, list_idx, seg_idx); 30899a2dd95SBruce Richardson 30999a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 310*32b4771cSDmitry Kozlyuk out_fd = &fd_list[list_idx].memseg_list_fd; 31199a2dd95SBruce Richardson eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); 312*32b4771cSDmitry Kozlyuk } else { 313*32b4771cSDmitry Kozlyuk out_fd = &fd_list[list_idx].fds[seg_idx]; 314*32b4771cSDmitry Kozlyuk eal_get_hugefile_path(path, buflen, hi->hugedir, 315*32b4771cSDmitry Kozlyuk list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); 316*32b4771cSDmitry Kozlyuk } 317*32b4771cSDmitry Kozlyuk fd = *out_fd; 318*32b4771cSDmitry Kozlyuk if (fd >= 0) 319*32b4771cSDmitry Kozlyuk return fd; 32099a2dd95SBruce Richardson 321*32b4771cSDmitry Kozlyuk /* 322*32b4771cSDmitry Kozlyuk * There is no TOCTOU between stat() and unlink()/open() 323*32b4771cSDmitry Kozlyuk * because the hugepage directory is locked. 324*32b4771cSDmitry Kozlyuk */ 325*32b4771cSDmitry Kozlyuk ret = stat(path, &st); 326*32b4771cSDmitry Kozlyuk if (ret < 0 && errno != ENOENT) { 327*32b4771cSDmitry Kozlyuk RTE_LOG(DEBUG, EAL, "%s(): stat() for '%s' failed: %s\n", 3288a5a9140SStephen Hemminger __func__, path, strerror(errno)); 32999a2dd95SBruce Richardson return -1; 33099a2dd95SBruce Richardson } 331*32b4771cSDmitry Kozlyuk if (!internal_conf->hugepage_file.unlink_existing && ret == 0 && 332*32b4771cSDmitry Kozlyuk dirty != NULL) 333*32b4771cSDmitry Kozlyuk *dirty = true; 33499a2dd95SBruce Richardson 335*32b4771cSDmitry Kozlyuk /* 336*32b4771cSDmitry Kozlyuk * The kernel clears a hugepage only when it is mapped 337*32b4771cSDmitry Kozlyuk * from a particular file for the first time. 338*32b4771cSDmitry Kozlyuk * If the file already exists, the old content will be mapped. 339*32b4771cSDmitry Kozlyuk * If the memory manager assumes all mapped pages to be clean, 340*32b4771cSDmitry Kozlyuk * the file must be removed and created anew. 341*32b4771cSDmitry Kozlyuk * Otherwise, the primary caller must be notified 342*32b4771cSDmitry Kozlyuk * that mapped pages will be dirty 343*32b4771cSDmitry Kozlyuk * (secondary callers receive the segment state from the primary one). 344*32b4771cSDmitry Kozlyuk * When multiple hugepages are mapped from the same file, 345*32b4771cSDmitry Kozlyuk * whether they will be dirty depends on the part that is mapped. 34699a2dd95SBruce Richardson */ 347*32b4771cSDmitry Kozlyuk if (!internal_conf->single_file_segments && 348*32b4771cSDmitry Kozlyuk internal_conf->hugepage_file.unlink_existing && 349*32b4771cSDmitry Kozlyuk rte_eal_process_type() == RTE_PROC_PRIMARY && 350*32b4771cSDmitry Kozlyuk ret == 0) { 351*32b4771cSDmitry Kozlyuk /* coverity[toctou] */ 352*32b4771cSDmitry Kozlyuk if (unlink(path) < 0) { 35399a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n", 35499a2dd95SBruce Richardson __func__, path, strerror(errno)); 35599a2dd95SBruce Richardson return -1; 35699a2dd95SBruce Richardson } 357*32b4771cSDmitry Kozlyuk } 35899a2dd95SBruce Richardson 359*32b4771cSDmitry Kozlyuk /* coverity[toctou] */ 36099a2dd95SBruce Richardson fd = open(path, O_CREAT | O_RDWR, 0600); 36199a2dd95SBruce Richardson if (fd < 0) { 3628a5a9140SStephen Hemminger RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n", 3638a5a9140SStephen Hemminger __func__, path, strerror(errno)); 36499a2dd95SBruce Richardson return -1; 36599a2dd95SBruce Richardson } 36699a2dd95SBruce Richardson /* take out a read lock */ 36799a2dd95SBruce Richardson if (lock(fd, LOCK_SH) < 0) { 368*32b4771cSDmitry Kozlyuk RTE_LOG(ERR, EAL, "%s(): lock '%s' failed: %s\n", 369*32b4771cSDmitry Kozlyuk __func__, path, strerror(errno)); 37099a2dd95SBruce Richardson close(fd); 37199a2dd95SBruce Richardson return -1; 37299a2dd95SBruce Richardson } 373*32b4771cSDmitry Kozlyuk *out_fd = fd; 37499a2dd95SBruce Richardson return fd; 37599a2dd95SBruce Richardson } 37699a2dd95SBruce Richardson 37799a2dd95SBruce Richardson static int 37899a2dd95SBruce Richardson resize_hugefile_in_memory(int fd, uint64_t fa_offset, 37999a2dd95SBruce Richardson uint64_t page_sz, bool grow) 38099a2dd95SBruce Richardson { 38199a2dd95SBruce Richardson int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 38299a2dd95SBruce Richardson FALLOC_FL_KEEP_SIZE; 38399a2dd95SBruce Richardson int ret; 38499a2dd95SBruce Richardson 38599a2dd95SBruce Richardson /* grow or shrink the file */ 38699a2dd95SBruce Richardson ret = fallocate(fd, flags, fa_offset, page_sz); 38799a2dd95SBruce Richardson 38899a2dd95SBruce Richardson if (ret < 0) { 38999a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 39099a2dd95SBruce Richardson __func__, 39199a2dd95SBruce Richardson strerror(errno)); 39299a2dd95SBruce Richardson return -1; 39399a2dd95SBruce Richardson } 39499a2dd95SBruce Richardson return 0; 39599a2dd95SBruce Richardson } 39699a2dd95SBruce Richardson 39799a2dd95SBruce Richardson static int 39899a2dd95SBruce Richardson resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, 399*32b4771cSDmitry Kozlyuk bool grow, bool *dirty) 40099a2dd95SBruce Richardson { 401*32b4771cSDmitry Kozlyuk const struct internal_config *internal_conf = 402*32b4771cSDmitry Kozlyuk eal_get_internal_configuration(); 40399a2dd95SBruce Richardson bool again = false; 40499a2dd95SBruce Richardson 40599a2dd95SBruce Richardson do { 40699a2dd95SBruce Richardson if (fallocate_supported == 0) { 40799a2dd95SBruce Richardson /* we cannot deallocate memory if fallocate() is not 40899a2dd95SBruce Richardson * supported, and hugepage file is already locked at 40999a2dd95SBruce Richardson * creation, so no further synchronization needed. 41099a2dd95SBruce Richardson */ 41199a2dd95SBruce Richardson 41299a2dd95SBruce Richardson if (!grow) { 41399a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", 41499a2dd95SBruce Richardson __func__); 41599a2dd95SBruce Richardson return -1; 41699a2dd95SBruce Richardson } 41799a2dd95SBruce Richardson uint64_t new_size = fa_offset + page_sz; 41899a2dd95SBruce Richardson uint64_t cur_size = get_file_size(fd); 41999a2dd95SBruce Richardson 42099a2dd95SBruce Richardson /* fallocate isn't supported, fall back to ftruncate */ 421*32b4771cSDmitry Kozlyuk if (dirty != NULL) 422*32b4771cSDmitry Kozlyuk *dirty = new_size <= cur_size; 42399a2dd95SBruce Richardson if (new_size > cur_size && 42499a2dd95SBruce Richardson ftruncate(fd, new_size) < 0) { 42599a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 42699a2dd95SBruce Richardson __func__, strerror(errno)); 42799a2dd95SBruce Richardson return -1; 42899a2dd95SBruce Richardson } 42999a2dd95SBruce Richardson } else { 43099a2dd95SBruce Richardson int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | 43199a2dd95SBruce Richardson FALLOC_FL_KEEP_SIZE; 43299a2dd95SBruce Richardson int ret; 43399a2dd95SBruce Richardson 43499a2dd95SBruce Richardson /* 43599a2dd95SBruce Richardson * technically, it is perfectly safe for both primary 43699a2dd95SBruce Richardson * and secondary to grow and shrink the page files: 43799a2dd95SBruce Richardson * growing the file repeatedly has no effect because 43899a2dd95SBruce Richardson * a page can only be allocated once, while mmap ensures 43999a2dd95SBruce Richardson * that secondaries hold on to the page even after the 44099a2dd95SBruce Richardson * page itself is removed from the filesystem. 44199a2dd95SBruce Richardson * 44299a2dd95SBruce Richardson * however, leaving growing/shrinking to the primary 44399a2dd95SBruce Richardson * tends to expose bugs in fdlist page count handling, 44499a2dd95SBruce Richardson * so leave this here just in case. 44599a2dd95SBruce Richardson */ 44699a2dd95SBruce Richardson if (rte_eal_process_type() != RTE_PROC_PRIMARY) 44799a2dd95SBruce Richardson return 0; 44899a2dd95SBruce Richardson 44999a2dd95SBruce Richardson /* grow or shrink the file */ 45099a2dd95SBruce Richardson ret = fallocate(fd, flags, fa_offset, page_sz); 45199a2dd95SBruce Richardson 45299a2dd95SBruce Richardson if (ret < 0) { 45399a2dd95SBruce Richardson if (fallocate_supported == -1 && 45499a2dd95SBruce Richardson errno == ENOTSUP) { 45599a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", 45699a2dd95SBruce Richardson __func__); 45799a2dd95SBruce Richardson again = true; 45899a2dd95SBruce Richardson fallocate_supported = 0; 45999a2dd95SBruce Richardson } else { 46099a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", 46199a2dd95SBruce Richardson __func__, 46299a2dd95SBruce Richardson strerror(errno)); 46399a2dd95SBruce Richardson return -1; 46499a2dd95SBruce Richardson } 465*32b4771cSDmitry Kozlyuk } else { 46699a2dd95SBruce Richardson fallocate_supported = 1; 467*32b4771cSDmitry Kozlyuk /* 468*32b4771cSDmitry Kozlyuk * It is unknown which portions of an existing 469*32b4771cSDmitry Kozlyuk * hugepage file were allocated previously, 470*32b4771cSDmitry Kozlyuk * so all pages within the file are considered 471*32b4771cSDmitry Kozlyuk * dirty, unless the file is a fresh one. 472*32b4771cSDmitry Kozlyuk */ 473*32b4771cSDmitry Kozlyuk if (dirty != NULL) 474*32b4771cSDmitry Kozlyuk *dirty &= !internal_conf->hugepage_file.unlink_existing; 475*32b4771cSDmitry Kozlyuk } 47699a2dd95SBruce Richardson } 47799a2dd95SBruce Richardson } while (again); 47899a2dd95SBruce Richardson 47999a2dd95SBruce Richardson return 0; 48099a2dd95SBruce Richardson } 48199a2dd95SBruce Richardson 48299a2dd95SBruce Richardson static void 48399a2dd95SBruce Richardson close_hugefile(int fd, char *path, int list_idx) 48499a2dd95SBruce Richardson { 48599a2dd95SBruce Richardson const struct internal_config *internal_conf = 48699a2dd95SBruce Richardson eal_get_internal_configuration(); 48799a2dd95SBruce Richardson /* 48899a2dd95SBruce Richardson * primary process must unlink the file, but only when not in in-memory 48999a2dd95SBruce Richardson * mode (as in that case there is no file to unlink). 49099a2dd95SBruce Richardson */ 49199a2dd95SBruce Richardson if (!internal_conf->in_memory && 49299a2dd95SBruce Richardson rte_eal_process_type() == RTE_PROC_PRIMARY && 49399a2dd95SBruce Richardson unlink(path)) 49499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", 49599a2dd95SBruce Richardson __func__, path, strerror(errno)); 49699a2dd95SBruce Richardson 49799a2dd95SBruce Richardson close(fd); 49899a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1; 49999a2dd95SBruce Richardson } 50099a2dd95SBruce Richardson 50199a2dd95SBruce Richardson static int 502*32b4771cSDmitry Kozlyuk resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow, 503*32b4771cSDmitry Kozlyuk bool *dirty) 50499a2dd95SBruce Richardson { 50599a2dd95SBruce Richardson /* in-memory mode is a special case, because we can be sure that 50699a2dd95SBruce Richardson * fallocate() is supported. 50799a2dd95SBruce Richardson */ 50899a2dd95SBruce Richardson const struct internal_config *internal_conf = 50999a2dd95SBruce Richardson eal_get_internal_configuration(); 51099a2dd95SBruce Richardson 511*32b4771cSDmitry Kozlyuk if (internal_conf->in_memory) { 512*32b4771cSDmitry Kozlyuk if (dirty != NULL) 513*32b4771cSDmitry Kozlyuk *dirty = false; 51499a2dd95SBruce Richardson return resize_hugefile_in_memory(fd, fa_offset, 51599a2dd95SBruce Richardson page_sz, grow); 516*32b4771cSDmitry Kozlyuk } 51799a2dd95SBruce Richardson 51899a2dd95SBruce Richardson return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, 519*32b4771cSDmitry Kozlyuk grow, dirty); 52099a2dd95SBruce Richardson } 52199a2dd95SBruce Richardson 52299a2dd95SBruce Richardson static int 52399a2dd95SBruce Richardson alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, 52499a2dd95SBruce Richardson struct hugepage_info *hi, unsigned int list_idx, 52599a2dd95SBruce Richardson unsigned int seg_idx) 52699a2dd95SBruce Richardson { 52799a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 52899a2dd95SBruce Richardson int cur_socket_id = 0; 52999a2dd95SBruce Richardson #endif 53099a2dd95SBruce Richardson uint64_t map_offset; 53199a2dd95SBruce Richardson rte_iova_t iova; 53299a2dd95SBruce Richardson void *va; 53399a2dd95SBruce Richardson char path[PATH_MAX]; 53499a2dd95SBruce Richardson int ret = 0; 53599a2dd95SBruce Richardson int fd; 536*32b4771cSDmitry Kozlyuk bool dirty; 53799a2dd95SBruce Richardson size_t alloc_sz; 53899a2dd95SBruce Richardson int flags; 53999a2dd95SBruce Richardson void *new_addr; 54099a2dd95SBruce Richardson const struct internal_config *internal_conf = 54199a2dd95SBruce Richardson eal_get_internal_configuration(); 54299a2dd95SBruce Richardson 54399a2dd95SBruce Richardson alloc_sz = hi->hugepage_sz; 54499a2dd95SBruce Richardson 54599a2dd95SBruce Richardson /* these are checked at init, but code analyzers don't know that */ 54699a2dd95SBruce Richardson if (internal_conf->in_memory && !anonymous_hugepages_supported) { 54799a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); 54899a2dd95SBruce Richardson return -1; 54999a2dd95SBruce Richardson } 55099a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported && 55199a2dd95SBruce Richardson internal_conf->single_file_segments) { 55299a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); 55399a2dd95SBruce Richardson return -1; 55499a2dd95SBruce Richardson } 55599a2dd95SBruce Richardson 55699a2dd95SBruce Richardson /* in-memory without memfd is a special case */ 55799a2dd95SBruce Richardson int mmap_flags; 55899a2dd95SBruce Richardson 55999a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported) { 56099a2dd95SBruce Richardson const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | 56199a2dd95SBruce Richardson MAP_PRIVATE | MAP_ANONYMOUS; 56299a2dd95SBruce Richardson int pagesz_flag; 56399a2dd95SBruce Richardson 56499a2dd95SBruce Richardson pagesz_flag = pagesz_flags(alloc_sz); 56599a2dd95SBruce Richardson fd = -1; 566*32b4771cSDmitry Kozlyuk dirty = false; 56799a2dd95SBruce Richardson mmap_flags = in_memory_flags | pagesz_flag; 56899a2dd95SBruce Richardson 56999a2dd95SBruce Richardson /* single-file segments codepath will never be active 57099a2dd95SBruce Richardson * here because in-memory mode is incompatible with the 57199a2dd95SBruce Richardson * fallback path, and it's stopped at EAL initialization 57299a2dd95SBruce Richardson * stage. 57399a2dd95SBruce Richardson */ 57499a2dd95SBruce Richardson map_offset = 0; 57599a2dd95SBruce Richardson } else { 57699a2dd95SBruce Richardson /* takes out a read lock on segment or segment list */ 577*32b4771cSDmitry Kozlyuk fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, 578*32b4771cSDmitry Kozlyuk &dirty); 57999a2dd95SBruce Richardson if (fd < 0) { 58099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); 58199a2dd95SBruce Richardson return -1; 58299a2dd95SBruce Richardson } 58399a2dd95SBruce Richardson 58499a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 58599a2dd95SBruce Richardson map_offset = seg_idx * alloc_sz; 586*32b4771cSDmitry Kozlyuk ret = resize_hugefile(fd, map_offset, alloc_sz, true, 587*32b4771cSDmitry Kozlyuk &dirty); 58899a2dd95SBruce Richardson if (ret < 0) 58999a2dd95SBruce Richardson goto resized; 59099a2dd95SBruce Richardson 59199a2dd95SBruce Richardson fd_list[list_idx].count++; 59299a2dd95SBruce Richardson } else { 59399a2dd95SBruce Richardson map_offset = 0; 59499a2dd95SBruce Richardson if (ftruncate(fd, alloc_sz) < 0) { 59599a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", 59699a2dd95SBruce Richardson __func__, strerror(errno)); 59799a2dd95SBruce Richardson goto resized; 59899a2dd95SBruce Richardson } 59952d7d91eSDmitry Kozlyuk if (internal_conf->hugepage_file.unlink_before_mapping && 60099a2dd95SBruce Richardson !internal_conf->in_memory) { 60199a2dd95SBruce Richardson if (unlink(path)) { 60299a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", 60399a2dd95SBruce Richardson __func__, strerror(errno)); 60499a2dd95SBruce Richardson goto resized; 60599a2dd95SBruce Richardson } 60699a2dd95SBruce Richardson } 60799a2dd95SBruce Richardson } 60899a2dd95SBruce Richardson mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; 60999a2dd95SBruce Richardson } 61099a2dd95SBruce Richardson 6119bffc928SOlivier Matz huge_register_sigbus(); 6129bffc928SOlivier Matz 61399a2dd95SBruce Richardson /* 61499a2dd95SBruce Richardson * map the segment, and populate page tables, the kernel fills 61599a2dd95SBruce Richardson * this segment with zeros if it's a new page. 61699a2dd95SBruce Richardson */ 61799a2dd95SBruce Richardson va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, 61899a2dd95SBruce Richardson map_offset); 61999a2dd95SBruce Richardson 62099a2dd95SBruce Richardson if (va == MAP_FAILED) { 62199a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, 62299a2dd95SBruce Richardson strerror(errno)); 62399a2dd95SBruce Richardson /* mmap failed, but the previous region might have been 62499a2dd95SBruce Richardson * unmapped anyway. try to remap it 62599a2dd95SBruce Richardson */ 62699a2dd95SBruce Richardson goto unmapped; 62799a2dd95SBruce Richardson } 62899a2dd95SBruce Richardson if (va != addr) { 62999a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); 63099a2dd95SBruce Richardson munmap(va, alloc_sz); 63199a2dd95SBruce Richardson goto resized; 63299a2dd95SBruce Richardson } 63399a2dd95SBruce Richardson 63499a2dd95SBruce Richardson /* In linux, hugetlb limitations, like cgroup, are 63599a2dd95SBruce Richardson * enforced at fault time instead of mmap(), even 63699a2dd95SBruce Richardson * with the option of MAP_POPULATE. Kernel will send 63799a2dd95SBruce Richardson * a SIGBUS signal. To avoid to be killed, save stack 63899a2dd95SBruce Richardson * environment here, if SIGBUS happens, we can jump 63999a2dd95SBruce Richardson * back here. 64099a2dd95SBruce Richardson */ 64199a2dd95SBruce Richardson if (huge_wrap_sigsetjmp()) { 64299a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", 64399a2dd95SBruce Richardson (unsigned int)(alloc_sz >> 20)); 64499a2dd95SBruce Richardson goto mapped; 64599a2dd95SBruce Richardson } 64699a2dd95SBruce Richardson 64799a2dd95SBruce Richardson /* we need to trigger a write to the page to enforce page fault and 64899a2dd95SBruce Richardson * ensure that page is accessible to us, but we can't overwrite value 64999a2dd95SBruce Richardson * that is already there, so read the old value, and write itback. 65099a2dd95SBruce Richardson * kernel populates the page with zeroes initially. 65199a2dd95SBruce Richardson */ 65299a2dd95SBruce Richardson *(volatile int *)addr = *(volatile int *)addr; 65399a2dd95SBruce Richardson 65499a2dd95SBruce Richardson iova = rte_mem_virt2iova(addr); 65599a2dd95SBruce Richardson if (iova == RTE_BAD_PHYS_ADDR) { 65699a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", 65799a2dd95SBruce Richardson __func__); 65899a2dd95SBruce Richardson goto mapped; 65999a2dd95SBruce Richardson } 66099a2dd95SBruce Richardson 66199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 66299a2dd95SBruce Richardson /* 66399a2dd95SBruce Richardson * If the kernel has been built without NUMA support, get_mempolicy() 66499a2dd95SBruce Richardson * will return an error. If check_numa() returns false, memory 66599a2dd95SBruce Richardson * allocation is not NUMA aware and the socket_id should not be 66699a2dd95SBruce Richardson * checked. 66799a2dd95SBruce Richardson */ 66899a2dd95SBruce Richardson if (check_numa()) { 66999a2dd95SBruce Richardson ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, 67099a2dd95SBruce Richardson MPOL_F_NODE | MPOL_F_ADDR); 67199a2dd95SBruce Richardson if (ret < 0) { 67299a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", 67399a2dd95SBruce Richardson __func__, strerror(errno)); 67499a2dd95SBruce Richardson goto mapped; 67599a2dd95SBruce Richardson } else if (cur_socket_id != socket_id) { 67699a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, 67799a2dd95SBruce Richardson "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", 67899a2dd95SBruce Richardson __func__, socket_id, cur_socket_id); 67999a2dd95SBruce Richardson goto mapped; 68099a2dd95SBruce Richardson } 68199a2dd95SBruce Richardson } 68299a2dd95SBruce Richardson #else 68399a2dd95SBruce Richardson if (rte_socket_count() > 1) 68499a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", 68599a2dd95SBruce Richardson __func__); 68699a2dd95SBruce Richardson #endif 68799a2dd95SBruce Richardson 6889bffc928SOlivier Matz huge_recover_sigbus(); 6899bffc928SOlivier Matz 69099a2dd95SBruce Richardson ms->addr = addr; 69199a2dd95SBruce Richardson ms->hugepage_sz = alloc_sz; 69299a2dd95SBruce Richardson ms->len = alloc_sz; 69399a2dd95SBruce Richardson ms->nchannel = rte_memory_get_nchannel(); 69499a2dd95SBruce Richardson ms->nrank = rte_memory_get_nrank(); 69599a2dd95SBruce Richardson ms->iova = iova; 69699a2dd95SBruce Richardson ms->socket_id = socket_id; 697*32b4771cSDmitry Kozlyuk ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0; 69899a2dd95SBruce Richardson 69999a2dd95SBruce Richardson return 0; 70099a2dd95SBruce Richardson 70199a2dd95SBruce Richardson mapped: 70299a2dd95SBruce Richardson munmap(addr, alloc_sz); 70399a2dd95SBruce Richardson unmapped: 7049bffc928SOlivier Matz huge_recover_sigbus(); 70599a2dd95SBruce Richardson flags = EAL_RESERVE_FORCE_ADDRESS; 70699a2dd95SBruce Richardson new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); 70799a2dd95SBruce Richardson if (new_addr != addr) { 70899a2dd95SBruce Richardson if (new_addr != NULL) 70999a2dd95SBruce Richardson munmap(new_addr, alloc_sz); 71099a2dd95SBruce Richardson /* we're leaving a hole in our virtual address space. if 71199a2dd95SBruce Richardson * somebody else maps this hole now, we could accidentally 71299a2dd95SBruce Richardson * override it in the future. 71399a2dd95SBruce Richardson */ 71499a2dd95SBruce Richardson RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); 71599a2dd95SBruce Richardson } 71699a2dd95SBruce Richardson /* roll back the ref count */ 71799a2dd95SBruce Richardson if (internal_conf->single_file_segments) 71899a2dd95SBruce Richardson fd_list[list_idx].count--; 71999a2dd95SBruce Richardson resized: 72099a2dd95SBruce Richardson /* some codepaths will return negative fd, so exit early */ 72199a2dd95SBruce Richardson if (fd < 0) 72299a2dd95SBruce Richardson return -1; 72399a2dd95SBruce Richardson 72499a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 725*32b4771cSDmitry Kozlyuk resize_hugefile(fd, map_offset, alloc_sz, false, NULL); 72699a2dd95SBruce Richardson /* ignore failure, can't make it any worse */ 72799a2dd95SBruce Richardson 72899a2dd95SBruce Richardson /* if refcount is at zero, close the file */ 72999a2dd95SBruce Richardson if (fd_list[list_idx].count == 0) 73099a2dd95SBruce Richardson close_hugefile(fd, path, list_idx); 73199a2dd95SBruce Richardson } else { 73299a2dd95SBruce Richardson /* only remove file if we can take out a write lock */ 73352d7d91eSDmitry Kozlyuk if (!internal_conf->hugepage_file.unlink_before_mapping && 73499a2dd95SBruce Richardson internal_conf->in_memory == 0 && 73599a2dd95SBruce Richardson lock(fd, LOCK_EX) == 1) 73699a2dd95SBruce Richardson unlink(path); 73799a2dd95SBruce Richardson close(fd); 73899a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = -1; 73999a2dd95SBruce Richardson } 74099a2dd95SBruce Richardson return -1; 74199a2dd95SBruce Richardson } 74299a2dd95SBruce Richardson 74399a2dd95SBruce Richardson static int 74499a2dd95SBruce Richardson free_seg(struct rte_memseg *ms, struct hugepage_info *hi, 74599a2dd95SBruce Richardson unsigned int list_idx, unsigned int seg_idx) 74699a2dd95SBruce Richardson { 74799a2dd95SBruce Richardson uint64_t map_offset; 74899a2dd95SBruce Richardson char path[PATH_MAX]; 74999a2dd95SBruce Richardson int fd, ret = 0; 75099a2dd95SBruce Richardson const struct internal_config *internal_conf = 75199a2dd95SBruce Richardson eal_get_internal_configuration(); 75299a2dd95SBruce Richardson 75399a2dd95SBruce Richardson /* erase page data */ 75499a2dd95SBruce Richardson memset(ms->addr, 0, ms->len); 75599a2dd95SBruce Richardson 75699a2dd95SBruce Richardson if (mmap(ms->addr, ms->len, PROT_NONE, 75799a2dd95SBruce Richardson MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == 75899a2dd95SBruce Richardson MAP_FAILED) { 75999a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); 76099a2dd95SBruce Richardson return -1; 76199a2dd95SBruce Richardson } 76299a2dd95SBruce Richardson 76399a2dd95SBruce Richardson eal_mem_set_dump(ms->addr, ms->len, false); 76499a2dd95SBruce Richardson 76599a2dd95SBruce Richardson /* if we're using anonymous hugepages, nothing to be done */ 76699a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported) { 76799a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms)); 76899a2dd95SBruce Richardson return 0; 76999a2dd95SBruce Richardson } 77099a2dd95SBruce Richardson 77199a2dd95SBruce Richardson /* if we are not in single file segments mode, we're going to unmap the 77299a2dd95SBruce Richardson * segment and thus drop the lock on original fd, but hugepage dir is 77399a2dd95SBruce Richardson * now locked so we can take out another one without races. 77499a2dd95SBruce Richardson */ 775*32b4771cSDmitry Kozlyuk fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL); 77699a2dd95SBruce Richardson if (fd < 0) 77799a2dd95SBruce Richardson return -1; 77899a2dd95SBruce Richardson 77999a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 78099a2dd95SBruce Richardson map_offset = seg_idx * ms->len; 781*32b4771cSDmitry Kozlyuk if (resize_hugefile(fd, map_offset, ms->len, false, NULL)) 78299a2dd95SBruce Richardson return -1; 78399a2dd95SBruce Richardson 78499a2dd95SBruce Richardson if (--(fd_list[list_idx].count) == 0) 78599a2dd95SBruce Richardson close_hugefile(fd, path, list_idx); 78699a2dd95SBruce Richardson 78799a2dd95SBruce Richardson ret = 0; 78899a2dd95SBruce Richardson } else { 78999a2dd95SBruce Richardson /* if we're able to take out a write lock, we're the last one 79099a2dd95SBruce Richardson * holding onto this page. 79199a2dd95SBruce Richardson */ 79252d7d91eSDmitry Kozlyuk if (!internal_conf->in_memory && 793*32b4771cSDmitry Kozlyuk internal_conf->hugepage_file.unlink_existing && 79452d7d91eSDmitry Kozlyuk !internal_conf->hugepage_file.unlink_before_mapping) { 79599a2dd95SBruce Richardson ret = lock(fd, LOCK_EX); 79699a2dd95SBruce Richardson if (ret >= 0) { 79799a2dd95SBruce Richardson /* no one else is using this page */ 79899a2dd95SBruce Richardson if (ret == 1) 79999a2dd95SBruce Richardson unlink(path); 80099a2dd95SBruce Richardson } 80199a2dd95SBruce Richardson } 80299a2dd95SBruce Richardson /* closing fd will drop the lock */ 80399a2dd95SBruce Richardson close(fd); 80499a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = -1; 80599a2dd95SBruce Richardson } 80699a2dd95SBruce Richardson 80799a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms)); 80899a2dd95SBruce Richardson 80999a2dd95SBruce Richardson return ret < 0 ? -1 : 0; 81099a2dd95SBruce Richardson } 81199a2dd95SBruce Richardson 81299a2dd95SBruce Richardson struct alloc_walk_param { 81399a2dd95SBruce Richardson struct hugepage_info *hi; 81499a2dd95SBruce Richardson struct rte_memseg **ms; 81599a2dd95SBruce Richardson size_t page_sz; 81699a2dd95SBruce Richardson unsigned int segs_allocated; 81799a2dd95SBruce Richardson unsigned int n_segs; 81899a2dd95SBruce Richardson int socket; 81999a2dd95SBruce Richardson bool exact; 82099a2dd95SBruce Richardson }; 82199a2dd95SBruce Richardson static int 82299a2dd95SBruce Richardson alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) 82399a2dd95SBruce Richardson { 82499a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 82599a2dd95SBruce Richardson struct alloc_walk_param *wa = arg; 82699a2dd95SBruce Richardson struct rte_memseg_list *cur_msl; 82799a2dd95SBruce Richardson size_t page_sz; 82899a2dd95SBruce Richardson int cur_idx, start_idx, j, dir_fd = -1; 82999a2dd95SBruce Richardson unsigned int msl_idx, need, i; 83099a2dd95SBruce Richardson const struct internal_config *internal_conf = 83199a2dd95SBruce Richardson eal_get_internal_configuration(); 83299a2dd95SBruce Richardson 83399a2dd95SBruce Richardson if (msl->page_sz != wa->page_sz) 83499a2dd95SBruce Richardson return 0; 83599a2dd95SBruce Richardson if (msl->socket_id != wa->socket) 83699a2dd95SBruce Richardson return 0; 83799a2dd95SBruce Richardson 83899a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz; 83999a2dd95SBruce Richardson 84099a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 84199a2dd95SBruce Richardson cur_msl = &mcfg->memsegs[msl_idx]; 84299a2dd95SBruce Richardson 84399a2dd95SBruce Richardson need = wa->n_segs; 84499a2dd95SBruce Richardson 84599a2dd95SBruce Richardson /* try finding space in memseg list */ 84699a2dd95SBruce Richardson if (wa->exact) { 84799a2dd95SBruce Richardson /* if we require exact number of pages in a list, find them */ 84899a2dd95SBruce Richardson cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, 84999a2dd95SBruce Richardson need); 85099a2dd95SBruce Richardson if (cur_idx < 0) 85199a2dd95SBruce Richardson return 0; 85299a2dd95SBruce Richardson start_idx = cur_idx; 85399a2dd95SBruce Richardson } else { 85499a2dd95SBruce Richardson int cur_len; 85599a2dd95SBruce Richardson 85699a2dd95SBruce Richardson /* we don't require exact number of pages, so we're going to go 85799a2dd95SBruce Richardson * for best-effort allocation. that means finding the biggest 85899a2dd95SBruce Richardson * unused block, and going with that. 85999a2dd95SBruce Richardson */ 86099a2dd95SBruce Richardson cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, 86199a2dd95SBruce Richardson 0); 86299a2dd95SBruce Richardson if (cur_idx < 0) 86399a2dd95SBruce Richardson return 0; 86499a2dd95SBruce Richardson start_idx = cur_idx; 86599a2dd95SBruce Richardson /* adjust the size to possibly be smaller than original 86699a2dd95SBruce Richardson * request, but do not allow it to be bigger. 86799a2dd95SBruce Richardson */ 86899a2dd95SBruce Richardson cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, 86999a2dd95SBruce Richardson cur_idx); 87099a2dd95SBruce Richardson need = RTE_MIN(need, (unsigned int)cur_len); 87199a2dd95SBruce Richardson } 87299a2dd95SBruce Richardson 87399a2dd95SBruce Richardson /* do not allow any page allocations during the time we're allocating, 87499a2dd95SBruce Richardson * because file creation and locking operations are not atomic, 87599a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page, 87699a2dd95SBruce Richardson * so we need to ensure atomicity of every operation. 87799a2dd95SBruce Richardson * 87899a2dd95SBruce Richardson * during init, we already hold a write lock, so don't try to take out 87999a2dd95SBruce Richardson * another one. 88099a2dd95SBruce Richardson */ 88199a2dd95SBruce Richardson if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 88299a2dd95SBruce Richardson dir_fd = open(wa->hi->hugedir, O_RDONLY); 88399a2dd95SBruce Richardson if (dir_fd < 0) { 88499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 88599a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno)); 88699a2dd95SBruce Richardson return -1; 88799a2dd95SBruce Richardson } 88899a2dd95SBruce Richardson /* blocking writelock */ 88999a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) { 89099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 89199a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno)); 89299a2dd95SBruce Richardson close(dir_fd); 89399a2dd95SBruce Richardson return -1; 89499a2dd95SBruce Richardson } 89599a2dd95SBruce Richardson } 89699a2dd95SBruce Richardson 89799a2dd95SBruce Richardson for (i = 0; i < need; i++, cur_idx++) { 89899a2dd95SBruce Richardson struct rte_memseg *cur; 89999a2dd95SBruce Richardson void *map_addr; 90099a2dd95SBruce Richardson 90199a2dd95SBruce Richardson cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); 90299a2dd95SBruce Richardson map_addr = RTE_PTR_ADD(cur_msl->base_va, 90399a2dd95SBruce Richardson cur_idx * page_sz); 90499a2dd95SBruce Richardson 90599a2dd95SBruce Richardson if (alloc_seg(cur, map_addr, wa->socket, wa->hi, 90699a2dd95SBruce Richardson msl_idx, cur_idx)) { 90799a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", 90899a2dd95SBruce Richardson need, i); 90999a2dd95SBruce Richardson 91099a2dd95SBruce Richardson /* if exact number wasn't requested, stop */ 91199a2dd95SBruce Richardson if (!wa->exact) 91299a2dd95SBruce Richardson goto out; 91399a2dd95SBruce Richardson 91499a2dd95SBruce Richardson /* clean up */ 91599a2dd95SBruce Richardson for (j = start_idx; j < cur_idx; j++) { 91699a2dd95SBruce Richardson struct rte_memseg *tmp; 91799a2dd95SBruce Richardson struct rte_fbarray *arr = 91899a2dd95SBruce Richardson &cur_msl->memseg_arr; 91999a2dd95SBruce Richardson 92099a2dd95SBruce Richardson tmp = rte_fbarray_get(arr, j); 92199a2dd95SBruce Richardson rte_fbarray_set_free(arr, j); 92299a2dd95SBruce Richardson 92399a2dd95SBruce Richardson /* free_seg may attempt to create a file, which 92499a2dd95SBruce Richardson * may fail. 92599a2dd95SBruce Richardson */ 92699a2dd95SBruce Richardson if (free_seg(tmp, wa->hi, msl_idx, j)) 92799a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Cannot free page\n"); 92899a2dd95SBruce Richardson } 92999a2dd95SBruce Richardson /* clear the list */ 93099a2dd95SBruce Richardson if (wa->ms) 93199a2dd95SBruce Richardson memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); 93299a2dd95SBruce Richardson 93399a2dd95SBruce Richardson if (dir_fd >= 0) 93499a2dd95SBruce Richardson close(dir_fd); 93599a2dd95SBruce Richardson return -1; 93699a2dd95SBruce Richardson } 93799a2dd95SBruce Richardson if (wa->ms) 93899a2dd95SBruce Richardson wa->ms[i] = cur; 93999a2dd95SBruce Richardson 94099a2dd95SBruce Richardson rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); 94199a2dd95SBruce Richardson } 94299a2dd95SBruce Richardson out: 94399a2dd95SBruce Richardson wa->segs_allocated = i; 94499a2dd95SBruce Richardson if (i > 0) 94599a2dd95SBruce Richardson cur_msl->version++; 94699a2dd95SBruce Richardson if (dir_fd >= 0) 94799a2dd95SBruce Richardson close(dir_fd); 94899a2dd95SBruce Richardson /* if we didn't allocate any segments, move on to the next list */ 94999a2dd95SBruce Richardson return i > 0; 95099a2dd95SBruce Richardson } 95199a2dd95SBruce Richardson 95299a2dd95SBruce Richardson struct free_walk_param { 95399a2dd95SBruce Richardson struct hugepage_info *hi; 95499a2dd95SBruce Richardson struct rte_memseg *ms; 95599a2dd95SBruce Richardson }; 95699a2dd95SBruce Richardson static int 95799a2dd95SBruce Richardson free_seg_walk(const struct rte_memseg_list *msl, void *arg) 95899a2dd95SBruce Richardson { 95999a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 96099a2dd95SBruce Richardson struct rte_memseg_list *found_msl; 96199a2dd95SBruce Richardson struct free_walk_param *wa = arg; 96299a2dd95SBruce Richardson uintptr_t start_addr, end_addr; 96399a2dd95SBruce Richardson int msl_idx, seg_idx, ret, dir_fd = -1; 96499a2dd95SBruce Richardson const struct internal_config *internal_conf = 96599a2dd95SBruce Richardson eal_get_internal_configuration(); 96699a2dd95SBruce Richardson 96799a2dd95SBruce Richardson start_addr = (uintptr_t) msl->base_va; 96899a2dd95SBruce Richardson end_addr = start_addr + msl->len; 96999a2dd95SBruce Richardson 97099a2dd95SBruce Richardson if ((uintptr_t)wa->ms->addr < start_addr || 97199a2dd95SBruce Richardson (uintptr_t)wa->ms->addr >= end_addr) 97299a2dd95SBruce Richardson return 0; 97399a2dd95SBruce Richardson 97499a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 97599a2dd95SBruce Richardson seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; 97699a2dd95SBruce Richardson 97799a2dd95SBruce Richardson /* msl is const */ 97899a2dd95SBruce Richardson found_msl = &mcfg->memsegs[msl_idx]; 97999a2dd95SBruce Richardson 98099a2dd95SBruce Richardson /* do not allow any page allocations during the time we're freeing, 98199a2dd95SBruce Richardson * because file creation and locking operations are not atomic, 98299a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page, 98399a2dd95SBruce Richardson * so we need to ensure atomicity of every operation. 98499a2dd95SBruce Richardson * 98599a2dd95SBruce Richardson * during init, we already hold a write lock, so don't try to take out 98699a2dd95SBruce Richardson * another one. 98799a2dd95SBruce Richardson */ 98899a2dd95SBruce Richardson if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) { 98999a2dd95SBruce Richardson dir_fd = open(wa->hi->hugedir, O_RDONLY); 99099a2dd95SBruce Richardson if (dir_fd < 0) { 99199a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", 99299a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno)); 99399a2dd95SBruce Richardson return -1; 99499a2dd95SBruce Richardson } 99599a2dd95SBruce Richardson /* blocking writelock */ 99699a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) { 99799a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", 99899a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno)); 99999a2dd95SBruce Richardson close(dir_fd); 100099a2dd95SBruce Richardson return -1; 100199a2dd95SBruce Richardson } 100299a2dd95SBruce Richardson } 100399a2dd95SBruce Richardson 100499a2dd95SBruce Richardson found_msl->version++; 100599a2dd95SBruce Richardson 100699a2dd95SBruce Richardson rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); 100799a2dd95SBruce Richardson 100899a2dd95SBruce Richardson ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); 100999a2dd95SBruce Richardson 101099a2dd95SBruce Richardson if (dir_fd >= 0) 101199a2dd95SBruce Richardson close(dir_fd); 101299a2dd95SBruce Richardson 101399a2dd95SBruce Richardson if (ret < 0) 101499a2dd95SBruce Richardson return -1; 101599a2dd95SBruce Richardson 101699a2dd95SBruce Richardson return 1; 101799a2dd95SBruce Richardson } 101899a2dd95SBruce Richardson 101999a2dd95SBruce Richardson int 102099a2dd95SBruce Richardson eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, 102199a2dd95SBruce Richardson int socket, bool exact) 102299a2dd95SBruce Richardson { 102399a2dd95SBruce Richardson int i, ret = -1; 102499a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 102599a2dd95SBruce Richardson bool have_numa = false; 102699a2dd95SBruce Richardson int oldpolicy; 102799a2dd95SBruce Richardson struct bitmask *oldmask; 102899a2dd95SBruce Richardson #endif 102999a2dd95SBruce Richardson struct alloc_walk_param wa; 103099a2dd95SBruce Richardson struct hugepage_info *hi = NULL; 103199a2dd95SBruce Richardson struct internal_config *internal_conf = 103299a2dd95SBruce Richardson eal_get_internal_configuration(); 103399a2dd95SBruce Richardson 103499a2dd95SBruce Richardson memset(&wa, 0, sizeof(wa)); 103599a2dd95SBruce Richardson 103699a2dd95SBruce Richardson /* dynamic allocation not supported in legacy mode */ 103799a2dd95SBruce Richardson if (internal_conf->legacy_mem) 103899a2dd95SBruce Richardson return -1; 103999a2dd95SBruce Richardson 104099a2dd95SBruce Richardson for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) { 104199a2dd95SBruce Richardson if (page_sz == 104299a2dd95SBruce Richardson internal_conf->hugepage_info[i].hugepage_sz) { 104399a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i]; 104499a2dd95SBruce Richardson break; 104599a2dd95SBruce Richardson } 104699a2dd95SBruce Richardson } 104799a2dd95SBruce Richardson if (!hi) { 104899a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", 104999a2dd95SBruce Richardson __func__); 105099a2dd95SBruce Richardson return -1; 105199a2dd95SBruce Richardson } 105299a2dd95SBruce Richardson 105399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 105499a2dd95SBruce Richardson if (check_numa()) { 105599a2dd95SBruce Richardson oldmask = numa_allocate_nodemask(); 105699a2dd95SBruce Richardson prepare_numa(&oldpolicy, oldmask, socket); 105799a2dd95SBruce Richardson have_numa = true; 105899a2dd95SBruce Richardson } 105999a2dd95SBruce Richardson #endif 106099a2dd95SBruce Richardson 106199a2dd95SBruce Richardson wa.exact = exact; 106299a2dd95SBruce Richardson wa.hi = hi; 106399a2dd95SBruce Richardson wa.ms = ms; 106499a2dd95SBruce Richardson wa.n_segs = n_segs; 106599a2dd95SBruce Richardson wa.page_sz = page_sz; 106699a2dd95SBruce Richardson wa.socket = socket; 106799a2dd95SBruce Richardson wa.segs_allocated = 0; 106899a2dd95SBruce Richardson 106999a2dd95SBruce Richardson /* memalloc is locked, so it's safe to use thread-unsafe version */ 107099a2dd95SBruce Richardson ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); 107199a2dd95SBruce Richardson if (ret == 0) { 107299a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", 107399a2dd95SBruce Richardson __func__); 107499a2dd95SBruce Richardson ret = -1; 107599a2dd95SBruce Richardson } else if (ret > 0) { 107699a2dd95SBruce Richardson ret = (int)wa.segs_allocated; 107799a2dd95SBruce Richardson } 107899a2dd95SBruce Richardson 107999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES 108099a2dd95SBruce Richardson if (have_numa) 108199a2dd95SBruce Richardson restore_numa(&oldpolicy, oldmask); 108299a2dd95SBruce Richardson #endif 108399a2dd95SBruce Richardson return ret; 108499a2dd95SBruce Richardson } 108599a2dd95SBruce Richardson 108699a2dd95SBruce Richardson struct rte_memseg * 108799a2dd95SBruce Richardson eal_memalloc_alloc_seg(size_t page_sz, int socket) 108899a2dd95SBruce Richardson { 108999a2dd95SBruce Richardson struct rte_memseg *ms; 109099a2dd95SBruce Richardson if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) 109199a2dd95SBruce Richardson return NULL; 109299a2dd95SBruce Richardson /* return pointer to newly allocated memseg */ 109399a2dd95SBruce Richardson return ms; 109499a2dd95SBruce Richardson } 109599a2dd95SBruce Richardson 109699a2dd95SBruce Richardson int 109799a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) 109899a2dd95SBruce Richardson { 109999a2dd95SBruce Richardson int seg, ret = 0; 110099a2dd95SBruce Richardson struct internal_config *internal_conf = 110199a2dd95SBruce Richardson eal_get_internal_configuration(); 110299a2dd95SBruce Richardson 110399a2dd95SBruce Richardson /* dynamic free not supported in legacy mode */ 110499a2dd95SBruce Richardson if (internal_conf->legacy_mem) 110599a2dd95SBruce Richardson return -1; 110699a2dd95SBruce Richardson 110799a2dd95SBruce Richardson for (seg = 0; seg < n_segs; seg++) { 110899a2dd95SBruce Richardson struct rte_memseg *cur = ms[seg]; 110999a2dd95SBruce Richardson struct hugepage_info *hi = NULL; 111099a2dd95SBruce Richardson struct free_walk_param wa; 111199a2dd95SBruce Richardson int i, walk_res; 111299a2dd95SBruce Richardson 111399a2dd95SBruce Richardson /* if this page is marked as unfreeable, fail */ 111499a2dd95SBruce Richardson if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 111599a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); 111699a2dd95SBruce Richardson ret = -1; 111799a2dd95SBruce Richardson continue; 111899a2dd95SBruce Richardson } 111999a2dd95SBruce Richardson 112099a2dd95SBruce Richardson memset(&wa, 0, sizeof(wa)); 112199a2dd95SBruce Richardson 112299a2dd95SBruce Richardson for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info); 112399a2dd95SBruce Richardson i++) { 112499a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i]; 112599a2dd95SBruce Richardson if (cur->hugepage_sz == hi->hugepage_sz) 112699a2dd95SBruce Richardson break; 112799a2dd95SBruce Richardson } 112899a2dd95SBruce Richardson if (i == (int)RTE_DIM(internal_conf->hugepage_info)) { 112999a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 113099a2dd95SBruce Richardson ret = -1; 113199a2dd95SBruce Richardson continue; 113299a2dd95SBruce Richardson } 113399a2dd95SBruce Richardson 113499a2dd95SBruce Richardson wa.ms = cur; 113599a2dd95SBruce Richardson wa.hi = hi; 113699a2dd95SBruce Richardson 113799a2dd95SBruce Richardson /* memalloc is locked, so it's safe to use thread-unsafe version 113899a2dd95SBruce Richardson */ 113999a2dd95SBruce Richardson walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, 114099a2dd95SBruce Richardson &wa); 114199a2dd95SBruce Richardson if (walk_res == 1) 114299a2dd95SBruce Richardson continue; 114399a2dd95SBruce Richardson if (walk_res == 0) 114499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); 114599a2dd95SBruce Richardson ret = -1; 114699a2dd95SBruce Richardson } 114799a2dd95SBruce Richardson return ret; 114899a2dd95SBruce Richardson } 114999a2dd95SBruce Richardson 115099a2dd95SBruce Richardson int 115199a2dd95SBruce Richardson eal_memalloc_free_seg(struct rte_memseg *ms) 115299a2dd95SBruce Richardson { 115399a2dd95SBruce Richardson const struct internal_config *internal_conf = 115499a2dd95SBruce Richardson eal_get_internal_configuration(); 115599a2dd95SBruce Richardson 115699a2dd95SBruce Richardson /* dynamic free not supported in legacy mode */ 115799a2dd95SBruce Richardson if (internal_conf->legacy_mem) 115899a2dd95SBruce Richardson return -1; 115999a2dd95SBruce Richardson 116099a2dd95SBruce Richardson return eal_memalloc_free_seg_bulk(&ms, 1); 116199a2dd95SBruce Richardson } 116299a2dd95SBruce Richardson 116399a2dd95SBruce Richardson static int 116499a2dd95SBruce Richardson sync_chunk(struct rte_memseg_list *primary_msl, 116599a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi, 116699a2dd95SBruce Richardson unsigned int msl_idx, bool used, int start, int end) 116799a2dd95SBruce Richardson { 116899a2dd95SBruce Richardson struct rte_fbarray *l_arr, *p_arr; 116999a2dd95SBruce Richardson int i, ret, chunk_len, diff_len; 117099a2dd95SBruce Richardson 117199a2dd95SBruce Richardson l_arr = &local_msl->memseg_arr; 117299a2dd95SBruce Richardson p_arr = &primary_msl->memseg_arr; 117399a2dd95SBruce Richardson 117499a2dd95SBruce Richardson /* we need to aggregate allocations/deallocations into bigger chunks, 117599a2dd95SBruce Richardson * as we don't want to spam the user with per-page callbacks. 117699a2dd95SBruce Richardson * 117799a2dd95SBruce Richardson * to avoid any potential issues, we also want to trigger 117899a2dd95SBruce Richardson * deallocation callbacks *before* we actually deallocate 117999a2dd95SBruce Richardson * memory, so that the user application could wrap up its use 118099a2dd95SBruce Richardson * before it goes away. 118199a2dd95SBruce Richardson */ 118299a2dd95SBruce Richardson 118399a2dd95SBruce Richardson chunk_len = end - start; 118499a2dd95SBruce Richardson 118599a2dd95SBruce Richardson /* find how many contiguous pages we can map/unmap for this chunk */ 118699a2dd95SBruce Richardson diff_len = used ? 118799a2dd95SBruce Richardson rte_fbarray_find_contig_free(l_arr, start) : 118899a2dd95SBruce Richardson rte_fbarray_find_contig_used(l_arr, start); 118999a2dd95SBruce Richardson 119099a2dd95SBruce Richardson /* has to be at least one page */ 119199a2dd95SBruce Richardson if (diff_len < 1) 119299a2dd95SBruce Richardson return -1; 119399a2dd95SBruce Richardson 119499a2dd95SBruce Richardson diff_len = RTE_MIN(chunk_len, diff_len); 119599a2dd95SBruce Richardson 119699a2dd95SBruce Richardson /* if we are freeing memory, notify the application */ 119799a2dd95SBruce Richardson if (!used) { 119899a2dd95SBruce Richardson struct rte_memseg *ms; 119999a2dd95SBruce Richardson void *start_va; 120099a2dd95SBruce Richardson size_t len, page_sz; 120199a2dd95SBruce Richardson 120299a2dd95SBruce Richardson ms = rte_fbarray_get(l_arr, start); 120399a2dd95SBruce Richardson start_va = ms->addr; 120499a2dd95SBruce Richardson page_sz = (size_t)primary_msl->page_sz; 120599a2dd95SBruce Richardson len = page_sz * diff_len; 120699a2dd95SBruce Richardson 120799a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 120899a2dd95SBruce Richardson start_va, len); 120999a2dd95SBruce Richardson } 121099a2dd95SBruce Richardson 121199a2dd95SBruce Richardson for (i = 0; i < diff_len; i++) { 121299a2dd95SBruce Richardson struct rte_memseg *p_ms, *l_ms; 121399a2dd95SBruce Richardson int seg_idx = start + i; 121499a2dd95SBruce Richardson 121599a2dd95SBruce Richardson l_ms = rte_fbarray_get(l_arr, seg_idx); 121699a2dd95SBruce Richardson p_ms = rte_fbarray_get(p_arr, seg_idx); 121799a2dd95SBruce Richardson 121899a2dd95SBruce Richardson if (l_ms == NULL || p_ms == NULL) 121999a2dd95SBruce Richardson return -1; 122099a2dd95SBruce Richardson 122199a2dd95SBruce Richardson if (used) { 122299a2dd95SBruce Richardson ret = alloc_seg(l_ms, p_ms->addr, 122399a2dd95SBruce Richardson p_ms->socket_id, hi, 122499a2dd95SBruce Richardson msl_idx, seg_idx); 122599a2dd95SBruce Richardson if (ret < 0) 122699a2dd95SBruce Richardson return -1; 122799a2dd95SBruce Richardson rte_fbarray_set_used(l_arr, seg_idx); 122899a2dd95SBruce Richardson } else { 122999a2dd95SBruce Richardson ret = free_seg(l_ms, hi, msl_idx, seg_idx); 123099a2dd95SBruce Richardson rte_fbarray_set_free(l_arr, seg_idx); 123199a2dd95SBruce Richardson if (ret < 0) 123299a2dd95SBruce Richardson return -1; 123399a2dd95SBruce Richardson } 123499a2dd95SBruce Richardson } 123599a2dd95SBruce Richardson 123699a2dd95SBruce Richardson /* if we just allocated memory, notify the application */ 123799a2dd95SBruce Richardson if (used) { 123899a2dd95SBruce Richardson struct rte_memseg *ms; 123999a2dd95SBruce Richardson void *start_va; 124099a2dd95SBruce Richardson size_t len, page_sz; 124199a2dd95SBruce Richardson 124299a2dd95SBruce Richardson ms = rte_fbarray_get(l_arr, start); 124399a2dd95SBruce Richardson start_va = ms->addr; 124499a2dd95SBruce Richardson page_sz = (size_t)primary_msl->page_sz; 124599a2dd95SBruce Richardson len = page_sz * diff_len; 124699a2dd95SBruce Richardson 124799a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 124899a2dd95SBruce Richardson start_va, len); 124999a2dd95SBruce Richardson } 125099a2dd95SBruce Richardson 125199a2dd95SBruce Richardson /* calculate how much we can advance until next chunk */ 125299a2dd95SBruce Richardson diff_len = used ? 125399a2dd95SBruce Richardson rte_fbarray_find_contig_used(l_arr, start) : 125499a2dd95SBruce Richardson rte_fbarray_find_contig_free(l_arr, start); 125599a2dd95SBruce Richardson ret = RTE_MIN(chunk_len, diff_len); 125699a2dd95SBruce Richardson 125799a2dd95SBruce Richardson return ret; 125899a2dd95SBruce Richardson } 125999a2dd95SBruce Richardson 126099a2dd95SBruce Richardson static int 126199a2dd95SBruce Richardson sync_status(struct rte_memseg_list *primary_msl, 126299a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi, 126399a2dd95SBruce Richardson unsigned int msl_idx, bool used) 126499a2dd95SBruce Richardson { 126599a2dd95SBruce Richardson struct rte_fbarray *l_arr, *p_arr; 126699a2dd95SBruce Richardson int p_idx, l_chunk_len, p_chunk_len, ret; 126799a2dd95SBruce Richardson int start, end; 126899a2dd95SBruce Richardson 126999a2dd95SBruce Richardson /* this is a little bit tricky, but the basic idea is - walk both lists 127099a2dd95SBruce Richardson * and spot any places where there are discrepancies. walking both lists 127199a2dd95SBruce Richardson * and noting discrepancies in a single go is a hard problem, so we do 127299a2dd95SBruce Richardson * it in two passes - first we spot any places where allocated segments 127399a2dd95SBruce Richardson * mismatch (i.e. ensure that everything that's allocated in the primary 127499a2dd95SBruce Richardson * is also allocated in the secondary), and then we do it by looking at 127599a2dd95SBruce Richardson * free segments instead. 127699a2dd95SBruce Richardson * 127799a2dd95SBruce Richardson * we also need to aggregate changes into chunks, as we have to call 127899a2dd95SBruce Richardson * callbacks per allocation, not per page. 127999a2dd95SBruce Richardson */ 128099a2dd95SBruce Richardson l_arr = &local_msl->memseg_arr; 128199a2dd95SBruce Richardson p_arr = &primary_msl->memseg_arr; 128299a2dd95SBruce Richardson 128399a2dd95SBruce Richardson if (used) 128499a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_used(p_arr, 0); 128599a2dd95SBruce Richardson else 128699a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_free(p_arr, 0); 128799a2dd95SBruce Richardson 128899a2dd95SBruce Richardson while (p_idx >= 0) { 128999a2dd95SBruce Richardson int next_chunk_search_idx; 129099a2dd95SBruce Richardson 129199a2dd95SBruce Richardson if (used) { 129299a2dd95SBruce Richardson p_chunk_len = rte_fbarray_find_contig_used(p_arr, 129399a2dd95SBruce Richardson p_idx); 129499a2dd95SBruce Richardson l_chunk_len = rte_fbarray_find_contig_used(l_arr, 129599a2dd95SBruce Richardson p_idx); 129699a2dd95SBruce Richardson } else { 129799a2dd95SBruce Richardson p_chunk_len = rte_fbarray_find_contig_free(p_arr, 129899a2dd95SBruce Richardson p_idx); 129999a2dd95SBruce Richardson l_chunk_len = rte_fbarray_find_contig_free(l_arr, 130099a2dd95SBruce Richardson p_idx); 130199a2dd95SBruce Richardson } 130299a2dd95SBruce Richardson /* best case scenario - no differences (or bigger, which will be 130399a2dd95SBruce Richardson * fixed during next iteration), look for next chunk 130499a2dd95SBruce Richardson */ 130599a2dd95SBruce Richardson if (l_chunk_len >= p_chunk_len) { 130699a2dd95SBruce Richardson next_chunk_search_idx = p_idx + p_chunk_len; 130799a2dd95SBruce Richardson goto next_chunk; 130899a2dd95SBruce Richardson } 130999a2dd95SBruce Richardson 131099a2dd95SBruce Richardson /* if both chunks start at the same point, skip parts we know 131199a2dd95SBruce Richardson * are identical, and sync the rest. each call to sync_chunk 131299a2dd95SBruce Richardson * will only sync contiguous segments, so we need to call this 131399a2dd95SBruce Richardson * until we are sure there are no more differences in this 131499a2dd95SBruce Richardson * chunk. 131599a2dd95SBruce Richardson */ 131699a2dd95SBruce Richardson start = p_idx + l_chunk_len; 131799a2dd95SBruce Richardson end = p_idx + p_chunk_len; 131899a2dd95SBruce Richardson do { 131999a2dd95SBruce Richardson ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, 132099a2dd95SBruce Richardson used, start, end); 132199a2dd95SBruce Richardson start += ret; 132299a2dd95SBruce Richardson } while (start < end && ret >= 0); 132399a2dd95SBruce Richardson /* if ret is negative, something went wrong */ 132499a2dd95SBruce Richardson if (ret < 0) 132599a2dd95SBruce Richardson return -1; 132699a2dd95SBruce Richardson 132799a2dd95SBruce Richardson next_chunk_search_idx = p_idx + p_chunk_len; 132899a2dd95SBruce Richardson next_chunk: 132999a2dd95SBruce Richardson /* skip to end of this chunk */ 133099a2dd95SBruce Richardson if (used) { 133199a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_used(p_arr, 133299a2dd95SBruce Richardson next_chunk_search_idx); 133399a2dd95SBruce Richardson } else { 133499a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_free(p_arr, 133599a2dd95SBruce Richardson next_chunk_search_idx); 133699a2dd95SBruce Richardson } 133799a2dd95SBruce Richardson } 133899a2dd95SBruce Richardson return 0; 133999a2dd95SBruce Richardson } 134099a2dd95SBruce Richardson 134199a2dd95SBruce Richardson static int 134299a2dd95SBruce Richardson sync_existing(struct rte_memseg_list *primary_msl, 134399a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi, 134499a2dd95SBruce Richardson unsigned int msl_idx) 134599a2dd95SBruce Richardson { 134699a2dd95SBruce Richardson int ret, dir_fd; 134799a2dd95SBruce Richardson 134899a2dd95SBruce Richardson /* do not allow any page allocations during the time we're allocating, 134999a2dd95SBruce Richardson * because file creation and locking operations are not atomic, 135099a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page, 135199a2dd95SBruce Richardson * so we need to ensure atomicity of every operation. 135299a2dd95SBruce Richardson */ 135399a2dd95SBruce Richardson dir_fd = open(hi->hugedir, O_RDONLY); 135499a2dd95SBruce Richardson if (dir_fd < 0) { 135599a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, 135699a2dd95SBruce Richardson hi->hugedir, strerror(errno)); 135799a2dd95SBruce Richardson return -1; 135899a2dd95SBruce Richardson } 135999a2dd95SBruce Richardson /* blocking writelock */ 136099a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) { 136199a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, 136299a2dd95SBruce Richardson hi->hugedir, strerror(errno)); 136399a2dd95SBruce Richardson close(dir_fd); 136499a2dd95SBruce Richardson return -1; 136599a2dd95SBruce Richardson } 136699a2dd95SBruce Richardson 136799a2dd95SBruce Richardson /* ensure all allocated space is the same in both lists */ 136899a2dd95SBruce Richardson ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); 136999a2dd95SBruce Richardson if (ret < 0) 137099a2dd95SBruce Richardson goto fail; 137199a2dd95SBruce Richardson 137299a2dd95SBruce Richardson /* ensure all unallocated space is the same in both lists */ 137399a2dd95SBruce Richardson ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); 137499a2dd95SBruce Richardson if (ret < 0) 137599a2dd95SBruce Richardson goto fail; 137699a2dd95SBruce Richardson 137799a2dd95SBruce Richardson /* update version number */ 137899a2dd95SBruce Richardson local_msl->version = primary_msl->version; 137999a2dd95SBruce Richardson 138099a2dd95SBruce Richardson close(dir_fd); 138199a2dd95SBruce Richardson 138299a2dd95SBruce Richardson return 0; 138399a2dd95SBruce Richardson fail: 138499a2dd95SBruce Richardson close(dir_fd); 138599a2dd95SBruce Richardson return -1; 138699a2dd95SBruce Richardson } 138799a2dd95SBruce Richardson 138899a2dd95SBruce Richardson static int 138999a2dd95SBruce Richardson sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 139099a2dd95SBruce Richardson { 139199a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 139299a2dd95SBruce Richardson struct rte_memseg_list *primary_msl, *local_msl; 139399a2dd95SBruce Richardson struct hugepage_info *hi = NULL; 139499a2dd95SBruce Richardson unsigned int i; 139599a2dd95SBruce Richardson int msl_idx; 139699a2dd95SBruce Richardson struct internal_config *internal_conf = 139799a2dd95SBruce Richardson eal_get_internal_configuration(); 139899a2dd95SBruce Richardson 139999a2dd95SBruce Richardson if (msl->external) 140099a2dd95SBruce Richardson return 0; 140199a2dd95SBruce Richardson 140299a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 140399a2dd95SBruce Richardson primary_msl = &mcfg->memsegs[msl_idx]; 140499a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx]; 140599a2dd95SBruce Richardson 140699a2dd95SBruce Richardson for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) { 140799a2dd95SBruce Richardson uint64_t cur_sz = 140899a2dd95SBruce Richardson internal_conf->hugepage_info[i].hugepage_sz; 140999a2dd95SBruce Richardson uint64_t msl_sz = primary_msl->page_sz; 141099a2dd95SBruce Richardson if (msl_sz == cur_sz) { 141199a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i]; 141299a2dd95SBruce Richardson break; 141399a2dd95SBruce Richardson } 141499a2dd95SBruce Richardson } 141599a2dd95SBruce Richardson if (!hi) { 141699a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); 141799a2dd95SBruce Richardson return -1; 141899a2dd95SBruce Richardson } 141999a2dd95SBruce Richardson 142099a2dd95SBruce Richardson /* if versions don't match, synchronize everything */ 142199a2dd95SBruce Richardson if (local_msl->version != primary_msl->version && 142299a2dd95SBruce Richardson sync_existing(primary_msl, local_msl, hi, msl_idx)) 142399a2dd95SBruce Richardson return -1; 142499a2dd95SBruce Richardson return 0; 142599a2dd95SBruce Richardson } 142699a2dd95SBruce Richardson 142799a2dd95SBruce Richardson 142899a2dd95SBruce Richardson int 142999a2dd95SBruce Richardson eal_memalloc_sync_with_primary(void) 143099a2dd95SBruce Richardson { 143199a2dd95SBruce Richardson /* nothing to be done in primary */ 143299a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) 143399a2dd95SBruce Richardson return 0; 143499a2dd95SBruce Richardson 143599a2dd95SBruce Richardson /* memalloc is locked, so it's safe to call thread-unsafe version */ 143699a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) 143799a2dd95SBruce Richardson return -1; 143899a2dd95SBruce Richardson return 0; 143999a2dd95SBruce Richardson } 144099a2dd95SBruce Richardson 144199a2dd95SBruce Richardson static int 144299a2dd95SBruce Richardson secondary_msl_create_walk(const struct rte_memseg_list *msl, 144399a2dd95SBruce Richardson void *arg __rte_unused) 144499a2dd95SBruce Richardson { 144599a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 144699a2dd95SBruce Richardson struct rte_memseg_list *primary_msl, *local_msl; 144799a2dd95SBruce Richardson char name[PATH_MAX]; 144899a2dd95SBruce Richardson int msl_idx, ret; 144999a2dd95SBruce Richardson 145099a2dd95SBruce Richardson if (msl->external) 145199a2dd95SBruce Richardson return 0; 145299a2dd95SBruce Richardson 145399a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 145499a2dd95SBruce Richardson primary_msl = &mcfg->memsegs[msl_idx]; 145599a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx]; 145699a2dd95SBruce Richardson 145799a2dd95SBruce Richardson /* create distinct fbarrays for each secondary */ 145899a2dd95SBruce Richardson snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", 145999a2dd95SBruce Richardson primary_msl->memseg_arr.name, getpid()); 146099a2dd95SBruce Richardson 146199a2dd95SBruce Richardson ret = rte_fbarray_init(&local_msl->memseg_arr, name, 146299a2dd95SBruce Richardson primary_msl->memseg_arr.len, 146399a2dd95SBruce Richardson primary_msl->memseg_arr.elt_sz); 146499a2dd95SBruce Richardson if (ret < 0) { 146599a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); 146699a2dd95SBruce Richardson return -1; 146799a2dd95SBruce Richardson } 146899a2dd95SBruce Richardson local_msl->base_va = primary_msl->base_va; 146999a2dd95SBruce Richardson local_msl->len = primary_msl->len; 147099a2dd95SBruce Richardson 147199a2dd95SBruce Richardson return 0; 147299a2dd95SBruce Richardson } 147399a2dd95SBruce Richardson 147499a2dd95SBruce Richardson static int 147599a2dd95SBruce Richardson secondary_msl_destroy_walk(const struct rte_memseg_list *msl, 147699a2dd95SBruce Richardson void *arg __rte_unused) 147799a2dd95SBruce Richardson { 147899a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 147999a2dd95SBruce Richardson struct rte_memseg_list *local_msl; 148099a2dd95SBruce Richardson int msl_idx, ret; 148199a2dd95SBruce Richardson 148299a2dd95SBruce Richardson if (msl->external) 148399a2dd95SBruce Richardson return 0; 148499a2dd95SBruce Richardson 148599a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 148699a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx]; 148799a2dd95SBruce Richardson 148899a2dd95SBruce Richardson ret = rte_fbarray_destroy(&local_msl->memseg_arr); 148999a2dd95SBruce Richardson if (ret < 0) { 149099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n"); 149199a2dd95SBruce Richardson return -1; 149299a2dd95SBruce Richardson } 149399a2dd95SBruce Richardson local_msl->base_va = NULL; 149499a2dd95SBruce Richardson local_msl->len = 0; 149599a2dd95SBruce Richardson 149699a2dd95SBruce Richardson return 0; 149799a2dd95SBruce Richardson } 149899a2dd95SBruce Richardson 149999a2dd95SBruce Richardson static int 150099a2dd95SBruce Richardson alloc_list(int list_idx, int len) 150199a2dd95SBruce Richardson { 150299a2dd95SBruce Richardson int *data; 150399a2dd95SBruce Richardson int i; 150499a2dd95SBruce Richardson const struct internal_config *internal_conf = 150599a2dd95SBruce Richardson eal_get_internal_configuration(); 150699a2dd95SBruce Richardson 150799a2dd95SBruce Richardson /* single-file segments mode does not need fd list */ 150899a2dd95SBruce Richardson if (!internal_conf->single_file_segments) { 150999a2dd95SBruce Richardson /* ensure we have space to store fd per each possible segment */ 151099a2dd95SBruce Richardson data = malloc(sizeof(int) * len); 151199a2dd95SBruce Richardson if (data == NULL) { 151299a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); 151399a2dd95SBruce Richardson return -1; 151499a2dd95SBruce Richardson } 151599a2dd95SBruce Richardson /* set all fd's as invalid */ 151699a2dd95SBruce Richardson for (i = 0; i < len; i++) 151799a2dd95SBruce Richardson data[i] = -1; 151899a2dd95SBruce Richardson fd_list[list_idx].fds = data; 151999a2dd95SBruce Richardson fd_list[list_idx].len = len; 152099a2dd95SBruce Richardson } else { 152199a2dd95SBruce Richardson fd_list[list_idx].fds = NULL; 152299a2dd95SBruce Richardson fd_list[list_idx].len = 0; 152399a2dd95SBruce Richardson } 152499a2dd95SBruce Richardson 152599a2dd95SBruce Richardson fd_list[list_idx].count = 0; 152699a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1; 152799a2dd95SBruce Richardson 152899a2dd95SBruce Richardson return 0; 152999a2dd95SBruce Richardson } 153099a2dd95SBruce Richardson 153199a2dd95SBruce Richardson static int 153299a2dd95SBruce Richardson destroy_list(int list_idx) 153399a2dd95SBruce Richardson { 153499a2dd95SBruce Richardson const struct internal_config *internal_conf = 153599a2dd95SBruce Richardson eal_get_internal_configuration(); 153699a2dd95SBruce Richardson 153799a2dd95SBruce Richardson /* single-file segments mode does not need fd list */ 153899a2dd95SBruce Richardson if (!internal_conf->single_file_segments) { 153999a2dd95SBruce Richardson int *fds = fd_list[list_idx].fds; 154099a2dd95SBruce Richardson int i; 154199a2dd95SBruce Richardson /* go through each fd and ensure it's closed */ 154299a2dd95SBruce Richardson for (i = 0; i < fd_list[list_idx].len; i++) { 154399a2dd95SBruce Richardson if (fds[i] >= 0) { 154499a2dd95SBruce Richardson close(fds[i]); 154599a2dd95SBruce Richardson fds[i] = -1; 154699a2dd95SBruce Richardson } 154799a2dd95SBruce Richardson } 154899a2dd95SBruce Richardson free(fds); 154999a2dd95SBruce Richardson fd_list[list_idx].fds = NULL; 155099a2dd95SBruce Richardson fd_list[list_idx].len = 0; 155199a2dd95SBruce Richardson } else if (fd_list[list_idx].memseg_list_fd >= 0) { 155299a2dd95SBruce Richardson close(fd_list[list_idx].memseg_list_fd); 155399a2dd95SBruce Richardson fd_list[list_idx].count = 0; 155499a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1; 155599a2dd95SBruce Richardson } 155699a2dd95SBruce Richardson return 0; 155799a2dd95SBruce Richardson } 155899a2dd95SBruce Richardson 155999a2dd95SBruce Richardson static int 156099a2dd95SBruce Richardson fd_list_create_walk(const struct rte_memseg_list *msl, 156199a2dd95SBruce Richardson void *arg __rte_unused) 156299a2dd95SBruce Richardson { 156399a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 156499a2dd95SBruce Richardson unsigned int len; 156599a2dd95SBruce Richardson int msl_idx; 156699a2dd95SBruce Richardson 156799a2dd95SBruce Richardson if (msl->external) 156899a2dd95SBruce Richardson return 0; 156999a2dd95SBruce Richardson 157099a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 157199a2dd95SBruce Richardson len = msl->memseg_arr.len; 157299a2dd95SBruce Richardson 157399a2dd95SBruce Richardson return alloc_list(msl_idx, len); 157499a2dd95SBruce Richardson } 157599a2dd95SBruce Richardson 157699a2dd95SBruce Richardson static int 157799a2dd95SBruce Richardson fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) 157899a2dd95SBruce Richardson { 157999a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 158099a2dd95SBruce Richardson int msl_idx; 158199a2dd95SBruce Richardson 158299a2dd95SBruce Richardson if (msl->external) 158399a2dd95SBruce Richardson return 0; 158499a2dd95SBruce Richardson 158599a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 158699a2dd95SBruce Richardson 158799a2dd95SBruce Richardson return destroy_list(msl_idx); 158899a2dd95SBruce Richardson } 158999a2dd95SBruce Richardson 159099a2dd95SBruce Richardson int 159199a2dd95SBruce Richardson eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) 159299a2dd95SBruce Richardson { 159399a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 159499a2dd95SBruce Richardson const struct internal_config *internal_conf = 159599a2dd95SBruce Richardson eal_get_internal_configuration(); 159699a2dd95SBruce Richardson 159799a2dd95SBruce Richardson /* single file segments mode doesn't support individual segment fd's */ 159899a2dd95SBruce Richardson if (internal_conf->single_file_segments) 159999a2dd95SBruce Richardson return -ENOTSUP; 160099a2dd95SBruce Richardson 160199a2dd95SBruce Richardson /* if list is not allocated, allocate it */ 160299a2dd95SBruce Richardson if (fd_list[list_idx].len == 0) { 160399a2dd95SBruce Richardson int len = mcfg->memsegs[list_idx].memseg_arr.len; 160499a2dd95SBruce Richardson 160599a2dd95SBruce Richardson if (alloc_list(list_idx, len) < 0) 160699a2dd95SBruce Richardson return -ENOMEM; 160799a2dd95SBruce Richardson } 160899a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = fd; 160999a2dd95SBruce Richardson 161099a2dd95SBruce Richardson return 0; 161199a2dd95SBruce Richardson } 161299a2dd95SBruce Richardson 161399a2dd95SBruce Richardson int 161499a2dd95SBruce Richardson eal_memalloc_set_seg_list_fd(int list_idx, int fd) 161599a2dd95SBruce Richardson { 161699a2dd95SBruce Richardson const struct internal_config *internal_conf = 161799a2dd95SBruce Richardson eal_get_internal_configuration(); 161899a2dd95SBruce Richardson 161999a2dd95SBruce Richardson /* non-single file segment mode doesn't support segment list fd's */ 162099a2dd95SBruce Richardson if (!internal_conf->single_file_segments) 162199a2dd95SBruce Richardson return -ENOTSUP; 162299a2dd95SBruce Richardson 162399a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = fd; 162499a2dd95SBruce Richardson 162599a2dd95SBruce Richardson return 0; 162699a2dd95SBruce Richardson } 162799a2dd95SBruce Richardson 162899a2dd95SBruce Richardson int 162999a2dd95SBruce Richardson eal_memalloc_get_seg_fd(int list_idx, int seg_idx) 163099a2dd95SBruce Richardson { 163199a2dd95SBruce Richardson int fd; 163299a2dd95SBruce Richardson const struct internal_config *internal_conf = 163399a2dd95SBruce Richardson eal_get_internal_configuration(); 163499a2dd95SBruce Richardson 163599a2dd95SBruce Richardson if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 163699a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED 163799a2dd95SBruce Richardson /* in in-memory or no-huge mode, we rely on memfd support */ 163899a2dd95SBruce Richardson return -ENOTSUP; 163999a2dd95SBruce Richardson #endif 164099a2dd95SBruce Richardson /* memfd supported, but hugetlbfs memfd may not be */ 164199a2dd95SBruce Richardson if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 164299a2dd95SBruce Richardson return -ENOTSUP; 164399a2dd95SBruce Richardson } 164499a2dd95SBruce Richardson 164599a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 164699a2dd95SBruce Richardson fd = fd_list[list_idx].memseg_list_fd; 164799a2dd95SBruce Richardson } else if (fd_list[list_idx].len == 0) { 164899a2dd95SBruce Richardson /* list not initialized */ 164999a2dd95SBruce Richardson fd = -1; 165099a2dd95SBruce Richardson } else { 165199a2dd95SBruce Richardson fd = fd_list[list_idx].fds[seg_idx]; 165299a2dd95SBruce Richardson } 165399a2dd95SBruce Richardson if (fd < 0) 165499a2dd95SBruce Richardson return -ENODEV; 165599a2dd95SBruce Richardson return fd; 165699a2dd95SBruce Richardson } 165799a2dd95SBruce Richardson 165899a2dd95SBruce Richardson static int 165999a2dd95SBruce Richardson test_memfd_create(void) 166099a2dd95SBruce Richardson { 166199a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED 166299a2dd95SBruce Richardson const struct internal_config *internal_conf = 166399a2dd95SBruce Richardson eal_get_internal_configuration(); 166499a2dd95SBruce Richardson unsigned int i; 166599a2dd95SBruce Richardson for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 166699a2dd95SBruce Richardson uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz; 166799a2dd95SBruce Richardson int pagesz_flag = pagesz_flags(pagesz); 166899a2dd95SBruce Richardson int flags; 166999a2dd95SBruce Richardson 167099a2dd95SBruce Richardson flags = pagesz_flag | RTE_MFD_HUGETLB; 167199a2dd95SBruce Richardson int fd = memfd_create("test", flags); 167299a2dd95SBruce Richardson if (fd < 0) { 167399a2dd95SBruce Richardson /* we failed - let memalloc know this isn't working */ 167499a2dd95SBruce Richardson if (errno == EINVAL) { 167599a2dd95SBruce Richardson memfd_create_supported = 0; 167699a2dd95SBruce Richardson return 0; /* not supported */ 167799a2dd95SBruce Richardson } 167899a2dd95SBruce Richardson 167999a2dd95SBruce Richardson /* we got other error - something's wrong */ 168099a2dd95SBruce Richardson return -1; /* error */ 168199a2dd95SBruce Richardson } 168299a2dd95SBruce Richardson close(fd); 168399a2dd95SBruce Richardson return 1; /* supported */ 168499a2dd95SBruce Richardson } 168599a2dd95SBruce Richardson #endif 168699a2dd95SBruce Richardson return 0; /* not supported */ 168799a2dd95SBruce Richardson } 168899a2dd95SBruce Richardson 168999a2dd95SBruce Richardson int 169099a2dd95SBruce Richardson eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) 169199a2dd95SBruce Richardson { 169299a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 169399a2dd95SBruce Richardson const struct internal_config *internal_conf = 169499a2dd95SBruce Richardson eal_get_internal_configuration(); 169599a2dd95SBruce Richardson 169699a2dd95SBruce Richardson if (internal_conf->in_memory || internal_conf->no_hugetlbfs) { 169799a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED 169899a2dd95SBruce Richardson /* in in-memory or no-huge mode, we rely on memfd support */ 169999a2dd95SBruce Richardson return -ENOTSUP; 170099a2dd95SBruce Richardson #endif 170199a2dd95SBruce Richardson /* memfd supported, but hugetlbfs memfd may not be */ 170299a2dd95SBruce Richardson if (!internal_conf->no_hugetlbfs && !memfd_create_supported) 170399a2dd95SBruce Richardson return -ENOTSUP; 170499a2dd95SBruce Richardson } 170599a2dd95SBruce Richardson 170699a2dd95SBruce Richardson if (internal_conf->single_file_segments) { 170799a2dd95SBruce Richardson size_t pgsz = mcfg->memsegs[list_idx].page_sz; 170899a2dd95SBruce Richardson 170999a2dd95SBruce Richardson /* segment not active? */ 171099a2dd95SBruce Richardson if (fd_list[list_idx].memseg_list_fd < 0) 171199a2dd95SBruce Richardson return -ENOENT; 171299a2dd95SBruce Richardson *offset = pgsz * seg_idx; 171399a2dd95SBruce Richardson } else { 171499a2dd95SBruce Richardson /* fd_list not initialized? */ 171599a2dd95SBruce Richardson if (fd_list[list_idx].len == 0) 171699a2dd95SBruce Richardson return -ENODEV; 171799a2dd95SBruce Richardson 171899a2dd95SBruce Richardson /* segment not active? */ 171999a2dd95SBruce Richardson if (fd_list[list_idx].fds[seg_idx] < 0) 172099a2dd95SBruce Richardson return -ENOENT; 172199a2dd95SBruce Richardson *offset = 0; 172299a2dd95SBruce Richardson } 172399a2dd95SBruce Richardson return 0; 172499a2dd95SBruce Richardson } 172599a2dd95SBruce Richardson 172699a2dd95SBruce Richardson int 172799a2dd95SBruce Richardson eal_memalloc_cleanup(void) 172899a2dd95SBruce Richardson { 172999a2dd95SBruce Richardson /* close all remaining fd's - these are per-process, so it's safe */ 173099a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL)) 173199a2dd95SBruce Richardson return -1; 173299a2dd95SBruce Richardson 173399a2dd95SBruce Richardson /* destroy the shadow page table if we're a secondary process */ 173499a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) 173599a2dd95SBruce Richardson return 0; 173699a2dd95SBruce Richardson 173799a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk, 173899a2dd95SBruce Richardson NULL)) 173999a2dd95SBruce Richardson return -1; 174099a2dd95SBruce Richardson 174199a2dd95SBruce Richardson return 0; 174299a2dd95SBruce Richardson } 174399a2dd95SBruce Richardson 174499a2dd95SBruce Richardson int 174599a2dd95SBruce Richardson eal_memalloc_init(void) 174699a2dd95SBruce Richardson { 174799a2dd95SBruce Richardson const struct internal_config *internal_conf = 174899a2dd95SBruce Richardson eal_get_internal_configuration(); 174999a2dd95SBruce Richardson 175099a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_SECONDARY) 175199a2dd95SBruce Richardson if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) 175299a2dd95SBruce Richardson return -1; 175399a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY && 175499a2dd95SBruce Richardson internal_conf->in_memory) { 175599a2dd95SBruce Richardson int mfd_res = test_memfd_create(); 175699a2dd95SBruce Richardson 175799a2dd95SBruce Richardson if (mfd_res < 0) { 175899a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); 175999a2dd95SBruce Richardson return -1; 176099a2dd95SBruce Richardson } 176199a2dd95SBruce Richardson if (mfd_res == 1) 176299a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); 176399a2dd95SBruce Richardson else 176499a2dd95SBruce Richardson RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); 176599a2dd95SBruce Richardson 176699a2dd95SBruce Richardson /* we only support single-file segments mode with in-memory mode 176799a2dd95SBruce Richardson * if we support hugetlbfs with memfd_create. this code will 176899a2dd95SBruce Richardson * test if we do. 176999a2dd95SBruce Richardson */ 177099a2dd95SBruce Richardson if (internal_conf->single_file_segments && 177199a2dd95SBruce Richardson mfd_res != 1) { 177299a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); 177399a2dd95SBruce Richardson return -1; 177499a2dd95SBruce Richardson } 177599a2dd95SBruce Richardson /* this cannot ever happen but better safe than sorry */ 177699a2dd95SBruce Richardson if (!anonymous_hugepages_supported) { 177799a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); 177899a2dd95SBruce Richardson return -1; 177999a2dd95SBruce Richardson } 1780*32b4771cSDmitry Kozlyuk /* safety net, should be impossible to configure */ 1781*32b4771cSDmitry Kozlyuk if (internal_conf->hugepage_file.unlink_before_mapping && 1782*32b4771cSDmitry Kozlyuk !internal_conf->hugepage_file.unlink_existing) { 1783*32b4771cSDmitry Kozlyuk RTE_LOG(ERR, EAL, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.\n"); 1784*32b4771cSDmitry Kozlyuk return -1; 1785*32b4771cSDmitry Kozlyuk } 178699a2dd95SBruce Richardson } 178799a2dd95SBruce Richardson 178899a2dd95SBruce Richardson /* initialize all of the fd lists */ 178999a2dd95SBruce Richardson if (rte_memseg_list_walk(fd_list_create_walk, NULL)) 179099a2dd95SBruce Richardson return -1; 179199a2dd95SBruce Richardson return 0; 179299a2dd95SBruce Richardson } 1793