199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson * Copyright(c) 2017-2018 Intel Corporation
399a2dd95SBruce Richardson */
499a2dd95SBruce Richardson
599a2dd95SBruce Richardson #include <errno.h>
699a2dd95SBruce Richardson #include <stdbool.h>
799a2dd95SBruce Richardson #include <stdlib.h>
899a2dd95SBruce Richardson #include <stdio.h>
999a2dd95SBruce Richardson #include <stdint.h>
1099a2dd95SBruce Richardson #include <string.h>
1199a2dd95SBruce Richardson #include <sys/mman.h>
1299a2dd95SBruce Richardson #include <sys/stat.h>
1399a2dd95SBruce Richardson #include <sys/file.h>
1499a2dd95SBruce Richardson #include <unistd.h>
1599a2dd95SBruce Richardson #include <limits.h>
1699a2dd95SBruce Richardson #include <fcntl.h>
1799a2dd95SBruce Richardson #include <signal.h>
1899a2dd95SBruce Richardson #include <setjmp.h>
1999a2dd95SBruce Richardson #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
2099a2dd95SBruce Richardson #include <linux/memfd.h>
2199a2dd95SBruce Richardson #define MEMFD_SUPPORTED
2299a2dd95SBruce Richardson #endif
2399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
2499a2dd95SBruce Richardson #include <numa.h>
2599a2dd95SBruce Richardson #include <numaif.h>
2699a2dd95SBruce Richardson #endif
2799a2dd95SBruce Richardson #include <linux/falloc.h>
2899a2dd95SBruce Richardson #include <linux/mman.h> /* for hugetlb-related mmap flags */
2999a2dd95SBruce Richardson
3099a2dd95SBruce Richardson #include <rte_common.h>
3199a2dd95SBruce Richardson #include <rte_log.h>
3299a2dd95SBruce Richardson #include <rte_eal.h>
3399a2dd95SBruce Richardson #include <rte_memory.h>
3499a2dd95SBruce Richardson
3599a2dd95SBruce Richardson #include "eal_filesystem.h"
3699a2dd95SBruce Richardson #include "eal_internal_cfg.h"
3799a2dd95SBruce Richardson #include "eal_memalloc.h"
3899a2dd95SBruce Richardson #include "eal_memcfg.h"
3999a2dd95SBruce Richardson #include "eal_private.h"
4099a2dd95SBruce Richardson
4199a2dd95SBruce Richardson const int anonymous_hugepages_supported =
4299a2dd95SBruce Richardson #ifdef MAP_HUGE_SHIFT
4399a2dd95SBruce Richardson 1;
4499a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
4599a2dd95SBruce Richardson #else
4699a2dd95SBruce Richardson 0;
4799a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT 26
4899a2dd95SBruce Richardson #endif
4999a2dd95SBruce Richardson
5099a2dd95SBruce Richardson /*
5199a2dd95SBruce Richardson * we've already checked memfd support at compile-time, but we also need to
5299a2dd95SBruce Richardson * check if we can create hugepage files with memfd.
5399a2dd95SBruce Richardson *
5499a2dd95SBruce Richardson * also, this is not a constant, because while we may be *compiled* with memfd
5599a2dd95SBruce Richardson * hugetlbfs support, we might not be *running* on a system that supports memfd
5699a2dd95SBruce Richardson * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
5799a2dd95SBruce Richardson * runtime, and fall back to anonymous memory.
5899a2dd95SBruce Richardson */
5999a2dd95SBruce Richardson static int memfd_create_supported =
6099a2dd95SBruce Richardson #ifdef MFD_HUGETLB
6199a2dd95SBruce Richardson 1;
6299a2dd95SBruce Richardson #define RTE_MFD_HUGETLB MFD_HUGETLB
6399a2dd95SBruce Richardson #else
6499a2dd95SBruce Richardson 0;
6599a2dd95SBruce Richardson #define RTE_MFD_HUGETLB 4U
6699a2dd95SBruce Richardson #endif
6799a2dd95SBruce Richardson
6899a2dd95SBruce Richardson /*
6999a2dd95SBruce Richardson * not all kernel version support fallocate on hugetlbfs, so fall back to
7099a2dd95SBruce Richardson * ftruncate and disallow deallocation if fallocate is not supported.
7199a2dd95SBruce Richardson */
7299a2dd95SBruce Richardson static int fallocate_supported = -1; /* unknown */
7399a2dd95SBruce Richardson
7499a2dd95SBruce Richardson /*
7599a2dd95SBruce Richardson * we have two modes - single file segments, and file-per-page mode.
7699a2dd95SBruce Richardson *
7799a2dd95SBruce Richardson * for single-file segments, we use memseg_list_fd to store the segment fd,
7899a2dd95SBruce Richardson * while the fds[] will not be allocated, and len will be set to 0.
7999a2dd95SBruce Richardson *
8099a2dd95SBruce Richardson * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
8199a2dd95SBruce Richardson * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
8299a2dd95SBruce Richardson *
8399a2dd95SBruce Richardson * we cannot know how many pages a system will have in advance, but we do know
8499a2dd95SBruce Richardson * that they come in lists, and we know lengths of these lists. so, simply store
8599a2dd95SBruce Richardson * a malloc'd array of fd's indexed by list and segment index.
8699a2dd95SBruce Richardson *
8799a2dd95SBruce Richardson * they will be initialized at startup, and filled as we allocate/deallocate
8899a2dd95SBruce Richardson * segments.
8999a2dd95SBruce Richardson */
9099a2dd95SBruce Richardson static struct {
9199a2dd95SBruce Richardson int *fds; /**< dynamically allocated array of segment lock fd's */
9299a2dd95SBruce Richardson int memseg_list_fd; /**< memseg list fd */
9399a2dd95SBruce Richardson int len; /**< total length of the array */
9499a2dd95SBruce Richardson int count; /**< entries used in an array */
9599a2dd95SBruce Richardson } fd_list[RTE_MAX_MEMSEG_LISTS];
9699a2dd95SBruce Richardson
9799a2dd95SBruce Richardson /** local copy of a memory map, used to synchronize memory hotplug in MP */
9899a2dd95SBruce Richardson static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
9999a2dd95SBruce Richardson
10099a2dd95SBruce Richardson static sigjmp_buf huge_jmpenv;
10199a2dd95SBruce Richardson
huge_sigbus_handler(int signo __rte_unused)1029bffc928SOlivier Matz static void huge_sigbus_handler(int signo __rte_unused)
10399a2dd95SBruce Richardson {
10499a2dd95SBruce Richardson siglongjmp(huge_jmpenv, 1);
10599a2dd95SBruce Richardson }
10699a2dd95SBruce Richardson
10799a2dd95SBruce Richardson /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
10899a2dd95SBruce Richardson * non-static local variable in the stack frame calling sigsetjmp might be
10999a2dd95SBruce Richardson * clobbered by a call to longjmp.
11099a2dd95SBruce Richardson */
huge_wrap_sigsetjmp(void)1119bffc928SOlivier Matz static int huge_wrap_sigsetjmp(void)
11299a2dd95SBruce Richardson {
11399a2dd95SBruce Richardson return sigsetjmp(huge_jmpenv, 1);
11499a2dd95SBruce Richardson }
11599a2dd95SBruce Richardson
11699a2dd95SBruce Richardson static struct sigaction huge_action_old;
11799a2dd95SBruce Richardson static int huge_need_recover;
11899a2dd95SBruce Richardson
1199bffc928SOlivier Matz static void
huge_register_sigbus(void)12099a2dd95SBruce Richardson huge_register_sigbus(void)
12199a2dd95SBruce Richardson {
12299a2dd95SBruce Richardson sigset_t mask;
12399a2dd95SBruce Richardson struct sigaction action;
12499a2dd95SBruce Richardson
12599a2dd95SBruce Richardson sigemptyset(&mask);
12699a2dd95SBruce Richardson sigaddset(&mask, SIGBUS);
12799a2dd95SBruce Richardson action.sa_flags = 0;
12899a2dd95SBruce Richardson action.sa_mask = mask;
12999a2dd95SBruce Richardson action.sa_handler = huge_sigbus_handler;
13099a2dd95SBruce Richardson
13199a2dd95SBruce Richardson huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
13299a2dd95SBruce Richardson }
13399a2dd95SBruce Richardson
1349bffc928SOlivier Matz static void
huge_recover_sigbus(void)13599a2dd95SBruce Richardson huge_recover_sigbus(void)
13699a2dd95SBruce Richardson {
13799a2dd95SBruce Richardson if (huge_need_recover) {
13899a2dd95SBruce Richardson sigaction(SIGBUS, &huge_action_old, NULL);
13999a2dd95SBruce Richardson huge_need_recover = 0;
14099a2dd95SBruce Richardson }
14199a2dd95SBruce Richardson }
14299a2dd95SBruce Richardson
14399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
14499a2dd95SBruce Richardson static bool
check_numa(void)14599a2dd95SBruce Richardson check_numa(void)
14699a2dd95SBruce Richardson {
14799a2dd95SBruce Richardson bool ret = true;
14899a2dd95SBruce Richardson /* Check if kernel supports NUMA. */
14999a2dd95SBruce Richardson if (numa_available() != 0) {
150ae67895bSDavid Marchand EAL_LOG(DEBUG, "NUMA is not supported.");
15199a2dd95SBruce Richardson ret = false;
15299a2dd95SBruce Richardson }
15399a2dd95SBruce Richardson return ret;
15499a2dd95SBruce Richardson }
15599a2dd95SBruce Richardson
15699a2dd95SBruce Richardson static void
prepare_numa(int * oldpolicy,struct bitmask * oldmask,int socket_id)15799a2dd95SBruce Richardson prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
15899a2dd95SBruce Richardson {
159ae67895bSDavid Marchand EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
16099a2dd95SBruce Richardson if (get_mempolicy(oldpolicy, oldmask->maskp,
16199a2dd95SBruce Richardson oldmask->size + 1, 0, 0) < 0) {
162ae67895bSDavid Marchand EAL_LOG(ERR,
16399a2dd95SBruce Richardson "Failed to get current mempolicy: %s. "
164ae67895bSDavid Marchand "Assuming MPOL_DEFAULT.", strerror(errno));
16599a2dd95SBruce Richardson *oldpolicy = MPOL_DEFAULT;
16699a2dd95SBruce Richardson }
167ae67895bSDavid Marchand EAL_LOG(DEBUG,
168ae67895bSDavid Marchand "Setting policy MPOL_PREFERRED for socket %d",
16999a2dd95SBruce Richardson socket_id);
17099a2dd95SBruce Richardson numa_set_preferred(socket_id);
17199a2dd95SBruce Richardson }
17299a2dd95SBruce Richardson
17399a2dd95SBruce Richardson static void
restore_numa(int * oldpolicy,struct bitmask * oldmask)17499a2dd95SBruce Richardson restore_numa(int *oldpolicy, struct bitmask *oldmask)
17599a2dd95SBruce Richardson {
176ae67895bSDavid Marchand EAL_LOG(DEBUG,
177ae67895bSDavid Marchand "Restoring previous memory policy: %d", *oldpolicy);
17899a2dd95SBruce Richardson if (*oldpolicy == MPOL_DEFAULT) {
17999a2dd95SBruce Richardson numa_set_localalloc();
18099a2dd95SBruce Richardson } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
18199a2dd95SBruce Richardson oldmask->size + 1) < 0) {
182ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to restore mempolicy: %s",
18399a2dd95SBruce Richardson strerror(errno));
18499a2dd95SBruce Richardson numa_set_localalloc();
18599a2dd95SBruce Richardson }
18699a2dd95SBruce Richardson numa_free_cpumask(oldmask);
18799a2dd95SBruce Richardson }
18899a2dd95SBruce Richardson #endif
18999a2dd95SBruce Richardson
19099a2dd95SBruce Richardson /*
19199a2dd95SBruce Richardson * uses fstat to report the size of a file on disk
19299a2dd95SBruce Richardson */
19399a2dd95SBruce Richardson static off_t
get_file_size(int fd)19499a2dd95SBruce Richardson get_file_size(int fd)
19599a2dd95SBruce Richardson {
19699a2dd95SBruce Richardson struct stat st;
19799a2dd95SBruce Richardson if (fstat(fd, &st) < 0)
19899a2dd95SBruce Richardson return 0;
19999a2dd95SBruce Richardson return st.st_size;
20099a2dd95SBruce Richardson }
20199a2dd95SBruce Richardson
20299a2dd95SBruce Richardson static int
pagesz_flags(uint64_t page_sz)20399a2dd95SBruce Richardson pagesz_flags(uint64_t page_sz)
20499a2dd95SBruce Richardson {
20599a2dd95SBruce Richardson /* as per mmap() manpage, all page sizes are log2 of page size
20699a2dd95SBruce Richardson * shifted by MAP_HUGE_SHIFT
20799a2dd95SBruce Richardson */
20899a2dd95SBruce Richardson int log2 = rte_log2_u64(page_sz);
20999a2dd95SBruce Richardson return log2 << RTE_MAP_HUGE_SHIFT;
21099a2dd95SBruce Richardson }
21199a2dd95SBruce Richardson
21299a2dd95SBruce Richardson /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
lock(int fd,int type)21399a2dd95SBruce Richardson static int lock(int fd, int type)
21499a2dd95SBruce Richardson {
21599a2dd95SBruce Richardson int ret;
21699a2dd95SBruce Richardson
21799a2dd95SBruce Richardson /* flock may be interrupted */
21899a2dd95SBruce Richardson do {
21999a2dd95SBruce Richardson ret = flock(fd, type | LOCK_NB);
22099a2dd95SBruce Richardson } while (ret && errno == EINTR);
22199a2dd95SBruce Richardson
22299a2dd95SBruce Richardson if (ret && errno == EWOULDBLOCK) {
22399a2dd95SBruce Richardson /* couldn't lock */
22499a2dd95SBruce Richardson return 0;
22599a2dd95SBruce Richardson } else if (ret) {
226ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): error calling flock(): %s",
22799a2dd95SBruce Richardson __func__, strerror(errno));
22899a2dd95SBruce Richardson return -1;
22999a2dd95SBruce Richardson }
23099a2dd95SBruce Richardson /* lock was successful */
23199a2dd95SBruce Richardson return 1;
23299a2dd95SBruce Richardson }
23399a2dd95SBruce Richardson
23499a2dd95SBruce Richardson static int
get_seg_memfd(struct hugepage_info * hi __rte_unused,unsigned int list_idx __rte_unused,unsigned int seg_idx __rte_unused)23599a2dd95SBruce Richardson get_seg_memfd(struct hugepage_info *hi __rte_unused,
23699a2dd95SBruce Richardson unsigned int list_idx __rte_unused,
23799a2dd95SBruce Richardson unsigned int seg_idx __rte_unused)
23899a2dd95SBruce Richardson {
23999a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
24099a2dd95SBruce Richardson int fd;
24199a2dd95SBruce Richardson char segname[250]; /* as per manpage, limit is 249 bytes plus null */
24299a2dd95SBruce Richardson
24399a2dd95SBruce Richardson int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
24499a2dd95SBruce Richardson const struct internal_config *internal_conf =
24599a2dd95SBruce Richardson eal_get_internal_configuration();
24699a2dd95SBruce Richardson
24799a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
24899a2dd95SBruce Richardson fd = fd_list[list_idx].memseg_list_fd;
24999a2dd95SBruce Richardson
25099a2dd95SBruce Richardson if (fd < 0) {
25199a2dd95SBruce Richardson snprintf(segname, sizeof(segname), "seg_%i", list_idx);
25299a2dd95SBruce Richardson fd = memfd_create(segname, flags);
25399a2dd95SBruce Richardson if (fd < 0) {
254ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
25599a2dd95SBruce Richardson __func__, strerror(errno));
25699a2dd95SBruce Richardson return -1;
25799a2dd95SBruce Richardson }
25899a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = fd;
25999a2dd95SBruce Richardson }
26099a2dd95SBruce Richardson } else {
26199a2dd95SBruce Richardson fd = fd_list[list_idx].fds[seg_idx];
26299a2dd95SBruce Richardson
26399a2dd95SBruce Richardson if (fd < 0) {
26499a2dd95SBruce Richardson snprintf(segname, sizeof(segname), "seg_%i-%i",
26599a2dd95SBruce Richardson list_idx, seg_idx);
26699a2dd95SBruce Richardson fd = memfd_create(segname, flags);
26799a2dd95SBruce Richardson if (fd < 0) {
268ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
26999a2dd95SBruce Richardson __func__, strerror(errno));
27099a2dd95SBruce Richardson return -1;
27199a2dd95SBruce Richardson }
27299a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = fd;
27399a2dd95SBruce Richardson }
27499a2dd95SBruce Richardson }
27599a2dd95SBruce Richardson return fd;
27699a2dd95SBruce Richardson #endif
27799a2dd95SBruce Richardson return -1;
27899a2dd95SBruce Richardson }
27999a2dd95SBruce Richardson
28099a2dd95SBruce Richardson static int
get_seg_fd(char * path,int buflen,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx,bool * dirty)28199a2dd95SBruce Richardson get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
28232b4771cSDmitry Kozlyuk unsigned int list_idx, unsigned int seg_idx,
28332b4771cSDmitry Kozlyuk bool *dirty)
28499a2dd95SBruce Richardson {
28599a2dd95SBruce Richardson int fd;
28632b4771cSDmitry Kozlyuk int *out_fd;
28732b4771cSDmitry Kozlyuk struct stat st;
28832b4771cSDmitry Kozlyuk int ret;
28999a2dd95SBruce Richardson const struct internal_config *internal_conf =
29099a2dd95SBruce Richardson eal_get_internal_configuration();
29199a2dd95SBruce Richardson
29232b4771cSDmitry Kozlyuk if (dirty != NULL)
29332b4771cSDmitry Kozlyuk *dirty = false;
29432b4771cSDmitry Kozlyuk
29599a2dd95SBruce Richardson /* for in-memory mode, we only make it here when we're sure we support
29699a2dd95SBruce Richardson * memfd, and this is a special case.
29799a2dd95SBruce Richardson */
29899a2dd95SBruce Richardson if (internal_conf->in_memory)
29999a2dd95SBruce Richardson return get_seg_memfd(hi, list_idx, seg_idx);
30099a2dd95SBruce Richardson
30199a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
30232b4771cSDmitry Kozlyuk out_fd = &fd_list[list_idx].memseg_list_fd;
30399a2dd95SBruce Richardson eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
30432b4771cSDmitry Kozlyuk } else {
30532b4771cSDmitry Kozlyuk out_fd = &fd_list[list_idx].fds[seg_idx];
30632b4771cSDmitry Kozlyuk eal_get_hugefile_path(path, buflen, hi->hugedir,
30732b4771cSDmitry Kozlyuk list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
30832b4771cSDmitry Kozlyuk }
30932b4771cSDmitry Kozlyuk fd = *out_fd;
31032b4771cSDmitry Kozlyuk if (fd >= 0)
31132b4771cSDmitry Kozlyuk return fd;
31299a2dd95SBruce Richardson
31332b4771cSDmitry Kozlyuk /*
31432b4771cSDmitry Kozlyuk * There is no TOCTOU between stat() and unlink()/open()
31532b4771cSDmitry Kozlyuk * because the hugepage directory is locked.
31632b4771cSDmitry Kozlyuk */
31732b4771cSDmitry Kozlyuk ret = stat(path, &st);
31832b4771cSDmitry Kozlyuk if (ret < 0 && errno != ENOENT) {
319ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): stat() for '%s' failed: %s",
3208a5a9140SStephen Hemminger __func__, path, strerror(errno));
32199a2dd95SBruce Richardson return -1;
32299a2dd95SBruce Richardson }
32332b4771cSDmitry Kozlyuk if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
32432b4771cSDmitry Kozlyuk dirty != NULL)
32532b4771cSDmitry Kozlyuk *dirty = true;
32699a2dd95SBruce Richardson
32732b4771cSDmitry Kozlyuk /*
32832b4771cSDmitry Kozlyuk * The kernel clears a hugepage only when it is mapped
32932b4771cSDmitry Kozlyuk * from a particular file for the first time.
33032b4771cSDmitry Kozlyuk * If the file already exists, the old content will be mapped.
33132b4771cSDmitry Kozlyuk * If the memory manager assumes all mapped pages to be clean,
33232b4771cSDmitry Kozlyuk * the file must be removed and created anew.
33332b4771cSDmitry Kozlyuk * Otherwise, the primary caller must be notified
33432b4771cSDmitry Kozlyuk * that mapped pages will be dirty
33532b4771cSDmitry Kozlyuk * (secondary callers receive the segment state from the primary one).
33632b4771cSDmitry Kozlyuk * When multiple hugepages are mapped from the same file,
33732b4771cSDmitry Kozlyuk * whether they will be dirty depends on the part that is mapped.
33899a2dd95SBruce Richardson */
33932b4771cSDmitry Kozlyuk if (!internal_conf->single_file_segments &&
34032b4771cSDmitry Kozlyuk internal_conf->hugepage_file.unlink_existing &&
34132b4771cSDmitry Kozlyuk rte_eal_process_type() == RTE_PROC_PRIMARY &&
34232b4771cSDmitry Kozlyuk ret == 0) {
34332b4771cSDmitry Kozlyuk /* coverity[toctou] */
34432b4771cSDmitry Kozlyuk if (unlink(path) < 0) {
345ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): could not remove '%s': %s",
34699a2dd95SBruce Richardson __func__, path, strerror(errno));
34799a2dd95SBruce Richardson return -1;
34899a2dd95SBruce Richardson }
34932b4771cSDmitry Kozlyuk }
35099a2dd95SBruce Richardson
35132b4771cSDmitry Kozlyuk /* coverity[toctou] */
35299a2dd95SBruce Richardson fd = open(path, O_CREAT | O_RDWR, 0600);
35399a2dd95SBruce Richardson if (fd < 0) {
354ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): open '%s' failed: %s",
3558a5a9140SStephen Hemminger __func__, path, strerror(errno));
35699a2dd95SBruce Richardson return -1;
35799a2dd95SBruce Richardson }
35899a2dd95SBruce Richardson /* take out a read lock */
35999a2dd95SBruce Richardson if (lock(fd, LOCK_SH) < 0) {
360ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): lock '%s' failed: %s",
36132b4771cSDmitry Kozlyuk __func__, path, strerror(errno));
36299a2dd95SBruce Richardson close(fd);
36399a2dd95SBruce Richardson return -1;
36499a2dd95SBruce Richardson }
36532b4771cSDmitry Kozlyuk *out_fd = fd;
36699a2dd95SBruce Richardson return fd;
36799a2dd95SBruce Richardson }
36899a2dd95SBruce Richardson
36999a2dd95SBruce Richardson static int
resize_hugefile_in_memory(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow)37099a2dd95SBruce Richardson resize_hugefile_in_memory(int fd, uint64_t fa_offset,
37199a2dd95SBruce Richardson uint64_t page_sz, bool grow)
37299a2dd95SBruce Richardson {
37399a2dd95SBruce Richardson int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
37499a2dd95SBruce Richardson FALLOC_FL_KEEP_SIZE;
37599a2dd95SBruce Richardson int ret;
37699a2dd95SBruce Richardson
37799a2dd95SBruce Richardson /* grow or shrink the file */
37899a2dd95SBruce Richardson ret = fallocate(fd, flags, fa_offset, page_sz);
37999a2dd95SBruce Richardson
38099a2dd95SBruce Richardson if (ret < 0) {
381ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
38299a2dd95SBruce Richardson __func__,
38399a2dd95SBruce Richardson strerror(errno));
38499a2dd95SBruce Richardson return -1;
38599a2dd95SBruce Richardson }
38699a2dd95SBruce Richardson return 0;
38799a2dd95SBruce Richardson }
38899a2dd95SBruce Richardson
38999a2dd95SBruce Richardson static int
resize_hugefile_in_filesystem(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)39099a2dd95SBruce Richardson resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
39132b4771cSDmitry Kozlyuk bool grow, bool *dirty)
39299a2dd95SBruce Richardson {
39332b4771cSDmitry Kozlyuk const struct internal_config *internal_conf =
39432b4771cSDmitry Kozlyuk eal_get_internal_configuration();
39599a2dd95SBruce Richardson bool again = false;
39699a2dd95SBruce Richardson
39799a2dd95SBruce Richardson do {
39899a2dd95SBruce Richardson if (fallocate_supported == 0) {
39999a2dd95SBruce Richardson /* we cannot deallocate memory if fallocate() is not
40099a2dd95SBruce Richardson * supported, and hugepage file is already locked at
40199a2dd95SBruce Richardson * creation, so no further synchronization needed.
40299a2dd95SBruce Richardson */
40399a2dd95SBruce Richardson
40499a2dd95SBruce Richardson if (!grow) {
405ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): fallocate not supported, not freeing page back to the system",
40699a2dd95SBruce Richardson __func__);
40799a2dd95SBruce Richardson return -1;
40899a2dd95SBruce Richardson }
40999a2dd95SBruce Richardson uint64_t new_size = fa_offset + page_sz;
41099a2dd95SBruce Richardson uint64_t cur_size = get_file_size(fd);
41199a2dd95SBruce Richardson
41299a2dd95SBruce Richardson /* fallocate isn't supported, fall back to ftruncate */
41332b4771cSDmitry Kozlyuk if (dirty != NULL)
41432b4771cSDmitry Kozlyuk *dirty = new_size <= cur_size;
41599a2dd95SBruce Richardson if (new_size > cur_size &&
41699a2dd95SBruce Richardson ftruncate(fd, new_size) < 0) {
417ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
41899a2dd95SBruce Richardson __func__, strerror(errno));
41999a2dd95SBruce Richardson return -1;
42099a2dd95SBruce Richardson }
42199a2dd95SBruce Richardson } else {
42299a2dd95SBruce Richardson int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
42399a2dd95SBruce Richardson FALLOC_FL_KEEP_SIZE;
42499a2dd95SBruce Richardson int ret;
42599a2dd95SBruce Richardson
42699a2dd95SBruce Richardson /*
42799a2dd95SBruce Richardson * technically, it is perfectly safe for both primary
42899a2dd95SBruce Richardson * and secondary to grow and shrink the page files:
42999a2dd95SBruce Richardson * growing the file repeatedly has no effect because
43099a2dd95SBruce Richardson * a page can only be allocated once, while mmap ensures
43199a2dd95SBruce Richardson * that secondaries hold on to the page even after the
43299a2dd95SBruce Richardson * page itself is removed from the filesystem.
43399a2dd95SBruce Richardson *
43499a2dd95SBruce Richardson * however, leaving growing/shrinking to the primary
43599a2dd95SBruce Richardson * tends to expose bugs in fdlist page count handling,
43699a2dd95SBruce Richardson * so leave this here just in case.
43799a2dd95SBruce Richardson */
43899a2dd95SBruce Richardson if (rte_eal_process_type() != RTE_PROC_PRIMARY)
43999a2dd95SBruce Richardson return 0;
44099a2dd95SBruce Richardson
44199a2dd95SBruce Richardson /* grow or shrink the file */
44299a2dd95SBruce Richardson ret = fallocate(fd, flags, fa_offset, page_sz);
44399a2dd95SBruce Richardson
44499a2dd95SBruce Richardson if (ret < 0) {
44599a2dd95SBruce Richardson if (fallocate_supported == -1 &&
44699a2dd95SBruce Richardson errno == ENOTSUP) {
447ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): fallocate() not supported, hugepage deallocation will be disabled",
44899a2dd95SBruce Richardson __func__);
44999a2dd95SBruce Richardson again = true;
45099a2dd95SBruce Richardson fallocate_supported = 0;
45199a2dd95SBruce Richardson } else {
452ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
45399a2dd95SBruce Richardson __func__,
45499a2dd95SBruce Richardson strerror(errno));
45599a2dd95SBruce Richardson return -1;
45699a2dd95SBruce Richardson }
45732b4771cSDmitry Kozlyuk } else {
45899a2dd95SBruce Richardson fallocate_supported = 1;
45932b4771cSDmitry Kozlyuk /*
46032b4771cSDmitry Kozlyuk * It is unknown which portions of an existing
46132b4771cSDmitry Kozlyuk * hugepage file were allocated previously,
46232b4771cSDmitry Kozlyuk * so all pages within the file are considered
46332b4771cSDmitry Kozlyuk * dirty, unless the file is a fresh one.
46432b4771cSDmitry Kozlyuk */
46532b4771cSDmitry Kozlyuk if (dirty != NULL)
46632b4771cSDmitry Kozlyuk *dirty &= !internal_conf->hugepage_file.unlink_existing;
46732b4771cSDmitry Kozlyuk }
46899a2dd95SBruce Richardson }
46999a2dd95SBruce Richardson } while (again);
47099a2dd95SBruce Richardson
47199a2dd95SBruce Richardson return 0;
47299a2dd95SBruce Richardson }
47399a2dd95SBruce Richardson
47499a2dd95SBruce Richardson static void
close_hugefile(int fd,char * path,int list_idx)47599a2dd95SBruce Richardson close_hugefile(int fd, char *path, int list_idx)
47699a2dd95SBruce Richardson {
47799a2dd95SBruce Richardson const struct internal_config *internal_conf =
47899a2dd95SBruce Richardson eal_get_internal_configuration();
47999a2dd95SBruce Richardson /*
48099a2dd95SBruce Richardson * primary process must unlink the file, but only when not in in-memory
48199a2dd95SBruce Richardson * mode (as in that case there is no file to unlink).
48299a2dd95SBruce Richardson */
48399a2dd95SBruce Richardson if (!internal_conf->in_memory &&
48499a2dd95SBruce Richardson rte_eal_process_type() == RTE_PROC_PRIMARY &&
48599a2dd95SBruce Richardson unlink(path))
486ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): unlinking '%s' failed: %s",
48799a2dd95SBruce Richardson __func__, path, strerror(errno));
48899a2dd95SBruce Richardson
48999a2dd95SBruce Richardson close(fd);
49099a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1;
49199a2dd95SBruce Richardson }
49299a2dd95SBruce Richardson
49399a2dd95SBruce Richardson static int
resize_hugefile(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)49432b4771cSDmitry Kozlyuk resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
49532b4771cSDmitry Kozlyuk bool *dirty)
49699a2dd95SBruce Richardson {
49799a2dd95SBruce Richardson /* in-memory mode is a special case, because we can be sure that
49899a2dd95SBruce Richardson * fallocate() is supported.
49999a2dd95SBruce Richardson */
50099a2dd95SBruce Richardson const struct internal_config *internal_conf =
50199a2dd95SBruce Richardson eal_get_internal_configuration();
50299a2dd95SBruce Richardson
50332b4771cSDmitry Kozlyuk if (internal_conf->in_memory) {
50432b4771cSDmitry Kozlyuk if (dirty != NULL)
50532b4771cSDmitry Kozlyuk *dirty = false;
50699a2dd95SBruce Richardson return resize_hugefile_in_memory(fd, fa_offset,
50799a2dd95SBruce Richardson page_sz, grow);
50832b4771cSDmitry Kozlyuk }
50999a2dd95SBruce Richardson
51099a2dd95SBruce Richardson return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
51132b4771cSDmitry Kozlyuk grow, dirty);
51299a2dd95SBruce Richardson }
51399a2dd95SBruce Richardson
51499a2dd95SBruce Richardson static int
alloc_seg(struct rte_memseg * ms,void * addr,int socket_id,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)51599a2dd95SBruce Richardson alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
51699a2dd95SBruce Richardson struct hugepage_info *hi, unsigned int list_idx,
51799a2dd95SBruce Richardson unsigned int seg_idx)
51899a2dd95SBruce Richardson {
51999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
52099a2dd95SBruce Richardson int cur_socket_id = 0;
52199a2dd95SBruce Richardson #endif
52299a2dd95SBruce Richardson uint64_t map_offset;
52399a2dd95SBruce Richardson rte_iova_t iova;
52499a2dd95SBruce Richardson void *va;
52599a2dd95SBruce Richardson char path[PATH_MAX];
52699a2dd95SBruce Richardson int ret = 0;
52799a2dd95SBruce Richardson int fd;
52832b4771cSDmitry Kozlyuk bool dirty;
52999a2dd95SBruce Richardson size_t alloc_sz;
53099a2dd95SBruce Richardson int flags;
53199a2dd95SBruce Richardson void *new_addr;
53299a2dd95SBruce Richardson const struct internal_config *internal_conf =
53399a2dd95SBruce Richardson eal_get_internal_configuration();
53499a2dd95SBruce Richardson
53599a2dd95SBruce Richardson alloc_sz = hi->hugepage_sz;
53699a2dd95SBruce Richardson
53799a2dd95SBruce Richardson /* these are checked at init, but code analyzers don't know that */
53899a2dd95SBruce Richardson if (internal_conf->in_memory && !anonymous_hugepages_supported) {
539ae67895bSDavid Marchand EAL_LOG(ERR, "Anonymous hugepages not supported, in-memory mode cannot allocate memory");
54099a2dd95SBruce Richardson return -1;
54199a2dd95SBruce Richardson }
54299a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported &&
54399a2dd95SBruce Richardson internal_conf->single_file_segments) {
544ae67895bSDavid Marchand EAL_LOG(ERR, "Single-file segments are not supported without memfd support");
54599a2dd95SBruce Richardson return -1;
54699a2dd95SBruce Richardson }
54799a2dd95SBruce Richardson
54899a2dd95SBruce Richardson /* in-memory without memfd is a special case */
54999a2dd95SBruce Richardson int mmap_flags;
55099a2dd95SBruce Richardson
55199a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported) {
55299a2dd95SBruce Richardson const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
55399a2dd95SBruce Richardson MAP_PRIVATE | MAP_ANONYMOUS;
55499a2dd95SBruce Richardson int pagesz_flag;
55599a2dd95SBruce Richardson
55699a2dd95SBruce Richardson pagesz_flag = pagesz_flags(alloc_sz);
55799a2dd95SBruce Richardson fd = -1;
55832b4771cSDmitry Kozlyuk dirty = false;
55999a2dd95SBruce Richardson mmap_flags = in_memory_flags | pagesz_flag;
56099a2dd95SBruce Richardson
56199a2dd95SBruce Richardson /* single-file segments codepath will never be active
56299a2dd95SBruce Richardson * here because in-memory mode is incompatible with the
56399a2dd95SBruce Richardson * fallback path, and it's stopped at EAL initialization
56499a2dd95SBruce Richardson * stage.
56599a2dd95SBruce Richardson */
56699a2dd95SBruce Richardson map_offset = 0;
56799a2dd95SBruce Richardson } else {
56899a2dd95SBruce Richardson /* takes out a read lock on segment or segment list */
56932b4771cSDmitry Kozlyuk fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
57032b4771cSDmitry Kozlyuk &dirty);
57199a2dd95SBruce Richardson if (fd < 0) {
572ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't get fd on hugepage file");
57399a2dd95SBruce Richardson return -1;
57499a2dd95SBruce Richardson }
57599a2dd95SBruce Richardson
57699a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
57799a2dd95SBruce Richardson map_offset = seg_idx * alloc_sz;
57832b4771cSDmitry Kozlyuk ret = resize_hugefile(fd, map_offset, alloc_sz, true,
57932b4771cSDmitry Kozlyuk &dirty);
58099a2dd95SBruce Richardson if (ret < 0)
58199a2dd95SBruce Richardson goto resized;
58299a2dd95SBruce Richardson
58399a2dd95SBruce Richardson fd_list[list_idx].count++;
58499a2dd95SBruce Richardson } else {
58599a2dd95SBruce Richardson map_offset = 0;
58699a2dd95SBruce Richardson if (ftruncate(fd, alloc_sz) < 0) {
587ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
58899a2dd95SBruce Richardson __func__, strerror(errno));
58999a2dd95SBruce Richardson goto resized;
59099a2dd95SBruce Richardson }
59152d7d91eSDmitry Kozlyuk if (internal_conf->hugepage_file.unlink_before_mapping &&
59299a2dd95SBruce Richardson !internal_conf->in_memory) {
59399a2dd95SBruce Richardson if (unlink(path)) {
594ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): unlink() failed: %s",
59599a2dd95SBruce Richardson __func__, strerror(errno));
59699a2dd95SBruce Richardson goto resized;
59799a2dd95SBruce Richardson }
59899a2dd95SBruce Richardson }
59999a2dd95SBruce Richardson }
60099a2dd95SBruce Richardson mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
60199a2dd95SBruce Richardson }
60299a2dd95SBruce Richardson
6039bffc928SOlivier Matz huge_register_sigbus();
6049bffc928SOlivier Matz
60599a2dd95SBruce Richardson /*
60699a2dd95SBruce Richardson * map the segment, and populate page tables, the kernel fills
60799a2dd95SBruce Richardson * this segment with zeros if it's a new page.
60899a2dd95SBruce Richardson */
60999a2dd95SBruce Richardson va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
61099a2dd95SBruce Richardson map_offset);
61199a2dd95SBruce Richardson
61299a2dd95SBruce Richardson if (va == MAP_FAILED) {
613ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): mmap() failed: %s", __func__,
61499a2dd95SBruce Richardson strerror(errno));
61599a2dd95SBruce Richardson /* mmap failed, but the previous region might have been
61699a2dd95SBruce Richardson * unmapped anyway. try to remap it
61799a2dd95SBruce Richardson */
61899a2dd95SBruce Richardson goto unmapped;
61999a2dd95SBruce Richardson }
62099a2dd95SBruce Richardson if (va != addr) {
621ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): wrong mmap() address", __func__);
62299a2dd95SBruce Richardson munmap(va, alloc_sz);
62399a2dd95SBruce Richardson goto resized;
62499a2dd95SBruce Richardson }
62599a2dd95SBruce Richardson
62699a2dd95SBruce Richardson /* In linux, hugetlb limitations, like cgroup, are
62799a2dd95SBruce Richardson * enforced at fault time instead of mmap(), even
62899a2dd95SBruce Richardson * with the option of MAP_POPULATE. Kernel will send
62999a2dd95SBruce Richardson * a SIGBUS signal. To avoid to be killed, save stack
63099a2dd95SBruce Richardson * environment here, if SIGBUS happens, we can jump
63199a2dd95SBruce Richardson * back here.
63299a2dd95SBruce Richardson */
63399a2dd95SBruce Richardson if (huge_wrap_sigsetjmp()) {
634ae67895bSDavid Marchand EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more hugepages of size %uMB",
63599a2dd95SBruce Richardson (unsigned int)(alloc_sz >> 20));
63699a2dd95SBruce Richardson goto mapped;
63799a2dd95SBruce Richardson }
63899a2dd95SBruce Richardson
63999a2dd95SBruce Richardson /* we need to trigger a write to the page to enforce page fault and
64099a2dd95SBruce Richardson * ensure that page is accessible to us, but we can't overwrite value
64199a2dd95SBruce Richardson * that is already there, so read the old value, and write itback.
64299a2dd95SBruce Richardson * kernel populates the page with zeroes initially.
64399a2dd95SBruce Richardson */
64499a2dd95SBruce Richardson *(volatile int *)addr = *(volatile int *)addr;
64599a2dd95SBruce Richardson
64699a2dd95SBruce Richardson iova = rte_mem_virt2iova(addr);
64799a2dd95SBruce Richardson if (iova == RTE_BAD_PHYS_ADDR) {
648ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): can't get IOVA addr",
64999a2dd95SBruce Richardson __func__);
65099a2dd95SBruce Richardson goto mapped;
65199a2dd95SBruce Richardson }
65299a2dd95SBruce Richardson
65399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
65499a2dd95SBruce Richardson /*
65599a2dd95SBruce Richardson * If the kernel has been built without NUMA support, get_mempolicy()
65699a2dd95SBruce Richardson * will return an error. If check_numa() returns false, memory
65799a2dd95SBruce Richardson * allocation is not NUMA aware and the socket_id should not be
65899a2dd95SBruce Richardson * checked.
65999a2dd95SBruce Richardson */
66099a2dd95SBruce Richardson if (check_numa()) {
66199a2dd95SBruce Richardson ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
66299a2dd95SBruce Richardson MPOL_F_NODE | MPOL_F_ADDR);
66399a2dd95SBruce Richardson if (ret < 0) {
664ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): get_mempolicy: %s",
66599a2dd95SBruce Richardson __func__, strerror(errno));
66699a2dd95SBruce Richardson goto mapped;
66799a2dd95SBruce Richardson } else if (cur_socket_id != socket_id) {
668ae67895bSDavid Marchand EAL_LOG(DEBUG,
669ae67895bSDavid Marchand "%s(): allocation happened on wrong socket (wanted %d, got %d)",
67099a2dd95SBruce Richardson __func__, socket_id, cur_socket_id);
67199a2dd95SBruce Richardson goto mapped;
67299a2dd95SBruce Richardson }
67399a2dd95SBruce Richardson }
67499a2dd95SBruce Richardson #else
67599a2dd95SBruce Richardson if (rte_socket_count() > 1)
676ae67895bSDavid Marchand EAL_LOG(DEBUG, "%s(): not checking hugepage NUMA node.",
67799a2dd95SBruce Richardson __func__);
67899a2dd95SBruce Richardson #endif
67999a2dd95SBruce Richardson
6809bffc928SOlivier Matz huge_recover_sigbus();
6819bffc928SOlivier Matz
68299a2dd95SBruce Richardson ms->addr = addr;
68399a2dd95SBruce Richardson ms->hugepage_sz = alloc_sz;
68499a2dd95SBruce Richardson ms->len = alloc_sz;
68599a2dd95SBruce Richardson ms->nchannel = rte_memory_get_nchannel();
68699a2dd95SBruce Richardson ms->nrank = rte_memory_get_nrank();
68799a2dd95SBruce Richardson ms->iova = iova;
68899a2dd95SBruce Richardson ms->socket_id = socket_id;
68932b4771cSDmitry Kozlyuk ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
69099a2dd95SBruce Richardson
69199a2dd95SBruce Richardson return 0;
69299a2dd95SBruce Richardson
69399a2dd95SBruce Richardson mapped:
69499a2dd95SBruce Richardson munmap(addr, alloc_sz);
69599a2dd95SBruce Richardson unmapped:
6969bffc928SOlivier Matz huge_recover_sigbus();
69799a2dd95SBruce Richardson flags = EAL_RESERVE_FORCE_ADDRESS;
69899a2dd95SBruce Richardson new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
69999a2dd95SBruce Richardson if (new_addr != addr) {
70099a2dd95SBruce Richardson if (new_addr != NULL)
70199a2dd95SBruce Richardson munmap(new_addr, alloc_sz);
70299a2dd95SBruce Richardson /* we're leaving a hole in our virtual address space. if
70399a2dd95SBruce Richardson * somebody else maps this hole now, we could accidentally
70499a2dd95SBruce Richardson * override it in the future.
70599a2dd95SBruce Richardson */
706ae67895bSDavid Marchand EAL_LOG(CRIT, "Can't mmap holes in our virtual address space");
70799a2dd95SBruce Richardson }
70899a2dd95SBruce Richardson /* roll back the ref count */
70999a2dd95SBruce Richardson if (internal_conf->single_file_segments)
71099a2dd95SBruce Richardson fd_list[list_idx].count--;
71199a2dd95SBruce Richardson resized:
71299a2dd95SBruce Richardson /* some codepaths will return negative fd, so exit early */
71399a2dd95SBruce Richardson if (fd < 0)
71499a2dd95SBruce Richardson return -1;
71599a2dd95SBruce Richardson
71699a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
71732b4771cSDmitry Kozlyuk resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
71899a2dd95SBruce Richardson /* ignore failure, can't make it any worse */
71999a2dd95SBruce Richardson
72099a2dd95SBruce Richardson /* if refcount is at zero, close the file */
72199a2dd95SBruce Richardson if (fd_list[list_idx].count == 0)
72299a2dd95SBruce Richardson close_hugefile(fd, path, list_idx);
72399a2dd95SBruce Richardson } else {
72499a2dd95SBruce Richardson /* only remove file if we can take out a write lock */
72552d7d91eSDmitry Kozlyuk if (!internal_conf->hugepage_file.unlink_before_mapping &&
72699a2dd95SBruce Richardson internal_conf->in_memory == 0 &&
72799a2dd95SBruce Richardson lock(fd, LOCK_EX) == 1)
72899a2dd95SBruce Richardson unlink(path);
72999a2dd95SBruce Richardson close(fd);
73099a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = -1;
73199a2dd95SBruce Richardson }
73299a2dd95SBruce Richardson return -1;
73399a2dd95SBruce Richardson }
73499a2dd95SBruce Richardson
73599a2dd95SBruce Richardson static int
free_seg(struct rte_memseg * ms,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)73699a2dd95SBruce Richardson free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
73799a2dd95SBruce Richardson unsigned int list_idx, unsigned int seg_idx)
73899a2dd95SBruce Richardson {
73999a2dd95SBruce Richardson uint64_t map_offset;
74099a2dd95SBruce Richardson char path[PATH_MAX];
74199a2dd95SBruce Richardson int fd, ret = 0;
74299a2dd95SBruce Richardson const struct internal_config *internal_conf =
74399a2dd95SBruce Richardson eal_get_internal_configuration();
74499a2dd95SBruce Richardson
74599a2dd95SBruce Richardson /* erase page data */
74699a2dd95SBruce Richardson memset(ms->addr, 0, ms->len);
74799a2dd95SBruce Richardson
74899a2dd95SBruce Richardson if (mmap(ms->addr, ms->len, PROT_NONE,
74999a2dd95SBruce Richardson MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
75099a2dd95SBruce Richardson MAP_FAILED) {
751ae67895bSDavid Marchand EAL_LOG(DEBUG, "couldn't unmap page");
75299a2dd95SBruce Richardson return -1;
75399a2dd95SBruce Richardson }
75499a2dd95SBruce Richardson
75599a2dd95SBruce Richardson eal_mem_set_dump(ms->addr, ms->len, false);
75699a2dd95SBruce Richardson
75799a2dd95SBruce Richardson /* if we're using anonymous hugepages, nothing to be done */
75899a2dd95SBruce Richardson if (internal_conf->in_memory && !memfd_create_supported) {
75999a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms));
76099a2dd95SBruce Richardson return 0;
76199a2dd95SBruce Richardson }
76299a2dd95SBruce Richardson
76399a2dd95SBruce Richardson /* if we are not in single file segments mode, we're going to unmap the
76499a2dd95SBruce Richardson * segment and thus drop the lock on original fd, but hugepage dir is
76599a2dd95SBruce Richardson * now locked so we can take out another one without races.
76699a2dd95SBruce Richardson */
76732b4771cSDmitry Kozlyuk fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
76899a2dd95SBruce Richardson if (fd < 0)
76999a2dd95SBruce Richardson return -1;
77099a2dd95SBruce Richardson
77199a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
77299a2dd95SBruce Richardson map_offset = seg_idx * ms->len;
77332b4771cSDmitry Kozlyuk if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
77499a2dd95SBruce Richardson return -1;
77599a2dd95SBruce Richardson
77699a2dd95SBruce Richardson if (--(fd_list[list_idx].count) == 0)
77799a2dd95SBruce Richardson close_hugefile(fd, path, list_idx);
77899a2dd95SBruce Richardson
77999a2dd95SBruce Richardson ret = 0;
78099a2dd95SBruce Richardson } else {
78199a2dd95SBruce Richardson /* if we're able to take out a write lock, we're the last one
78299a2dd95SBruce Richardson * holding onto this page.
78399a2dd95SBruce Richardson */
78452d7d91eSDmitry Kozlyuk if (!internal_conf->in_memory &&
78532b4771cSDmitry Kozlyuk internal_conf->hugepage_file.unlink_existing &&
78652d7d91eSDmitry Kozlyuk !internal_conf->hugepage_file.unlink_before_mapping) {
78799a2dd95SBruce Richardson ret = lock(fd, LOCK_EX);
78899a2dd95SBruce Richardson if (ret >= 0) {
78999a2dd95SBruce Richardson /* no one else is using this page */
79099a2dd95SBruce Richardson if (ret == 1)
79199a2dd95SBruce Richardson unlink(path);
79299a2dd95SBruce Richardson }
79399a2dd95SBruce Richardson }
79499a2dd95SBruce Richardson /* closing fd will drop the lock */
79599a2dd95SBruce Richardson close(fd);
79699a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = -1;
79799a2dd95SBruce Richardson }
79899a2dd95SBruce Richardson
79999a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms));
80099a2dd95SBruce Richardson
80199a2dd95SBruce Richardson return ret < 0 ? -1 : 0;
80299a2dd95SBruce Richardson }
80399a2dd95SBruce Richardson
80499a2dd95SBruce Richardson struct alloc_walk_param {
80599a2dd95SBruce Richardson struct hugepage_info *hi;
80699a2dd95SBruce Richardson struct rte_memseg **ms;
80799a2dd95SBruce Richardson size_t page_sz;
80899a2dd95SBruce Richardson unsigned int segs_allocated;
80999a2dd95SBruce Richardson unsigned int n_segs;
81099a2dd95SBruce Richardson int socket;
81199a2dd95SBruce Richardson bool exact;
81299a2dd95SBruce Richardson };
81399a2dd95SBruce Richardson static int
alloc_seg_walk(const struct rte_memseg_list * msl,void * arg)81499a2dd95SBruce Richardson alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
81599a2dd95SBruce Richardson {
81699a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
81799a2dd95SBruce Richardson struct alloc_walk_param *wa = arg;
81899a2dd95SBruce Richardson struct rte_memseg_list *cur_msl;
81999a2dd95SBruce Richardson size_t page_sz;
82099a2dd95SBruce Richardson int cur_idx, start_idx, j, dir_fd = -1;
82199a2dd95SBruce Richardson unsigned int msl_idx, need, i;
82299a2dd95SBruce Richardson const struct internal_config *internal_conf =
82399a2dd95SBruce Richardson eal_get_internal_configuration();
82499a2dd95SBruce Richardson
82599a2dd95SBruce Richardson if (msl->page_sz != wa->page_sz)
82699a2dd95SBruce Richardson return 0;
82799a2dd95SBruce Richardson if (msl->socket_id != wa->socket)
82899a2dd95SBruce Richardson return 0;
82999a2dd95SBruce Richardson
83099a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz;
83199a2dd95SBruce Richardson
83299a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
83399a2dd95SBruce Richardson cur_msl = &mcfg->memsegs[msl_idx];
83499a2dd95SBruce Richardson
83599a2dd95SBruce Richardson need = wa->n_segs;
83699a2dd95SBruce Richardson
83799a2dd95SBruce Richardson /* try finding space in memseg list */
83899a2dd95SBruce Richardson if (wa->exact) {
83999a2dd95SBruce Richardson /* if we require exact number of pages in a list, find them */
84099a2dd95SBruce Richardson cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
84199a2dd95SBruce Richardson need);
84299a2dd95SBruce Richardson if (cur_idx < 0)
84399a2dd95SBruce Richardson return 0;
84499a2dd95SBruce Richardson start_idx = cur_idx;
84599a2dd95SBruce Richardson } else {
84699a2dd95SBruce Richardson int cur_len;
84799a2dd95SBruce Richardson
84899a2dd95SBruce Richardson /* we don't require exact number of pages, so we're going to go
84999a2dd95SBruce Richardson * for best-effort allocation. that means finding the biggest
85099a2dd95SBruce Richardson * unused block, and going with that.
85199a2dd95SBruce Richardson */
85299a2dd95SBruce Richardson cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
85399a2dd95SBruce Richardson 0);
85499a2dd95SBruce Richardson if (cur_idx < 0)
85599a2dd95SBruce Richardson return 0;
85699a2dd95SBruce Richardson start_idx = cur_idx;
85799a2dd95SBruce Richardson /* adjust the size to possibly be smaller than original
85899a2dd95SBruce Richardson * request, but do not allow it to be bigger.
85999a2dd95SBruce Richardson */
86099a2dd95SBruce Richardson cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
86199a2dd95SBruce Richardson cur_idx);
86299a2dd95SBruce Richardson need = RTE_MIN(need, (unsigned int)cur_len);
86399a2dd95SBruce Richardson }
86499a2dd95SBruce Richardson
86599a2dd95SBruce Richardson /* do not allow any page allocations during the time we're allocating,
86699a2dd95SBruce Richardson * because file creation and locking operations are not atomic,
86799a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page,
86899a2dd95SBruce Richardson * so we need to ensure atomicity of every operation.
86999a2dd95SBruce Richardson *
87099a2dd95SBruce Richardson * during init, we already hold a write lock, so don't try to take out
87199a2dd95SBruce Richardson * another one.
87299a2dd95SBruce Richardson */
87399a2dd95SBruce Richardson if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
87499a2dd95SBruce Richardson dir_fd = open(wa->hi->hugedir, O_RDONLY);
87599a2dd95SBruce Richardson if (dir_fd < 0) {
876ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
87799a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno));
87899a2dd95SBruce Richardson return -1;
87999a2dd95SBruce Richardson }
88099a2dd95SBruce Richardson /* blocking writelock */
88199a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) {
882ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
88399a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno));
88499a2dd95SBruce Richardson close(dir_fd);
88599a2dd95SBruce Richardson return -1;
88699a2dd95SBruce Richardson }
88799a2dd95SBruce Richardson }
88899a2dd95SBruce Richardson
88999a2dd95SBruce Richardson for (i = 0; i < need; i++, cur_idx++) {
89099a2dd95SBruce Richardson struct rte_memseg *cur;
89199a2dd95SBruce Richardson void *map_addr;
89299a2dd95SBruce Richardson
89399a2dd95SBruce Richardson cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
89499a2dd95SBruce Richardson map_addr = RTE_PTR_ADD(cur_msl->base_va,
89599a2dd95SBruce Richardson cur_idx * page_sz);
89699a2dd95SBruce Richardson
89799a2dd95SBruce Richardson if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
89899a2dd95SBruce Richardson msl_idx, cur_idx)) {
899ae67895bSDavid Marchand EAL_LOG(DEBUG, "attempted to allocate %i segments, but only %i were allocated",
90099a2dd95SBruce Richardson need, i);
90199a2dd95SBruce Richardson
90299a2dd95SBruce Richardson /* if exact number wasn't requested, stop */
90399a2dd95SBruce Richardson if (!wa->exact)
90499a2dd95SBruce Richardson goto out;
90599a2dd95SBruce Richardson
90699a2dd95SBruce Richardson /* clean up */
90799a2dd95SBruce Richardson for (j = start_idx; j < cur_idx; j++) {
90899a2dd95SBruce Richardson struct rte_memseg *tmp;
90999a2dd95SBruce Richardson struct rte_fbarray *arr =
91099a2dd95SBruce Richardson &cur_msl->memseg_arr;
91199a2dd95SBruce Richardson
91299a2dd95SBruce Richardson tmp = rte_fbarray_get(arr, j);
91399a2dd95SBruce Richardson rte_fbarray_set_free(arr, j);
91499a2dd95SBruce Richardson
91599a2dd95SBruce Richardson /* free_seg may attempt to create a file, which
91699a2dd95SBruce Richardson * may fail.
91799a2dd95SBruce Richardson */
91899a2dd95SBruce Richardson if (free_seg(tmp, wa->hi, msl_idx, j))
919ae67895bSDavid Marchand EAL_LOG(DEBUG, "Cannot free page");
92099a2dd95SBruce Richardson }
92199a2dd95SBruce Richardson /* clear the list */
92299a2dd95SBruce Richardson if (wa->ms)
92399a2dd95SBruce Richardson memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
92499a2dd95SBruce Richardson
92599a2dd95SBruce Richardson if (dir_fd >= 0)
92699a2dd95SBruce Richardson close(dir_fd);
92799a2dd95SBruce Richardson return -1;
92899a2dd95SBruce Richardson }
92999a2dd95SBruce Richardson if (wa->ms)
93099a2dd95SBruce Richardson wa->ms[i] = cur;
93199a2dd95SBruce Richardson
93299a2dd95SBruce Richardson rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
93399a2dd95SBruce Richardson }
93499a2dd95SBruce Richardson out:
93599a2dd95SBruce Richardson wa->segs_allocated = i;
93699a2dd95SBruce Richardson if (i > 0)
93799a2dd95SBruce Richardson cur_msl->version++;
93899a2dd95SBruce Richardson if (dir_fd >= 0)
93999a2dd95SBruce Richardson close(dir_fd);
94099a2dd95SBruce Richardson /* if we didn't allocate any segments, move on to the next list */
94199a2dd95SBruce Richardson return i > 0;
94299a2dd95SBruce Richardson }
94399a2dd95SBruce Richardson
94499a2dd95SBruce Richardson struct free_walk_param {
94599a2dd95SBruce Richardson struct hugepage_info *hi;
94699a2dd95SBruce Richardson struct rte_memseg *ms;
94799a2dd95SBruce Richardson };
94899a2dd95SBruce Richardson static int
free_seg_walk(const struct rte_memseg_list * msl,void * arg)94999a2dd95SBruce Richardson free_seg_walk(const struct rte_memseg_list *msl, void *arg)
95099a2dd95SBruce Richardson {
95199a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
95299a2dd95SBruce Richardson struct rte_memseg_list *found_msl;
95399a2dd95SBruce Richardson struct free_walk_param *wa = arg;
95499a2dd95SBruce Richardson uintptr_t start_addr, end_addr;
95599a2dd95SBruce Richardson int msl_idx, seg_idx, ret, dir_fd = -1;
95699a2dd95SBruce Richardson const struct internal_config *internal_conf =
95799a2dd95SBruce Richardson eal_get_internal_configuration();
95899a2dd95SBruce Richardson
95999a2dd95SBruce Richardson start_addr = (uintptr_t) msl->base_va;
96099a2dd95SBruce Richardson end_addr = start_addr + msl->len;
96199a2dd95SBruce Richardson
96299a2dd95SBruce Richardson if ((uintptr_t)wa->ms->addr < start_addr ||
96399a2dd95SBruce Richardson (uintptr_t)wa->ms->addr >= end_addr)
96499a2dd95SBruce Richardson return 0;
96599a2dd95SBruce Richardson
96699a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
96799a2dd95SBruce Richardson seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
96899a2dd95SBruce Richardson
96999a2dd95SBruce Richardson /* msl is const */
97099a2dd95SBruce Richardson found_msl = &mcfg->memsegs[msl_idx];
97199a2dd95SBruce Richardson
97299a2dd95SBruce Richardson /* do not allow any page allocations during the time we're freeing,
97399a2dd95SBruce Richardson * because file creation and locking operations are not atomic,
97499a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page,
97599a2dd95SBruce Richardson * so we need to ensure atomicity of every operation.
97699a2dd95SBruce Richardson *
97799a2dd95SBruce Richardson * during init, we already hold a write lock, so don't try to take out
97899a2dd95SBruce Richardson * another one.
97999a2dd95SBruce Richardson */
98099a2dd95SBruce Richardson if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
98199a2dd95SBruce Richardson dir_fd = open(wa->hi->hugedir, O_RDONLY);
98299a2dd95SBruce Richardson if (dir_fd < 0) {
983ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
98499a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno));
98599a2dd95SBruce Richardson return -1;
98699a2dd95SBruce Richardson }
98799a2dd95SBruce Richardson /* blocking writelock */
98899a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) {
989ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
99099a2dd95SBruce Richardson __func__, wa->hi->hugedir, strerror(errno));
99199a2dd95SBruce Richardson close(dir_fd);
99299a2dd95SBruce Richardson return -1;
99399a2dd95SBruce Richardson }
99499a2dd95SBruce Richardson }
99599a2dd95SBruce Richardson
99699a2dd95SBruce Richardson found_msl->version++;
99799a2dd95SBruce Richardson
99899a2dd95SBruce Richardson rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
99999a2dd95SBruce Richardson
100099a2dd95SBruce Richardson ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
100199a2dd95SBruce Richardson
100299a2dd95SBruce Richardson if (dir_fd >= 0)
100399a2dd95SBruce Richardson close(dir_fd);
100499a2dd95SBruce Richardson
100599a2dd95SBruce Richardson if (ret < 0)
100699a2dd95SBruce Richardson return -1;
100799a2dd95SBruce Richardson
100899a2dd95SBruce Richardson return 1;
100999a2dd95SBruce Richardson }
101099a2dd95SBruce Richardson
101199a2dd95SBruce Richardson int
eal_memalloc_alloc_seg_bulk(struct rte_memseg ** ms,int n_segs,size_t page_sz,int socket,bool exact)101299a2dd95SBruce Richardson eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
101399a2dd95SBruce Richardson int socket, bool exact)
101499a2dd95SBruce Richardson {
101599a2dd95SBruce Richardson int i, ret = -1;
101699a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
101799a2dd95SBruce Richardson bool have_numa = false;
101899a2dd95SBruce Richardson int oldpolicy;
101999a2dd95SBruce Richardson struct bitmask *oldmask;
102099a2dd95SBruce Richardson #endif
102199a2dd95SBruce Richardson struct alloc_walk_param wa;
102299a2dd95SBruce Richardson struct hugepage_info *hi = NULL;
102399a2dd95SBruce Richardson struct internal_config *internal_conf =
102499a2dd95SBruce Richardson eal_get_internal_configuration();
102599a2dd95SBruce Richardson
102699a2dd95SBruce Richardson memset(&wa, 0, sizeof(wa));
102799a2dd95SBruce Richardson
102899a2dd95SBruce Richardson /* dynamic allocation not supported in legacy mode */
102999a2dd95SBruce Richardson if (internal_conf->legacy_mem)
103099a2dd95SBruce Richardson return -1;
103199a2dd95SBruce Richardson
103299a2dd95SBruce Richardson for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) {
103399a2dd95SBruce Richardson if (page_sz ==
103499a2dd95SBruce Richardson internal_conf->hugepage_info[i].hugepage_sz) {
103599a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i];
103699a2dd95SBruce Richardson break;
103799a2dd95SBruce Richardson }
103899a2dd95SBruce Richardson }
103999a2dd95SBruce Richardson if (!hi) {
1040ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): can't find relevant hugepage_info entry",
104199a2dd95SBruce Richardson __func__);
104299a2dd95SBruce Richardson return -1;
104399a2dd95SBruce Richardson }
104499a2dd95SBruce Richardson
104599a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
104699a2dd95SBruce Richardson if (check_numa()) {
104799a2dd95SBruce Richardson oldmask = numa_allocate_nodemask();
104899a2dd95SBruce Richardson prepare_numa(&oldpolicy, oldmask, socket);
104999a2dd95SBruce Richardson have_numa = true;
105099a2dd95SBruce Richardson }
105199a2dd95SBruce Richardson #endif
105299a2dd95SBruce Richardson
105399a2dd95SBruce Richardson wa.exact = exact;
105499a2dd95SBruce Richardson wa.hi = hi;
105599a2dd95SBruce Richardson wa.ms = ms;
105699a2dd95SBruce Richardson wa.n_segs = n_segs;
105799a2dd95SBruce Richardson wa.page_sz = page_sz;
105899a2dd95SBruce Richardson wa.socket = socket;
105999a2dd95SBruce Richardson wa.segs_allocated = 0;
106099a2dd95SBruce Richardson
106199a2dd95SBruce Richardson /* memalloc is locked, so it's safe to use thread-unsafe version */
106299a2dd95SBruce Richardson ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
106399a2dd95SBruce Richardson if (ret == 0) {
1064*8f4611d8SDavid Marchand EAL_LOG(DEBUG, "%s(): couldn't find suitable memseg_list",
106599a2dd95SBruce Richardson __func__);
106699a2dd95SBruce Richardson ret = -1;
106799a2dd95SBruce Richardson } else if (ret > 0) {
106899a2dd95SBruce Richardson ret = (int)wa.segs_allocated;
106999a2dd95SBruce Richardson }
107099a2dd95SBruce Richardson
107199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
107299a2dd95SBruce Richardson if (have_numa)
107399a2dd95SBruce Richardson restore_numa(&oldpolicy, oldmask);
107499a2dd95SBruce Richardson #endif
107599a2dd95SBruce Richardson return ret;
107699a2dd95SBruce Richardson }
107799a2dd95SBruce Richardson
107899a2dd95SBruce Richardson struct rte_memseg *
eal_memalloc_alloc_seg(size_t page_sz,int socket)107999a2dd95SBruce Richardson eal_memalloc_alloc_seg(size_t page_sz, int socket)
108099a2dd95SBruce Richardson {
108199a2dd95SBruce Richardson struct rte_memseg *ms;
108299a2dd95SBruce Richardson if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
108399a2dd95SBruce Richardson return NULL;
108499a2dd95SBruce Richardson /* return pointer to newly allocated memseg */
108599a2dd95SBruce Richardson return ms;
108699a2dd95SBruce Richardson }
108799a2dd95SBruce Richardson
108899a2dd95SBruce Richardson int
eal_memalloc_free_seg_bulk(struct rte_memseg ** ms,int n_segs)108999a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
109099a2dd95SBruce Richardson {
109199a2dd95SBruce Richardson int seg, ret = 0;
109299a2dd95SBruce Richardson struct internal_config *internal_conf =
109399a2dd95SBruce Richardson eal_get_internal_configuration();
109499a2dd95SBruce Richardson
109599a2dd95SBruce Richardson /* dynamic free not supported in legacy mode */
109699a2dd95SBruce Richardson if (internal_conf->legacy_mem)
109799a2dd95SBruce Richardson return -1;
109899a2dd95SBruce Richardson
109999a2dd95SBruce Richardson for (seg = 0; seg < n_segs; seg++) {
110099a2dd95SBruce Richardson struct rte_memseg *cur = ms[seg];
110199a2dd95SBruce Richardson struct hugepage_info *hi = NULL;
110299a2dd95SBruce Richardson struct free_walk_param wa;
110399a2dd95SBruce Richardson int i, walk_res;
110499a2dd95SBruce Richardson
110599a2dd95SBruce Richardson /* if this page is marked as unfreeable, fail */
110699a2dd95SBruce Richardson if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
1107ae67895bSDavid Marchand EAL_LOG(DEBUG, "Page is not allowed to be freed");
110899a2dd95SBruce Richardson ret = -1;
110999a2dd95SBruce Richardson continue;
111099a2dd95SBruce Richardson }
111199a2dd95SBruce Richardson
111299a2dd95SBruce Richardson memset(&wa, 0, sizeof(wa));
111399a2dd95SBruce Richardson
111499a2dd95SBruce Richardson for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info);
111599a2dd95SBruce Richardson i++) {
111699a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i];
111799a2dd95SBruce Richardson if (cur->hugepage_sz == hi->hugepage_sz)
111899a2dd95SBruce Richardson break;
111999a2dd95SBruce Richardson }
112099a2dd95SBruce Richardson if (i == (int)RTE_DIM(internal_conf->hugepage_info)) {
1121ae67895bSDavid Marchand EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
112299a2dd95SBruce Richardson ret = -1;
112399a2dd95SBruce Richardson continue;
112499a2dd95SBruce Richardson }
112599a2dd95SBruce Richardson
112699a2dd95SBruce Richardson wa.ms = cur;
112799a2dd95SBruce Richardson wa.hi = hi;
112899a2dd95SBruce Richardson
112999a2dd95SBruce Richardson /* memalloc is locked, so it's safe to use thread-unsafe version
113099a2dd95SBruce Richardson */
113199a2dd95SBruce Richardson walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
113299a2dd95SBruce Richardson &wa);
113399a2dd95SBruce Richardson if (walk_res == 1)
113499a2dd95SBruce Richardson continue;
113599a2dd95SBruce Richardson if (walk_res == 0)
1136ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't find memseg list");
113799a2dd95SBruce Richardson ret = -1;
113899a2dd95SBruce Richardson }
113999a2dd95SBruce Richardson return ret;
114099a2dd95SBruce Richardson }
114199a2dd95SBruce Richardson
114299a2dd95SBruce Richardson int
eal_memalloc_free_seg(struct rte_memseg * ms)114399a2dd95SBruce Richardson eal_memalloc_free_seg(struct rte_memseg *ms)
114499a2dd95SBruce Richardson {
114599a2dd95SBruce Richardson const struct internal_config *internal_conf =
114699a2dd95SBruce Richardson eal_get_internal_configuration();
114799a2dd95SBruce Richardson
114899a2dd95SBruce Richardson /* dynamic free not supported in legacy mode */
114999a2dd95SBruce Richardson if (internal_conf->legacy_mem)
115099a2dd95SBruce Richardson return -1;
115199a2dd95SBruce Richardson
115299a2dd95SBruce Richardson return eal_memalloc_free_seg_bulk(&ms, 1);
115399a2dd95SBruce Richardson }
115499a2dd95SBruce Richardson
115599a2dd95SBruce Richardson static int
sync_chunk(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used,int start,int end)115699a2dd95SBruce Richardson sync_chunk(struct rte_memseg_list *primary_msl,
115799a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi,
115899a2dd95SBruce Richardson unsigned int msl_idx, bool used, int start, int end)
115999a2dd95SBruce Richardson {
116099a2dd95SBruce Richardson struct rte_fbarray *l_arr, *p_arr;
116199a2dd95SBruce Richardson int i, ret, chunk_len, diff_len;
116299a2dd95SBruce Richardson
116399a2dd95SBruce Richardson l_arr = &local_msl->memseg_arr;
116499a2dd95SBruce Richardson p_arr = &primary_msl->memseg_arr;
116599a2dd95SBruce Richardson
116699a2dd95SBruce Richardson /* we need to aggregate allocations/deallocations into bigger chunks,
116799a2dd95SBruce Richardson * as we don't want to spam the user with per-page callbacks.
116899a2dd95SBruce Richardson *
116999a2dd95SBruce Richardson * to avoid any potential issues, we also want to trigger
117099a2dd95SBruce Richardson * deallocation callbacks *before* we actually deallocate
117199a2dd95SBruce Richardson * memory, so that the user application could wrap up its use
117299a2dd95SBruce Richardson * before it goes away.
117399a2dd95SBruce Richardson */
117499a2dd95SBruce Richardson
117599a2dd95SBruce Richardson chunk_len = end - start;
117699a2dd95SBruce Richardson
117799a2dd95SBruce Richardson /* find how many contiguous pages we can map/unmap for this chunk */
117899a2dd95SBruce Richardson diff_len = used ?
117999a2dd95SBruce Richardson rte_fbarray_find_contig_free(l_arr, start) :
118099a2dd95SBruce Richardson rte_fbarray_find_contig_used(l_arr, start);
118199a2dd95SBruce Richardson
118299a2dd95SBruce Richardson /* has to be at least one page */
118399a2dd95SBruce Richardson if (diff_len < 1)
118499a2dd95SBruce Richardson return -1;
118599a2dd95SBruce Richardson
118699a2dd95SBruce Richardson diff_len = RTE_MIN(chunk_len, diff_len);
118799a2dd95SBruce Richardson
118899a2dd95SBruce Richardson /* if we are freeing memory, notify the application */
118999a2dd95SBruce Richardson if (!used) {
119099a2dd95SBruce Richardson struct rte_memseg *ms;
119199a2dd95SBruce Richardson void *start_va;
119299a2dd95SBruce Richardson size_t len, page_sz;
119399a2dd95SBruce Richardson
119499a2dd95SBruce Richardson ms = rte_fbarray_get(l_arr, start);
119599a2dd95SBruce Richardson start_va = ms->addr;
119699a2dd95SBruce Richardson page_sz = (size_t)primary_msl->page_sz;
119799a2dd95SBruce Richardson len = page_sz * diff_len;
119899a2dd95SBruce Richardson
119999a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
120099a2dd95SBruce Richardson start_va, len);
120199a2dd95SBruce Richardson }
120299a2dd95SBruce Richardson
120399a2dd95SBruce Richardson for (i = 0; i < diff_len; i++) {
120499a2dd95SBruce Richardson struct rte_memseg *p_ms, *l_ms;
120599a2dd95SBruce Richardson int seg_idx = start + i;
120699a2dd95SBruce Richardson
120799a2dd95SBruce Richardson l_ms = rte_fbarray_get(l_arr, seg_idx);
120899a2dd95SBruce Richardson p_ms = rte_fbarray_get(p_arr, seg_idx);
120999a2dd95SBruce Richardson
121099a2dd95SBruce Richardson if (l_ms == NULL || p_ms == NULL)
121199a2dd95SBruce Richardson return -1;
121299a2dd95SBruce Richardson
121399a2dd95SBruce Richardson if (used) {
121499a2dd95SBruce Richardson ret = alloc_seg(l_ms, p_ms->addr,
121599a2dd95SBruce Richardson p_ms->socket_id, hi,
121699a2dd95SBruce Richardson msl_idx, seg_idx);
121799a2dd95SBruce Richardson if (ret < 0)
121899a2dd95SBruce Richardson return -1;
121999a2dd95SBruce Richardson rte_fbarray_set_used(l_arr, seg_idx);
122099a2dd95SBruce Richardson } else {
122199a2dd95SBruce Richardson ret = free_seg(l_ms, hi, msl_idx, seg_idx);
122299a2dd95SBruce Richardson rte_fbarray_set_free(l_arr, seg_idx);
122399a2dd95SBruce Richardson if (ret < 0)
122499a2dd95SBruce Richardson return -1;
122599a2dd95SBruce Richardson }
122699a2dd95SBruce Richardson }
122799a2dd95SBruce Richardson
122899a2dd95SBruce Richardson /* if we just allocated memory, notify the application */
122999a2dd95SBruce Richardson if (used) {
123099a2dd95SBruce Richardson struct rte_memseg *ms;
123199a2dd95SBruce Richardson void *start_va;
123299a2dd95SBruce Richardson size_t len, page_sz;
123399a2dd95SBruce Richardson
123499a2dd95SBruce Richardson ms = rte_fbarray_get(l_arr, start);
123599a2dd95SBruce Richardson start_va = ms->addr;
123699a2dd95SBruce Richardson page_sz = (size_t)primary_msl->page_sz;
123799a2dd95SBruce Richardson len = page_sz * diff_len;
123899a2dd95SBruce Richardson
123999a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
124099a2dd95SBruce Richardson start_va, len);
124199a2dd95SBruce Richardson }
124299a2dd95SBruce Richardson
124399a2dd95SBruce Richardson /* calculate how much we can advance until next chunk */
124499a2dd95SBruce Richardson diff_len = used ?
124599a2dd95SBruce Richardson rte_fbarray_find_contig_used(l_arr, start) :
124699a2dd95SBruce Richardson rte_fbarray_find_contig_free(l_arr, start);
124799a2dd95SBruce Richardson ret = RTE_MIN(chunk_len, diff_len);
124899a2dd95SBruce Richardson
124999a2dd95SBruce Richardson return ret;
125099a2dd95SBruce Richardson }
125199a2dd95SBruce Richardson
125299a2dd95SBruce Richardson static int
sync_status(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used)125399a2dd95SBruce Richardson sync_status(struct rte_memseg_list *primary_msl,
125499a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi,
125599a2dd95SBruce Richardson unsigned int msl_idx, bool used)
125699a2dd95SBruce Richardson {
125799a2dd95SBruce Richardson struct rte_fbarray *l_arr, *p_arr;
125899a2dd95SBruce Richardson int p_idx, l_chunk_len, p_chunk_len, ret;
125999a2dd95SBruce Richardson int start, end;
126099a2dd95SBruce Richardson
126199a2dd95SBruce Richardson /* this is a little bit tricky, but the basic idea is - walk both lists
126299a2dd95SBruce Richardson * and spot any places where there are discrepancies. walking both lists
126399a2dd95SBruce Richardson * and noting discrepancies in a single go is a hard problem, so we do
126499a2dd95SBruce Richardson * it in two passes - first we spot any places where allocated segments
126599a2dd95SBruce Richardson * mismatch (i.e. ensure that everything that's allocated in the primary
126699a2dd95SBruce Richardson * is also allocated in the secondary), and then we do it by looking at
126799a2dd95SBruce Richardson * free segments instead.
126899a2dd95SBruce Richardson *
126999a2dd95SBruce Richardson * we also need to aggregate changes into chunks, as we have to call
127099a2dd95SBruce Richardson * callbacks per allocation, not per page.
127199a2dd95SBruce Richardson */
127299a2dd95SBruce Richardson l_arr = &local_msl->memseg_arr;
127399a2dd95SBruce Richardson p_arr = &primary_msl->memseg_arr;
127499a2dd95SBruce Richardson
127599a2dd95SBruce Richardson if (used)
127699a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_used(p_arr, 0);
127799a2dd95SBruce Richardson else
127899a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_free(p_arr, 0);
127999a2dd95SBruce Richardson
128099a2dd95SBruce Richardson while (p_idx >= 0) {
128199a2dd95SBruce Richardson int next_chunk_search_idx;
128299a2dd95SBruce Richardson
128399a2dd95SBruce Richardson if (used) {
128499a2dd95SBruce Richardson p_chunk_len = rte_fbarray_find_contig_used(p_arr,
128599a2dd95SBruce Richardson p_idx);
128699a2dd95SBruce Richardson l_chunk_len = rte_fbarray_find_contig_used(l_arr,
128799a2dd95SBruce Richardson p_idx);
128899a2dd95SBruce Richardson } else {
128999a2dd95SBruce Richardson p_chunk_len = rte_fbarray_find_contig_free(p_arr,
129099a2dd95SBruce Richardson p_idx);
129199a2dd95SBruce Richardson l_chunk_len = rte_fbarray_find_contig_free(l_arr,
129299a2dd95SBruce Richardson p_idx);
129399a2dd95SBruce Richardson }
129499a2dd95SBruce Richardson /* best case scenario - no differences (or bigger, which will be
129599a2dd95SBruce Richardson * fixed during next iteration), look for next chunk
129699a2dd95SBruce Richardson */
129799a2dd95SBruce Richardson if (l_chunk_len >= p_chunk_len) {
129899a2dd95SBruce Richardson next_chunk_search_idx = p_idx + p_chunk_len;
129999a2dd95SBruce Richardson goto next_chunk;
130099a2dd95SBruce Richardson }
130199a2dd95SBruce Richardson
130299a2dd95SBruce Richardson /* if both chunks start at the same point, skip parts we know
130399a2dd95SBruce Richardson * are identical, and sync the rest. each call to sync_chunk
130499a2dd95SBruce Richardson * will only sync contiguous segments, so we need to call this
130599a2dd95SBruce Richardson * until we are sure there are no more differences in this
130699a2dd95SBruce Richardson * chunk.
130799a2dd95SBruce Richardson */
130899a2dd95SBruce Richardson start = p_idx + l_chunk_len;
130999a2dd95SBruce Richardson end = p_idx + p_chunk_len;
131099a2dd95SBruce Richardson do {
131199a2dd95SBruce Richardson ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
131299a2dd95SBruce Richardson used, start, end);
131399a2dd95SBruce Richardson start += ret;
131499a2dd95SBruce Richardson } while (start < end && ret >= 0);
131599a2dd95SBruce Richardson /* if ret is negative, something went wrong */
131699a2dd95SBruce Richardson if (ret < 0)
131799a2dd95SBruce Richardson return -1;
131899a2dd95SBruce Richardson
131999a2dd95SBruce Richardson next_chunk_search_idx = p_idx + p_chunk_len;
132099a2dd95SBruce Richardson next_chunk:
132199a2dd95SBruce Richardson /* skip to end of this chunk */
132299a2dd95SBruce Richardson if (used) {
132399a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_used(p_arr,
132499a2dd95SBruce Richardson next_chunk_search_idx);
132599a2dd95SBruce Richardson } else {
132699a2dd95SBruce Richardson p_idx = rte_fbarray_find_next_free(p_arr,
132799a2dd95SBruce Richardson next_chunk_search_idx);
132899a2dd95SBruce Richardson }
132999a2dd95SBruce Richardson }
133099a2dd95SBruce Richardson return 0;
133199a2dd95SBruce Richardson }
133299a2dd95SBruce Richardson
133399a2dd95SBruce Richardson static int
sync_existing(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx)133499a2dd95SBruce Richardson sync_existing(struct rte_memseg_list *primary_msl,
133599a2dd95SBruce Richardson struct rte_memseg_list *local_msl, struct hugepage_info *hi,
133699a2dd95SBruce Richardson unsigned int msl_idx)
133799a2dd95SBruce Richardson {
133899a2dd95SBruce Richardson int ret, dir_fd;
133999a2dd95SBruce Richardson
134099a2dd95SBruce Richardson /* do not allow any page allocations during the time we're allocating,
134199a2dd95SBruce Richardson * because file creation and locking operations are not atomic,
134299a2dd95SBruce Richardson * and we might be the first or the last ones to use a particular page,
134399a2dd95SBruce Richardson * so we need to ensure atomicity of every operation.
134499a2dd95SBruce Richardson */
134599a2dd95SBruce Richardson dir_fd = open(hi->hugedir, O_RDONLY);
134699a2dd95SBruce Richardson if (dir_fd < 0) {
1347ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot open '%s': %s", __func__,
134899a2dd95SBruce Richardson hi->hugedir, strerror(errno));
134999a2dd95SBruce Richardson return -1;
135099a2dd95SBruce Richardson }
135199a2dd95SBruce Richardson /* blocking writelock */
135299a2dd95SBruce Richardson if (flock(dir_fd, LOCK_EX)) {
1353ae67895bSDavid Marchand EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", __func__,
135499a2dd95SBruce Richardson hi->hugedir, strerror(errno));
135599a2dd95SBruce Richardson close(dir_fd);
135699a2dd95SBruce Richardson return -1;
135799a2dd95SBruce Richardson }
135899a2dd95SBruce Richardson
135999a2dd95SBruce Richardson /* ensure all allocated space is the same in both lists */
136099a2dd95SBruce Richardson ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
136199a2dd95SBruce Richardson if (ret < 0)
136299a2dd95SBruce Richardson goto fail;
136399a2dd95SBruce Richardson
136499a2dd95SBruce Richardson /* ensure all unallocated space is the same in both lists */
136599a2dd95SBruce Richardson ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
136699a2dd95SBruce Richardson if (ret < 0)
136799a2dd95SBruce Richardson goto fail;
136899a2dd95SBruce Richardson
136999a2dd95SBruce Richardson /* update version number */
137099a2dd95SBruce Richardson local_msl->version = primary_msl->version;
137199a2dd95SBruce Richardson
137299a2dd95SBruce Richardson close(dir_fd);
137399a2dd95SBruce Richardson
137499a2dd95SBruce Richardson return 0;
137599a2dd95SBruce Richardson fail:
137699a2dd95SBruce Richardson close(dir_fd);
137799a2dd95SBruce Richardson return -1;
137899a2dd95SBruce Richardson }
137999a2dd95SBruce Richardson
138099a2dd95SBruce Richardson static int
sync_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)138199a2dd95SBruce Richardson sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
138299a2dd95SBruce Richardson {
138399a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
138499a2dd95SBruce Richardson struct rte_memseg_list *primary_msl, *local_msl;
138599a2dd95SBruce Richardson struct hugepage_info *hi = NULL;
138699a2dd95SBruce Richardson unsigned int i;
138799a2dd95SBruce Richardson int msl_idx;
138899a2dd95SBruce Richardson struct internal_config *internal_conf =
138999a2dd95SBruce Richardson eal_get_internal_configuration();
139099a2dd95SBruce Richardson
139199a2dd95SBruce Richardson if (msl->external)
139299a2dd95SBruce Richardson return 0;
139399a2dd95SBruce Richardson
139499a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
139599a2dd95SBruce Richardson primary_msl = &mcfg->memsegs[msl_idx];
139699a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx];
139799a2dd95SBruce Richardson
139899a2dd95SBruce Richardson for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
139999a2dd95SBruce Richardson uint64_t cur_sz =
140099a2dd95SBruce Richardson internal_conf->hugepage_info[i].hugepage_sz;
140199a2dd95SBruce Richardson uint64_t msl_sz = primary_msl->page_sz;
140299a2dd95SBruce Richardson if (msl_sz == cur_sz) {
140399a2dd95SBruce Richardson hi = &internal_conf->hugepage_info[i];
140499a2dd95SBruce Richardson break;
140599a2dd95SBruce Richardson }
140699a2dd95SBruce Richardson }
140799a2dd95SBruce Richardson if (!hi) {
1408ae67895bSDavid Marchand EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
140999a2dd95SBruce Richardson return -1;
141099a2dd95SBruce Richardson }
141199a2dd95SBruce Richardson
141299a2dd95SBruce Richardson /* if versions don't match, synchronize everything */
141399a2dd95SBruce Richardson if (local_msl->version != primary_msl->version &&
141499a2dd95SBruce Richardson sync_existing(primary_msl, local_msl, hi, msl_idx))
141599a2dd95SBruce Richardson return -1;
141699a2dd95SBruce Richardson return 0;
141799a2dd95SBruce Richardson }
141899a2dd95SBruce Richardson
141999a2dd95SBruce Richardson
142099a2dd95SBruce Richardson int
eal_memalloc_sync_with_primary(void)142199a2dd95SBruce Richardson eal_memalloc_sync_with_primary(void)
142299a2dd95SBruce Richardson {
142399a2dd95SBruce Richardson /* nothing to be done in primary */
142499a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY)
142599a2dd95SBruce Richardson return 0;
142699a2dd95SBruce Richardson
142799a2dd95SBruce Richardson /* memalloc is locked, so it's safe to call thread-unsafe version */
142899a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
142999a2dd95SBruce Richardson return -1;
143099a2dd95SBruce Richardson return 0;
143199a2dd95SBruce Richardson }
143299a2dd95SBruce Richardson
143399a2dd95SBruce Richardson static int
secondary_msl_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)143499a2dd95SBruce Richardson secondary_msl_create_walk(const struct rte_memseg_list *msl,
143599a2dd95SBruce Richardson void *arg __rte_unused)
143699a2dd95SBruce Richardson {
143799a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
143899a2dd95SBruce Richardson struct rte_memseg_list *primary_msl, *local_msl;
143999a2dd95SBruce Richardson char name[PATH_MAX];
144099a2dd95SBruce Richardson int msl_idx, ret;
144199a2dd95SBruce Richardson
144299a2dd95SBruce Richardson if (msl->external)
144399a2dd95SBruce Richardson return 0;
144499a2dd95SBruce Richardson
144599a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
144699a2dd95SBruce Richardson primary_msl = &mcfg->memsegs[msl_idx];
144799a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx];
144899a2dd95SBruce Richardson
144999a2dd95SBruce Richardson /* create distinct fbarrays for each secondary */
145099a2dd95SBruce Richardson snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
145199a2dd95SBruce Richardson primary_msl->memseg_arr.name, getpid());
145299a2dd95SBruce Richardson
145399a2dd95SBruce Richardson ret = rte_fbarray_init(&local_msl->memseg_arr, name,
145499a2dd95SBruce Richardson primary_msl->memseg_arr.len,
145599a2dd95SBruce Richardson primary_msl->memseg_arr.elt_sz);
145699a2dd95SBruce Richardson if (ret < 0) {
1457ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot initialize local memory map");
145899a2dd95SBruce Richardson return -1;
145999a2dd95SBruce Richardson }
146099a2dd95SBruce Richardson local_msl->base_va = primary_msl->base_va;
146199a2dd95SBruce Richardson local_msl->len = primary_msl->len;
146299a2dd95SBruce Richardson
146399a2dd95SBruce Richardson return 0;
146499a2dd95SBruce Richardson }
146599a2dd95SBruce Richardson
146699a2dd95SBruce Richardson static int
secondary_msl_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)146799a2dd95SBruce Richardson secondary_msl_destroy_walk(const struct rte_memseg_list *msl,
146899a2dd95SBruce Richardson void *arg __rte_unused)
146999a2dd95SBruce Richardson {
147099a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
147199a2dd95SBruce Richardson struct rte_memseg_list *local_msl;
147299a2dd95SBruce Richardson int msl_idx, ret;
147399a2dd95SBruce Richardson
147499a2dd95SBruce Richardson if (msl->external)
147599a2dd95SBruce Richardson return 0;
147699a2dd95SBruce Richardson
147799a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
147899a2dd95SBruce Richardson local_msl = &local_memsegs[msl_idx];
147999a2dd95SBruce Richardson
148099a2dd95SBruce Richardson ret = rte_fbarray_destroy(&local_msl->memseg_arr);
148199a2dd95SBruce Richardson if (ret < 0) {
1482ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot destroy local memory map");
148399a2dd95SBruce Richardson return -1;
148499a2dd95SBruce Richardson }
148599a2dd95SBruce Richardson local_msl->base_va = NULL;
148699a2dd95SBruce Richardson local_msl->len = 0;
148799a2dd95SBruce Richardson
148899a2dd95SBruce Richardson return 0;
148999a2dd95SBruce Richardson }
149099a2dd95SBruce Richardson
149199a2dd95SBruce Richardson static int
alloc_list(int list_idx,int len)149299a2dd95SBruce Richardson alloc_list(int list_idx, int len)
149399a2dd95SBruce Richardson {
149499a2dd95SBruce Richardson int *data;
149599a2dd95SBruce Richardson int i;
149699a2dd95SBruce Richardson const struct internal_config *internal_conf =
149799a2dd95SBruce Richardson eal_get_internal_configuration();
149899a2dd95SBruce Richardson
149999a2dd95SBruce Richardson /* single-file segments mode does not need fd list */
150099a2dd95SBruce Richardson if (!internal_conf->single_file_segments) {
150199a2dd95SBruce Richardson /* ensure we have space to store fd per each possible segment */
150299a2dd95SBruce Richardson data = malloc(sizeof(int) * len);
150399a2dd95SBruce Richardson if (data == NULL) {
1504ae67895bSDavid Marchand EAL_LOG(ERR, "Unable to allocate space for file descriptors");
150599a2dd95SBruce Richardson return -1;
150699a2dd95SBruce Richardson }
150799a2dd95SBruce Richardson /* set all fd's as invalid */
150899a2dd95SBruce Richardson for (i = 0; i < len; i++)
150999a2dd95SBruce Richardson data[i] = -1;
151099a2dd95SBruce Richardson fd_list[list_idx].fds = data;
151199a2dd95SBruce Richardson fd_list[list_idx].len = len;
151299a2dd95SBruce Richardson } else {
151399a2dd95SBruce Richardson fd_list[list_idx].fds = NULL;
151499a2dd95SBruce Richardson fd_list[list_idx].len = 0;
151599a2dd95SBruce Richardson }
151699a2dd95SBruce Richardson
151799a2dd95SBruce Richardson fd_list[list_idx].count = 0;
151899a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1;
151999a2dd95SBruce Richardson
152099a2dd95SBruce Richardson return 0;
152199a2dd95SBruce Richardson }
152299a2dd95SBruce Richardson
152399a2dd95SBruce Richardson static int
destroy_list(int list_idx)152499a2dd95SBruce Richardson destroy_list(int list_idx)
152599a2dd95SBruce Richardson {
152699a2dd95SBruce Richardson const struct internal_config *internal_conf =
152799a2dd95SBruce Richardson eal_get_internal_configuration();
152899a2dd95SBruce Richardson
152999a2dd95SBruce Richardson /* single-file segments mode does not need fd list */
153099a2dd95SBruce Richardson if (!internal_conf->single_file_segments) {
153199a2dd95SBruce Richardson int *fds = fd_list[list_idx].fds;
153299a2dd95SBruce Richardson int i;
153399a2dd95SBruce Richardson /* go through each fd and ensure it's closed */
153499a2dd95SBruce Richardson for (i = 0; i < fd_list[list_idx].len; i++) {
153599a2dd95SBruce Richardson if (fds[i] >= 0) {
153699a2dd95SBruce Richardson close(fds[i]);
153799a2dd95SBruce Richardson fds[i] = -1;
153899a2dd95SBruce Richardson }
153999a2dd95SBruce Richardson }
154099a2dd95SBruce Richardson free(fds);
154199a2dd95SBruce Richardson fd_list[list_idx].fds = NULL;
154299a2dd95SBruce Richardson fd_list[list_idx].len = 0;
154399a2dd95SBruce Richardson } else if (fd_list[list_idx].memseg_list_fd >= 0) {
154499a2dd95SBruce Richardson close(fd_list[list_idx].memseg_list_fd);
154599a2dd95SBruce Richardson fd_list[list_idx].count = 0;
154699a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = -1;
154799a2dd95SBruce Richardson }
154899a2dd95SBruce Richardson return 0;
154999a2dd95SBruce Richardson }
155099a2dd95SBruce Richardson
155199a2dd95SBruce Richardson static int
fd_list_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)155299a2dd95SBruce Richardson fd_list_create_walk(const struct rte_memseg_list *msl,
155399a2dd95SBruce Richardson void *arg __rte_unused)
155499a2dd95SBruce Richardson {
155599a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
155699a2dd95SBruce Richardson unsigned int len;
155799a2dd95SBruce Richardson int msl_idx;
155899a2dd95SBruce Richardson
155999a2dd95SBruce Richardson if (msl->external)
156099a2dd95SBruce Richardson return 0;
156199a2dd95SBruce Richardson
156299a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
156399a2dd95SBruce Richardson len = msl->memseg_arr.len;
156499a2dd95SBruce Richardson
156599a2dd95SBruce Richardson return alloc_list(msl_idx, len);
156699a2dd95SBruce Richardson }
156799a2dd95SBruce Richardson
156899a2dd95SBruce Richardson static int
fd_list_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)156999a2dd95SBruce Richardson fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
157099a2dd95SBruce Richardson {
157199a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
157299a2dd95SBruce Richardson int msl_idx;
157399a2dd95SBruce Richardson
157499a2dd95SBruce Richardson if (msl->external)
157599a2dd95SBruce Richardson return 0;
157699a2dd95SBruce Richardson
157799a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs;
157899a2dd95SBruce Richardson
157999a2dd95SBruce Richardson return destroy_list(msl_idx);
158099a2dd95SBruce Richardson }
158199a2dd95SBruce Richardson
158299a2dd95SBruce Richardson int
eal_memalloc_set_seg_fd(int list_idx,int seg_idx,int fd)158399a2dd95SBruce Richardson eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
158499a2dd95SBruce Richardson {
158599a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
158699a2dd95SBruce Richardson const struct internal_config *internal_conf =
158799a2dd95SBruce Richardson eal_get_internal_configuration();
158899a2dd95SBruce Richardson
158999a2dd95SBruce Richardson /* single file segments mode doesn't support individual segment fd's */
159099a2dd95SBruce Richardson if (internal_conf->single_file_segments)
159199a2dd95SBruce Richardson return -ENOTSUP;
159299a2dd95SBruce Richardson
159399a2dd95SBruce Richardson /* if list is not allocated, allocate it */
159499a2dd95SBruce Richardson if (fd_list[list_idx].len == 0) {
159599a2dd95SBruce Richardson int len = mcfg->memsegs[list_idx].memseg_arr.len;
159699a2dd95SBruce Richardson
159799a2dd95SBruce Richardson if (alloc_list(list_idx, len) < 0)
159899a2dd95SBruce Richardson return -ENOMEM;
159999a2dd95SBruce Richardson }
160099a2dd95SBruce Richardson fd_list[list_idx].fds[seg_idx] = fd;
160199a2dd95SBruce Richardson
160299a2dd95SBruce Richardson return 0;
160399a2dd95SBruce Richardson }
160499a2dd95SBruce Richardson
160599a2dd95SBruce Richardson int
eal_memalloc_set_seg_list_fd(int list_idx,int fd)160699a2dd95SBruce Richardson eal_memalloc_set_seg_list_fd(int list_idx, int fd)
160799a2dd95SBruce Richardson {
160899a2dd95SBruce Richardson const struct internal_config *internal_conf =
160999a2dd95SBruce Richardson eal_get_internal_configuration();
161099a2dd95SBruce Richardson
161199a2dd95SBruce Richardson /* non-single file segment mode doesn't support segment list fd's */
161299a2dd95SBruce Richardson if (!internal_conf->single_file_segments)
161399a2dd95SBruce Richardson return -ENOTSUP;
161499a2dd95SBruce Richardson
161599a2dd95SBruce Richardson fd_list[list_idx].memseg_list_fd = fd;
161699a2dd95SBruce Richardson
161799a2dd95SBruce Richardson return 0;
161899a2dd95SBruce Richardson }
161999a2dd95SBruce Richardson
162099a2dd95SBruce Richardson int
eal_memalloc_get_seg_fd(int list_idx,int seg_idx)162199a2dd95SBruce Richardson eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
162299a2dd95SBruce Richardson {
162399a2dd95SBruce Richardson int fd;
162499a2dd95SBruce Richardson const struct internal_config *internal_conf =
162599a2dd95SBruce Richardson eal_get_internal_configuration();
162699a2dd95SBruce Richardson
162799a2dd95SBruce Richardson if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
162899a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
162999a2dd95SBruce Richardson /* in in-memory or no-huge mode, we rely on memfd support */
163099a2dd95SBruce Richardson return -ENOTSUP;
163199a2dd95SBruce Richardson #endif
163299a2dd95SBruce Richardson /* memfd supported, but hugetlbfs memfd may not be */
163399a2dd95SBruce Richardson if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
163499a2dd95SBruce Richardson return -ENOTSUP;
163599a2dd95SBruce Richardson }
163699a2dd95SBruce Richardson
163799a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
163899a2dd95SBruce Richardson fd = fd_list[list_idx].memseg_list_fd;
163999a2dd95SBruce Richardson } else if (fd_list[list_idx].len == 0) {
164099a2dd95SBruce Richardson /* list not initialized */
164199a2dd95SBruce Richardson fd = -1;
164299a2dd95SBruce Richardson } else {
164399a2dd95SBruce Richardson fd = fd_list[list_idx].fds[seg_idx];
164499a2dd95SBruce Richardson }
164599a2dd95SBruce Richardson if (fd < 0)
164699a2dd95SBruce Richardson return -ENODEV;
164799a2dd95SBruce Richardson return fd;
164899a2dd95SBruce Richardson }
164999a2dd95SBruce Richardson
165099a2dd95SBruce Richardson static int
test_memfd_create(void)165199a2dd95SBruce Richardson test_memfd_create(void)
165299a2dd95SBruce Richardson {
165399a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
165499a2dd95SBruce Richardson const struct internal_config *internal_conf =
165599a2dd95SBruce Richardson eal_get_internal_configuration();
165699a2dd95SBruce Richardson unsigned int i;
165799a2dd95SBruce Richardson for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
165899a2dd95SBruce Richardson uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz;
165999a2dd95SBruce Richardson int pagesz_flag = pagesz_flags(pagesz);
166099a2dd95SBruce Richardson int flags;
166199a2dd95SBruce Richardson
166299a2dd95SBruce Richardson flags = pagesz_flag | RTE_MFD_HUGETLB;
166399a2dd95SBruce Richardson int fd = memfd_create("test", flags);
166499a2dd95SBruce Richardson if (fd < 0) {
166599a2dd95SBruce Richardson /* we failed - let memalloc know this isn't working */
166699a2dd95SBruce Richardson if (errno == EINVAL) {
166799a2dd95SBruce Richardson memfd_create_supported = 0;
166899a2dd95SBruce Richardson return 0; /* not supported */
166999a2dd95SBruce Richardson }
167099a2dd95SBruce Richardson
167199a2dd95SBruce Richardson /* we got other error - something's wrong */
167299a2dd95SBruce Richardson return -1; /* error */
167399a2dd95SBruce Richardson }
167499a2dd95SBruce Richardson close(fd);
167599a2dd95SBruce Richardson return 1; /* supported */
167699a2dd95SBruce Richardson }
167799a2dd95SBruce Richardson #endif
167899a2dd95SBruce Richardson return 0; /* not supported */
167999a2dd95SBruce Richardson }
168099a2dd95SBruce Richardson
168199a2dd95SBruce Richardson int
eal_memalloc_get_seg_fd_offset(int list_idx,int seg_idx,size_t * offset)168299a2dd95SBruce Richardson eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
168399a2dd95SBruce Richardson {
168499a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
168599a2dd95SBruce Richardson const struct internal_config *internal_conf =
168699a2dd95SBruce Richardson eal_get_internal_configuration();
168799a2dd95SBruce Richardson
168899a2dd95SBruce Richardson if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
168999a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
169099a2dd95SBruce Richardson /* in in-memory or no-huge mode, we rely on memfd support */
169199a2dd95SBruce Richardson return -ENOTSUP;
169299a2dd95SBruce Richardson #endif
169399a2dd95SBruce Richardson /* memfd supported, but hugetlbfs memfd may not be */
169499a2dd95SBruce Richardson if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
169599a2dd95SBruce Richardson return -ENOTSUP;
169699a2dd95SBruce Richardson }
169799a2dd95SBruce Richardson
169899a2dd95SBruce Richardson if (internal_conf->single_file_segments) {
169999a2dd95SBruce Richardson size_t pgsz = mcfg->memsegs[list_idx].page_sz;
170099a2dd95SBruce Richardson
170199a2dd95SBruce Richardson /* segment not active? */
170299a2dd95SBruce Richardson if (fd_list[list_idx].memseg_list_fd < 0)
170399a2dd95SBruce Richardson return -ENOENT;
170499a2dd95SBruce Richardson *offset = pgsz * seg_idx;
170599a2dd95SBruce Richardson } else {
170699a2dd95SBruce Richardson /* fd_list not initialized? */
170799a2dd95SBruce Richardson if (fd_list[list_idx].len == 0)
170899a2dd95SBruce Richardson return -ENODEV;
170999a2dd95SBruce Richardson
171099a2dd95SBruce Richardson /* segment not active? */
171199a2dd95SBruce Richardson if (fd_list[list_idx].fds[seg_idx] < 0)
171299a2dd95SBruce Richardson return -ENOENT;
171399a2dd95SBruce Richardson *offset = 0;
171499a2dd95SBruce Richardson }
171599a2dd95SBruce Richardson return 0;
171699a2dd95SBruce Richardson }
171799a2dd95SBruce Richardson
171899a2dd95SBruce Richardson int
eal_memalloc_cleanup(void)171999a2dd95SBruce Richardson eal_memalloc_cleanup(void)
172099a2dd95SBruce Richardson {
172199a2dd95SBruce Richardson /* close all remaining fd's - these are per-process, so it's safe */
172299a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL))
172399a2dd95SBruce Richardson return -1;
172499a2dd95SBruce Richardson
172599a2dd95SBruce Richardson /* destroy the shadow page table if we're a secondary process */
172699a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY)
172799a2dd95SBruce Richardson return 0;
172899a2dd95SBruce Richardson
172999a2dd95SBruce Richardson if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk,
173099a2dd95SBruce Richardson NULL))
173199a2dd95SBruce Richardson return -1;
173299a2dd95SBruce Richardson
173399a2dd95SBruce Richardson return 0;
173499a2dd95SBruce Richardson }
173599a2dd95SBruce Richardson
173699a2dd95SBruce Richardson int
eal_memalloc_init(void)173799a2dd95SBruce Richardson eal_memalloc_init(void)
173899a2dd95SBruce Richardson {
173999a2dd95SBruce Richardson const struct internal_config *internal_conf =
174099a2dd95SBruce Richardson eal_get_internal_configuration();
174199a2dd95SBruce Richardson
174299a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1743f82c02d3SArtemy Kovalyov /* memory_hotplug_lock is held during initialization, so it's
1744f82c02d3SArtemy Kovalyov * safe to call thread-unsafe version.
1745f82c02d3SArtemy Kovalyov */
1746f82c02d3SArtemy Kovalyov if (rte_memseg_list_walk_thread_unsafe(secondary_msl_create_walk, NULL) < 0)
174799a2dd95SBruce Richardson return -1;
174899a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
174999a2dd95SBruce Richardson internal_conf->in_memory) {
175099a2dd95SBruce Richardson int mfd_res = test_memfd_create();
175199a2dd95SBruce Richardson
175299a2dd95SBruce Richardson if (mfd_res < 0) {
1753ae67895bSDavid Marchand EAL_LOG(ERR, "Unable to check if memfd is supported");
175499a2dd95SBruce Richardson return -1;
175599a2dd95SBruce Richardson }
175699a2dd95SBruce Richardson if (mfd_res == 1)
1757ae67895bSDavid Marchand EAL_LOG(DEBUG, "Using memfd for anonymous memory");
175899a2dd95SBruce Richardson else
1759ae67895bSDavid Marchand EAL_LOG(INFO, "Using memfd is not supported, falling back to anonymous hugepages");
176099a2dd95SBruce Richardson
176199a2dd95SBruce Richardson /* we only support single-file segments mode with in-memory mode
176299a2dd95SBruce Richardson * if we support hugetlbfs with memfd_create. this code will
176399a2dd95SBruce Richardson * test if we do.
176499a2dd95SBruce Richardson */
176599a2dd95SBruce Richardson if (internal_conf->single_file_segments &&
176699a2dd95SBruce Richardson mfd_res != 1) {
1767ae67895bSDavid Marchand EAL_LOG(ERR, "Single-file segments mode cannot be used without memfd support");
176899a2dd95SBruce Richardson return -1;
176999a2dd95SBruce Richardson }
177099a2dd95SBruce Richardson /* this cannot ever happen but better safe than sorry */
177199a2dd95SBruce Richardson if (!anonymous_hugepages_supported) {
1772ae67895bSDavid Marchand EAL_LOG(ERR, "Using anonymous memory is not supported");
177399a2dd95SBruce Richardson return -1;
177499a2dd95SBruce Richardson }
177532b4771cSDmitry Kozlyuk /* safety net, should be impossible to configure */
177632b4771cSDmitry Kozlyuk if (internal_conf->hugepage_file.unlink_before_mapping &&
177732b4771cSDmitry Kozlyuk !internal_conf->hugepage_file.unlink_existing) {
1778ae67895bSDavid Marchand EAL_LOG(ERR, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.");
177932b4771cSDmitry Kozlyuk return -1;
178032b4771cSDmitry Kozlyuk }
178199a2dd95SBruce Richardson }
178299a2dd95SBruce Richardson
178399a2dd95SBruce Richardson /* initialize all of the fd lists */
1784f82c02d3SArtemy Kovalyov if (rte_memseg_list_walk_thread_unsafe(fd_list_create_walk, NULL))
178599a2dd95SBruce Richardson return -1;
178699a2dd95SBruce Richardson return 0;
178799a2dd95SBruce Richardson }
1788