xref: /dpdk/lib/eal/linux/eal_memalloc.c (revision 8f4611d893b4eeffb942fffdadc4cde394e4c309)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2017-2018 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #include <errno.h>
699a2dd95SBruce Richardson #include <stdbool.h>
799a2dd95SBruce Richardson #include <stdlib.h>
899a2dd95SBruce Richardson #include <stdio.h>
999a2dd95SBruce Richardson #include <stdint.h>
1099a2dd95SBruce Richardson #include <string.h>
1199a2dd95SBruce Richardson #include <sys/mman.h>
1299a2dd95SBruce Richardson #include <sys/stat.h>
1399a2dd95SBruce Richardson #include <sys/file.h>
1499a2dd95SBruce Richardson #include <unistd.h>
1599a2dd95SBruce Richardson #include <limits.h>
1699a2dd95SBruce Richardson #include <fcntl.h>
1799a2dd95SBruce Richardson #include <signal.h>
1899a2dd95SBruce Richardson #include <setjmp.h>
1999a2dd95SBruce Richardson #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
2099a2dd95SBruce Richardson #include <linux/memfd.h>
2199a2dd95SBruce Richardson #define MEMFD_SUPPORTED
2299a2dd95SBruce Richardson #endif
2399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
2499a2dd95SBruce Richardson #include <numa.h>
2599a2dd95SBruce Richardson #include <numaif.h>
2699a2dd95SBruce Richardson #endif
2799a2dd95SBruce Richardson #include <linux/falloc.h>
2899a2dd95SBruce Richardson #include <linux/mman.h> /* for hugetlb-related mmap flags */
2999a2dd95SBruce Richardson 
3099a2dd95SBruce Richardson #include <rte_common.h>
3199a2dd95SBruce Richardson #include <rte_log.h>
3299a2dd95SBruce Richardson #include <rte_eal.h>
3399a2dd95SBruce Richardson #include <rte_memory.h>
3499a2dd95SBruce Richardson 
3599a2dd95SBruce Richardson #include "eal_filesystem.h"
3699a2dd95SBruce Richardson #include "eal_internal_cfg.h"
3799a2dd95SBruce Richardson #include "eal_memalloc.h"
3899a2dd95SBruce Richardson #include "eal_memcfg.h"
3999a2dd95SBruce Richardson #include "eal_private.h"
4099a2dd95SBruce Richardson 
4199a2dd95SBruce Richardson const int anonymous_hugepages_supported =
4299a2dd95SBruce Richardson #ifdef MAP_HUGE_SHIFT
4399a2dd95SBruce Richardson 		1;
4499a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
4599a2dd95SBruce Richardson #else
4699a2dd95SBruce Richardson 		0;
4799a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT 26
4899a2dd95SBruce Richardson #endif
4999a2dd95SBruce Richardson 
5099a2dd95SBruce Richardson /*
5199a2dd95SBruce Richardson  * we've already checked memfd support at compile-time, but we also need to
5299a2dd95SBruce Richardson  * check if we can create hugepage files with memfd.
5399a2dd95SBruce Richardson  *
5499a2dd95SBruce Richardson  * also, this is not a constant, because while we may be *compiled* with memfd
5599a2dd95SBruce Richardson  * hugetlbfs support, we might not be *running* on a system that supports memfd
5699a2dd95SBruce Richardson  * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
5799a2dd95SBruce Richardson  * runtime, and fall back to anonymous memory.
5899a2dd95SBruce Richardson  */
5999a2dd95SBruce Richardson static int memfd_create_supported =
6099a2dd95SBruce Richardson #ifdef MFD_HUGETLB
6199a2dd95SBruce Richardson 		1;
6299a2dd95SBruce Richardson #define RTE_MFD_HUGETLB MFD_HUGETLB
6399a2dd95SBruce Richardson #else
6499a2dd95SBruce Richardson 		0;
6599a2dd95SBruce Richardson #define RTE_MFD_HUGETLB 4U
6699a2dd95SBruce Richardson #endif
6799a2dd95SBruce Richardson 
6899a2dd95SBruce Richardson /*
6999a2dd95SBruce Richardson  * not all kernel version support fallocate on hugetlbfs, so fall back to
7099a2dd95SBruce Richardson  * ftruncate and disallow deallocation if fallocate is not supported.
7199a2dd95SBruce Richardson  */
7299a2dd95SBruce Richardson static int fallocate_supported = -1; /* unknown */
7399a2dd95SBruce Richardson 
7499a2dd95SBruce Richardson /*
7599a2dd95SBruce Richardson  * we have two modes - single file segments, and file-per-page mode.
7699a2dd95SBruce Richardson  *
7799a2dd95SBruce Richardson  * for single-file segments, we use memseg_list_fd to store the segment fd,
7899a2dd95SBruce Richardson  * while the fds[] will not be allocated, and len will be set to 0.
7999a2dd95SBruce Richardson  *
8099a2dd95SBruce Richardson  * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
8199a2dd95SBruce Richardson  * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
8299a2dd95SBruce Richardson  *
8399a2dd95SBruce Richardson  * we cannot know how many pages a system will have in advance, but we do know
8499a2dd95SBruce Richardson  * that they come in lists, and we know lengths of these lists. so, simply store
8599a2dd95SBruce Richardson  * a malloc'd array of fd's indexed by list and segment index.
8699a2dd95SBruce Richardson  *
8799a2dd95SBruce Richardson  * they will be initialized at startup, and filled as we allocate/deallocate
8899a2dd95SBruce Richardson  * segments.
8999a2dd95SBruce Richardson  */
9099a2dd95SBruce Richardson static struct {
9199a2dd95SBruce Richardson 	int *fds; /**< dynamically allocated array of segment lock fd's */
9299a2dd95SBruce Richardson 	int memseg_list_fd; /**< memseg list fd */
9399a2dd95SBruce Richardson 	int len; /**< total length of the array */
9499a2dd95SBruce Richardson 	int count; /**< entries used in an array */
9599a2dd95SBruce Richardson } fd_list[RTE_MAX_MEMSEG_LISTS];
9699a2dd95SBruce Richardson 
9799a2dd95SBruce Richardson /** local copy of a memory map, used to synchronize memory hotplug in MP */
9899a2dd95SBruce Richardson static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
9999a2dd95SBruce Richardson 
10099a2dd95SBruce Richardson static sigjmp_buf huge_jmpenv;
10199a2dd95SBruce Richardson 
huge_sigbus_handler(int signo __rte_unused)1029bffc928SOlivier Matz static void huge_sigbus_handler(int signo __rte_unused)
10399a2dd95SBruce Richardson {
10499a2dd95SBruce Richardson 	siglongjmp(huge_jmpenv, 1);
10599a2dd95SBruce Richardson }
10699a2dd95SBruce Richardson 
10799a2dd95SBruce Richardson /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
10899a2dd95SBruce Richardson  * non-static local variable in the stack frame calling sigsetjmp might be
10999a2dd95SBruce Richardson  * clobbered by a call to longjmp.
11099a2dd95SBruce Richardson  */
huge_wrap_sigsetjmp(void)1119bffc928SOlivier Matz static int huge_wrap_sigsetjmp(void)
11299a2dd95SBruce Richardson {
11399a2dd95SBruce Richardson 	return sigsetjmp(huge_jmpenv, 1);
11499a2dd95SBruce Richardson }
11599a2dd95SBruce Richardson 
11699a2dd95SBruce Richardson static struct sigaction huge_action_old;
11799a2dd95SBruce Richardson static int huge_need_recover;
11899a2dd95SBruce Richardson 
1199bffc928SOlivier Matz static void
huge_register_sigbus(void)12099a2dd95SBruce Richardson huge_register_sigbus(void)
12199a2dd95SBruce Richardson {
12299a2dd95SBruce Richardson 	sigset_t mask;
12399a2dd95SBruce Richardson 	struct sigaction action;
12499a2dd95SBruce Richardson 
12599a2dd95SBruce Richardson 	sigemptyset(&mask);
12699a2dd95SBruce Richardson 	sigaddset(&mask, SIGBUS);
12799a2dd95SBruce Richardson 	action.sa_flags = 0;
12899a2dd95SBruce Richardson 	action.sa_mask = mask;
12999a2dd95SBruce Richardson 	action.sa_handler = huge_sigbus_handler;
13099a2dd95SBruce Richardson 
13199a2dd95SBruce Richardson 	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
13299a2dd95SBruce Richardson }
13399a2dd95SBruce Richardson 
1349bffc928SOlivier Matz static void
huge_recover_sigbus(void)13599a2dd95SBruce Richardson huge_recover_sigbus(void)
13699a2dd95SBruce Richardson {
13799a2dd95SBruce Richardson 	if (huge_need_recover) {
13899a2dd95SBruce Richardson 		sigaction(SIGBUS, &huge_action_old, NULL);
13999a2dd95SBruce Richardson 		huge_need_recover = 0;
14099a2dd95SBruce Richardson 	}
14199a2dd95SBruce Richardson }
14299a2dd95SBruce Richardson 
14399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
14499a2dd95SBruce Richardson static bool
check_numa(void)14599a2dd95SBruce Richardson check_numa(void)
14699a2dd95SBruce Richardson {
14799a2dd95SBruce Richardson 	bool ret = true;
14899a2dd95SBruce Richardson 	/* Check if kernel supports NUMA. */
14999a2dd95SBruce Richardson 	if (numa_available() != 0) {
150ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "NUMA is not supported.");
15199a2dd95SBruce Richardson 		ret = false;
15299a2dd95SBruce Richardson 	}
15399a2dd95SBruce Richardson 	return ret;
15499a2dd95SBruce Richardson }
15599a2dd95SBruce Richardson 
15699a2dd95SBruce Richardson static void
prepare_numa(int * oldpolicy,struct bitmask * oldmask,int socket_id)15799a2dd95SBruce Richardson prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
15899a2dd95SBruce Richardson {
159ae67895bSDavid Marchand 	EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
16099a2dd95SBruce Richardson 	if (get_mempolicy(oldpolicy, oldmask->maskp,
16199a2dd95SBruce Richardson 			  oldmask->size + 1, 0, 0) < 0) {
162ae67895bSDavid Marchand 		EAL_LOG(ERR,
16399a2dd95SBruce Richardson 			"Failed to get current mempolicy: %s. "
164ae67895bSDavid Marchand 			"Assuming MPOL_DEFAULT.", strerror(errno));
16599a2dd95SBruce Richardson 		*oldpolicy = MPOL_DEFAULT;
16699a2dd95SBruce Richardson 	}
167ae67895bSDavid Marchand 	EAL_LOG(DEBUG,
168ae67895bSDavid Marchand 		"Setting policy MPOL_PREFERRED for socket %d",
16999a2dd95SBruce Richardson 		socket_id);
17099a2dd95SBruce Richardson 	numa_set_preferred(socket_id);
17199a2dd95SBruce Richardson }
17299a2dd95SBruce Richardson 
17399a2dd95SBruce Richardson static void
restore_numa(int * oldpolicy,struct bitmask * oldmask)17499a2dd95SBruce Richardson restore_numa(int *oldpolicy, struct bitmask *oldmask)
17599a2dd95SBruce Richardson {
176ae67895bSDavid Marchand 	EAL_LOG(DEBUG,
177ae67895bSDavid Marchand 		"Restoring previous memory policy: %d", *oldpolicy);
17899a2dd95SBruce Richardson 	if (*oldpolicy == MPOL_DEFAULT) {
17999a2dd95SBruce Richardson 		numa_set_localalloc();
18099a2dd95SBruce Richardson 	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
18199a2dd95SBruce Richardson 				 oldmask->size + 1) < 0) {
182ae67895bSDavid Marchand 		EAL_LOG(ERR, "Failed to restore mempolicy: %s",
18399a2dd95SBruce Richardson 			strerror(errno));
18499a2dd95SBruce Richardson 		numa_set_localalloc();
18599a2dd95SBruce Richardson 	}
18699a2dd95SBruce Richardson 	numa_free_cpumask(oldmask);
18799a2dd95SBruce Richardson }
18899a2dd95SBruce Richardson #endif
18999a2dd95SBruce Richardson 
19099a2dd95SBruce Richardson /*
19199a2dd95SBruce Richardson  * uses fstat to report the size of a file on disk
19299a2dd95SBruce Richardson  */
19399a2dd95SBruce Richardson static off_t
get_file_size(int fd)19499a2dd95SBruce Richardson get_file_size(int fd)
19599a2dd95SBruce Richardson {
19699a2dd95SBruce Richardson 	struct stat st;
19799a2dd95SBruce Richardson 	if (fstat(fd, &st) < 0)
19899a2dd95SBruce Richardson 		return 0;
19999a2dd95SBruce Richardson 	return st.st_size;
20099a2dd95SBruce Richardson }
20199a2dd95SBruce Richardson 
20299a2dd95SBruce Richardson static int
pagesz_flags(uint64_t page_sz)20399a2dd95SBruce Richardson pagesz_flags(uint64_t page_sz)
20499a2dd95SBruce Richardson {
20599a2dd95SBruce Richardson 	/* as per mmap() manpage, all page sizes are log2 of page size
20699a2dd95SBruce Richardson 	 * shifted by MAP_HUGE_SHIFT
20799a2dd95SBruce Richardson 	 */
20899a2dd95SBruce Richardson 	int log2 = rte_log2_u64(page_sz);
20999a2dd95SBruce Richardson 	return log2 << RTE_MAP_HUGE_SHIFT;
21099a2dd95SBruce Richardson }
21199a2dd95SBruce Richardson 
21299a2dd95SBruce Richardson /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
lock(int fd,int type)21399a2dd95SBruce Richardson static int lock(int fd, int type)
21499a2dd95SBruce Richardson {
21599a2dd95SBruce Richardson 	int ret;
21699a2dd95SBruce Richardson 
21799a2dd95SBruce Richardson 	/* flock may be interrupted */
21899a2dd95SBruce Richardson 	do {
21999a2dd95SBruce Richardson 		ret = flock(fd, type | LOCK_NB);
22099a2dd95SBruce Richardson 	} while (ret && errno == EINTR);
22199a2dd95SBruce Richardson 
22299a2dd95SBruce Richardson 	if (ret && errno == EWOULDBLOCK) {
22399a2dd95SBruce Richardson 		/* couldn't lock */
22499a2dd95SBruce Richardson 		return 0;
22599a2dd95SBruce Richardson 	} else if (ret) {
226ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): error calling flock(): %s",
22799a2dd95SBruce Richardson 			__func__, strerror(errno));
22899a2dd95SBruce Richardson 		return -1;
22999a2dd95SBruce Richardson 	}
23099a2dd95SBruce Richardson 	/* lock was successful */
23199a2dd95SBruce Richardson 	return 1;
23299a2dd95SBruce Richardson }
23399a2dd95SBruce Richardson 
23499a2dd95SBruce Richardson static int
get_seg_memfd(struct hugepage_info * hi __rte_unused,unsigned int list_idx __rte_unused,unsigned int seg_idx __rte_unused)23599a2dd95SBruce Richardson get_seg_memfd(struct hugepage_info *hi __rte_unused,
23699a2dd95SBruce Richardson 		unsigned int list_idx __rte_unused,
23799a2dd95SBruce Richardson 		unsigned int seg_idx __rte_unused)
23899a2dd95SBruce Richardson {
23999a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
24099a2dd95SBruce Richardson 	int fd;
24199a2dd95SBruce Richardson 	char segname[250]; /* as per manpage, limit is 249 bytes plus null */
24299a2dd95SBruce Richardson 
24399a2dd95SBruce Richardson 	int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
24499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
24599a2dd95SBruce Richardson 		eal_get_internal_configuration();
24699a2dd95SBruce Richardson 
24799a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
24899a2dd95SBruce Richardson 		fd = fd_list[list_idx].memseg_list_fd;
24999a2dd95SBruce Richardson 
25099a2dd95SBruce Richardson 		if (fd < 0) {
25199a2dd95SBruce Richardson 			snprintf(segname, sizeof(segname), "seg_%i", list_idx);
25299a2dd95SBruce Richardson 			fd = memfd_create(segname, flags);
25399a2dd95SBruce Richardson 			if (fd < 0) {
254ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
25599a2dd95SBruce Richardson 					__func__, strerror(errno));
25699a2dd95SBruce Richardson 				return -1;
25799a2dd95SBruce Richardson 			}
25899a2dd95SBruce Richardson 			fd_list[list_idx].memseg_list_fd = fd;
25999a2dd95SBruce Richardson 		}
26099a2dd95SBruce Richardson 	} else {
26199a2dd95SBruce Richardson 		fd = fd_list[list_idx].fds[seg_idx];
26299a2dd95SBruce Richardson 
26399a2dd95SBruce Richardson 		if (fd < 0) {
26499a2dd95SBruce Richardson 			snprintf(segname, sizeof(segname), "seg_%i-%i",
26599a2dd95SBruce Richardson 					list_idx, seg_idx);
26699a2dd95SBruce Richardson 			fd = memfd_create(segname, flags);
26799a2dd95SBruce Richardson 			if (fd < 0) {
268ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
26999a2dd95SBruce Richardson 					__func__, strerror(errno));
27099a2dd95SBruce Richardson 				return -1;
27199a2dd95SBruce Richardson 			}
27299a2dd95SBruce Richardson 			fd_list[list_idx].fds[seg_idx] = fd;
27399a2dd95SBruce Richardson 		}
27499a2dd95SBruce Richardson 	}
27599a2dd95SBruce Richardson 	return fd;
27699a2dd95SBruce Richardson #endif
27799a2dd95SBruce Richardson 	return -1;
27899a2dd95SBruce Richardson }
27999a2dd95SBruce Richardson 
28099a2dd95SBruce Richardson static int
get_seg_fd(char * path,int buflen,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx,bool * dirty)28199a2dd95SBruce Richardson get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
28232b4771cSDmitry Kozlyuk 		unsigned int list_idx, unsigned int seg_idx,
28332b4771cSDmitry Kozlyuk 		bool *dirty)
28499a2dd95SBruce Richardson {
28599a2dd95SBruce Richardson 	int fd;
28632b4771cSDmitry Kozlyuk 	int *out_fd;
28732b4771cSDmitry Kozlyuk 	struct stat st;
28832b4771cSDmitry Kozlyuk 	int ret;
28999a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
29099a2dd95SBruce Richardson 		eal_get_internal_configuration();
29199a2dd95SBruce Richardson 
29232b4771cSDmitry Kozlyuk 	if (dirty != NULL)
29332b4771cSDmitry Kozlyuk 		*dirty = false;
29432b4771cSDmitry Kozlyuk 
29599a2dd95SBruce Richardson 	/* for in-memory mode, we only make it here when we're sure we support
29699a2dd95SBruce Richardson 	 * memfd, and this is a special case.
29799a2dd95SBruce Richardson 	 */
29899a2dd95SBruce Richardson 	if (internal_conf->in_memory)
29999a2dd95SBruce Richardson 		return get_seg_memfd(hi, list_idx, seg_idx);
30099a2dd95SBruce Richardson 
30199a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
30232b4771cSDmitry Kozlyuk 		out_fd = &fd_list[list_idx].memseg_list_fd;
30399a2dd95SBruce Richardson 		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
30432b4771cSDmitry Kozlyuk 	} else {
30532b4771cSDmitry Kozlyuk 		out_fd = &fd_list[list_idx].fds[seg_idx];
30632b4771cSDmitry Kozlyuk 		eal_get_hugefile_path(path, buflen, hi->hugedir,
30732b4771cSDmitry Kozlyuk 				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
30832b4771cSDmitry Kozlyuk 	}
30932b4771cSDmitry Kozlyuk 	fd = *out_fd;
31032b4771cSDmitry Kozlyuk 	if (fd >= 0)
31132b4771cSDmitry Kozlyuk 		return fd;
31299a2dd95SBruce Richardson 
31332b4771cSDmitry Kozlyuk 	/*
31432b4771cSDmitry Kozlyuk 	 * There is no TOCTOU between stat() and unlink()/open()
31532b4771cSDmitry Kozlyuk 	 * because the hugepage directory is locked.
31632b4771cSDmitry Kozlyuk 	 */
31732b4771cSDmitry Kozlyuk 	ret = stat(path, &st);
31832b4771cSDmitry Kozlyuk 	if (ret < 0 && errno != ENOENT) {
319ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): stat() for '%s' failed: %s",
3208a5a9140SStephen Hemminger 			__func__, path, strerror(errno));
32199a2dd95SBruce Richardson 		return -1;
32299a2dd95SBruce Richardson 	}
32332b4771cSDmitry Kozlyuk 	if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
32432b4771cSDmitry Kozlyuk 			dirty != NULL)
32532b4771cSDmitry Kozlyuk 		*dirty = true;
32699a2dd95SBruce Richardson 
32732b4771cSDmitry Kozlyuk 	/*
32832b4771cSDmitry Kozlyuk 	 * The kernel clears a hugepage only when it is mapped
32932b4771cSDmitry Kozlyuk 	 * from a particular file for the first time.
33032b4771cSDmitry Kozlyuk 	 * If the file already exists, the old content will be mapped.
33132b4771cSDmitry Kozlyuk 	 * If the memory manager assumes all mapped pages to be clean,
33232b4771cSDmitry Kozlyuk 	 * the file must be removed and created anew.
33332b4771cSDmitry Kozlyuk 	 * Otherwise, the primary caller must be notified
33432b4771cSDmitry Kozlyuk 	 * that mapped pages will be dirty
33532b4771cSDmitry Kozlyuk 	 * (secondary callers receive the segment state from the primary one).
33632b4771cSDmitry Kozlyuk 	 * When multiple hugepages are mapped from the same file,
33732b4771cSDmitry Kozlyuk 	 * whether they will be dirty depends on the part that is mapped.
33899a2dd95SBruce Richardson 	 */
33932b4771cSDmitry Kozlyuk 	if (!internal_conf->single_file_segments &&
34032b4771cSDmitry Kozlyuk 			internal_conf->hugepage_file.unlink_existing &&
34132b4771cSDmitry Kozlyuk 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
34232b4771cSDmitry Kozlyuk 			ret == 0) {
34332b4771cSDmitry Kozlyuk 		/* coverity[toctou] */
34432b4771cSDmitry Kozlyuk 		if (unlink(path) < 0) {
345ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): could not remove '%s': %s",
34699a2dd95SBruce Richardson 				__func__, path, strerror(errno));
34799a2dd95SBruce Richardson 			return -1;
34899a2dd95SBruce Richardson 		}
34932b4771cSDmitry Kozlyuk 	}
35099a2dd95SBruce Richardson 
35132b4771cSDmitry Kozlyuk 	/* coverity[toctou] */
35299a2dd95SBruce Richardson 	fd = open(path, O_CREAT | O_RDWR, 0600);
35399a2dd95SBruce Richardson 	if (fd < 0) {
354ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): open '%s' failed: %s",
3558a5a9140SStephen Hemminger 			__func__, path, strerror(errno));
35699a2dd95SBruce Richardson 		return -1;
35799a2dd95SBruce Richardson 	}
35899a2dd95SBruce Richardson 	/* take out a read lock */
35999a2dd95SBruce Richardson 	if (lock(fd, LOCK_SH) < 0) {
360ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): lock '%s' failed: %s",
36132b4771cSDmitry Kozlyuk 			__func__, path, strerror(errno));
36299a2dd95SBruce Richardson 		close(fd);
36399a2dd95SBruce Richardson 		return -1;
36499a2dd95SBruce Richardson 	}
36532b4771cSDmitry Kozlyuk 	*out_fd = fd;
36699a2dd95SBruce Richardson 	return fd;
36799a2dd95SBruce Richardson }
36899a2dd95SBruce Richardson 
36999a2dd95SBruce Richardson static int
resize_hugefile_in_memory(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow)37099a2dd95SBruce Richardson resize_hugefile_in_memory(int fd, uint64_t fa_offset,
37199a2dd95SBruce Richardson 		uint64_t page_sz, bool grow)
37299a2dd95SBruce Richardson {
37399a2dd95SBruce Richardson 	int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
37499a2dd95SBruce Richardson 			FALLOC_FL_KEEP_SIZE;
37599a2dd95SBruce Richardson 	int ret;
37699a2dd95SBruce Richardson 
37799a2dd95SBruce Richardson 	/* grow or shrink the file */
37899a2dd95SBruce Richardson 	ret = fallocate(fd, flags, fa_offset, page_sz);
37999a2dd95SBruce Richardson 
38099a2dd95SBruce Richardson 	if (ret < 0) {
381ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
38299a2dd95SBruce Richardson 				__func__,
38399a2dd95SBruce Richardson 				strerror(errno));
38499a2dd95SBruce Richardson 		return -1;
38599a2dd95SBruce Richardson 	}
38699a2dd95SBruce Richardson 	return 0;
38799a2dd95SBruce Richardson }
38899a2dd95SBruce Richardson 
38999a2dd95SBruce Richardson static int
resize_hugefile_in_filesystem(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)39099a2dd95SBruce Richardson resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
39132b4771cSDmitry Kozlyuk 		bool grow, bool *dirty)
39299a2dd95SBruce Richardson {
39332b4771cSDmitry Kozlyuk 	const struct internal_config *internal_conf =
39432b4771cSDmitry Kozlyuk 			eal_get_internal_configuration();
39599a2dd95SBruce Richardson 	bool again = false;
39699a2dd95SBruce Richardson 
39799a2dd95SBruce Richardson 	do {
39899a2dd95SBruce Richardson 		if (fallocate_supported == 0) {
39999a2dd95SBruce Richardson 			/* we cannot deallocate memory if fallocate() is not
40099a2dd95SBruce Richardson 			 * supported, and hugepage file is already locked at
40199a2dd95SBruce Richardson 			 * creation, so no further synchronization needed.
40299a2dd95SBruce Richardson 			 */
40399a2dd95SBruce Richardson 
40499a2dd95SBruce Richardson 			if (!grow) {
405ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "%s(): fallocate not supported, not freeing page back to the system",
40699a2dd95SBruce Richardson 					__func__);
40799a2dd95SBruce Richardson 				return -1;
40899a2dd95SBruce Richardson 			}
40999a2dd95SBruce Richardson 			uint64_t new_size = fa_offset + page_sz;
41099a2dd95SBruce Richardson 			uint64_t cur_size = get_file_size(fd);
41199a2dd95SBruce Richardson 
41299a2dd95SBruce Richardson 			/* fallocate isn't supported, fall back to ftruncate */
41332b4771cSDmitry Kozlyuk 			if (dirty != NULL)
41432b4771cSDmitry Kozlyuk 				*dirty = new_size <= cur_size;
41599a2dd95SBruce Richardson 			if (new_size > cur_size &&
41699a2dd95SBruce Richardson 					ftruncate(fd, new_size) < 0) {
417ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
41899a2dd95SBruce Richardson 					__func__, strerror(errno));
41999a2dd95SBruce Richardson 				return -1;
42099a2dd95SBruce Richardson 			}
42199a2dd95SBruce Richardson 		} else {
42299a2dd95SBruce Richardson 			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
42399a2dd95SBruce Richardson 					FALLOC_FL_KEEP_SIZE;
42499a2dd95SBruce Richardson 			int ret;
42599a2dd95SBruce Richardson 
42699a2dd95SBruce Richardson 			/*
42799a2dd95SBruce Richardson 			 * technically, it is perfectly safe for both primary
42899a2dd95SBruce Richardson 			 * and secondary to grow and shrink the page files:
42999a2dd95SBruce Richardson 			 * growing the file repeatedly has no effect because
43099a2dd95SBruce Richardson 			 * a page can only be allocated once, while mmap ensures
43199a2dd95SBruce Richardson 			 * that secondaries hold on to the page even after the
43299a2dd95SBruce Richardson 			 * page itself is removed from the filesystem.
43399a2dd95SBruce Richardson 			 *
43499a2dd95SBruce Richardson 			 * however, leaving growing/shrinking to the primary
43599a2dd95SBruce Richardson 			 * tends to expose bugs in fdlist page count handling,
43699a2dd95SBruce Richardson 			 * so leave this here just in case.
43799a2dd95SBruce Richardson 			 */
43899a2dd95SBruce Richardson 			if (rte_eal_process_type() != RTE_PROC_PRIMARY)
43999a2dd95SBruce Richardson 				return 0;
44099a2dd95SBruce Richardson 
44199a2dd95SBruce Richardson 			/* grow or shrink the file */
44299a2dd95SBruce Richardson 			ret = fallocate(fd, flags, fa_offset, page_sz);
44399a2dd95SBruce Richardson 
44499a2dd95SBruce Richardson 			if (ret < 0) {
44599a2dd95SBruce Richardson 				if (fallocate_supported == -1 &&
44699a2dd95SBruce Richardson 						errno == ENOTSUP) {
447ae67895bSDavid Marchand 					EAL_LOG(ERR, "%s(): fallocate() not supported, hugepage deallocation will be disabled",
44899a2dd95SBruce Richardson 						__func__);
44999a2dd95SBruce Richardson 					again = true;
45099a2dd95SBruce Richardson 					fallocate_supported = 0;
45199a2dd95SBruce Richardson 				} else {
452ae67895bSDavid Marchand 					EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
45399a2dd95SBruce Richardson 						__func__,
45499a2dd95SBruce Richardson 						strerror(errno));
45599a2dd95SBruce Richardson 					return -1;
45699a2dd95SBruce Richardson 				}
45732b4771cSDmitry Kozlyuk 			} else {
45899a2dd95SBruce Richardson 				fallocate_supported = 1;
45932b4771cSDmitry Kozlyuk 				/*
46032b4771cSDmitry Kozlyuk 				 * It is unknown which portions of an existing
46132b4771cSDmitry Kozlyuk 				 * hugepage file were allocated previously,
46232b4771cSDmitry Kozlyuk 				 * so all pages within the file are considered
46332b4771cSDmitry Kozlyuk 				 * dirty, unless the file is a fresh one.
46432b4771cSDmitry Kozlyuk 				 */
46532b4771cSDmitry Kozlyuk 				if (dirty != NULL)
46632b4771cSDmitry Kozlyuk 					*dirty &= !internal_conf->hugepage_file.unlink_existing;
46732b4771cSDmitry Kozlyuk 			}
46899a2dd95SBruce Richardson 		}
46999a2dd95SBruce Richardson 	} while (again);
47099a2dd95SBruce Richardson 
47199a2dd95SBruce Richardson 	return 0;
47299a2dd95SBruce Richardson }
47399a2dd95SBruce Richardson 
47499a2dd95SBruce Richardson static void
close_hugefile(int fd,char * path,int list_idx)47599a2dd95SBruce Richardson close_hugefile(int fd, char *path, int list_idx)
47699a2dd95SBruce Richardson {
47799a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
47899a2dd95SBruce Richardson 		eal_get_internal_configuration();
47999a2dd95SBruce Richardson 	/*
48099a2dd95SBruce Richardson 	 * primary process must unlink the file, but only when not in in-memory
48199a2dd95SBruce Richardson 	 * mode (as in that case there is no file to unlink).
48299a2dd95SBruce Richardson 	 */
48399a2dd95SBruce Richardson 	if (!internal_conf->in_memory &&
48499a2dd95SBruce Richardson 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
48599a2dd95SBruce Richardson 			unlink(path))
486ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): unlinking '%s' failed: %s",
48799a2dd95SBruce Richardson 			__func__, path, strerror(errno));
48899a2dd95SBruce Richardson 
48999a2dd95SBruce Richardson 	close(fd);
49099a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = -1;
49199a2dd95SBruce Richardson }
49299a2dd95SBruce Richardson 
49399a2dd95SBruce Richardson static int
resize_hugefile(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)49432b4771cSDmitry Kozlyuk resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
49532b4771cSDmitry Kozlyuk 		bool *dirty)
49699a2dd95SBruce Richardson {
49799a2dd95SBruce Richardson 	/* in-memory mode is a special case, because we can be sure that
49899a2dd95SBruce Richardson 	 * fallocate() is supported.
49999a2dd95SBruce Richardson 	 */
50099a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
50199a2dd95SBruce Richardson 		eal_get_internal_configuration();
50299a2dd95SBruce Richardson 
50332b4771cSDmitry Kozlyuk 	if (internal_conf->in_memory) {
50432b4771cSDmitry Kozlyuk 		if (dirty != NULL)
50532b4771cSDmitry Kozlyuk 			*dirty = false;
50699a2dd95SBruce Richardson 		return resize_hugefile_in_memory(fd, fa_offset,
50799a2dd95SBruce Richardson 				page_sz, grow);
50832b4771cSDmitry Kozlyuk 	}
50999a2dd95SBruce Richardson 
51099a2dd95SBruce Richardson 	return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
51132b4771cSDmitry Kozlyuk 			grow, dirty);
51299a2dd95SBruce Richardson }
51399a2dd95SBruce Richardson 
51499a2dd95SBruce Richardson static int
alloc_seg(struct rte_memseg * ms,void * addr,int socket_id,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)51599a2dd95SBruce Richardson alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
51699a2dd95SBruce Richardson 		struct hugepage_info *hi, unsigned int list_idx,
51799a2dd95SBruce Richardson 		unsigned int seg_idx)
51899a2dd95SBruce Richardson {
51999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
52099a2dd95SBruce Richardson 	int cur_socket_id = 0;
52199a2dd95SBruce Richardson #endif
52299a2dd95SBruce Richardson 	uint64_t map_offset;
52399a2dd95SBruce Richardson 	rte_iova_t iova;
52499a2dd95SBruce Richardson 	void *va;
52599a2dd95SBruce Richardson 	char path[PATH_MAX];
52699a2dd95SBruce Richardson 	int ret = 0;
52799a2dd95SBruce Richardson 	int fd;
52832b4771cSDmitry Kozlyuk 	bool dirty;
52999a2dd95SBruce Richardson 	size_t alloc_sz;
53099a2dd95SBruce Richardson 	int flags;
53199a2dd95SBruce Richardson 	void *new_addr;
53299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
53399a2dd95SBruce Richardson 		eal_get_internal_configuration();
53499a2dd95SBruce Richardson 
53599a2dd95SBruce Richardson 	alloc_sz = hi->hugepage_sz;
53699a2dd95SBruce Richardson 
53799a2dd95SBruce Richardson 	/* these are checked at init, but code analyzers don't know that */
53899a2dd95SBruce Richardson 	if (internal_conf->in_memory && !anonymous_hugepages_supported) {
539ae67895bSDavid Marchand 		EAL_LOG(ERR, "Anonymous hugepages not supported, in-memory mode cannot allocate memory");
54099a2dd95SBruce Richardson 		return -1;
54199a2dd95SBruce Richardson 	}
54299a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported &&
54399a2dd95SBruce Richardson 			internal_conf->single_file_segments) {
544ae67895bSDavid Marchand 		EAL_LOG(ERR, "Single-file segments are not supported without memfd support");
54599a2dd95SBruce Richardson 		return -1;
54699a2dd95SBruce Richardson 	}
54799a2dd95SBruce Richardson 
54899a2dd95SBruce Richardson 	/* in-memory without memfd is a special case */
54999a2dd95SBruce Richardson 	int mmap_flags;
55099a2dd95SBruce Richardson 
55199a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported) {
55299a2dd95SBruce Richardson 		const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
55399a2dd95SBruce Richardson 				MAP_PRIVATE | MAP_ANONYMOUS;
55499a2dd95SBruce Richardson 		int pagesz_flag;
55599a2dd95SBruce Richardson 
55699a2dd95SBruce Richardson 		pagesz_flag = pagesz_flags(alloc_sz);
55799a2dd95SBruce Richardson 		fd = -1;
55832b4771cSDmitry Kozlyuk 		dirty = false;
55999a2dd95SBruce Richardson 		mmap_flags = in_memory_flags | pagesz_flag;
56099a2dd95SBruce Richardson 
56199a2dd95SBruce Richardson 		/* single-file segments codepath will never be active
56299a2dd95SBruce Richardson 		 * here because in-memory mode is incompatible with the
56399a2dd95SBruce Richardson 		 * fallback path, and it's stopped at EAL initialization
56499a2dd95SBruce Richardson 		 * stage.
56599a2dd95SBruce Richardson 		 */
56699a2dd95SBruce Richardson 		map_offset = 0;
56799a2dd95SBruce Richardson 	} else {
56899a2dd95SBruce Richardson 		/* takes out a read lock on segment or segment list */
56932b4771cSDmitry Kozlyuk 		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
57032b4771cSDmitry Kozlyuk 				&dirty);
57199a2dd95SBruce Richardson 		if (fd < 0) {
572ae67895bSDavid Marchand 			EAL_LOG(ERR, "Couldn't get fd on hugepage file");
57399a2dd95SBruce Richardson 			return -1;
57499a2dd95SBruce Richardson 		}
57599a2dd95SBruce Richardson 
57699a2dd95SBruce Richardson 		if (internal_conf->single_file_segments) {
57799a2dd95SBruce Richardson 			map_offset = seg_idx * alloc_sz;
57832b4771cSDmitry Kozlyuk 			ret = resize_hugefile(fd, map_offset, alloc_sz, true,
57932b4771cSDmitry Kozlyuk 					&dirty);
58099a2dd95SBruce Richardson 			if (ret < 0)
58199a2dd95SBruce Richardson 				goto resized;
58299a2dd95SBruce Richardson 
58399a2dd95SBruce Richardson 			fd_list[list_idx].count++;
58499a2dd95SBruce Richardson 		} else {
58599a2dd95SBruce Richardson 			map_offset = 0;
58699a2dd95SBruce Richardson 			if (ftruncate(fd, alloc_sz) < 0) {
587ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
58899a2dd95SBruce Richardson 					__func__, strerror(errno));
58999a2dd95SBruce Richardson 				goto resized;
59099a2dd95SBruce Richardson 			}
59152d7d91eSDmitry Kozlyuk 			if (internal_conf->hugepage_file.unlink_before_mapping &&
59299a2dd95SBruce Richardson 					!internal_conf->in_memory) {
59399a2dd95SBruce Richardson 				if (unlink(path)) {
594ae67895bSDavid Marchand 					EAL_LOG(DEBUG, "%s(): unlink() failed: %s",
59599a2dd95SBruce Richardson 						__func__, strerror(errno));
59699a2dd95SBruce Richardson 					goto resized;
59799a2dd95SBruce Richardson 				}
59899a2dd95SBruce Richardson 			}
59999a2dd95SBruce Richardson 		}
60099a2dd95SBruce Richardson 		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
60199a2dd95SBruce Richardson 	}
60299a2dd95SBruce Richardson 
6039bffc928SOlivier Matz 	huge_register_sigbus();
6049bffc928SOlivier Matz 
60599a2dd95SBruce Richardson 	/*
60699a2dd95SBruce Richardson 	 * map the segment, and populate page tables, the kernel fills
60799a2dd95SBruce Richardson 	 * this segment with zeros if it's a new page.
60899a2dd95SBruce Richardson 	 */
60999a2dd95SBruce Richardson 	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
61099a2dd95SBruce Richardson 			map_offset);
61199a2dd95SBruce Richardson 
61299a2dd95SBruce Richardson 	if (va == MAP_FAILED) {
613ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): mmap() failed: %s", __func__,
61499a2dd95SBruce Richardson 			strerror(errno));
61599a2dd95SBruce Richardson 		/* mmap failed, but the previous region might have been
61699a2dd95SBruce Richardson 		 * unmapped anyway. try to remap it
61799a2dd95SBruce Richardson 		 */
61899a2dd95SBruce Richardson 		goto unmapped;
61999a2dd95SBruce Richardson 	}
62099a2dd95SBruce Richardson 	if (va != addr) {
621ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): wrong mmap() address", __func__);
62299a2dd95SBruce Richardson 		munmap(va, alloc_sz);
62399a2dd95SBruce Richardson 		goto resized;
62499a2dd95SBruce Richardson 	}
62599a2dd95SBruce Richardson 
62699a2dd95SBruce Richardson 	/* In linux, hugetlb limitations, like cgroup, are
62799a2dd95SBruce Richardson 	 * enforced at fault time instead of mmap(), even
62899a2dd95SBruce Richardson 	 * with the option of MAP_POPULATE. Kernel will send
62999a2dd95SBruce Richardson 	 * a SIGBUS signal. To avoid to be killed, save stack
63099a2dd95SBruce Richardson 	 * environment here, if SIGBUS happens, we can jump
63199a2dd95SBruce Richardson 	 * back here.
63299a2dd95SBruce Richardson 	 */
63399a2dd95SBruce Richardson 	if (huge_wrap_sigsetjmp()) {
634ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more hugepages of size %uMB",
63599a2dd95SBruce Richardson 			(unsigned int)(alloc_sz >> 20));
63699a2dd95SBruce Richardson 		goto mapped;
63799a2dd95SBruce Richardson 	}
63899a2dd95SBruce Richardson 
63999a2dd95SBruce Richardson 	/* we need to trigger a write to the page to enforce page fault and
64099a2dd95SBruce Richardson 	 * ensure that page is accessible to us, but we can't overwrite value
64199a2dd95SBruce Richardson 	 * that is already there, so read the old value, and write itback.
64299a2dd95SBruce Richardson 	 * kernel populates the page with zeroes initially.
64399a2dd95SBruce Richardson 	 */
64499a2dd95SBruce Richardson 	*(volatile int *)addr = *(volatile int *)addr;
64599a2dd95SBruce Richardson 
64699a2dd95SBruce Richardson 	iova = rte_mem_virt2iova(addr);
64799a2dd95SBruce Richardson 	if (iova == RTE_BAD_PHYS_ADDR) {
648ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): can't get IOVA addr",
64999a2dd95SBruce Richardson 			__func__);
65099a2dd95SBruce Richardson 		goto mapped;
65199a2dd95SBruce Richardson 	}
65299a2dd95SBruce Richardson 
65399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
65499a2dd95SBruce Richardson 	/*
65599a2dd95SBruce Richardson 	 * If the kernel has been built without NUMA support, get_mempolicy()
65699a2dd95SBruce Richardson 	 * will return an error. If check_numa() returns false, memory
65799a2dd95SBruce Richardson 	 * allocation is not NUMA aware and the socket_id should not be
65899a2dd95SBruce Richardson 	 * checked.
65999a2dd95SBruce Richardson 	 */
66099a2dd95SBruce Richardson 	if (check_numa()) {
66199a2dd95SBruce Richardson 		ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
66299a2dd95SBruce Richardson 					MPOL_F_NODE | MPOL_F_ADDR);
66399a2dd95SBruce Richardson 		if (ret < 0) {
664ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): get_mempolicy: %s",
66599a2dd95SBruce Richardson 				__func__, strerror(errno));
66699a2dd95SBruce Richardson 			goto mapped;
66799a2dd95SBruce Richardson 		} else if (cur_socket_id != socket_id) {
668ae67895bSDavid Marchand 			EAL_LOG(DEBUG,
669ae67895bSDavid Marchand 					"%s(): allocation happened on wrong socket (wanted %d, got %d)",
67099a2dd95SBruce Richardson 				__func__, socket_id, cur_socket_id);
67199a2dd95SBruce Richardson 			goto mapped;
67299a2dd95SBruce Richardson 		}
67399a2dd95SBruce Richardson 	}
67499a2dd95SBruce Richardson #else
67599a2dd95SBruce Richardson 	if (rte_socket_count() > 1)
676ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "%s(): not checking hugepage NUMA node.",
67799a2dd95SBruce Richardson 				__func__);
67899a2dd95SBruce Richardson #endif
67999a2dd95SBruce Richardson 
6809bffc928SOlivier Matz 	huge_recover_sigbus();
6819bffc928SOlivier Matz 
68299a2dd95SBruce Richardson 	ms->addr = addr;
68399a2dd95SBruce Richardson 	ms->hugepage_sz = alloc_sz;
68499a2dd95SBruce Richardson 	ms->len = alloc_sz;
68599a2dd95SBruce Richardson 	ms->nchannel = rte_memory_get_nchannel();
68699a2dd95SBruce Richardson 	ms->nrank = rte_memory_get_nrank();
68799a2dd95SBruce Richardson 	ms->iova = iova;
68899a2dd95SBruce Richardson 	ms->socket_id = socket_id;
68932b4771cSDmitry Kozlyuk 	ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
69099a2dd95SBruce Richardson 
69199a2dd95SBruce Richardson 	return 0;
69299a2dd95SBruce Richardson 
69399a2dd95SBruce Richardson mapped:
69499a2dd95SBruce Richardson 	munmap(addr, alloc_sz);
69599a2dd95SBruce Richardson unmapped:
6969bffc928SOlivier Matz 	huge_recover_sigbus();
69799a2dd95SBruce Richardson 	flags = EAL_RESERVE_FORCE_ADDRESS;
69899a2dd95SBruce Richardson 	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
69999a2dd95SBruce Richardson 	if (new_addr != addr) {
70099a2dd95SBruce Richardson 		if (new_addr != NULL)
70199a2dd95SBruce Richardson 			munmap(new_addr, alloc_sz);
70299a2dd95SBruce Richardson 		/* we're leaving a hole in our virtual address space. if
70399a2dd95SBruce Richardson 		 * somebody else maps this hole now, we could accidentally
70499a2dd95SBruce Richardson 		 * override it in the future.
70599a2dd95SBruce Richardson 		 */
706ae67895bSDavid Marchand 		EAL_LOG(CRIT, "Can't mmap holes in our virtual address space");
70799a2dd95SBruce Richardson 	}
70899a2dd95SBruce Richardson 	/* roll back the ref count */
70999a2dd95SBruce Richardson 	if (internal_conf->single_file_segments)
71099a2dd95SBruce Richardson 		fd_list[list_idx].count--;
71199a2dd95SBruce Richardson resized:
71299a2dd95SBruce Richardson 	/* some codepaths will return negative fd, so exit early */
71399a2dd95SBruce Richardson 	if (fd < 0)
71499a2dd95SBruce Richardson 		return -1;
71599a2dd95SBruce Richardson 
71699a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
71732b4771cSDmitry Kozlyuk 		resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
71899a2dd95SBruce Richardson 		/* ignore failure, can't make it any worse */
71999a2dd95SBruce Richardson 
72099a2dd95SBruce Richardson 		/* if refcount is at zero, close the file */
72199a2dd95SBruce Richardson 		if (fd_list[list_idx].count == 0)
72299a2dd95SBruce Richardson 			close_hugefile(fd, path, list_idx);
72399a2dd95SBruce Richardson 	} else {
72499a2dd95SBruce Richardson 		/* only remove file if we can take out a write lock */
72552d7d91eSDmitry Kozlyuk 		if (!internal_conf->hugepage_file.unlink_before_mapping &&
72699a2dd95SBruce Richardson 				internal_conf->in_memory == 0 &&
72799a2dd95SBruce Richardson 				lock(fd, LOCK_EX) == 1)
72899a2dd95SBruce Richardson 			unlink(path);
72999a2dd95SBruce Richardson 		close(fd);
73099a2dd95SBruce Richardson 		fd_list[list_idx].fds[seg_idx] = -1;
73199a2dd95SBruce Richardson 	}
73299a2dd95SBruce Richardson 	return -1;
73399a2dd95SBruce Richardson }
73499a2dd95SBruce Richardson 
73599a2dd95SBruce Richardson static int
free_seg(struct rte_memseg * ms,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)73699a2dd95SBruce Richardson free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
73799a2dd95SBruce Richardson 		unsigned int list_idx, unsigned int seg_idx)
73899a2dd95SBruce Richardson {
73999a2dd95SBruce Richardson 	uint64_t map_offset;
74099a2dd95SBruce Richardson 	char path[PATH_MAX];
74199a2dd95SBruce Richardson 	int fd, ret = 0;
74299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
74399a2dd95SBruce Richardson 		eal_get_internal_configuration();
74499a2dd95SBruce Richardson 
74599a2dd95SBruce Richardson 	/* erase page data */
74699a2dd95SBruce Richardson 	memset(ms->addr, 0, ms->len);
74799a2dd95SBruce Richardson 
74899a2dd95SBruce Richardson 	if (mmap(ms->addr, ms->len, PROT_NONE,
74999a2dd95SBruce Richardson 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
75099a2dd95SBruce Richardson 				MAP_FAILED) {
751ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "couldn't unmap page");
75299a2dd95SBruce Richardson 		return -1;
75399a2dd95SBruce Richardson 	}
75499a2dd95SBruce Richardson 
75599a2dd95SBruce Richardson 	eal_mem_set_dump(ms->addr, ms->len, false);
75699a2dd95SBruce Richardson 
75799a2dd95SBruce Richardson 	/* if we're using anonymous hugepages, nothing to be done */
75899a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported) {
75999a2dd95SBruce Richardson 		memset(ms, 0, sizeof(*ms));
76099a2dd95SBruce Richardson 		return 0;
76199a2dd95SBruce Richardson 	}
76299a2dd95SBruce Richardson 
76399a2dd95SBruce Richardson 	/* if we are not in single file segments mode, we're going to unmap the
76499a2dd95SBruce Richardson 	 * segment and thus drop the lock on original fd, but hugepage dir is
76599a2dd95SBruce Richardson 	 * now locked so we can take out another one without races.
76699a2dd95SBruce Richardson 	 */
76732b4771cSDmitry Kozlyuk 	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
76899a2dd95SBruce Richardson 	if (fd < 0)
76999a2dd95SBruce Richardson 		return -1;
77099a2dd95SBruce Richardson 
77199a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
77299a2dd95SBruce Richardson 		map_offset = seg_idx * ms->len;
77332b4771cSDmitry Kozlyuk 		if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
77499a2dd95SBruce Richardson 			return -1;
77599a2dd95SBruce Richardson 
77699a2dd95SBruce Richardson 		if (--(fd_list[list_idx].count) == 0)
77799a2dd95SBruce Richardson 			close_hugefile(fd, path, list_idx);
77899a2dd95SBruce Richardson 
77999a2dd95SBruce Richardson 		ret = 0;
78099a2dd95SBruce Richardson 	} else {
78199a2dd95SBruce Richardson 		/* if we're able to take out a write lock, we're the last one
78299a2dd95SBruce Richardson 		 * holding onto this page.
78399a2dd95SBruce Richardson 		 */
78452d7d91eSDmitry Kozlyuk 		if (!internal_conf->in_memory &&
78532b4771cSDmitry Kozlyuk 				internal_conf->hugepage_file.unlink_existing &&
78652d7d91eSDmitry Kozlyuk 				!internal_conf->hugepage_file.unlink_before_mapping) {
78799a2dd95SBruce Richardson 			ret = lock(fd, LOCK_EX);
78899a2dd95SBruce Richardson 			if (ret >= 0) {
78999a2dd95SBruce Richardson 				/* no one else is using this page */
79099a2dd95SBruce Richardson 				if (ret == 1)
79199a2dd95SBruce Richardson 					unlink(path);
79299a2dd95SBruce Richardson 			}
79399a2dd95SBruce Richardson 		}
79499a2dd95SBruce Richardson 		/* closing fd will drop the lock */
79599a2dd95SBruce Richardson 		close(fd);
79699a2dd95SBruce Richardson 		fd_list[list_idx].fds[seg_idx] = -1;
79799a2dd95SBruce Richardson 	}
79899a2dd95SBruce Richardson 
79999a2dd95SBruce Richardson 	memset(ms, 0, sizeof(*ms));
80099a2dd95SBruce Richardson 
80199a2dd95SBruce Richardson 	return ret < 0 ? -1 : 0;
80299a2dd95SBruce Richardson }
80399a2dd95SBruce Richardson 
80499a2dd95SBruce Richardson struct alloc_walk_param {
80599a2dd95SBruce Richardson 	struct hugepage_info *hi;
80699a2dd95SBruce Richardson 	struct rte_memseg **ms;
80799a2dd95SBruce Richardson 	size_t page_sz;
80899a2dd95SBruce Richardson 	unsigned int segs_allocated;
80999a2dd95SBruce Richardson 	unsigned int n_segs;
81099a2dd95SBruce Richardson 	int socket;
81199a2dd95SBruce Richardson 	bool exact;
81299a2dd95SBruce Richardson };
81399a2dd95SBruce Richardson static int
alloc_seg_walk(const struct rte_memseg_list * msl,void * arg)81499a2dd95SBruce Richardson alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
81599a2dd95SBruce Richardson {
81699a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
81799a2dd95SBruce Richardson 	struct alloc_walk_param *wa = arg;
81899a2dd95SBruce Richardson 	struct rte_memseg_list *cur_msl;
81999a2dd95SBruce Richardson 	size_t page_sz;
82099a2dd95SBruce Richardson 	int cur_idx, start_idx, j, dir_fd = -1;
82199a2dd95SBruce Richardson 	unsigned int msl_idx, need, i;
82299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
82399a2dd95SBruce Richardson 		eal_get_internal_configuration();
82499a2dd95SBruce Richardson 
82599a2dd95SBruce Richardson 	if (msl->page_sz != wa->page_sz)
82699a2dd95SBruce Richardson 		return 0;
82799a2dd95SBruce Richardson 	if (msl->socket_id != wa->socket)
82899a2dd95SBruce Richardson 		return 0;
82999a2dd95SBruce Richardson 
83099a2dd95SBruce Richardson 	page_sz = (size_t)msl->page_sz;
83199a2dd95SBruce Richardson 
83299a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
83399a2dd95SBruce Richardson 	cur_msl = &mcfg->memsegs[msl_idx];
83499a2dd95SBruce Richardson 
83599a2dd95SBruce Richardson 	need = wa->n_segs;
83699a2dd95SBruce Richardson 
83799a2dd95SBruce Richardson 	/* try finding space in memseg list */
83899a2dd95SBruce Richardson 	if (wa->exact) {
83999a2dd95SBruce Richardson 		/* if we require exact number of pages in a list, find them */
84099a2dd95SBruce Richardson 		cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
84199a2dd95SBruce Richardson 				need);
84299a2dd95SBruce Richardson 		if (cur_idx < 0)
84399a2dd95SBruce Richardson 			return 0;
84499a2dd95SBruce Richardson 		start_idx = cur_idx;
84599a2dd95SBruce Richardson 	} else {
84699a2dd95SBruce Richardson 		int cur_len;
84799a2dd95SBruce Richardson 
84899a2dd95SBruce Richardson 		/* we don't require exact number of pages, so we're going to go
84999a2dd95SBruce Richardson 		 * for best-effort allocation. that means finding the biggest
85099a2dd95SBruce Richardson 		 * unused block, and going with that.
85199a2dd95SBruce Richardson 		 */
85299a2dd95SBruce Richardson 		cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
85399a2dd95SBruce Richardson 				0);
85499a2dd95SBruce Richardson 		if (cur_idx < 0)
85599a2dd95SBruce Richardson 			return 0;
85699a2dd95SBruce Richardson 		start_idx = cur_idx;
85799a2dd95SBruce Richardson 		/* adjust the size to possibly be smaller than original
85899a2dd95SBruce Richardson 		 * request, but do not allow it to be bigger.
85999a2dd95SBruce Richardson 		 */
86099a2dd95SBruce Richardson 		cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
86199a2dd95SBruce Richardson 				cur_idx);
86299a2dd95SBruce Richardson 		need = RTE_MIN(need, (unsigned int)cur_len);
86399a2dd95SBruce Richardson 	}
86499a2dd95SBruce Richardson 
86599a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're allocating,
86699a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
86799a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
86899a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
86999a2dd95SBruce Richardson 	 *
87099a2dd95SBruce Richardson 	 * during init, we already hold a write lock, so don't try to take out
87199a2dd95SBruce Richardson 	 * another one.
87299a2dd95SBruce Richardson 	 */
87399a2dd95SBruce Richardson 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
87499a2dd95SBruce Richardson 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
87599a2dd95SBruce Richardson 		if (dir_fd < 0) {
876ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
87799a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
87899a2dd95SBruce Richardson 			return -1;
87999a2dd95SBruce Richardson 		}
88099a2dd95SBruce Richardson 		/* blocking writelock */
88199a2dd95SBruce Richardson 		if (flock(dir_fd, LOCK_EX)) {
882ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
88399a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
88499a2dd95SBruce Richardson 			close(dir_fd);
88599a2dd95SBruce Richardson 			return -1;
88699a2dd95SBruce Richardson 		}
88799a2dd95SBruce Richardson 	}
88899a2dd95SBruce Richardson 
88999a2dd95SBruce Richardson 	for (i = 0; i < need; i++, cur_idx++) {
89099a2dd95SBruce Richardson 		struct rte_memseg *cur;
89199a2dd95SBruce Richardson 		void *map_addr;
89299a2dd95SBruce Richardson 
89399a2dd95SBruce Richardson 		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
89499a2dd95SBruce Richardson 		map_addr = RTE_PTR_ADD(cur_msl->base_va,
89599a2dd95SBruce Richardson 				cur_idx * page_sz);
89699a2dd95SBruce Richardson 
89799a2dd95SBruce Richardson 		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
89899a2dd95SBruce Richardson 				msl_idx, cur_idx)) {
899ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "attempted to allocate %i segments, but only %i were allocated",
90099a2dd95SBruce Richardson 				need, i);
90199a2dd95SBruce Richardson 
90299a2dd95SBruce Richardson 			/* if exact number wasn't requested, stop */
90399a2dd95SBruce Richardson 			if (!wa->exact)
90499a2dd95SBruce Richardson 				goto out;
90599a2dd95SBruce Richardson 
90699a2dd95SBruce Richardson 			/* clean up */
90799a2dd95SBruce Richardson 			for (j = start_idx; j < cur_idx; j++) {
90899a2dd95SBruce Richardson 				struct rte_memseg *tmp;
90999a2dd95SBruce Richardson 				struct rte_fbarray *arr =
91099a2dd95SBruce Richardson 						&cur_msl->memseg_arr;
91199a2dd95SBruce Richardson 
91299a2dd95SBruce Richardson 				tmp = rte_fbarray_get(arr, j);
91399a2dd95SBruce Richardson 				rte_fbarray_set_free(arr, j);
91499a2dd95SBruce Richardson 
91599a2dd95SBruce Richardson 				/* free_seg may attempt to create a file, which
91699a2dd95SBruce Richardson 				 * may fail.
91799a2dd95SBruce Richardson 				 */
91899a2dd95SBruce Richardson 				if (free_seg(tmp, wa->hi, msl_idx, j))
919ae67895bSDavid Marchand 					EAL_LOG(DEBUG, "Cannot free page");
92099a2dd95SBruce Richardson 			}
92199a2dd95SBruce Richardson 			/* clear the list */
92299a2dd95SBruce Richardson 			if (wa->ms)
92399a2dd95SBruce Richardson 				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
92499a2dd95SBruce Richardson 
92599a2dd95SBruce Richardson 			if (dir_fd >= 0)
92699a2dd95SBruce Richardson 				close(dir_fd);
92799a2dd95SBruce Richardson 			return -1;
92899a2dd95SBruce Richardson 		}
92999a2dd95SBruce Richardson 		if (wa->ms)
93099a2dd95SBruce Richardson 			wa->ms[i] = cur;
93199a2dd95SBruce Richardson 
93299a2dd95SBruce Richardson 		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
93399a2dd95SBruce Richardson 	}
93499a2dd95SBruce Richardson out:
93599a2dd95SBruce Richardson 	wa->segs_allocated = i;
93699a2dd95SBruce Richardson 	if (i > 0)
93799a2dd95SBruce Richardson 		cur_msl->version++;
93899a2dd95SBruce Richardson 	if (dir_fd >= 0)
93999a2dd95SBruce Richardson 		close(dir_fd);
94099a2dd95SBruce Richardson 	/* if we didn't allocate any segments, move on to the next list */
94199a2dd95SBruce Richardson 	return i > 0;
94299a2dd95SBruce Richardson }
94399a2dd95SBruce Richardson 
94499a2dd95SBruce Richardson struct free_walk_param {
94599a2dd95SBruce Richardson 	struct hugepage_info *hi;
94699a2dd95SBruce Richardson 	struct rte_memseg *ms;
94799a2dd95SBruce Richardson };
94899a2dd95SBruce Richardson static int
free_seg_walk(const struct rte_memseg_list * msl,void * arg)94999a2dd95SBruce Richardson free_seg_walk(const struct rte_memseg_list *msl, void *arg)
95099a2dd95SBruce Richardson {
95199a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
95299a2dd95SBruce Richardson 	struct rte_memseg_list *found_msl;
95399a2dd95SBruce Richardson 	struct free_walk_param *wa = arg;
95499a2dd95SBruce Richardson 	uintptr_t start_addr, end_addr;
95599a2dd95SBruce Richardson 	int msl_idx, seg_idx, ret, dir_fd = -1;
95699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
95799a2dd95SBruce Richardson 		eal_get_internal_configuration();
95899a2dd95SBruce Richardson 
95999a2dd95SBruce Richardson 	start_addr = (uintptr_t) msl->base_va;
96099a2dd95SBruce Richardson 	end_addr = start_addr + msl->len;
96199a2dd95SBruce Richardson 
96299a2dd95SBruce Richardson 	if ((uintptr_t)wa->ms->addr < start_addr ||
96399a2dd95SBruce Richardson 			(uintptr_t)wa->ms->addr >= end_addr)
96499a2dd95SBruce Richardson 		return 0;
96599a2dd95SBruce Richardson 
96699a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
96799a2dd95SBruce Richardson 	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
96899a2dd95SBruce Richardson 
96999a2dd95SBruce Richardson 	/* msl is const */
97099a2dd95SBruce Richardson 	found_msl = &mcfg->memsegs[msl_idx];
97199a2dd95SBruce Richardson 
97299a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're freeing,
97399a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
97499a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
97599a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
97699a2dd95SBruce Richardson 	 *
97799a2dd95SBruce Richardson 	 * during init, we already hold a write lock, so don't try to take out
97899a2dd95SBruce Richardson 	 * another one.
97999a2dd95SBruce Richardson 	 */
98099a2dd95SBruce Richardson 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
98199a2dd95SBruce Richardson 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
98299a2dd95SBruce Richardson 		if (dir_fd < 0) {
983ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
98499a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
98599a2dd95SBruce Richardson 			return -1;
98699a2dd95SBruce Richardson 		}
98799a2dd95SBruce Richardson 		/* blocking writelock */
98899a2dd95SBruce Richardson 		if (flock(dir_fd, LOCK_EX)) {
989ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
99099a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
99199a2dd95SBruce Richardson 			close(dir_fd);
99299a2dd95SBruce Richardson 			return -1;
99399a2dd95SBruce Richardson 		}
99499a2dd95SBruce Richardson 	}
99599a2dd95SBruce Richardson 
99699a2dd95SBruce Richardson 	found_msl->version++;
99799a2dd95SBruce Richardson 
99899a2dd95SBruce Richardson 	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
99999a2dd95SBruce Richardson 
100099a2dd95SBruce Richardson 	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
100199a2dd95SBruce Richardson 
100299a2dd95SBruce Richardson 	if (dir_fd >= 0)
100399a2dd95SBruce Richardson 		close(dir_fd);
100499a2dd95SBruce Richardson 
100599a2dd95SBruce Richardson 	if (ret < 0)
100699a2dd95SBruce Richardson 		return -1;
100799a2dd95SBruce Richardson 
100899a2dd95SBruce Richardson 	return 1;
100999a2dd95SBruce Richardson }
101099a2dd95SBruce Richardson 
101199a2dd95SBruce Richardson int
eal_memalloc_alloc_seg_bulk(struct rte_memseg ** ms,int n_segs,size_t page_sz,int socket,bool exact)101299a2dd95SBruce Richardson eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
101399a2dd95SBruce Richardson 		int socket, bool exact)
101499a2dd95SBruce Richardson {
101599a2dd95SBruce Richardson 	int i, ret = -1;
101699a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
101799a2dd95SBruce Richardson 	bool have_numa = false;
101899a2dd95SBruce Richardson 	int oldpolicy;
101999a2dd95SBruce Richardson 	struct bitmask *oldmask;
102099a2dd95SBruce Richardson #endif
102199a2dd95SBruce Richardson 	struct alloc_walk_param wa;
102299a2dd95SBruce Richardson 	struct hugepage_info *hi = NULL;
102399a2dd95SBruce Richardson 	struct internal_config *internal_conf =
102499a2dd95SBruce Richardson 		eal_get_internal_configuration();
102599a2dd95SBruce Richardson 
102699a2dd95SBruce Richardson 	memset(&wa, 0, sizeof(wa));
102799a2dd95SBruce Richardson 
102899a2dd95SBruce Richardson 	/* dynamic allocation not supported in legacy mode */
102999a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
103099a2dd95SBruce Richardson 		return -1;
103199a2dd95SBruce Richardson 
103299a2dd95SBruce Richardson 	for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) {
103399a2dd95SBruce Richardson 		if (page_sz ==
103499a2dd95SBruce Richardson 				internal_conf->hugepage_info[i].hugepage_sz) {
103599a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
103699a2dd95SBruce Richardson 			break;
103799a2dd95SBruce Richardson 		}
103899a2dd95SBruce Richardson 	}
103999a2dd95SBruce Richardson 	if (!hi) {
1040ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): can't find relevant hugepage_info entry",
104199a2dd95SBruce Richardson 			__func__);
104299a2dd95SBruce Richardson 		return -1;
104399a2dd95SBruce Richardson 	}
104499a2dd95SBruce Richardson 
104599a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
104699a2dd95SBruce Richardson 	if (check_numa()) {
104799a2dd95SBruce Richardson 		oldmask = numa_allocate_nodemask();
104899a2dd95SBruce Richardson 		prepare_numa(&oldpolicy, oldmask, socket);
104999a2dd95SBruce Richardson 		have_numa = true;
105099a2dd95SBruce Richardson 	}
105199a2dd95SBruce Richardson #endif
105299a2dd95SBruce Richardson 
105399a2dd95SBruce Richardson 	wa.exact = exact;
105499a2dd95SBruce Richardson 	wa.hi = hi;
105599a2dd95SBruce Richardson 	wa.ms = ms;
105699a2dd95SBruce Richardson 	wa.n_segs = n_segs;
105799a2dd95SBruce Richardson 	wa.page_sz = page_sz;
105899a2dd95SBruce Richardson 	wa.socket = socket;
105999a2dd95SBruce Richardson 	wa.segs_allocated = 0;
106099a2dd95SBruce Richardson 
106199a2dd95SBruce Richardson 	/* memalloc is locked, so it's safe to use thread-unsafe version */
106299a2dd95SBruce Richardson 	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
106399a2dd95SBruce Richardson 	if (ret == 0) {
1064*8f4611d8SDavid Marchand 		EAL_LOG(DEBUG, "%s(): couldn't find suitable memseg_list",
106599a2dd95SBruce Richardson 			__func__);
106699a2dd95SBruce Richardson 		ret = -1;
106799a2dd95SBruce Richardson 	} else if (ret > 0) {
106899a2dd95SBruce Richardson 		ret = (int)wa.segs_allocated;
106999a2dd95SBruce Richardson 	}
107099a2dd95SBruce Richardson 
107199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
107299a2dd95SBruce Richardson 	if (have_numa)
107399a2dd95SBruce Richardson 		restore_numa(&oldpolicy, oldmask);
107499a2dd95SBruce Richardson #endif
107599a2dd95SBruce Richardson 	return ret;
107699a2dd95SBruce Richardson }
107799a2dd95SBruce Richardson 
107899a2dd95SBruce Richardson struct rte_memseg *
eal_memalloc_alloc_seg(size_t page_sz,int socket)107999a2dd95SBruce Richardson eal_memalloc_alloc_seg(size_t page_sz, int socket)
108099a2dd95SBruce Richardson {
108199a2dd95SBruce Richardson 	struct rte_memseg *ms;
108299a2dd95SBruce Richardson 	if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
108399a2dd95SBruce Richardson 		return NULL;
108499a2dd95SBruce Richardson 	/* return pointer to newly allocated memseg */
108599a2dd95SBruce Richardson 	return ms;
108699a2dd95SBruce Richardson }
108799a2dd95SBruce Richardson 
108899a2dd95SBruce Richardson int
eal_memalloc_free_seg_bulk(struct rte_memseg ** ms,int n_segs)108999a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
109099a2dd95SBruce Richardson {
109199a2dd95SBruce Richardson 	int seg, ret = 0;
109299a2dd95SBruce Richardson 	struct internal_config *internal_conf =
109399a2dd95SBruce Richardson 		eal_get_internal_configuration();
109499a2dd95SBruce Richardson 
109599a2dd95SBruce Richardson 	/* dynamic free not supported in legacy mode */
109699a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
109799a2dd95SBruce Richardson 		return -1;
109899a2dd95SBruce Richardson 
109999a2dd95SBruce Richardson 	for (seg = 0; seg < n_segs; seg++) {
110099a2dd95SBruce Richardson 		struct rte_memseg *cur = ms[seg];
110199a2dd95SBruce Richardson 		struct hugepage_info *hi = NULL;
110299a2dd95SBruce Richardson 		struct free_walk_param wa;
110399a2dd95SBruce Richardson 		int i, walk_res;
110499a2dd95SBruce Richardson 
110599a2dd95SBruce Richardson 		/* if this page is marked as unfreeable, fail */
110699a2dd95SBruce Richardson 		if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
1107ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Page is not allowed to be freed");
110899a2dd95SBruce Richardson 			ret = -1;
110999a2dd95SBruce Richardson 			continue;
111099a2dd95SBruce Richardson 		}
111199a2dd95SBruce Richardson 
111299a2dd95SBruce Richardson 		memset(&wa, 0, sizeof(wa));
111399a2dd95SBruce Richardson 
111499a2dd95SBruce Richardson 		for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info);
111599a2dd95SBruce Richardson 				i++) {
111699a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
111799a2dd95SBruce Richardson 			if (cur->hugepage_sz == hi->hugepage_sz)
111899a2dd95SBruce Richardson 				break;
111999a2dd95SBruce Richardson 		}
112099a2dd95SBruce Richardson 		if (i == (int)RTE_DIM(internal_conf->hugepage_info)) {
1121ae67895bSDavid Marchand 			EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
112299a2dd95SBruce Richardson 			ret = -1;
112399a2dd95SBruce Richardson 			continue;
112499a2dd95SBruce Richardson 		}
112599a2dd95SBruce Richardson 
112699a2dd95SBruce Richardson 		wa.ms = cur;
112799a2dd95SBruce Richardson 		wa.hi = hi;
112899a2dd95SBruce Richardson 
112999a2dd95SBruce Richardson 		/* memalloc is locked, so it's safe to use thread-unsafe version
113099a2dd95SBruce Richardson 		 */
113199a2dd95SBruce Richardson 		walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
113299a2dd95SBruce Richardson 				&wa);
113399a2dd95SBruce Richardson 		if (walk_res == 1)
113499a2dd95SBruce Richardson 			continue;
113599a2dd95SBruce Richardson 		if (walk_res == 0)
1136ae67895bSDavid Marchand 			EAL_LOG(ERR, "Couldn't find memseg list");
113799a2dd95SBruce Richardson 		ret = -1;
113899a2dd95SBruce Richardson 	}
113999a2dd95SBruce Richardson 	return ret;
114099a2dd95SBruce Richardson }
114199a2dd95SBruce Richardson 
114299a2dd95SBruce Richardson int
eal_memalloc_free_seg(struct rte_memseg * ms)114399a2dd95SBruce Richardson eal_memalloc_free_seg(struct rte_memseg *ms)
114499a2dd95SBruce Richardson {
114599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
114699a2dd95SBruce Richardson 		eal_get_internal_configuration();
114799a2dd95SBruce Richardson 
114899a2dd95SBruce Richardson 	/* dynamic free not supported in legacy mode */
114999a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
115099a2dd95SBruce Richardson 		return -1;
115199a2dd95SBruce Richardson 
115299a2dd95SBruce Richardson 	return eal_memalloc_free_seg_bulk(&ms, 1);
115399a2dd95SBruce Richardson }
115499a2dd95SBruce Richardson 
115599a2dd95SBruce Richardson static int
sync_chunk(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used,int start,int end)115699a2dd95SBruce Richardson sync_chunk(struct rte_memseg_list *primary_msl,
115799a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
115899a2dd95SBruce Richardson 		unsigned int msl_idx, bool used, int start, int end)
115999a2dd95SBruce Richardson {
116099a2dd95SBruce Richardson 	struct rte_fbarray *l_arr, *p_arr;
116199a2dd95SBruce Richardson 	int i, ret, chunk_len, diff_len;
116299a2dd95SBruce Richardson 
116399a2dd95SBruce Richardson 	l_arr = &local_msl->memseg_arr;
116499a2dd95SBruce Richardson 	p_arr = &primary_msl->memseg_arr;
116599a2dd95SBruce Richardson 
116699a2dd95SBruce Richardson 	/* we need to aggregate allocations/deallocations into bigger chunks,
116799a2dd95SBruce Richardson 	 * as we don't want to spam the user with per-page callbacks.
116899a2dd95SBruce Richardson 	 *
116999a2dd95SBruce Richardson 	 * to avoid any potential issues, we also want to trigger
117099a2dd95SBruce Richardson 	 * deallocation callbacks *before* we actually deallocate
117199a2dd95SBruce Richardson 	 * memory, so that the user application could wrap up its use
117299a2dd95SBruce Richardson 	 * before it goes away.
117399a2dd95SBruce Richardson 	 */
117499a2dd95SBruce Richardson 
117599a2dd95SBruce Richardson 	chunk_len = end - start;
117699a2dd95SBruce Richardson 
117799a2dd95SBruce Richardson 	/* find how many contiguous pages we can map/unmap for this chunk */
117899a2dd95SBruce Richardson 	diff_len = used ?
117999a2dd95SBruce Richardson 			rte_fbarray_find_contig_free(l_arr, start) :
118099a2dd95SBruce Richardson 			rte_fbarray_find_contig_used(l_arr, start);
118199a2dd95SBruce Richardson 
118299a2dd95SBruce Richardson 	/* has to be at least one page */
118399a2dd95SBruce Richardson 	if (diff_len < 1)
118499a2dd95SBruce Richardson 		return -1;
118599a2dd95SBruce Richardson 
118699a2dd95SBruce Richardson 	diff_len = RTE_MIN(chunk_len, diff_len);
118799a2dd95SBruce Richardson 
118899a2dd95SBruce Richardson 	/* if we are freeing memory, notify the application */
118999a2dd95SBruce Richardson 	if (!used) {
119099a2dd95SBruce Richardson 		struct rte_memseg *ms;
119199a2dd95SBruce Richardson 		void *start_va;
119299a2dd95SBruce Richardson 		size_t len, page_sz;
119399a2dd95SBruce Richardson 
119499a2dd95SBruce Richardson 		ms = rte_fbarray_get(l_arr, start);
119599a2dd95SBruce Richardson 		start_va = ms->addr;
119699a2dd95SBruce Richardson 		page_sz = (size_t)primary_msl->page_sz;
119799a2dd95SBruce Richardson 		len = page_sz * diff_len;
119899a2dd95SBruce Richardson 
119999a2dd95SBruce Richardson 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
120099a2dd95SBruce Richardson 				start_va, len);
120199a2dd95SBruce Richardson 	}
120299a2dd95SBruce Richardson 
120399a2dd95SBruce Richardson 	for (i = 0; i < diff_len; i++) {
120499a2dd95SBruce Richardson 		struct rte_memseg *p_ms, *l_ms;
120599a2dd95SBruce Richardson 		int seg_idx = start + i;
120699a2dd95SBruce Richardson 
120799a2dd95SBruce Richardson 		l_ms = rte_fbarray_get(l_arr, seg_idx);
120899a2dd95SBruce Richardson 		p_ms = rte_fbarray_get(p_arr, seg_idx);
120999a2dd95SBruce Richardson 
121099a2dd95SBruce Richardson 		if (l_ms == NULL || p_ms == NULL)
121199a2dd95SBruce Richardson 			return -1;
121299a2dd95SBruce Richardson 
121399a2dd95SBruce Richardson 		if (used) {
121499a2dd95SBruce Richardson 			ret = alloc_seg(l_ms, p_ms->addr,
121599a2dd95SBruce Richardson 					p_ms->socket_id, hi,
121699a2dd95SBruce Richardson 					msl_idx, seg_idx);
121799a2dd95SBruce Richardson 			if (ret < 0)
121899a2dd95SBruce Richardson 				return -1;
121999a2dd95SBruce Richardson 			rte_fbarray_set_used(l_arr, seg_idx);
122099a2dd95SBruce Richardson 		} else {
122199a2dd95SBruce Richardson 			ret = free_seg(l_ms, hi, msl_idx, seg_idx);
122299a2dd95SBruce Richardson 			rte_fbarray_set_free(l_arr, seg_idx);
122399a2dd95SBruce Richardson 			if (ret < 0)
122499a2dd95SBruce Richardson 				return -1;
122599a2dd95SBruce Richardson 		}
122699a2dd95SBruce Richardson 	}
122799a2dd95SBruce Richardson 
122899a2dd95SBruce Richardson 	/* if we just allocated memory, notify the application */
122999a2dd95SBruce Richardson 	if (used) {
123099a2dd95SBruce Richardson 		struct rte_memseg *ms;
123199a2dd95SBruce Richardson 		void *start_va;
123299a2dd95SBruce Richardson 		size_t len, page_sz;
123399a2dd95SBruce Richardson 
123499a2dd95SBruce Richardson 		ms = rte_fbarray_get(l_arr, start);
123599a2dd95SBruce Richardson 		start_va = ms->addr;
123699a2dd95SBruce Richardson 		page_sz = (size_t)primary_msl->page_sz;
123799a2dd95SBruce Richardson 		len = page_sz * diff_len;
123899a2dd95SBruce Richardson 
123999a2dd95SBruce Richardson 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
124099a2dd95SBruce Richardson 				start_va, len);
124199a2dd95SBruce Richardson 	}
124299a2dd95SBruce Richardson 
124399a2dd95SBruce Richardson 	/* calculate how much we can advance until next chunk */
124499a2dd95SBruce Richardson 	diff_len = used ?
124599a2dd95SBruce Richardson 			rte_fbarray_find_contig_used(l_arr, start) :
124699a2dd95SBruce Richardson 			rte_fbarray_find_contig_free(l_arr, start);
124799a2dd95SBruce Richardson 	ret = RTE_MIN(chunk_len, diff_len);
124899a2dd95SBruce Richardson 
124999a2dd95SBruce Richardson 	return ret;
125099a2dd95SBruce Richardson }
125199a2dd95SBruce Richardson 
125299a2dd95SBruce Richardson static int
sync_status(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used)125399a2dd95SBruce Richardson sync_status(struct rte_memseg_list *primary_msl,
125499a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
125599a2dd95SBruce Richardson 		unsigned int msl_idx, bool used)
125699a2dd95SBruce Richardson {
125799a2dd95SBruce Richardson 	struct rte_fbarray *l_arr, *p_arr;
125899a2dd95SBruce Richardson 	int p_idx, l_chunk_len, p_chunk_len, ret;
125999a2dd95SBruce Richardson 	int start, end;
126099a2dd95SBruce Richardson 
126199a2dd95SBruce Richardson 	/* this is a little bit tricky, but the basic idea is - walk both lists
126299a2dd95SBruce Richardson 	 * and spot any places where there are discrepancies. walking both lists
126399a2dd95SBruce Richardson 	 * and noting discrepancies in a single go is a hard problem, so we do
126499a2dd95SBruce Richardson 	 * it in two passes - first we spot any places where allocated segments
126599a2dd95SBruce Richardson 	 * mismatch (i.e. ensure that everything that's allocated in the primary
126699a2dd95SBruce Richardson 	 * is also allocated in the secondary), and then we do it by looking at
126799a2dd95SBruce Richardson 	 * free segments instead.
126899a2dd95SBruce Richardson 	 *
126999a2dd95SBruce Richardson 	 * we also need to aggregate changes into chunks, as we have to call
127099a2dd95SBruce Richardson 	 * callbacks per allocation, not per page.
127199a2dd95SBruce Richardson 	 */
127299a2dd95SBruce Richardson 	l_arr = &local_msl->memseg_arr;
127399a2dd95SBruce Richardson 	p_arr = &primary_msl->memseg_arr;
127499a2dd95SBruce Richardson 
127599a2dd95SBruce Richardson 	if (used)
127699a2dd95SBruce Richardson 		p_idx = rte_fbarray_find_next_used(p_arr, 0);
127799a2dd95SBruce Richardson 	else
127899a2dd95SBruce Richardson 		p_idx = rte_fbarray_find_next_free(p_arr, 0);
127999a2dd95SBruce Richardson 
128099a2dd95SBruce Richardson 	while (p_idx >= 0) {
128199a2dd95SBruce Richardson 		int next_chunk_search_idx;
128299a2dd95SBruce Richardson 
128399a2dd95SBruce Richardson 		if (used) {
128499a2dd95SBruce Richardson 			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
128599a2dd95SBruce Richardson 					p_idx);
128699a2dd95SBruce Richardson 			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
128799a2dd95SBruce Richardson 					p_idx);
128899a2dd95SBruce Richardson 		} else {
128999a2dd95SBruce Richardson 			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
129099a2dd95SBruce Richardson 					p_idx);
129199a2dd95SBruce Richardson 			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
129299a2dd95SBruce Richardson 					p_idx);
129399a2dd95SBruce Richardson 		}
129499a2dd95SBruce Richardson 		/* best case scenario - no differences (or bigger, which will be
129599a2dd95SBruce Richardson 		 * fixed during next iteration), look for next chunk
129699a2dd95SBruce Richardson 		 */
129799a2dd95SBruce Richardson 		if (l_chunk_len >= p_chunk_len) {
129899a2dd95SBruce Richardson 			next_chunk_search_idx = p_idx + p_chunk_len;
129999a2dd95SBruce Richardson 			goto next_chunk;
130099a2dd95SBruce Richardson 		}
130199a2dd95SBruce Richardson 
130299a2dd95SBruce Richardson 		/* if both chunks start at the same point, skip parts we know
130399a2dd95SBruce Richardson 		 * are identical, and sync the rest. each call to sync_chunk
130499a2dd95SBruce Richardson 		 * will only sync contiguous segments, so we need to call this
130599a2dd95SBruce Richardson 		 * until we are sure there are no more differences in this
130699a2dd95SBruce Richardson 		 * chunk.
130799a2dd95SBruce Richardson 		 */
130899a2dd95SBruce Richardson 		start = p_idx + l_chunk_len;
130999a2dd95SBruce Richardson 		end = p_idx + p_chunk_len;
131099a2dd95SBruce Richardson 		do {
131199a2dd95SBruce Richardson 			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
131299a2dd95SBruce Richardson 					used, start, end);
131399a2dd95SBruce Richardson 			start += ret;
131499a2dd95SBruce Richardson 		} while (start < end && ret >= 0);
131599a2dd95SBruce Richardson 		/* if ret is negative, something went wrong */
131699a2dd95SBruce Richardson 		if (ret < 0)
131799a2dd95SBruce Richardson 			return -1;
131899a2dd95SBruce Richardson 
131999a2dd95SBruce Richardson 		next_chunk_search_idx = p_idx + p_chunk_len;
132099a2dd95SBruce Richardson next_chunk:
132199a2dd95SBruce Richardson 		/* skip to end of this chunk */
132299a2dd95SBruce Richardson 		if (used) {
132399a2dd95SBruce Richardson 			p_idx = rte_fbarray_find_next_used(p_arr,
132499a2dd95SBruce Richardson 					next_chunk_search_idx);
132599a2dd95SBruce Richardson 		} else {
132699a2dd95SBruce Richardson 			p_idx = rte_fbarray_find_next_free(p_arr,
132799a2dd95SBruce Richardson 					next_chunk_search_idx);
132899a2dd95SBruce Richardson 		}
132999a2dd95SBruce Richardson 	}
133099a2dd95SBruce Richardson 	return 0;
133199a2dd95SBruce Richardson }
133299a2dd95SBruce Richardson 
133399a2dd95SBruce Richardson static int
sync_existing(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx)133499a2dd95SBruce Richardson sync_existing(struct rte_memseg_list *primary_msl,
133599a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
133699a2dd95SBruce Richardson 		unsigned int msl_idx)
133799a2dd95SBruce Richardson {
133899a2dd95SBruce Richardson 	int ret, dir_fd;
133999a2dd95SBruce Richardson 
134099a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're allocating,
134199a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
134299a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
134399a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
134499a2dd95SBruce Richardson 	 */
134599a2dd95SBruce Richardson 	dir_fd = open(hi->hugedir, O_RDONLY);
134699a2dd95SBruce Richardson 	if (dir_fd < 0) {
1347ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): Cannot open '%s': %s", __func__,
134899a2dd95SBruce Richardson 			hi->hugedir, strerror(errno));
134999a2dd95SBruce Richardson 		return -1;
135099a2dd95SBruce Richardson 	}
135199a2dd95SBruce Richardson 	/* blocking writelock */
135299a2dd95SBruce Richardson 	if (flock(dir_fd, LOCK_EX)) {
1353ae67895bSDavid Marchand 		EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", __func__,
135499a2dd95SBruce Richardson 			hi->hugedir, strerror(errno));
135599a2dd95SBruce Richardson 		close(dir_fd);
135699a2dd95SBruce Richardson 		return -1;
135799a2dd95SBruce Richardson 	}
135899a2dd95SBruce Richardson 
135999a2dd95SBruce Richardson 	/* ensure all allocated space is the same in both lists */
136099a2dd95SBruce Richardson 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
136199a2dd95SBruce Richardson 	if (ret < 0)
136299a2dd95SBruce Richardson 		goto fail;
136399a2dd95SBruce Richardson 
136499a2dd95SBruce Richardson 	/* ensure all unallocated space is the same in both lists */
136599a2dd95SBruce Richardson 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
136699a2dd95SBruce Richardson 	if (ret < 0)
136799a2dd95SBruce Richardson 		goto fail;
136899a2dd95SBruce Richardson 
136999a2dd95SBruce Richardson 	/* update version number */
137099a2dd95SBruce Richardson 	local_msl->version = primary_msl->version;
137199a2dd95SBruce Richardson 
137299a2dd95SBruce Richardson 	close(dir_fd);
137399a2dd95SBruce Richardson 
137499a2dd95SBruce Richardson 	return 0;
137599a2dd95SBruce Richardson fail:
137699a2dd95SBruce Richardson 	close(dir_fd);
137799a2dd95SBruce Richardson 	return -1;
137899a2dd95SBruce Richardson }
137999a2dd95SBruce Richardson 
138099a2dd95SBruce Richardson static int
sync_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)138199a2dd95SBruce Richardson sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
138299a2dd95SBruce Richardson {
138399a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
138499a2dd95SBruce Richardson 	struct rte_memseg_list *primary_msl, *local_msl;
138599a2dd95SBruce Richardson 	struct hugepage_info *hi = NULL;
138699a2dd95SBruce Richardson 	unsigned int i;
138799a2dd95SBruce Richardson 	int msl_idx;
138899a2dd95SBruce Richardson 	struct internal_config *internal_conf =
138999a2dd95SBruce Richardson 		eal_get_internal_configuration();
139099a2dd95SBruce Richardson 
139199a2dd95SBruce Richardson 	if (msl->external)
139299a2dd95SBruce Richardson 		return 0;
139399a2dd95SBruce Richardson 
139499a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
139599a2dd95SBruce Richardson 	primary_msl = &mcfg->memsegs[msl_idx];
139699a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
139799a2dd95SBruce Richardson 
139899a2dd95SBruce Richardson 	for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
139999a2dd95SBruce Richardson 		uint64_t cur_sz =
140099a2dd95SBruce Richardson 			internal_conf->hugepage_info[i].hugepage_sz;
140199a2dd95SBruce Richardson 		uint64_t msl_sz = primary_msl->page_sz;
140299a2dd95SBruce Richardson 		if (msl_sz == cur_sz) {
140399a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
140499a2dd95SBruce Richardson 			break;
140599a2dd95SBruce Richardson 		}
140699a2dd95SBruce Richardson 	}
140799a2dd95SBruce Richardson 	if (!hi) {
1408ae67895bSDavid Marchand 		EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
140999a2dd95SBruce Richardson 		return -1;
141099a2dd95SBruce Richardson 	}
141199a2dd95SBruce Richardson 
141299a2dd95SBruce Richardson 	/* if versions don't match, synchronize everything */
141399a2dd95SBruce Richardson 	if (local_msl->version != primary_msl->version &&
141499a2dd95SBruce Richardson 			sync_existing(primary_msl, local_msl, hi, msl_idx))
141599a2dd95SBruce Richardson 		return -1;
141699a2dd95SBruce Richardson 	return 0;
141799a2dd95SBruce Richardson }
141899a2dd95SBruce Richardson 
141999a2dd95SBruce Richardson 
142099a2dd95SBruce Richardson int
eal_memalloc_sync_with_primary(void)142199a2dd95SBruce Richardson eal_memalloc_sync_with_primary(void)
142299a2dd95SBruce Richardson {
142399a2dd95SBruce Richardson 	/* nothing to be done in primary */
142499a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
142599a2dd95SBruce Richardson 		return 0;
142699a2dd95SBruce Richardson 
142799a2dd95SBruce Richardson 	/* memalloc is locked, so it's safe to call thread-unsafe version */
142899a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
142999a2dd95SBruce Richardson 		return -1;
143099a2dd95SBruce Richardson 	return 0;
143199a2dd95SBruce Richardson }
143299a2dd95SBruce Richardson 
143399a2dd95SBruce Richardson static int
secondary_msl_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)143499a2dd95SBruce Richardson secondary_msl_create_walk(const struct rte_memseg_list *msl,
143599a2dd95SBruce Richardson 		void *arg __rte_unused)
143699a2dd95SBruce Richardson {
143799a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
143899a2dd95SBruce Richardson 	struct rte_memseg_list *primary_msl, *local_msl;
143999a2dd95SBruce Richardson 	char name[PATH_MAX];
144099a2dd95SBruce Richardson 	int msl_idx, ret;
144199a2dd95SBruce Richardson 
144299a2dd95SBruce Richardson 	if (msl->external)
144399a2dd95SBruce Richardson 		return 0;
144499a2dd95SBruce Richardson 
144599a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
144699a2dd95SBruce Richardson 	primary_msl = &mcfg->memsegs[msl_idx];
144799a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
144899a2dd95SBruce Richardson 
144999a2dd95SBruce Richardson 	/* create distinct fbarrays for each secondary */
145099a2dd95SBruce Richardson 	snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
145199a2dd95SBruce Richardson 		primary_msl->memseg_arr.name, getpid());
145299a2dd95SBruce Richardson 
145399a2dd95SBruce Richardson 	ret = rte_fbarray_init(&local_msl->memseg_arr, name,
145499a2dd95SBruce Richardson 		primary_msl->memseg_arr.len,
145599a2dd95SBruce Richardson 		primary_msl->memseg_arr.elt_sz);
145699a2dd95SBruce Richardson 	if (ret < 0) {
1457ae67895bSDavid Marchand 		EAL_LOG(ERR, "Cannot initialize local memory map");
145899a2dd95SBruce Richardson 		return -1;
145999a2dd95SBruce Richardson 	}
146099a2dd95SBruce Richardson 	local_msl->base_va = primary_msl->base_va;
146199a2dd95SBruce Richardson 	local_msl->len = primary_msl->len;
146299a2dd95SBruce Richardson 
146399a2dd95SBruce Richardson 	return 0;
146499a2dd95SBruce Richardson }
146599a2dd95SBruce Richardson 
146699a2dd95SBruce Richardson static int
secondary_msl_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)146799a2dd95SBruce Richardson secondary_msl_destroy_walk(const struct rte_memseg_list *msl,
146899a2dd95SBruce Richardson 		void *arg __rte_unused)
146999a2dd95SBruce Richardson {
147099a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
147199a2dd95SBruce Richardson 	struct rte_memseg_list *local_msl;
147299a2dd95SBruce Richardson 	int msl_idx, ret;
147399a2dd95SBruce Richardson 
147499a2dd95SBruce Richardson 	if (msl->external)
147599a2dd95SBruce Richardson 		return 0;
147699a2dd95SBruce Richardson 
147799a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
147899a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
147999a2dd95SBruce Richardson 
148099a2dd95SBruce Richardson 	ret = rte_fbarray_destroy(&local_msl->memseg_arr);
148199a2dd95SBruce Richardson 	if (ret < 0) {
1482ae67895bSDavid Marchand 		EAL_LOG(ERR, "Cannot destroy local memory map");
148399a2dd95SBruce Richardson 		return -1;
148499a2dd95SBruce Richardson 	}
148599a2dd95SBruce Richardson 	local_msl->base_va = NULL;
148699a2dd95SBruce Richardson 	local_msl->len = 0;
148799a2dd95SBruce Richardson 
148899a2dd95SBruce Richardson 	return 0;
148999a2dd95SBruce Richardson }
149099a2dd95SBruce Richardson 
149199a2dd95SBruce Richardson static int
alloc_list(int list_idx,int len)149299a2dd95SBruce Richardson alloc_list(int list_idx, int len)
149399a2dd95SBruce Richardson {
149499a2dd95SBruce Richardson 	int *data;
149599a2dd95SBruce Richardson 	int i;
149699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
149799a2dd95SBruce Richardson 		eal_get_internal_configuration();
149899a2dd95SBruce Richardson 
149999a2dd95SBruce Richardson 	/* single-file segments mode does not need fd list */
150099a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments) {
150199a2dd95SBruce Richardson 		/* ensure we have space to store fd per each possible segment */
150299a2dd95SBruce Richardson 		data = malloc(sizeof(int) * len);
150399a2dd95SBruce Richardson 		if (data == NULL) {
1504ae67895bSDavid Marchand 			EAL_LOG(ERR, "Unable to allocate space for file descriptors");
150599a2dd95SBruce Richardson 			return -1;
150699a2dd95SBruce Richardson 		}
150799a2dd95SBruce Richardson 		/* set all fd's as invalid */
150899a2dd95SBruce Richardson 		for (i = 0; i < len; i++)
150999a2dd95SBruce Richardson 			data[i] = -1;
151099a2dd95SBruce Richardson 		fd_list[list_idx].fds = data;
151199a2dd95SBruce Richardson 		fd_list[list_idx].len = len;
151299a2dd95SBruce Richardson 	} else {
151399a2dd95SBruce Richardson 		fd_list[list_idx].fds = NULL;
151499a2dd95SBruce Richardson 		fd_list[list_idx].len = 0;
151599a2dd95SBruce Richardson 	}
151699a2dd95SBruce Richardson 
151799a2dd95SBruce Richardson 	fd_list[list_idx].count = 0;
151899a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = -1;
151999a2dd95SBruce Richardson 
152099a2dd95SBruce Richardson 	return 0;
152199a2dd95SBruce Richardson }
152299a2dd95SBruce Richardson 
152399a2dd95SBruce Richardson static int
destroy_list(int list_idx)152499a2dd95SBruce Richardson destroy_list(int list_idx)
152599a2dd95SBruce Richardson {
152699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
152799a2dd95SBruce Richardson 			eal_get_internal_configuration();
152899a2dd95SBruce Richardson 
152999a2dd95SBruce Richardson 	/* single-file segments mode does not need fd list */
153099a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments) {
153199a2dd95SBruce Richardson 		int *fds = fd_list[list_idx].fds;
153299a2dd95SBruce Richardson 		int i;
153399a2dd95SBruce Richardson 		/* go through each fd and ensure it's closed */
153499a2dd95SBruce Richardson 		for (i = 0; i < fd_list[list_idx].len; i++) {
153599a2dd95SBruce Richardson 			if (fds[i] >= 0) {
153699a2dd95SBruce Richardson 				close(fds[i]);
153799a2dd95SBruce Richardson 				fds[i] = -1;
153899a2dd95SBruce Richardson 			}
153999a2dd95SBruce Richardson 		}
154099a2dd95SBruce Richardson 		free(fds);
154199a2dd95SBruce Richardson 		fd_list[list_idx].fds = NULL;
154299a2dd95SBruce Richardson 		fd_list[list_idx].len = 0;
154399a2dd95SBruce Richardson 	} else if (fd_list[list_idx].memseg_list_fd >= 0) {
154499a2dd95SBruce Richardson 		close(fd_list[list_idx].memseg_list_fd);
154599a2dd95SBruce Richardson 		fd_list[list_idx].count = 0;
154699a2dd95SBruce Richardson 		fd_list[list_idx].memseg_list_fd = -1;
154799a2dd95SBruce Richardson 	}
154899a2dd95SBruce Richardson 	return 0;
154999a2dd95SBruce Richardson }
155099a2dd95SBruce Richardson 
155199a2dd95SBruce Richardson static int
fd_list_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)155299a2dd95SBruce Richardson fd_list_create_walk(const struct rte_memseg_list *msl,
155399a2dd95SBruce Richardson 		void *arg __rte_unused)
155499a2dd95SBruce Richardson {
155599a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
155699a2dd95SBruce Richardson 	unsigned int len;
155799a2dd95SBruce Richardson 	int msl_idx;
155899a2dd95SBruce Richardson 
155999a2dd95SBruce Richardson 	if (msl->external)
156099a2dd95SBruce Richardson 		return 0;
156199a2dd95SBruce Richardson 
156299a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
156399a2dd95SBruce Richardson 	len = msl->memseg_arr.len;
156499a2dd95SBruce Richardson 
156599a2dd95SBruce Richardson 	return alloc_list(msl_idx, len);
156699a2dd95SBruce Richardson }
156799a2dd95SBruce Richardson 
156899a2dd95SBruce Richardson static int
fd_list_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)156999a2dd95SBruce Richardson fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
157099a2dd95SBruce Richardson {
157199a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
157299a2dd95SBruce Richardson 	int msl_idx;
157399a2dd95SBruce Richardson 
157499a2dd95SBruce Richardson 	if (msl->external)
157599a2dd95SBruce Richardson 		return 0;
157699a2dd95SBruce Richardson 
157799a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
157899a2dd95SBruce Richardson 
157999a2dd95SBruce Richardson 	return destroy_list(msl_idx);
158099a2dd95SBruce Richardson }
158199a2dd95SBruce Richardson 
158299a2dd95SBruce Richardson int
eal_memalloc_set_seg_fd(int list_idx,int seg_idx,int fd)158399a2dd95SBruce Richardson eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
158499a2dd95SBruce Richardson {
158599a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
158699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
158799a2dd95SBruce Richardson 		eal_get_internal_configuration();
158899a2dd95SBruce Richardson 
158999a2dd95SBruce Richardson 	/* single file segments mode doesn't support individual segment fd's */
159099a2dd95SBruce Richardson 	if (internal_conf->single_file_segments)
159199a2dd95SBruce Richardson 		return -ENOTSUP;
159299a2dd95SBruce Richardson 
159399a2dd95SBruce Richardson 	/* if list is not allocated, allocate it */
159499a2dd95SBruce Richardson 	if (fd_list[list_idx].len == 0) {
159599a2dd95SBruce Richardson 		int len = mcfg->memsegs[list_idx].memseg_arr.len;
159699a2dd95SBruce Richardson 
159799a2dd95SBruce Richardson 		if (alloc_list(list_idx, len) < 0)
159899a2dd95SBruce Richardson 			return -ENOMEM;
159999a2dd95SBruce Richardson 	}
160099a2dd95SBruce Richardson 	fd_list[list_idx].fds[seg_idx] = fd;
160199a2dd95SBruce Richardson 
160299a2dd95SBruce Richardson 	return 0;
160399a2dd95SBruce Richardson }
160499a2dd95SBruce Richardson 
160599a2dd95SBruce Richardson int
eal_memalloc_set_seg_list_fd(int list_idx,int fd)160699a2dd95SBruce Richardson eal_memalloc_set_seg_list_fd(int list_idx, int fd)
160799a2dd95SBruce Richardson {
160899a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
160999a2dd95SBruce Richardson 		eal_get_internal_configuration();
161099a2dd95SBruce Richardson 
161199a2dd95SBruce Richardson 	/* non-single file segment mode doesn't support segment list fd's */
161299a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments)
161399a2dd95SBruce Richardson 		return -ENOTSUP;
161499a2dd95SBruce Richardson 
161599a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = fd;
161699a2dd95SBruce Richardson 
161799a2dd95SBruce Richardson 	return 0;
161899a2dd95SBruce Richardson }
161999a2dd95SBruce Richardson 
162099a2dd95SBruce Richardson int
eal_memalloc_get_seg_fd(int list_idx,int seg_idx)162199a2dd95SBruce Richardson eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
162299a2dd95SBruce Richardson {
162399a2dd95SBruce Richardson 	int fd;
162499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
162599a2dd95SBruce Richardson 		eal_get_internal_configuration();
162699a2dd95SBruce Richardson 
162799a2dd95SBruce Richardson 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
162899a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
162999a2dd95SBruce Richardson 		/* in in-memory or no-huge mode, we rely on memfd support */
163099a2dd95SBruce Richardson 		return -ENOTSUP;
163199a2dd95SBruce Richardson #endif
163299a2dd95SBruce Richardson 		/* memfd supported, but hugetlbfs memfd may not be */
163399a2dd95SBruce Richardson 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
163499a2dd95SBruce Richardson 			return -ENOTSUP;
163599a2dd95SBruce Richardson 	}
163699a2dd95SBruce Richardson 
163799a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
163899a2dd95SBruce Richardson 		fd = fd_list[list_idx].memseg_list_fd;
163999a2dd95SBruce Richardson 	} else if (fd_list[list_idx].len == 0) {
164099a2dd95SBruce Richardson 		/* list not initialized */
164199a2dd95SBruce Richardson 		fd = -1;
164299a2dd95SBruce Richardson 	} else {
164399a2dd95SBruce Richardson 		fd = fd_list[list_idx].fds[seg_idx];
164499a2dd95SBruce Richardson 	}
164599a2dd95SBruce Richardson 	if (fd < 0)
164699a2dd95SBruce Richardson 		return -ENODEV;
164799a2dd95SBruce Richardson 	return fd;
164899a2dd95SBruce Richardson }
164999a2dd95SBruce Richardson 
165099a2dd95SBruce Richardson static int
test_memfd_create(void)165199a2dd95SBruce Richardson test_memfd_create(void)
165299a2dd95SBruce Richardson {
165399a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
165499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
165599a2dd95SBruce Richardson 		eal_get_internal_configuration();
165699a2dd95SBruce Richardson 	unsigned int i;
165799a2dd95SBruce Richardson 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
165899a2dd95SBruce Richardson 		uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz;
165999a2dd95SBruce Richardson 		int pagesz_flag = pagesz_flags(pagesz);
166099a2dd95SBruce Richardson 		int flags;
166199a2dd95SBruce Richardson 
166299a2dd95SBruce Richardson 		flags = pagesz_flag | RTE_MFD_HUGETLB;
166399a2dd95SBruce Richardson 		int fd = memfd_create("test", flags);
166499a2dd95SBruce Richardson 		if (fd < 0) {
166599a2dd95SBruce Richardson 			/* we failed - let memalloc know this isn't working */
166699a2dd95SBruce Richardson 			if (errno == EINVAL) {
166799a2dd95SBruce Richardson 				memfd_create_supported = 0;
166899a2dd95SBruce Richardson 				return 0; /* not supported */
166999a2dd95SBruce Richardson 			}
167099a2dd95SBruce Richardson 
167199a2dd95SBruce Richardson 			/* we got other error - something's wrong */
167299a2dd95SBruce Richardson 			return -1; /* error */
167399a2dd95SBruce Richardson 		}
167499a2dd95SBruce Richardson 		close(fd);
167599a2dd95SBruce Richardson 		return 1; /* supported */
167699a2dd95SBruce Richardson 	}
167799a2dd95SBruce Richardson #endif
167899a2dd95SBruce Richardson 	return 0; /* not supported */
167999a2dd95SBruce Richardson }
168099a2dd95SBruce Richardson 
168199a2dd95SBruce Richardson int
eal_memalloc_get_seg_fd_offset(int list_idx,int seg_idx,size_t * offset)168299a2dd95SBruce Richardson eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
168399a2dd95SBruce Richardson {
168499a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
168599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
168699a2dd95SBruce Richardson 		eal_get_internal_configuration();
168799a2dd95SBruce Richardson 
168899a2dd95SBruce Richardson 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
168999a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
169099a2dd95SBruce Richardson 		/* in in-memory or no-huge mode, we rely on memfd support */
169199a2dd95SBruce Richardson 		return -ENOTSUP;
169299a2dd95SBruce Richardson #endif
169399a2dd95SBruce Richardson 		/* memfd supported, but hugetlbfs memfd may not be */
169499a2dd95SBruce Richardson 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
169599a2dd95SBruce Richardson 			return -ENOTSUP;
169699a2dd95SBruce Richardson 	}
169799a2dd95SBruce Richardson 
169899a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
169999a2dd95SBruce Richardson 		size_t pgsz = mcfg->memsegs[list_idx].page_sz;
170099a2dd95SBruce Richardson 
170199a2dd95SBruce Richardson 		/* segment not active? */
170299a2dd95SBruce Richardson 		if (fd_list[list_idx].memseg_list_fd < 0)
170399a2dd95SBruce Richardson 			return -ENOENT;
170499a2dd95SBruce Richardson 		*offset = pgsz * seg_idx;
170599a2dd95SBruce Richardson 	} else {
170699a2dd95SBruce Richardson 		/* fd_list not initialized? */
170799a2dd95SBruce Richardson 		if (fd_list[list_idx].len == 0)
170899a2dd95SBruce Richardson 			return -ENODEV;
170999a2dd95SBruce Richardson 
171099a2dd95SBruce Richardson 		/* segment not active? */
171199a2dd95SBruce Richardson 		if (fd_list[list_idx].fds[seg_idx] < 0)
171299a2dd95SBruce Richardson 			return -ENOENT;
171399a2dd95SBruce Richardson 		*offset = 0;
171499a2dd95SBruce Richardson 	}
171599a2dd95SBruce Richardson 	return 0;
171699a2dd95SBruce Richardson }
171799a2dd95SBruce Richardson 
171899a2dd95SBruce Richardson int
eal_memalloc_cleanup(void)171999a2dd95SBruce Richardson eal_memalloc_cleanup(void)
172099a2dd95SBruce Richardson {
172199a2dd95SBruce Richardson 	/* close all remaining fd's - these are per-process, so it's safe */
172299a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL))
172399a2dd95SBruce Richardson 		return -1;
172499a2dd95SBruce Richardson 
172599a2dd95SBruce Richardson 	/* destroy the shadow page table if we're a secondary process */
172699a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
172799a2dd95SBruce Richardson 		return 0;
172899a2dd95SBruce Richardson 
172999a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk,
173099a2dd95SBruce Richardson 			NULL))
173199a2dd95SBruce Richardson 		return -1;
173299a2dd95SBruce Richardson 
173399a2dd95SBruce Richardson 	return 0;
173499a2dd95SBruce Richardson }
173599a2dd95SBruce Richardson 
173699a2dd95SBruce Richardson int
eal_memalloc_init(void)173799a2dd95SBruce Richardson eal_memalloc_init(void)
173899a2dd95SBruce Richardson {
173999a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
174099a2dd95SBruce Richardson 		eal_get_internal_configuration();
174199a2dd95SBruce Richardson 
174299a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1743f82c02d3SArtemy Kovalyov 		/*  memory_hotplug_lock is held during initialization, so it's
1744f82c02d3SArtemy Kovalyov 		 *  safe to call thread-unsafe version.
1745f82c02d3SArtemy Kovalyov 		 */
1746f82c02d3SArtemy Kovalyov 		if (rte_memseg_list_walk_thread_unsafe(secondary_msl_create_walk, NULL) < 0)
174799a2dd95SBruce Richardson 			return -1;
174899a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
174999a2dd95SBruce Richardson 			internal_conf->in_memory) {
175099a2dd95SBruce Richardson 		int mfd_res = test_memfd_create();
175199a2dd95SBruce Richardson 
175299a2dd95SBruce Richardson 		if (mfd_res < 0) {
1753ae67895bSDavid Marchand 			EAL_LOG(ERR, "Unable to check if memfd is supported");
175499a2dd95SBruce Richardson 			return -1;
175599a2dd95SBruce Richardson 		}
175699a2dd95SBruce Richardson 		if (mfd_res == 1)
1757ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Using memfd for anonymous memory");
175899a2dd95SBruce Richardson 		else
1759ae67895bSDavid Marchand 			EAL_LOG(INFO, "Using memfd is not supported, falling back to anonymous hugepages");
176099a2dd95SBruce Richardson 
176199a2dd95SBruce Richardson 		/* we only support single-file segments mode with in-memory mode
176299a2dd95SBruce Richardson 		 * if we support hugetlbfs with memfd_create. this code will
176399a2dd95SBruce Richardson 		 * test if we do.
176499a2dd95SBruce Richardson 		 */
176599a2dd95SBruce Richardson 		if (internal_conf->single_file_segments &&
176699a2dd95SBruce Richardson 				mfd_res != 1) {
1767ae67895bSDavid Marchand 			EAL_LOG(ERR, "Single-file segments mode cannot be used without memfd support");
176899a2dd95SBruce Richardson 			return -1;
176999a2dd95SBruce Richardson 		}
177099a2dd95SBruce Richardson 		/* this cannot ever happen but better safe than sorry */
177199a2dd95SBruce Richardson 		if (!anonymous_hugepages_supported) {
1772ae67895bSDavid Marchand 			EAL_LOG(ERR, "Using anonymous memory is not supported");
177399a2dd95SBruce Richardson 			return -1;
177499a2dd95SBruce Richardson 		}
177532b4771cSDmitry Kozlyuk 		/* safety net, should be impossible to configure */
177632b4771cSDmitry Kozlyuk 		if (internal_conf->hugepage_file.unlink_before_mapping &&
177732b4771cSDmitry Kozlyuk 				!internal_conf->hugepage_file.unlink_existing) {
1778ae67895bSDavid Marchand 			EAL_LOG(ERR, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.");
177932b4771cSDmitry Kozlyuk 			return -1;
178032b4771cSDmitry Kozlyuk 		}
178199a2dd95SBruce Richardson 	}
178299a2dd95SBruce Richardson 
178399a2dd95SBruce Richardson 	/* initialize all of the fd lists */
1784f82c02d3SArtemy Kovalyov 	if (rte_memseg_list_walk_thread_unsafe(fd_list_create_walk, NULL))
178599a2dd95SBruce Richardson 		return -1;
178699a2dd95SBruce Richardson 	return 0;
178799a2dd95SBruce Richardson }
1788