xref: /dpdk/lib/eal/linux/eal_memalloc.c (revision 32b4771cd8ef377c039d895d034b3b5e86ee34d2)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2017-2018 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #include <errno.h>
699a2dd95SBruce Richardson #include <stdarg.h>
799a2dd95SBruce Richardson #include <stdbool.h>
899a2dd95SBruce Richardson #include <stdlib.h>
999a2dd95SBruce Richardson #include <stdio.h>
1099a2dd95SBruce Richardson #include <stdint.h>
1199a2dd95SBruce Richardson #include <inttypes.h>
1299a2dd95SBruce Richardson #include <string.h>
1399a2dd95SBruce Richardson #include <sys/mman.h>
1499a2dd95SBruce Richardson #include <sys/types.h>
1599a2dd95SBruce Richardson #include <sys/stat.h>
1699a2dd95SBruce Richardson #include <sys/queue.h>
1799a2dd95SBruce Richardson #include <sys/file.h>
1899a2dd95SBruce Richardson #include <unistd.h>
1999a2dd95SBruce Richardson #include <limits.h>
2099a2dd95SBruce Richardson #include <fcntl.h>
2199a2dd95SBruce Richardson #include <sys/ioctl.h>
2299a2dd95SBruce Richardson #include <sys/time.h>
2399a2dd95SBruce Richardson #include <signal.h>
2499a2dd95SBruce Richardson #include <setjmp.h>
2599a2dd95SBruce Richardson #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
2699a2dd95SBruce Richardson #include <linux/memfd.h>
2799a2dd95SBruce Richardson #define MEMFD_SUPPORTED
2899a2dd95SBruce Richardson #endif
2999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
3099a2dd95SBruce Richardson #include <numa.h>
3199a2dd95SBruce Richardson #include <numaif.h>
3299a2dd95SBruce Richardson #endif
3399a2dd95SBruce Richardson #include <linux/falloc.h>
3499a2dd95SBruce Richardson #include <linux/mman.h> /* for hugetlb-related mmap flags */
3599a2dd95SBruce Richardson 
3699a2dd95SBruce Richardson #include <rte_common.h>
3799a2dd95SBruce Richardson #include <rte_log.h>
3899a2dd95SBruce Richardson #include <rte_eal.h>
3999a2dd95SBruce Richardson #include <rte_errno.h>
4099a2dd95SBruce Richardson #include <rte_memory.h>
4199a2dd95SBruce Richardson #include <rte_spinlock.h>
4299a2dd95SBruce Richardson 
4399a2dd95SBruce Richardson #include "eal_filesystem.h"
4499a2dd95SBruce Richardson #include "eal_internal_cfg.h"
4599a2dd95SBruce Richardson #include "eal_memalloc.h"
4699a2dd95SBruce Richardson #include "eal_memcfg.h"
4799a2dd95SBruce Richardson #include "eal_private.h"
4899a2dd95SBruce Richardson 
4999a2dd95SBruce Richardson const int anonymous_hugepages_supported =
5099a2dd95SBruce Richardson #ifdef MAP_HUGE_SHIFT
5199a2dd95SBruce Richardson 		1;
5299a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
5399a2dd95SBruce Richardson #else
5499a2dd95SBruce Richardson 		0;
5599a2dd95SBruce Richardson #define RTE_MAP_HUGE_SHIFT 26
5699a2dd95SBruce Richardson #endif
5799a2dd95SBruce Richardson 
5899a2dd95SBruce Richardson /*
5999a2dd95SBruce Richardson  * we've already checked memfd support at compile-time, but we also need to
6099a2dd95SBruce Richardson  * check if we can create hugepage files with memfd.
6199a2dd95SBruce Richardson  *
6299a2dd95SBruce Richardson  * also, this is not a constant, because while we may be *compiled* with memfd
6399a2dd95SBruce Richardson  * hugetlbfs support, we might not be *running* on a system that supports memfd
6499a2dd95SBruce Richardson  * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
6599a2dd95SBruce Richardson  * runtime, and fall back to anonymous memory.
6699a2dd95SBruce Richardson  */
6799a2dd95SBruce Richardson static int memfd_create_supported =
6899a2dd95SBruce Richardson #ifdef MFD_HUGETLB
6999a2dd95SBruce Richardson 		1;
7099a2dd95SBruce Richardson #define RTE_MFD_HUGETLB MFD_HUGETLB
7199a2dd95SBruce Richardson #else
7299a2dd95SBruce Richardson 		0;
7399a2dd95SBruce Richardson #define RTE_MFD_HUGETLB 4U
7499a2dd95SBruce Richardson #endif
7599a2dd95SBruce Richardson 
7699a2dd95SBruce Richardson /*
7799a2dd95SBruce Richardson  * not all kernel version support fallocate on hugetlbfs, so fall back to
7899a2dd95SBruce Richardson  * ftruncate and disallow deallocation if fallocate is not supported.
7999a2dd95SBruce Richardson  */
8099a2dd95SBruce Richardson static int fallocate_supported = -1; /* unknown */
8199a2dd95SBruce Richardson 
8299a2dd95SBruce Richardson /*
8399a2dd95SBruce Richardson  * we have two modes - single file segments, and file-per-page mode.
8499a2dd95SBruce Richardson  *
8599a2dd95SBruce Richardson  * for single-file segments, we use memseg_list_fd to store the segment fd,
8699a2dd95SBruce Richardson  * while the fds[] will not be allocated, and len will be set to 0.
8799a2dd95SBruce Richardson  *
8899a2dd95SBruce Richardson  * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
8999a2dd95SBruce Richardson  * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
9099a2dd95SBruce Richardson  *
9199a2dd95SBruce Richardson  * we cannot know how many pages a system will have in advance, but we do know
9299a2dd95SBruce Richardson  * that they come in lists, and we know lengths of these lists. so, simply store
9399a2dd95SBruce Richardson  * a malloc'd array of fd's indexed by list and segment index.
9499a2dd95SBruce Richardson  *
9599a2dd95SBruce Richardson  * they will be initialized at startup, and filled as we allocate/deallocate
9699a2dd95SBruce Richardson  * segments.
9799a2dd95SBruce Richardson  */
9899a2dd95SBruce Richardson static struct {
9999a2dd95SBruce Richardson 	int *fds; /**< dynamically allocated array of segment lock fd's */
10099a2dd95SBruce Richardson 	int memseg_list_fd; /**< memseg list fd */
10199a2dd95SBruce Richardson 	int len; /**< total length of the array */
10299a2dd95SBruce Richardson 	int count; /**< entries used in an array */
10399a2dd95SBruce Richardson } fd_list[RTE_MAX_MEMSEG_LISTS];
10499a2dd95SBruce Richardson 
10599a2dd95SBruce Richardson /** local copy of a memory map, used to synchronize memory hotplug in MP */
10699a2dd95SBruce Richardson static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
10799a2dd95SBruce Richardson 
10899a2dd95SBruce Richardson static sigjmp_buf huge_jmpenv;
10999a2dd95SBruce Richardson 
1109bffc928SOlivier Matz static void huge_sigbus_handler(int signo __rte_unused)
11199a2dd95SBruce Richardson {
11299a2dd95SBruce Richardson 	siglongjmp(huge_jmpenv, 1);
11399a2dd95SBruce Richardson }
11499a2dd95SBruce Richardson 
11599a2dd95SBruce Richardson /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
11699a2dd95SBruce Richardson  * non-static local variable in the stack frame calling sigsetjmp might be
11799a2dd95SBruce Richardson  * clobbered by a call to longjmp.
11899a2dd95SBruce Richardson  */
1199bffc928SOlivier Matz static int huge_wrap_sigsetjmp(void)
12099a2dd95SBruce Richardson {
12199a2dd95SBruce Richardson 	return sigsetjmp(huge_jmpenv, 1);
12299a2dd95SBruce Richardson }
12399a2dd95SBruce Richardson 
12499a2dd95SBruce Richardson static struct sigaction huge_action_old;
12599a2dd95SBruce Richardson static int huge_need_recover;
12699a2dd95SBruce Richardson 
1279bffc928SOlivier Matz static void
12899a2dd95SBruce Richardson huge_register_sigbus(void)
12999a2dd95SBruce Richardson {
13099a2dd95SBruce Richardson 	sigset_t mask;
13199a2dd95SBruce Richardson 	struct sigaction action;
13299a2dd95SBruce Richardson 
13399a2dd95SBruce Richardson 	sigemptyset(&mask);
13499a2dd95SBruce Richardson 	sigaddset(&mask, SIGBUS);
13599a2dd95SBruce Richardson 	action.sa_flags = 0;
13699a2dd95SBruce Richardson 	action.sa_mask = mask;
13799a2dd95SBruce Richardson 	action.sa_handler = huge_sigbus_handler;
13899a2dd95SBruce Richardson 
13999a2dd95SBruce Richardson 	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
14099a2dd95SBruce Richardson }
14199a2dd95SBruce Richardson 
1429bffc928SOlivier Matz static void
14399a2dd95SBruce Richardson huge_recover_sigbus(void)
14499a2dd95SBruce Richardson {
14599a2dd95SBruce Richardson 	if (huge_need_recover) {
14699a2dd95SBruce Richardson 		sigaction(SIGBUS, &huge_action_old, NULL);
14799a2dd95SBruce Richardson 		huge_need_recover = 0;
14899a2dd95SBruce Richardson 	}
14999a2dd95SBruce Richardson }
15099a2dd95SBruce Richardson 
15199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
15299a2dd95SBruce Richardson static bool
15399a2dd95SBruce Richardson check_numa(void)
15499a2dd95SBruce Richardson {
15599a2dd95SBruce Richardson 	bool ret = true;
15699a2dd95SBruce Richardson 	/* Check if kernel supports NUMA. */
15799a2dd95SBruce Richardson 	if (numa_available() != 0) {
15899a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
15999a2dd95SBruce Richardson 		ret = false;
16099a2dd95SBruce Richardson 	}
16199a2dd95SBruce Richardson 	return ret;
16299a2dd95SBruce Richardson }
16399a2dd95SBruce Richardson 
16499a2dd95SBruce Richardson static void
16599a2dd95SBruce Richardson prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
16699a2dd95SBruce Richardson {
16799a2dd95SBruce Richardson 	RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
16899a2dd95SBruce Richardson 	if (get_mempolicy(oldpolicy, oldmask->maskp,
16999a2dd95SBruce Richardson 			  oldmask->size + 1, 0, 0) < 0) {
17099a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL,
17199a2dd95SBruce Richardson 			"Failed to get current mempolicy: %s. "
17299a2dd95SBruce Richardson 			"Assuming MPOL_DEFAULT.\n", strerror(errno));
17399a2dd95SBruce Richardson 		*oldpolicy = MPOL_DEFAULT;
17499a2dd95SBruce Richardson 	}
17599a2dd95SBruce Richardson 	RTE_LOG(DEBUG, EAL,
17699a2dd95SBruce Richardson 		"Setting policy MPOL_PREFERRED for socket %d\n",
17799a2dd95SBruce Richardson 		socket_id);
17899a2dd95SBruce Richardson 	numa_set_preferred(socket_id);
17999a2dd95SBruce Richardson }
18099a2dd95SBruce Richardson 
18199a2dd95SBruce Richardson static void
18299a2dd95SBruce Richardson restore_numa(int *oldpolicy, struct bitmask *oldmask)
18399a2dd95SBruce Richardson {
18499a2dd95SBruce Richardson 	RTE_LOG(DEBUG, EAL,
18599a2dd95SBruce Richardson 		"Restoring previous memory policy: %d\n", *oldpolicy);
18699a2dd95SBruce Richardson 	if (*oldpolicy == MPOL_DEFAULT) {
18799a2dd95SBruce Richardson 		numa_set_localalloc();
18899a2dd95SBruce Richardson 	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
18999a2dd95SBruce Richardson 				 oldmask->size + 1) < 0) {
19099a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
19199a2dd95SBruce Richardson 			strerror(errno));
19299a2dd95SBruce Richardson 		numa_set_localalloc();
19399a2dd95SBruce Richardson 	}
19499a2dd95SBruce Richardson 	numa_free_cpumask(oldmask);
19599a2dd95SBruce Richardson }
19699a2dd95SBruce Richardson #endif
19799a2dd95SBruce Richardson 
19899a2dd95SBruce Richardson /*
19999a2dd95SBruce Richardson  * uses fstat to report the size of a file on disk
20099a2dd95SBruce Richardson  */
20199a2dd95SBruce Richardson static off_t
20299a2dd95SBruce Richardson get_file_size(int fd)
20399a2dd95SBruce Richardson {
20499a2dd95SBruce Richardson 	struct stat st;
20599a2dd95SBruce Richardson 	if (fstat(fd, &st) < 0)
20699a2dd95SBruce Richardson 		return 0;
20799a2dd95SBruce Richardson 	return st.st_size;
20899a2dd95SBruce Richardson }
20999a2dd95SBruce Richardson 
21099a2dd95SBruce Richardson static int
21199a2dd95SBruce Richardson pagesz_flags(uint64_t page_sz)
21299a2dd95SBruce Richardson {
21399a2dd95SBruce Richardson 	/* as per mmap() manpage, all page sizes are log2 of page size
21499a2dd95SBruce Richardson 	 * shifted by MAP_HUGE_SHIFT
21599a2dd95SBruce Richardson 	 */
21699a2dd95SBruce Richardson 	int log2 = rte_log2_u64(page_sz);
21799a2dd95SBruce Richardson 	return log2 << RTE_MAP_HUGE_SHIFT;
21899a2dd95SBruce Richardson }
21999a2dd95SBruce Richardson 
22099a2dd95SBruce Richardson /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
22199a2dd95SBruce Richardson static int lock(int fd, int type)
22299a2dd95SBruce Richardson {
22399a2dd95SBruce Richardson 	int ret;
22499a2dd95SBruce Richardson 
22599a2dd95SBruce Richardson 	/* flock may be interrupted */
22699a2dd95SBruce Richardson 	do {
22799a2dd95SBruce Richardson 		ret = flock(fd, type | LOCK_NB);
22899a2dd95SBruce Richardson 	} while (ret && errno == EINTR);
22999a2dd95SBruce Richardson 
23099a2dd95SBruce Richardson 	if (ret && errno == EWOULDBLOCK) {
23199a2dd95SBruce Richardson 		/* couldn't lock */
23299a2dd95SBruce Richardson 		return 0;
23399a2dd95SBruce Richardson 	} else if (ret) {
23499a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
23599a2dd95SBruce Richardson 			__func__, strerror(errno));
23699a2dd95SBruce Richardson 		return -1;
23799a2dd95SBruce Richardson 	}
23899a2dd95SBruce Richardson 	/* lock was successful */
23999a2dd95SBruce Richardson 	return 1;
24099a2dd95SBruce Richardson }
24199a2dd95SBruce Richardson 
24299a2dd95SBruce Richardson static int
24399a2dd95SBruce Richardson get_seg_memfd(struct hugepage_info *hi __rte_unused,
24499a2dd95SBruce Richardson 		unsigned int list_idx __rte_unused,
24599a2dd95SBruce Richardson 		unsigned int seg_idx __rte_unused)
24699a2dd95SBruce Richardson {
24799a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
24899a2dd95SBruce Richardson 	int fd;
24999a2dd95SBruce Richardson 	char segname[250]; /* as per manpage, limit is 249 bytes plus null */
25099a2dd95SBruce Richardson 
25199a2dd95SBruce Richardson 	int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
25299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
25399a2dd95SBruce Richardson 		eal_get_internal_configuration();
25499a2dd95SBruce Richardson 
25599a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
25699a2dd95SBruce Richardson 		fd = fd_list[list_idx].memseg_list_fd;
25799a2dd95SBruce Richardson 
25899a2dd95SBruce Richardson 		if (fd < 0) {
25999a2dd95SBruce Richardson 			snprintf(segname, sizeof(segname), "seg_%i", list_idx);
26099a2dd95SBruce Richardson 			fd = memfd_create(segname, flags);
26199a2dd95SBruce Richardson 			if (fd < 0) {
26299a2dd95SBruce Richardson 				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
26399a2dd95SBruce Richardson 					__func__, strerror(errno));
26499a2dd95SBruce Richardson 				return -1;
26599a2dd95SBruce Richardson 			}
26699a2dd95SBruce Richardson 			fd_list[list_idx].memseg_list_fd = fd;
26799a2dd95SBruce Richardson 		}
26899a2dd95SBruce Richardson 	} else {
26999a2dd95SBruce Richardson 		fd = fd_list[list_idx].fds[seg_idx];
27099a2dd95SBruce Richardson 
27199a2dd95SBruce Richardson 		if (fd < 0) {
27299a2dd95SBruce Richardson 			snprintf(segname, sizeof(segname), "seg_%i-%i",
27399a2dd95SBruce Richardson 					list_idx, seg_idx);
27499a2dd95SBruce Richardson 			fd = memfd_create(segname, flags);
27599a2dd95SBruce Richardson 			if (fd < 0) {
27699a2dd95SBruce Richardson 				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
27799a2dd95SBruce Richardson 					__func__, strerror(errno));
27899a2dd95SBruce Richardson 				return -1;
27999a2dd95SBruce Richardson 			}
28099a2dd95SBruce Richardson 			fd_list[list_idx].fds[seg_idx] = fd;
28199a2dd95SBruce Richardson 		}
28299a2dd95SBruce Richardson 	}
28399a2dd95SBruce Richardson 	return fd;
28499a2dd95SBruce Richardson #endif
28599a2dd95SBruce Richardson 	return -1;
28699a2dd95SBruce Richardson }
28799a2dd95SBruce Richardson 
28899a2dd95SBruce Richardson static int
28999a2dd95SBruce Richardson get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
290*32b4771cSDmitry Kozlyuk 		unsigned int list_idx, unsigned int seg_idx,
291*32b4771cSDmitry Kozlyuk 		bool *dirty)
29299a2dd95SBruce Richardson {
29399a2dd95SBruce Richardson 	int fd;
294*32b4771cSDmitry Kozlyuk 	int *out_fd;
295*32b4771cSDmitry Kozlyuk 	struct stat st;
296*32b4771cSDmitry Kozlyuk 	int ret;
29799a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
29899a2dd95SBruce Richardson 		eal_get_internal_configuration();
29999a2dd95SBruce Richardson 
300*32b4771cSDmitry Kozlyuk 	if (dirty != NULL)
301*32b4771cSDmitry Kozlyuk 		*dirty = false;
302*32b4771cSDmitry Kozlyuk 
30399a2dd95SBruce Richardson 	/* for in-memory mode, we only make it here when we're sure we support
30499a2dd95SBruce Richardson 	 * memfd, and this is a special case.
30599a2dd95SBruce Richardson 	 */
30699a2dd95SBruce Richardson 	if (internal_conf->in_memory)
30799a2dd95SBruce Richardson 		return get_seg_memfd(hi, list_idx, seg_idx);
30899a2dd95SBruce Richardson 
30999a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
310*32b4771cSDmitry Kozlyuk 		out_fd = &fd_list[list_idx].memseg_list_fd;
31199a2dd95SBruce Richardson 		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
312*32b4771cSDmitry Kozlyuk 	} else {
313*32b4771cSDmitry Kozlyuk 		out_fd = &fd_list[list_idx].fds[seg_idx];
314*32b4771cSDmitry Kozlyuk 		eal_get_hugefile_path(path, buflen, hi->hugedir,
315*32b4771cSDmitry Kozlyuk 				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
316*32b4771cSDmitry Kozlyuk 	}
317*32b4771cSDmitry Kozlyuk 	fd = *out_fd;
318*32b4771cSDmitry Kozlyuk 	if (fd >= 0)
319*32b4771cSDmitry Kozlyuk 		return fd;
32099a2dd95SBruce Richardson 
321*32b4771cSDmitry Kozlyuk 	/*
322*32b4771cSDmitry Kozlyuk 	 * There is no TOCTOU between stat() and unlink()/open()
323*32b4771cSDmitry Kozlyuk 	 * because the hugepage directory is locked.
324*32b4771cSDmitry Kozlyuk 	 */
325*32b4771cSDmitry Kozlyuk 	ret = stat(path, &st);
326*32b4771cSDmitry Kozlyuk 	if (ret < 0 && errno != ENOENT) {
327*32b4771cSDmitry Kozlyuk 		RTE_LOG(DEBUG, EAL, "%s(): stat() for '%s' failed: %s\n",
3288a5a9140SStephen Hemminger 			__func__, path, strerror(errno));
32999a2dd95SBruce Richardson 		return -1;
33099a2dd95SBruce Richardson 	}
331*32b4771cSDmitry Kozlyuk 	if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
332*32b4771cSDmitry Kozlyuk 			dirty != NULL)
333*32b4771cSDmitry Kozlyuk 		*dirty = true;
33499a2dd95SBruce Richardson 
335*32b4771cSDmitry Kozlyuk 	/*
336*32b4771cSDmitry Kozlyuk 	 * The kernel clears a hugepage only when it is mapped
337*32b4771cSDmitry Kozlyuk 	 * from a particular file for the first time.
338*32b4771cSDmitry Kozlyuk 	 * If the file already exists, the old content will be mapped.
339*32b4771cSDmitry Kozlyuk 	 * If the memory manager assumes all mapped pages to be clean,
340*32b4771cSDmitry Kozlyuk 	 * the file must be removed and created anew.
341*32b4771cSDmitry Kozlyuk 	 * Otherwise, the primary caller must be notified
342*32b4771cSDmitry Kozlyuk 	 * that mapped pages will be dirty
343*32b4771cSDmitry Kozlyuk 	 * (secondary callers receive the segment state from the primary one).
344*32b4771cSDmitry Kozlyuk 	 * When multiple hugepages are mapped from the same file,
345*32b4771cSDmitry Kozlyuk 	 * whether they will be dirty depends on the part that is mapped.
34699a2dd95SBruce Richardson 	 */
347*32b4771cSDmitry Kozlyuk 	if (!internal_conf->single_file_segments &&
348*32b4771cSDmitry Kozlyuk 			internal_conf->hugepage_file.unlink_existing &&
349*32b4771cSDmitry Kozlyuk 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
350*32b4771cSDmitry Kozlyuk 			ret == 0) {
351*32b4771cSDmitry Kozlyuk 		/* coverity[toctou] */
352*32b4771cSDmitry Kozlyuk 		if (unlink(path) < 0) {
35399a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n",
35499a2dd95SBruce Richardson 				__func__, path, strerror(errno));
35599a2dd95SBruce Richardson 			return -1;
35699a2dd95SBruce Richardson 		}
357*32b4771cSDmitry Kozlyuk 	}
35899a2dd95SBruce Richardson 
359*32b4771cSDmitry Kozlyuk 	/* coverity[toctou] */
36099a2dd95SBruce Richardson 	fd = open(path, O_CREAT | O_RDWR, 0600);
36199a2dd95SBruce Richardson 	if (fd < 0) {
3628a5a9140SStephen Hemminger 		RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n",
3638a5a9140SStephen Hemminger 			__func__, path, strerror(errno));
36499a2dd95SBruce Richardson 		return -1;
36599a2dd95SBruce Richardson 	}
36699a2dd95SBruce Richardson 	/* take out a read lock */
36799a2dd95SBruce Richardson 	if (lock(fd, LOCK_SH) < 0) {
368*32b4771cSDmitry Kozlyuk 		RTE_LOG(ERR, EAL, "%s(): lock '%s' failed: %s\n",
369*32b4771cSDmitry Kozlyuk 			__func__, path, strerror(errno));
37099a2dd95SBruce Richardson 		close(fd);
37199a2dd95SBruce Richardson 		return -1;
37299a2dd95SBruce Richardson 	}
373*32b4771cSDmitry Kozlyuk 	*out_fd = fd;
37499a2dd95SBruce Richardson 	return fd;
37599a2dd95SBruce Richardson }
37699a2dd95SBruce Richardson 
37799a2dd95SBruce Richardson static int
37899a2dd95SBruce Richardson resize_hugefile_in_memory(int fd, uint64_t fa_offset,
37999a2dd95SBruce Richardson 		uint64_t page_sz, bool grow)
38099a2dd95SBruce Richardson {
38199a2dd95SBruce Richardson 	int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
38299a2dd95SBruce Richardson 			FALLOC_FL_KEEP_SIZE;
38399a2dd95SBruce Richardson 	int ret;
38499a2dd95SBruce Richardson 
38599a2dd95SBruce Richardson 	/* grow or shrink the file */
38699a2dd95SBruce Richardson 	ret = fallocate(fd, flags, fa_offset, page_sz);
38799a2dd95SBruce Richardson 
38899a2dd95SBruce Richardson 	if (ret < 0) {
38999a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
39099a2dd95SBruce Richardson 				__func__,
39199a2dd95SBruce Richardson 				strerror(errno));
39299a2dd95SBruce Richardson 		return -1;
39399a2dd95SBruce Richardson 	}
39499a2dd95SBruce Richardson 	return 0;
39599a2dd95SBruce Richardson }
39699a2dd95SBruce Richardson 
39799a2dd95SBruce Richardson static int
39899a2dd95SBruce Richardson resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
399*32b4771cSDmitry Kozlyuk 		bool grow, bool *dirty)
40099a2dd95SBruce Richardson {
401*32b4771cSDmitry Kozlyuk 	const struct internal_config *internal_conf =
402*32b4771cSDmitry Kozlyuk 			eal_get_internal_configuration();
40399a2dd95SBruce Richardson 	bool again = false;
40499a2dd95SBruce Richardson 
40599a2dd95SBruce Richardson 	do {
40699a2dd95SBruce Richardson 		if (fallocate_supported == 0) {
40799a2dd95SBruce Richardson 			/* we cannot deallocate memory if fallocate() is not
40899a2dd95SBruce Richardson 			 * supported, and hugepage file is already locked at
40999a2dd95SBruce Richardson 			 * creation, so no further synchronization needed.
41099a2dd95SBruce Richardson 			 */
41199a2dd95SBruce Richardson 
41299a2dd95SBruce Richardson 			if (!grow) {
41399a2dd95SBruce Richardson 				RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
41499a2dd95SBruce Richardson 					__func__);
41599a2dd95SBruce Richardson 				return -1;
41699a2dd95SBruce Richardson 			}
41799a2dd95SBruce Richardson 			uint64_t new_size = fa_offset + page_sz;
41899a2dd95SBruce Richardson 			uint64_t cur_size = get_file_size(fd);
41999a2dd95SBruce Richardson 
42099a2dd95SBruce Richardson 			/* fallocate isn't supported, fall back to ftruncate */
421*32b4771cSDmitry Kozlyuk 			if (dirty != NULL)
422*32b4771cSDmitry Kozlyuk 				*dirty = new_size <= cur_size;
42399a2dd95SBruce Richardson 			if (new_size > cur_size &&
42499a2dd95SBruce Richardson 					ftruncate(fd, new_size) < 0) {
42599a2dd95SBruce Richardson 				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
42699a2dd95SBruce Richardson 					__func__, strerror(errno));
42799a2dd95SBruce Richardson 				return -1;
42899a2dd95SBruce Richardson 			}
42999a2dd95SBruce Richardson 		} else {
43099a2dd95SBruce Richardson 			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
43199a2dd95SBruce Richardson 					FALLOC_FL_KEEP_SIZE;
43299a2dd95SBruce Richardson 			int ret;
43399a2dd95SBruce Richardson 
43499a2dd95SBruce Richardson 			/*
43599a2dd95SBruce Richardson 			 * technically, it is perfectly safe for both primary
43699a2dd95SBruce Richardson 			 * and secondary to grow and shrink the page files:
43799a2dd95SBruce Richardson 			 * growing the file repeatedly has no effect because
43899a2dd95SBruce Richardson 			 * a page can only be allocated once, while mmap ensures
43999a2dd95SBruce Richardson 			 * that secondaries hold on to the page even after the
44099a2dd95SBruce Richardson 			 * page itself is removed from the filesystem.
44199a2dd95SBruce Richardson 			 *
44299a2dd95SBruce Richardson 			 * however, leaving growing/shrinking to the primary
44399a2dd95SBruce Richardson 			 * tends to expose bugs in fdlist page count handling,
44499a2dd95SBruce Richardson 			 * so leave this here just in case.
44599a2dd95SBruce Richardson 			 */
44699a2dd95SBruce Richardson 			if (rte_eal_process_type() != RTE_PROC_PRIMARY)
44799a2dd95SBruce Richardson 				return 0;
44899a2dd95SBruce Richardson 
44999a2dd95SBruce Richardson 			/* grow or shrink the file */
45099a2dd95SBruce Richardson 			ret = fallocate(fd, flags, fa_offset, page_sz);
45199a2dd95SBruce Richardson 
45299a2dd95SBruce Richardson 			if (ret < 0) {
45399a2dd95SBruce Richardson 				if (fallocate_supported == -1 &&
45499a2dd95SBruce Richardson 						errno == ENOTSUP) {
45599a2dd95SBruce Richardson 					RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
45699a2dd95SBruce Richardson 						__func__);
45799a2dd95SBruce Richardson 					again = true;
45899a2dd95SBruce Richardson 					fallocate_supported = 0;
45999a2dd95SBruce Richardson 				} else {
46099a2dd95SBruce Richardson 					RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
46199a2dd95SBruce Richardson 						__func__,
46299a2dd95SBruce Richardson 						strerror(errno));
46399a2dd95SBruce Richardson 					return -1;
46499a2dd95SBruce Richardson 				}
465*32b4771cSDmitry Kozlyuk 			} else {
46699a2dd95SBruce Richardson 				fallocate_supported = 1;
467*32b4771cSDmitry Kozlyuk 				/*
468*32b4771cSDmitry Kozlyuk 				 * It is unknown which portions of an existing
469*32b4771cSDmitry Kozlyuk 				 * hugepage file were allocated previously,
470*32b4771cSDmitry Kozlyuk 				 * so all pages within the file are considered
471*32b4771cSDmitry Kozlyuk 				 * dirty, unless the file is a fresh one.
472*32b4771cSDmitry Kozlyuk 				 */
473*32b4771cSDmitry Kozlyuk 				if (dirty != NULL)
474*32b4771cSDmitry Kozlyuk 					*dirty &= !internal_conf->hugepage_file.unlink_existing;
475*32b4771cSDmitry Kozlyuk 			}
47699a2dd95SBruce Richardson 		}
47799a2dd95SBruce Richardson 	} while (again);
47899a2dd95SBruce Richardson 
47999a2dd95SBruce Richardson 	return 0;
48099a2dd95SBruce Richardson }
48199a2dd95SBruce Richardson 
48299a2dd95SBruce Richardson static void
48399a2dd95SBruce Richardson close_hugefile(int fd, char *path, int list_idx)
48499a2dd95SBruce Richardson {
48599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
48699a2dd95SBruce Richardson 		eal_get_internal_configuration();
48799a2dd95SBruce Richardson 	/*
48899a2dd95SBruce Richardson 	 * primary process must unlink the file, but only when not in in-memory
48999a2dd95SBruce Richardson 	 * mode (as in that case there is no file to unlink).
49099a2dd95SBruce Richardson 	 */
49199a2dd95SBruce Richardson 	if (!internal_conf->in_memory &&
49299a2dd95SBruce Richardson 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
49399a2dd95SBruce Richardson 			unlink(path))
49499a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
49599a2dd95SBruce Richardson 			__func__, path, strerror(errno));
49699a2dd95SBruce Richardson 
49799a2dd95SBruce Richardson 	close(fd);
49899a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = -1;
49999a2dd95SBruce Richardson }
50099a2dd95SBruce Richardson 
50199a2dd95SBruce Richardson static int
502*32b4771cSDmitry Kozlyuk resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
503*32b4771cSDmitry Kozlyuk 		bool *dirty)
50499a2dd95SBruce Richardson {
50599a2dd95SBruce Richardson 	/* in-memory mode is a special case, because we can be sure that
50699a2dd95SBruce Richardson 	 * fallocate() is supported.
50799a2dd95SBruce Richardson 	 */
50899a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
50999a2dd95SBruce Richardson 		eal_get_internal_configuration();
51099a2dd95SBruce Richardson 
511*32b4771cSDmitry Kozlyuk 	if (internal_conf->in_memory) {
512*32b4771cSDmitry Kozlyuk 		if (dirty != NULL)
513*32b4771cSDmitry Kozlyuk 			*dirty = false;
51499a2dd95SBruce Richardson 		return resize_hugefile_in_memory(fd, fa_offset,
51599a2dd95SBruce Richardson 				page_sz, grow);
516*32b4771cSDmitry Kozlyuk 	}
51799a2dd95SBruce Richardson 
51899a2dd95SBruce Richardson 	return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
519*32b4771cSDmitry Kozlyuk 			grow, dirty);
52099a2dd95SBruce Richardson }
52199a2dd95SBruce Richardson 
52299a2dd95SBruce Richardson static int
52399a2dd95SBruce Richardson alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
52499a2dd95SBruce Richardson 		struct hugepage_info *hi, unsigned int list_idx,
52599a2dd95SBruce Richardson 		unsigned int seg_idx)
52699a2dd95SBruce Richardson {
52799a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
52899a2dd95SBruce Richardson 	int cur_socket_id = 0;
52999a2dd95SBruce Richardson #endif
53099a2dd95SBruce Richardson 	uint64_t map_offset;
53199a2dd95SBruce Richardson 	rte_iova_t iova;
53299a2dd95SBruce Richardson 	void *va;
53399a2dd95SBruce Richardson 	char path[PATH_MAX];
53499a2dd95SBruce Richardson 	int ret = 0;
53599a2dd95SBruce Richardson 	int fd;
536*32b4771cSDmitry Kozlyuk 	bool dirty;
53799a2dd95SBruce Richardson 	size_t alloc_sz;
53899a2dd95SBruce Richardson 	int flags;
53999a2dd95SBruce Richardson 	void *new_addr;
54099a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
54199a2dd95SBruce Richardson 		eal_get_internal_configuration();
54299a2dd95SBruce Richardson 
54399a2dd95SBruce Richardson 	alloc_sz = hi->hugepage_sz;
54499a2dd95SBruce Richardson 
54599a2dd95SBruce Richardson 	/* these are checked at init, but code analyzers don't know that */
54699a2dd95SBruce Richardson 	if (internal_conf->in_memory && !anonymous_hugepages_supported) {
54799a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
54899a2dd95SBruce Richardson 		return -1;
54999a2dd95SBruce Richardson 	}
55099a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported &&
55199a2dd95SBruce Richardson 			internal_conf->single_file_segments) {
55299a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
55399a2dd95SBruce Richardson 		return -1;
55499a2dd95SBruce Richardson 	}
55599a2dd95SBruce Richardson 
55699a2dd95SBruce Richardson 	/* in-memory without memfd is a special case */
55799a2dd95SBruce Richardson 	int mmap_flags;
55899a2dd95SBruce Richardson 
55999a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported) {
56099a2dd95SBruce Richardson 		const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
56199a2dd95SBruce Richardson 				MAP_PRIVATE | MAP_ANONYMOUS;
56299a2dd95SBruce Richardson 		int pagesz_flag;
56399a2dd95SBruce Richardson 
56499a2dd95SBruce Richardson 		pagesz_flag = pagesz_flags(alloc_sz);
56599a2dd95SBruce Richardson 		fd = -1;
566*32b4771cSDmitry Kozlyuk 		dirty = false;
56799a2dd95SBruce Richardson 		mmap_flags = in_memory_flags | pagesz_flag;
56899a2dd95SBruce Richardson 
56999a2dd95SBruce Richardson 		/* single-file segments codepath will never be active
57099a2dd95SBruce Richardson 		 * here because in-memory mode is incompatible with the
57199a2dd95SBruce Richardson 		 * fallback path, and it's stopped at EAL initialization
57299a2dd95SBruce Richardson 		 * stage.
57399a2dd95SBruce Richardson 		 */
57499a2dd95SBruce Richardson 		map_offset = 0;
57599a2dd95SBruce Richardson 	} else {
57699a2dd95SBruce Richardson 		/* takes out a read lock on segment or segment list */
577*32b4771cSDmitry Kozlyuk 		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
578*32b4771cSDmitry Kozlyuk 				&dirty);
57999a2dd95SBruce Richardson 		if (fd < 0) {
58099a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
58199a2dd95SBruce Richardson 			return -1;
58299a2dd95SBruce Richardson 		}
58399a2dd95SBruce Richardson 
58499a2dd95SBruce Richardson 		if (internal_conf->single_file_segments) {
58599a2dd95SBruce Richardson 			map_offset = seg_idx * alloc_sz;
586*32b4771cSDmitry Kozlyuk 			ret = resize_hugefile(fd, map_offset, alloc_sz, true,
587*32b4771cSDmitry Kozlyuk 					&dirty);
58899a2dd95SBruce Richardson 			if (ret < 0)
58999a2dd95SBruce Richardson 				goto resized;
59099a2dd95SBruce Richardson 
59199a2dd95SBruce Richardson 			fd_list[list_idx].count++;
59299a2dd95SBruce Richardson 		} else {
59399a2dd95SBruce Richardson 			map_offset = 0;
59499a2dd95SBruce Richardson 			if (ftruncate(fd, alloc_sz) < 0) {
59599a2dd95SBruce Richardson 				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
59699a2dd95SBruce Richardson 					__func__, strerror(errno));
59799a2dd95SBruce Richardson 				goto resized;
59899a2dd95SBruce Richardson 			}
59952d7d91eSDmitry Kozlyuk 			if (internal_conf->hugepage_file.unlink_before_mapping &&
60099a2dd95SBruce Richardson 					!internal_conf->in_memory) {
60199a2dd95SBruce Richardson 				if (unlink(path)) {
60299a2dd95SBruce Richardson 					RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
60399a2dd95SBruce Richardson 						__func__, strerror(errno));
60499a2dd95SBruce Richardson 					goto resized;
60599a2dd95SBruce Richardson 				}
60699a2dd95SBruce Richardson 			}
60799a2dd95SBruce Richardson 		}
60899a2dd95SBruce Richardson 		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
60999a2dd95SBruce Richardson 	}
61099a2dd95SBruce Richardson 
6119bffc928SOlivier Matz 	huge_register_sigbus();
6129bffc928SOlivier Matz 
61399a2dd95SBruce Richardson 	/*
61499a2dd95SBruce Richardson 	 * map the segment, and populate page tables, the kernel fills
61599a2dd95SBruce Richardson 	 * this segment with zeros if it's a new page.
61699a2dd95SBruce Richardson 	 */
61799a2dd95SBruce Richardson 	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
61899a2dd95SBruce Richardson 			map_offset);
61999a2dd95SBruce Richardson 
62099a2dd95SBruce Richardson 	if (va == MAP_FAILED) {
62199a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
62299a2dd95SBruce Richardson 			strerror(errno));
62399a2dd95SBruce Richardson 		/* mmap failed, but the previous region might have been
62499a2dd95SBruce Richardson 		 * unmapped anyway. try to remap it
62599a2dd95SBruce Richardson 		 */
62699a2dd95SBruce Richardson 		goto unmapped;
62799a2dd95SBruce Richardson 	}
62899a2dd95SBruce Richardson 	if (va != addr) {
62999a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
63099a2dd95SBruce Richardson 		munmap(va, alloc_sz);
63199a2dd95SBruce Richardson 		goto resized;
63299a2dd95SBruce Richardson 	}
63399a2dd95SBruce Richardson 
63499a2dd95SBruce Richardson 	/* In linux, hugetlb limitations, like cgroup, are
63599a2dd95SBruce Richardson 	 * enforced at fault time instead of mmap(), even
63699a2dd95SBruce Richardson 	 * with the option of MAP_POPULATE. Kernel will send
63799a2dd95SBruce Richardson 	 * a SIGBUS signal. To avoid to be killed, save stack
63899a2dd95SBruce Richardson 	 * environment here, if SIGBUS happens, we can jump
63999a2dd95SBruce Richardson 	 * back here.
64099a2dd95SBruce Richardson 	 */
64199a2dd95SBruce Richardson 	if (huge_wrap_sigsetjmp()) {
64299a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
64399a2dd95SBruce Richardson 			(unsigned int)(alloc_sz >> 20));
64499a2dd95SBruce Richardson 		goto mapped;
64599a2dd95SBruce Richardson 	}
64699a2dd95SBruce Richardson 
64799a2dd95SBruce Richardson 	/* we need to trigger a write to the page to enforce page fault and
64899a2dd95SBruce Richardson 	 * ensure that page is accessible to us, but we can't overwrite value
64999a2dd95SBruce Richardson 	 * that is already there, so read the old value, and write itback.
65099a2dd95SBruce Richardson 	 * kernel populates the page with zeroes initially.
65199a2dd95SBruce Richardson 	 */
65299a2dd95SBruce Richardson 	*(volatile int *)addr = *(volatile int *)addr;
65399a2dd95SBruce Richardson 
65499a2dd95SBruce Richardson 	iova = rte_mem_virt2iova(addr);
65599a2dd95SBruce Richardson 	if (iova == RTE_BAD_PHYS_ADDR) {
65699a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
65799a2dd95SBruce Richardson 			__func__);
65899a2dd95SBruce Richardson 		goto mapped;
65999a2dd95SBruce Richardson 	}
66099a2dd95SBruce Richardson 
66199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
66299a2dd95SBruce Richardson 	/*
66399a2dd95SBruce Richardson 	 * If the kernel has been built without NUMA support, get_mempolicy()
66499a2dd95SBruce Richardson 	 * will return an error. If check_numa() returns false, memory
66599a2dd95SBruce Richardson 	 * allocation is not NUMA aware and the socket_id should not be
66699a2dd95SBruce Richardson 	 * checked.
66799a2dd95SBruce Richardson 	 */
66899a2dd95SBruce Richardson 	if (check_numa()) {
66999a2dd95SBruce Richardson 		ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
67099a2dd95SBruce Richardson 					MPOL_F_NODE | MPOL_F_ADDR);
67199a2dd95SBruce Richardson 		if (ret < 0) {
67299a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n",
67399a2dd95SBruce Richardson 				__func__, strerror(errno));
67499a2dd95SBruce Richardson 			goto mapped;
67599a2dd95SBruce Richardson 		} else if (cur_socket_id != socket_id) {
67699a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL,
67799a2dd95SBruce Richardson 					"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
67899a2dd95SBruce Richardson 				__func__, socket_id, cur_socket_id);
67999a2dd95SBruce Richardson 			goto mapped;
68099a2dd95SBruce Richardson 		}
68199a2dd95SBruce Richardson 	}
68299a2dd95SBruce Richardson #else
68399a2dd95SBruce Richardson 	if (rte_socket_count() > 1)
68499a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n",
68599a2dd95SBruce Richardson 				__func__);
68699a2dd95SBruce Richardson #endif
68799a2dd95SBruce Richardson 
6889bffc928SOlivier Matz 	huge_recover_sigbus();
6899bffc928SOlivier Matz 
69099a2dd95SBruce Richardson 	ms->addr = addr;
69199a2dd95SBruce Richardson 	ms->hugepage_sz = alloc_sz;
69299a2dd95SBruce Richardson 	ms->len = alloc_sz;
69399a2dd95SBruce Richardson 	ms->nchannel = rte_memory_get_nchannel();
69499a2dd95SBruce Richardson 	ms->nrank = rte_memory_get_nrank();
69599a2dd95SBruce Richardson 	ms->iova = iova;
69699a2dd95SBruce Richardson 	ms->socket_id = socket_id;
697*32b4771cSDmitry Kozlyuk 	ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
69899a2dd95SBruce Richardson 
69999a2dd95SBruce Richardson 	return 0;
70099a2dd95SBruce Richardson 
70199a2dd95SBruce Richardson mapped:
70299a2dd95SBruce Richardson 	munmap(addr, alloc_sz);
70399a2dd95SBruce Richardson unmapped:
7049bffc928SOlivier Matz 	huge_recover_sigbus();
70599a2dd95SBruce Richardson 	flags = EAL_RESERVE_FORCE_ADDRESS;
70699a2dd95SBruce Richardson 	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
70799a2dd95SBruce Richardson 	if (new_addr != addr) {
70899a2dd95SBruce Richardson 		if (new_addr != NULL)
70999a2dd95SBruce Richardson 			munmap(new_addr, alloc_sz);
71099a2dd95SBruce Richardson 		/* we're leaving a hole in our virtual address space. if
71199a2dd95SBruce Richardson 		 * somebody else maps this hole now, we could accidentally
71299a2dd95SBruce Richardson 		 * override it in the future.
71399a2dd95SBruce Richardson 		 */
71499a2dd95SBruce Richardson 		RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
71599a2dd95SBruce Richardson 	}
71699a2dd95SBruce Richardson 	/* roll back the ref count */
71799a2dd95SBruce Richardson 	if (internal_conf->single_file_segments)
71899a2dd95SBruce Richardson 		fd_list[list_idx].count--;
71999a2dd95SBruce Richardson resized:
72099a2dd95SBruce Richardson 	/* some codepaths will return negative fd, so exit early */
72199a2dd95SBruce Richardson 	if (fd < 0)
72299a2dd95SBruce Richardson 		return -1;
72399a2dd95SBruce Richardson 
72499a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
725*32b4771cSDmitry Kozlyuk 		resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
72699a2dd95SBruce Richardson 		/* ignore failure, can't make it any worse */
72799a2dd95SBruce Richardson 
72899a2dd95SBruce Richardson 		/* if refcount is at zero, close the file */
72999a2dd95SBruce Richardson 		if (fd_list[list_idx].count == 0)
73099a2dd95SBruce Richardson 			close_hugefile(fd, path, list_idx);
73199a2dd95SBruce Richardson 	} else {
73299a2dd95SBruce Richardson 		/* only remove file if we can take out a write lock */
73352d7d91eSDmitry Kozlyuk 		if (!internal_conf->hugepage_file.unlink_before_mapping &&
73499a2dd95SBruce Richardson 				internal_conf->in_memory == 0 &&
73599a2dd95SBruce Richardson 				lock(fd, LOCK_EX) == 1)
73699a2dd95SBruce Richardson 			unlink(path);
73799a2dd95SBruce Richardson 		close(fd);
73899a2dd95SBruce Richardson 		fd_list[list_idx].fds[seg_idx] = -1;
73999a2dd95SBruce Richardson 	}
74099a2dd95SBruce Richardson 	return -1;
74199a2dd95SBruce Richardson }
74299a2dd95SBruce Richardson 
74399a2dd95SBruce Richardson static int
74499a2dd95SBruce Richardson free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
74599a2dd95SBruce Richardson 		unsigned int list_idx, unsigned int seg_idx)
74699a2dd95SBruce Richardson {
74799a2dd95SBruce Richardson 	uint64_t map_offset;
74899a2dd95SBruce Richardson 	char path[PATH_MAX];
74999a2dd95SBruce Richardson 	int fd, ret = 0;
75099a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
75199a2dd95SBruce Richardson 		eal_get_internal_configuration();
75299a2dd95SBruce Richardson 
75399a2dd95SBruce Richardson 	/* erase page data */
75499a2dd95SBruce Richardson 	memset(ms->addr, 0, ms->len);
75599a2dd95SBruce Richardson 
75699a2dd95SBruce Richardson 	if (mmap(ms->addr, ms->len, PROT_NONE,
75799a2dd95SBruce Richardson 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
75899a2dd95SBruce Richardson 				MAP_FAILED) {
75999a2dd95SBruce Richardson 		RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
76099a2dd95SBruce Richardson 		return -1;
76199a2dd95SBruce Richardson 	}
76299a2dd95SBruce Richardson 
76399a2dd95SBruce Richardson 	eal_mem_set_dump(ms->addr, ms->len, false);
76499a2dd95SBruce Richardson 
76599a2dd95SBruce Richardson 	/* if we're using anonymous hugepages, nothing to be done */
76699a2dd95SBruce Richardson 	if (internal_conf->in_memory && !memfd_create_supported) {
76799a2dd95SBruce Richardson 		memset(ms, 0, sizeof(*ms));
76899a2dd95SBruce Richardson 		return 0;
76999a2dd95SBruce Richardson 	}
77099a2dd95SBruce Richardson 
77199a2dd95SBruce Richardson 	/* if we are not in single file segments mode, we're going to unmap the
77299a2dd95SBruce Richardson 	 * segment and thus drop the lock on original fd, but hugepage dir is
77399a2dd95SBruce Richardson 	 * now locked so we can take out another one without races.
77499a2dd95SBruce Richardson 	 */
775*32b4771cSDmitry Kozlyuk 	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
77699a2dd95SBruce Richardson 	if (fd < 0)
77799a2dd95SBruce Richardson 		return -1;
77899a2dd95SBruce Richardson 
77999a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
78099a2dd95SBruce Richardson 		map_offset = seg_idx * ms->len;
781*32b4771cSDmitry Kozlyuk 		if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
78299a2dd95SBruce Richardson 			return -1;
78399a2dd95SBruce Richardson 
78499a2dd95SBruce Richardson 		if (--(fd_list[list_idx].count) == 0)
78599a2dd95SBruce Richardson 			close_hugefile(fd, path, list_idx);
78699a2dd95SBruce Richardson 
78799a2dd95SBruce Richardson 		ret = 0;
78899a2dd95SBruce Richardson 	} else {
78999a2dd95SBruce Richardson 		/* if we're able to take out a write lock, we're the last one
79099a2dd95SBruce Richardson 		 * holding onto this page.
79199a2dd95SBruce Richardson 		 */
79252d7d91eSDmitry Kozlyuk 		if (!internal_conf->in_memory &&
793*32b4771cSDmitry Kozlyuk 				internal_conf->hugepage_file.unlink_existing &&
79452d7d91eSDmitry Kozlyuk 				!internal_conf->hugepage_file.unlink_before_mapping) {
79599a2dd95SBruce Richardson 			ret = lock(fd, LOCK_EX);
79699a2dd95SBruce Richardson 			if (ret >= 0) {
79799a2dd95SBruce Richardson 				/* no one else is using this page */
79899a2dd95SBruce Richardson 				if (ret == 1)
79999a2dd95SBruce Richardson 					unlink(path);
80099a2dd95SBruce Richardson 			}
80199a2dd95SBruce Richardson 		}
80299a2dd95SBruce Richardson 		/* closing fd will drop the lock */
80399a2dd95SBruce Richardson 		close(fd);
80499a2dd95SBruce Richardson 		fd_list[list_idx].fds[seg_idx] = -1;
80599a2dd95SBruce Richardson 	}
80699a2dd95SBruce Richardson 
80799a2dd95SBruce Richardson 	memset(ms, 0, sizeof(*ms));
80899a2dd95SBruce Richardson 
80999a2dd95SBruce Richardson 	return ret < 0 ? -1 : 0;
81099a2dd95SBruce Richardson }
81199a2dd95SBruce Richardson 
81299a2dd95SBruce Richardson struct alloc_walk_param {
81399a2dd95SBruce Richardson 	struct hugepage_info *hi;
81499a2dd95SBruce Richardson 	struct rte_memseg **ms;
81599a2dd95SBruce Richardson 	size_t page_sz;
81699a2dd95SBruce Richardson 	unsigned int segs_allocated;
81799a2dd95SBruce Richardson 	unsigned int n_segs;
81899a2dd95SBruce Richardson 	int socket;
81999a2dd95SBruce Richardson 	bool exact;
82099a2dd95SBruce Richardson };
82199a2dd95SBruce Richardson static int
82299a2dd95SBruce Richardson alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
82399a2dd95SBruce Richardson {
82499a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
82599a2dd95SBruce Richardson 	struct alloc_walk_param *wa = arg;
82699a2dd95SBruce Richardson 	struct rte_memseg_list *cur_msl;
82799a2dd95SBruce Richardson 	size_t page_sz;
82899a2dd95SBruce Richardson 	int cur_idx, start_idx, j, dir_fd = -1;
82999a2dd95SBruce Richardson 	unsigned int msl_idx, need, i;
83099a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
83199a2dd95SBruce Richardson 		eal_get_internal_configuration();
83299a2dd95SBruce Richardson 
83399a2dd95SBruce Richardson 	if (msl->page_sz != wa->page_sz)
83499a2dd95SBruce Richardson 		return 0;
83599a2dd95SBruce Richardson 	if (msl->socket_id != wa->socket)
83699a2dd95SBruce Richardson 		return 0;
83799a2dd95SBruce Richardson 
83899a2dd95SBruce Richardson 	page_sz = (size_t)msl->page_sz;
83999a2dd95SBruce Richardson 
84099a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
84199a2dd95SBruce Richardson 	cur_msl = &mcfg->memsegs[msl_idx];
84299a2dd95SBruce Richardson 
84399a2dd95SBruce Richardson 	need = wa->n_segs;
84499a2dd95SBruce Richardson 
84599a2dd95SBruce Richardson 	/* try finding space in memseg list */
84699a2dd95SBruce Richardson 	if (wa->exact) {
84799a2dd95SBruce Richardson 		/* if we require exact number of pages in a list, find them */
84899a2dd95SBruce Richardson 		cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
84999a2dd95SBruce Richardson 				need);
85099a2dd95SBruce Richardson 		if (cur_idx < 0)
85199a2dd95SBruce Richardson 			return 0;
85299a2dd95SBruce Richardson 		start_idx = cur_idx;
85399a2dd95SBruce Richardson 	} else {
85499a2dd95SBruce Richardson 		int cur_len;
85599a2dd95SBruce Richardson 
85699a2dd95SBruce Richardson 		/* we don't require exact number of pages, so we're going to go
85799a2dd95SBruce Richardson 		 * for best-effort allocation. that means finding the biggest
85899a2dd95SBruce Richardson 		 * unused block, and going with that.
85999a2dd95SBruce Richardson 		 */
86099a2dd95SBruce Richardson 		cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
86199a2dd95SBruce Richardson 				0);
86299a2dd95SBruce Richardson 		if (cur_idx < 0)
86399a2dd95SBruce Richardson 			return 0;
86499a2dd95SBruce Richardson 		start_idx = cur_idx;
86599a2dd95SBruce Richardson 		/* adjust the size to possibly be smaller than original
86699a2dd95SBruce Richardson 		 * request, but do not allow it to be bigger.
86799a2dd95SBruce Richardson 		 */
86899a2dd95SBruce Richardson 		cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
86999a2dd95SBruce Richardson 				cur_idx);
87099a2dd95SBruce Richardson 		need = RTE_MIN(need, (unsigned int)cur_len);
87199a2dd95SBruce Richardson 	}
87299a2dd95SBruce Richardson 
87399a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're allocating,
87499a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
87599a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
87699a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
87799a2dd95SBruce Richardson 	 *
87899a2dd95SBruce Richardson 	 * during init, we already hold a write lock, so don't try to take out
87999a2dd95SBruce Richardson 	 * another one.
88099a2dd95SBruce Richardson 	 */
88199a2dd95SBruce Richardson 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
88299a2dd95SBruce Richardson 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
88399a2dd95SBruce Richardson 		if (dir_fd < 0) {
88499a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
88599a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
88699a2dd95SBruce Richardson 			return -1;
88799a2dd95SBruce Richardson 		}
88899a2dd95SBruce Richardson 		/* blocking writelock */
88999a2dd95SBruce Richardson 		if (flock(dir_fd, LOCK_EX)) {
89099a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
89199a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
89299a2dd95SBruce Richardson 			close(dir_fd);
89399a2dd95SBruce Richardson 			return -1;
89499a2dd95SBruce Richardson 		}
89599a2dd95SBruce Richardson 	}
89699a2dd95SBruce Richardson 
89799a2dd95SBruce Richardson 	for (i = 0; i < need; i++, cur_idx++) {
89899a2dd95SBruce Richardson 		struct rte_memseg *cur;
89999a2dd95SBruce Richardson 		void *map_addr;
90099a2dd95SBruce Richardson 
90199a2dd95SBruce Richardson 		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
90299a2dd95SBruce Richardson 		map_addr = RTE_PTR_ADD(cur_msl->base_va,
90399a2dd95SBruce Richardson 				cur_idx * page_sz);
90499a2dd95SBruce Richardson 
90599a2dd95SBruce Richardson 		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
90699a2dd95SBruce Richardson 				msl_idx, cur_idx)) {
90799a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
90899a2dd95SBruce Richardson 				need, i);
90999a2dd95SBruce Richardson 
91099a2dd95SBruce Richardson 			/* if exact number wasn't requested, stop */
91199a2dd95SBruce Richardson 			if (!wa->exact)
91299a2dd95SBruce Richardson 				goto out;
91399a2dd95SBruce Richardson 
91499a2dd95SBruce Richardson 			/* clean up */
91599a2dd95SBruce Richardson 			for (j = start_idx; j < cur_idx; j++) {
91699a2dd95SBruce Richardson 				struct rte_memseg *tmp;
91799a2dd95SBruce Richardson 				struct rte_fbarray *arr =
91899a2dd95SBruce Richardson 						&cur_msl->memseg_arr;
91999a2dd95SBruce Richardson 
92099a2dd95SBruce Richardson 				tmp = rte_fbarray_get(arr, j);
92199a2dd95SBruce Richardson 				rte_fbarray_set_free(arr, j);
92299a2dd95SBruce Richardson 
92399a2dd95SBruce Richardson 				/* free_seg may attempt to create a file, which
92499a2dd95SBruce Richardson 				 * may fail.
92599a2dd95SBruce Richardson 				 */
92699a2dd95SBruce Richardson 				if (free_seg(tmp, wa->hi, msl_idx, j))
92799a2dd95SBruce Richardson 					RTE_LOG(DEBUG, EAL, "Cannot free page\n");
92899a2dd95SBruce Richardson 			}
92999a2dd95SBruce Richardson 			/* clear the list */
93099a2dd95SBruce Richardson 			if (wa->ms)
93199a2dd95SBruce Richardson 				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
93299a2dd95SBruce Richardson 
93399a2dd95SBruce Richardson 			if (dir_fd >= 0)
93499a2dd95SBruce Richardson 				close(dir_fd);
93599a2dd95SBruce Richardson 			return -1;
93699a2dd95SBruce Richardson 		}
93799a2dd95SBruce Richardson 		if (wa->ms)
93899a2dd95SBruce Richardson 			wa->ms[i] = cur;
93999a2dd95SBruce Richardson 
94099a2dd95SBruce Richardson 		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
94199a2dd95SBruce Richardson 	}
94299a2dd95SBruce Richardson out:
94399a2dd95SBruce Richardson 	wa->segs_allocated = i;
94499a2dd95SBruce Richardson 	if (i > 0)
94599a2dd95SBruce Richardson 		cur_msl->version++;
94699a2dd95SBruce Richardson 	if (dir_fd >= 0)
94799a2dd95SBruce Richardson 		close(dir_fd);
94899a2dd95SBruce Richardson 	/* if we didn't allocate any segments, move on to the next list */
94999a2dd95SBruce Richardson 	return i > 0;
95099a2dd95SBruce Richardson }
95199a2dd95SBruce Richardson 
95299a2dd95SBruce Richardson struct free_walk_param {
95399a2dd95SBruce Richardson 	struct hugepage_info *hi;
95499a2dd95SBruce Richardson 	struct rte_memseg *ms;
95599a2dd95SBruce Richardson };
95699a2dd95SBruce Richardson static int
95799a2dd95SBruce Richardson free_seg_walk(const struct rte_memseg_list *msl, void *arg)
95899a2dd95SBruce Richardson {
95999a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
96099a2dd95SBruce Richardson 	struct rte_memseg_list *found_msl;
96199a2dd95SBruce Richardson 	struct free_walk_param *wa = arg;
96299a2dd95SBruce Richardson 	uintptr_t start_addr, end_addr;
96399a2dd95SBruce Richardson 	int msl_idx, seg_idx, ret, dir_fd = -1;
96499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
96599a2dd95SBruce Richardson 		eal_get_internal_configuration();
96699a2dd95SBruce Richardson 
96799a2dd95SBruce Richardson 	start_addr = (uintptr_t) msl->base_va;
96899a2dd95SBruce Richardson 	end_addr = start_addr + msl->len;
96999a2dd95SBruce Richardson 
97099a2dd95SBruce Richardson 	if ((uintptr_t)wa->ms->addr < start_addr ||
97199a2dd95SBruce Richardson 			(uintptr_t)wa->ms->addr >= end_addr)
97299a2dd95SBruce Richardson 		return 0;
97399a2dd95SBruce Richardson 
97499a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
97599a2dd95SBruce Richardson 	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
97699a2dd95SBruce Richardson 
97799a2dd95SBruce Richardson 	/* msl is const */
97899a2dd95SBruce Richardson 	found_msl = &mcfg->memsegs[msl_idx];
97999a2dd95SBruce Richardson 
98099a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're freeing,
98199a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
98299a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
98399a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
98499a2dd95SBruce Richardson 	 *
98599a2dd95SBruce Richardson 	 * during init, we already hold a write lock, so don't try to take out
98699a2dd95SBruce Richardson 	 * another one.
98799a2dd95SBruce Richardson 	 */
98899a2dd95SBruce Richardson 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
98999a2dd95SBruce Richardson 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
99099a2dd95SBruce Richardson 		if (dir_fd < 0) {
99199a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
99299a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
99399a2dd95SBruce Richardson 			return -1;
99499a2dd95SBruce Richardson 		}
99599a2dd95SBruce Richardson 		/* blocking writelock */
99699a2dd95SBruce Richardson 		if (flock(dir_fd, LOCK_EX)) {
99799a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
99899a2dd95SBruce Richardson 				__func__, wa->hi->hugedir, strerror(errno));
99999a2dd95SBruce Richardson 			close(dir_fd);
100099a2dd95SBruce Richardson 			return -1;
100199a2dd95SBruce Richardson 		}
100299a2dd95SBruce Richardson 	}
100399a2dd95SBruce Richardson 
100499a2dd95SBruce Richardson 	found_msl->version++;
100599a2dd95SBruce Richardson 
100699a2dd95SBruce Richardson 	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
100799a2dd95SBruce Richardson 
100899a2dd95SBruce Richardson 	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
100999a2dd95SBruce Richardson 
101099a2dd95SBruce Richardson 	if (dir_fd >= 0)
101199a2dd95SBruce Richardson 		close(dir_fd);
101299a2dd95SBruce Richardson 
101399a2dd95SBruce Richardson 	if (ret < 0)
101499a2dd95SBruce Richardson 		return -1;
101599a2dd95SBruce Richardson 
101699a2dd95SBruce Richardson 	return 1;
101799a2dd95SBruce Richardson }
101899a2dd95SBruce Richardson 
101999a2dd95SBruce Richardson int
102099a2dd95SBruce Richardson eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
102199a2dd95SBruce Richardson 		int socket, bool exact)
102299a2dd95SBruce Richardson {
102399a2dd95SBruce Richardson 	int i, ret = -1;
102499a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
102599a2dd95SBruce Richardson 	bool have_numa = false;
102699a2dd95SBruce Richardson 	int oldpolicy;
102799a2dd95SBruce Richardson 	struct bitmask *oldmask;
102899a2dd95SBruce Richardson #endif
102999a2dd95SBruce Richardson 	struct alloc_walk_param wa;
103099a2dd95SBruce Richardson 	struct hugepage_info *hi = NULL;
103199a2dd95SBruce Richardson 	struct internal_config *internal_conf =
103299a2dd95SBruce Richardson 		eal_get_internal_configuration();
103399a2dd95SBruce Richardson 
103499a2dd95SBruce Richardson 	memset(&wa, 0, sizeof(wa));
103599a2dd95SBruce Richardson 
103699a2dd95SBruce Richardson 	/* dynamic allocation not supported in legacy mode */
103799a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
103899a2dd95SBruce Richardson 		return -1;
103999a2dd95SBruce Richardson 
104099a2dd95SBruce Richardson 	for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) {
104199a2dd95SBruce Richardson 		if (page_sz ==
104299a2dd95SBruce Richardson 				internal_conf->hugepage_info[i].hugepage_sz) {
104399a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
104499a2dd95SBruce Richardson 			break;
104599a2dd95SBruce Richardson 		}
104699a2dd95SBruce Richardson 	}
104799a2dd95SBruce Richardson 	if (!hi) {
104899a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
104999a2dd95SBruce Richardson 			__func__);
105099a2dd95SBruce Richardson 		return -1;
105199a2dd95SBruce Richardson 	}
105299a2dd95SBruce Richardson 
105399a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
105499a2dd95SBruce Richardson 	if (check_numa()) {
105599a2dd95SBruce Richardson 		oldmask = numa_allocate_nodemask();
105699a2dd95SBruce Richardson 		prepare_numa(&oldpolicy, oldmask, socket);
105799a2dd95SBruce Richardson 		have_numa = true;
105899a2dd95SBruce Richardson 	}
105999a2dd95SBruce Richardson #endif
106099a2dd95SBruce Richardson 
106199a2dd95SBruce Richardson 	wa.exact = exact;
106299a2dd95SBruce Richardson 	wa.hi = hi;
106399a2dd95SBruce Richardson 	wa.ms = ms;
106499a2dd95SBruce Richardson 	wa.n_segs = n_segs;
106599a2dd95SBruce Richardson 	wa.page_sz = page_sz;
106699a2dd95SBruce Richardson 	wa.socket = socket;
106799a2dd95SBruce Richardson 	wa.segs_allocated = 0;
106899a2dd95SBruce Richardson 
106999a2dd95SBruce Richardson 	/* memalloc is locked, so it's safe to use thread-unsafe version */
107099a2dd95SBruce Richardson 	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
107199a2dd95SBruce Richardson 	if (ret == 0) {
107299a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
107399a2dd95SBruce Richardson 			__func__);
107499a2dd95SBruce Richardson 		ret = -1;
107599a2dd95SBruce Richardson 	} else if (ret > 0) {
107699a2dd95SBruce Richardson 		ret = (int)wa.segs_allocated;
107799a2dd95SBruce Richardson 	}
107899a2dd95SBruce Richardson 
107999a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
108099a2dd95SBruce Richardson 	if (have_numa)
108199a2dd95SBruce Richardson 		restore_numa(&oldpolicy, oldmask);
108299a2dd95SBruce Richardson #endif
108399a2dd95SBruce Richardson 	return ret;
108499a2dd95SBruce Richardson }
108599a2dd95SBruce Richardson 
108699a2dd95SBruce Richardson struct rte_memseg *
108799a2dd95SBruce Richardson eal_memalloc_alloc_seg(size_t page_sz, int socket)
108899a2dd95SBruce Richardson {
108999a2dd95SBruce Richardson 	struct rte_memseg *ms;
109099a2dd95SBruce Richardson 	if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
109199a2dd95SBruce Richardson 		return NULL;
109299a2dd95SBruce Richardson 	/* return pointer to newly allocated memseg */
109399a2dd95SBruce Richardson 	return ms;
109499a2dd95SBruce Richardson }
109599a2dd95SBruce Richardson 
109699a2dd95SBruce Richardson int
109799a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
109899a2dd95SBruce Richardson {
109999a2dd95SBruce Richardson 	int seg, ret = 0;
110099a2dd95SBruce Richardson 	struct internal_config *internal_conf =
110199a2dd95SBruce Richardson 		eal_get_internal_configuration();
110299a2dd95SBruce Richardson 
110399a2dd95SBruce Richardson 	/* dynamic free not supported in legacy mode */
110499a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
110599a2dd95SBruce Richardson 		return -1;
110699a2dd95SBruce Richardson 
110799a2dd95SBruce Richardson 	for (seg = 0; seg < n_segs; seg++) {
110899a2dd95SBruce Richardson 		struct rte_memseg *cur = ms[seg];
110999a2dd95SBruce Richardson 		struct hugepage_info *hi = NULL;
111099a2dd95SBruce Richardson 		struct free_walk_param wa;
111199a2dd95SBruce Richardson 		int i, walk_res;
111299a2dd95SBruce Richardson 
111399a2dd95SBruce Richardson 		/* if this page is marked as unfreeable, fail */
111499a2dd95SBruce Richardson 		if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
111599a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
111699a2dd95SBruce Richardson 			ret = -1;
111799a2dd95SBruce Richardson 			continue;
111899a2dd95SBruce Richardson 		}
111999a2dd95SBruce Richardson 
112099a2dd95SBruce Richardson 		memset(&wa, 0, sizeof(wa));
112199a2dd95SBruce Richardson 
112299a2dd95SBruce Richardson 		for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info);
112399a2dd95SBruce Richardson 				i++) {
112499a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
112599a2dd95SBruce Richardson 			if (cur->hugepage_sz == hi->hugepage_sz)
112699a2dd95SBruce Richardson 				break;
112799a2dd95SBruce Richardson 		}
112899a2dd95SBruce Richardson 		if (i == (int)RTE_DIM(internal_conf->hugepage_info)) {
112999a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
113099a2dd95SBruce Richardson 			ret = -1;
113199a2dd95SBruce Richardson 			continue;
113299a2dd95SBruce Richardson 		}
113399a2dd95SBruce Richardson 
113499a2dd95SBruce Richardson 		wa.ms = cur;
113599a2dd95SBruce Richardson 		wa.hi = hi;
113699a2dd95SBruce Richardson 
113799a2dd95SBruce Richardson 		/* memalloc is locked, so it's safe to use thread-unsafe version
113899a2dd95SBruce Richardson 		 */
113999a2dd95SBruce Richardson 		walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
114099a2dd95SBruce Richardson 				&wa);
114199a2dd95SBruce Richardson 		if (walk_res == 1)
114299a2dd95SBruce Richardson 			continue;
114399a2dd95SBruce Richardson 		if (walk_res == 0)
114499a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
114599a2dd95SBruce Richardson 		ret = -1;
114699a2dd95SBruce Richardson 	}
114799a2dd95SBruce Richardson 	return ret;
114899a2dd95SBruce Richardson }
114999a2dd95SBruce Richardson 
115099a2dd95SBruce Richardson int
115199a2dd95SBruce Richardson eal_memalloc_free_seg(struct rte_memseg *ms)
115299a2dd95SBruce Richardson {
115399a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
115499a2dd95SBruce Richardson 		eal_get_internal_configuration();
115599a2dd95SBruce Richardson 
115699a2dd95SBruce Richardson 	/* dynamic free not supported in legacy mode */
115799a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
115899a2dd95SBruce Richardson 		return -1;
115999a2dd95SBruce Richardson 
116099a2dd95SBruce Richardson 	return eal_memalloc_free_seg_bulk(&ms, 1);
116199a2dd95SBruce Richardson }
116299a2dd95SBruce Richardson 
116399a2dd95SBruce Richardson static int
116499a2dd95SBruce Richardson sync_chunk(struct rte_memseg_list *primary_msl,
116599a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
116699a2dd95SBruce Richardson 		unsigned int msl_idx, bool used, int start, int end)
116799a2dd95SBruce Richardson {
116899a2dd95SBruce Richardson 	struct rte_fbarray *l_arr, *p_arr;
116999a2dd95SBruce Richardson 	int i, ret, chunk_len, diff_len;
117099a2dd95SBruce Richardson 
117199a2dd95SBruce Richardson 	l_arr = &local_msl->memseg_arr;
117299a2dd95SBruce Richardson 	p_arr = &primary_msl->memseg_arr;
117399a2dd95SBruce Richardson 
117499a2dd95SBruce Richardson 	/* we need to aggregate allocations/deallocations into bigger chunks,
117599a2dd95SBruce Richardson 	 * as we don't want to spam the user with per-page callbacks.
117699a2dd95SBruce Richardson 	 *
117799a2dd95SBruce Richardson 	 * to avoid any potential issues, we also want to trigger
117899a2dd95SBruce Richardson 	 * deallocation callbacks *before* we actually deallocate
117999a2dd95SBruce Richardson 	 * memory, so that the user application could wrap up its use
118099a2dd95SBruce Richardson 	 * before it goes away.
118199a2dd95SBruce Richardson 	 */
118299a2dd95SBruce Richardson 
118399a2dd95SBruce Richardson 	chunk_len = end - start;
118499a2dd95SBruce Richardson 
118599a2dd95SBruce Richardson 	/* find how many contiguous pages we can map/unmap for this chunk */
118699a2dd95SBruce Richardson 	diff_len = used ?
118799a2dd95SBruce Richardson 			rte_fbarray_find_contig_free(l_arr, start) :
118899a2dd95SBruce Richardson 			rte_fbarray_find_contig_used(l_arr, start);
118999a2dd95SBruce Richardson 
119099a2dd95SBruce Richardson 	/* has to be at least one page */
119199a2dd95SBruce Richardson 	if (diff_len < 1)
119299a2dd95SBruce Richardson 		return -1;
119399a2dd95SBruce Richardson 
119499a2dd95SBruce Richardson 	diff_len = RTE_MIN(chunk_len, diff_len);
119599a2dd95SBruce Richardson 
119699a2dd95SBruce Richardson 	/* if we are freeing memory, notify the application */
119799a2dd95SBruce Richardson 	if (!used) {
119899a2dd95SBruce Richardson 		struct rte_memseg *ms;
119999a2dd95SBruce Richardson 		void *start_va;
120099a2dd95SBruce Richardson 		size_t len, page_sz;
120199a2dd95SBruce Richardson 
120299a2dd95SBruce Richardson 		ms = rte_fbarray_get(l_arr, start);
120399a2dd95SBruce Richardson 		start_va = ms->addr;
120499a2dd95SBruce Richardson 		page_sz = (size_t)primary_msl->page_sz;
120599a2dd95SBruce Richardson 		len = page_sz * diff_len;
120699a2dd95SBruce Richardson 
120799a2dd95SBruce Richardson 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
120899a2dd95SBruce Richardson 				start_va, len);
120999a2dd95SBruce Richardson 	}
121099a2dd95SBruce Richardson 
121199a2dd95SBruce Richardson 	for (i = 0; i < diff_len; i++) {
121299a2dd95SBruce Richardson 		struct rte_memseg *p_ms, *l_ms;
121399a2dd95SBruce Richardson 		int seg_idx = start + i;
121499a2dd95SBruce Richardson 
121599a2dd95SBruce Richardson 		l_ms = rte_fbarray_get(l_arr, seg_idx);
121699a2dd95SBruce Richardson 		p_ms = rte_fbarray_get(p_arr, seg_idx);
121799a2dd95SBruce Richardson 
121899a2dd95SBruce Richardson 		if (l_ms == NULL || p_ms == NULL)
121999a2dd95SBruce Richardson 			return -1;
122099a2dd95SBruce Richardson 
122199a2dd95SBruce Richardson 		if (used) {
122299a2dd95SBruce Richardson 			ret = alloc_seg(l_ms, p_ms->addr,
122399a2dd95SBruce Richardson 					p_ms->socket_id, hi,
122499a2dd95SBruce Richardson 					msl_idx, seg_idx);
122599a2dd95SBruce Richardson 			if (ret < 0)
122699a2dd95SBruce Richardson 				return -1;
122799a2dd95SBruce Richardson 			rte_fbarray_set_used(l_arr, seg_idx);
122899a2dd95SBruce Richardson 		} else {
122999a2dd95SBruce Richardson 			ret = free_seg(l_ms, hi, msl_idx, seg_idx);
123099a2dd95SBruce Richardson 			rte_fbarray_set_free(l_arr, seg_idx);
123199a2dd95SBruce Richardson 			if (ret < 0)
123299a2dd95SBruce Richardson 				return -1;
123399a2dd95SBruce Richardson 		}
123499a2dd95SBruce Richardson 	}
123599a2dd95SBruce Richardson 
123699a2dd95SBruce Richardson 	/* if we just allocated memory, notify the application */
123799a2dd95SBruce Richardson 	if (used) {
123899a2dd95SBruce Richardson 		struct rte_memseg *ms;
123999a2dd95SBruce Richardson 		void *start_va;
124099a2dd95SBruce Richardson 		size_t len, page_sz;
124199a2dd95SBruce Richardson 
124299a2dd95SBruce Richardson 		ms = rte_fbarray_get(l_arr, start);
124399a2dd95SBruce Richardson 		start_va = ms->addr;
124499a2dd95SBruce Richardson 		page_sz = (size_t)primary_msl->page_sz;
124599a2dd95SBruce Richardson 		len = page_sz * diff_len;
124699a2dd95SBruce Richardson 
124799a2dd95SBruce Richardson 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
124899a2dd95SBruce Richardson 				start_va, len);
124999a2dd95SBruce Richardson 	}
125099a2dd95SBruce Richardson 
125199a2dd95SBruce Richardson 	/* calculate how much we can advance until next chunk */
125299a2dd95SBruce Richardson 	diff_len = used ?
125399a2dd95SBruce Richardson 			rte_fbarray_find_contig_used(l_arr, start) :
125499a2dd95SBruce Richardson 			rte_fbarray_find_contig_free(l_arr, start);
125599a2dd95SBruce Richardson 	ret = RTE_MIN(chunk_len, diff_len);
125699a2dd95SBruce Richardson 
125799a2dd95SBruce Richardson 	return ret;
125899a2dd95SBruce Richardson }
125999a2dd95SBruce Richardson 
126099a2dd95SBruce Richardson static int
126199a2dd95SBruce Richardson sync_status(struct rte_memseg_list *primary_msl,
126299a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
126399a2dd95SBruce Richardson 		unsigned int msl_idx, bool used)
126499a2dd95SBruce Richardson {
126599a2dd95SBruce Richardson 	struct rte_fbarray *l_arr, *p_arr;
126699a2dd95SBruce Richardson 	int p_idx, l_chunk_len, p_chunk_len, ret;
126799a2dd95SBruce Richardson 	int start, end;
126899a2dd95SBruce Richardson 
126999a2dd95SBruce Richardson 	/* this is a little bit tricky, but the basic idea is - walk both lists
127099a2dd95SBruce Richardson 	 * and spot any places where there are discrepancies. walking both lists
127199a2dd95SBruce Richardson 	 * and noting discrepancies in a single go is a hard problem, so we do
127299a2dd95SBruce Richardson 	 * it in two passes - first we spot any places where allocated segments
127399a2dd95SBruce Richardson 	 * mismatch (i.e. ensure that everything that's allocated in the primary
127499a2dd95SBruce Richardson 	 * is also allocated in the secondary), and then we do it by looking at
127599a2dd95SBruce Richardson 	 * free segments instead.
127699a2dd95SBruce Richardson 	 *
127799a2dd95SBruce Richardson 	 * we also need to aggregate changes into chunks, as we have to call
127899a2dd95SBruce Richardson 	 * callbacks per allocation, not per page.
127999a2dd95SBruce Richardson 	 */
128099a2dd95SBruce Richardson 	l_arr = &local_msl->memseg_arr;
128199a2dd95SBruce Richardson 	p_arr = &primary_msl->memseg_arr;
128299a2dd95SBruce Richardson 
128399a2dd95SBruce Richardson 	if (used)
128499a2dd95SBruce Richardson 		p_idx = rte_fbarray_find_next_used(p_arr, 0);
128599a2dd95SBruce Richardson 	else
128699a2dd95SBruce Richardson 		p_idx = rte_fbarray_find_next_free(p_arr, 0);
128799a2dd95SBruce Richardson 
128899a2dd95SBruce Richardson 	while (p_idx >= 0) {
128999a2dd95SBruce Richardson 		int next_chunk_search_idx;
129099a2dd95SBruce Richardson 
129199a2dd95SBruce Richardson 		if (used) {
129299a2dd95SBruce Richardson 			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
129399a2dd95SBruce Richardson 					p_idx);
129499a2dd95SBruce Richardson 			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
129599a2dd95SBruce Richardson 					p_idx);
129699a2dd95SBruce Richardson 		} else {
129799a2dd95SBruce Richardson 			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
129899a2dd95SBruce Richardson 					p_idx);
129999a2dd95SBruce Richardson 			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
130099a2dd95SBruce Richardson 					p_idx);
130199a2dd95SBruce Richardson 		}
130299a2dd95SBruce Richardson 		/* best case scenario - no differences (or bigger, which will be
130399a2dd95SBruce Richardson 		 * fixed during next iteration), look for next chunk
130499a2dd95SBruce Richardson 		 */
130599a2dd95SBruce Richardson 		if (l_chunk_len >= p_chunk_len) {
130699a2dd95SBruce Richardson 			next_chunk_search_idx = p_idx + p_chunk_len;
130799a2dd95SBruce Richardson 			goto next_chunk;
130899a2dd95SBruce Richardson 		}
130999a2dd95SBruce Richardson 
131099a2dd95SBruce Richardson 		/* if both chunks start at the same point, skip parts we know
131199a2dd95SBruce Richardson 		 * are identical, and sync the rest. each call to sync_chunk
131299a2dd95SBruce Richardson 		 * will only sync contiguous segments, so we need to call this
131399a2dd95SBruce Richardson 		 * until we are sure there are no more differences in this
131499a2dd95SBruce Richardson 		 * chunk.
131599a2dd95SBruce Richardson 		 */
131699a2dd95SBruce Richardson 		start = p_idx + l_chunk_len;
131799a2dd95SBruce Richardson 		end = p_idx + p_chunk_len;
131899a2dd95SBruce Richardson 		do {
131999a2dd95SBruce Richardson 			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
132099a2dd95SBruce Richardson 					used, start, end);
132199a2dd95SBruce Richardson 			start += ret;
132299a2dd95SBruce Richardson 		} while (start < end && ret >= 0);
132399a2dd95SBruce Richardson 		/* if ret is negative, something went wrong */
132499a2dd95SBruce Richardson 		if (ret < 0)
132599a2dd95SBruce Richardson 			return -1;
132699a2dd95SBruce Richardson 
132799a2dd95SBruce Richardson 		next_chunk_search_idx = p_idx + p_chunk_len;
132899a2dd95SBruce Richardson next_chunk:
132999a2dd95SBruce Richardson 		/* skip to end of this chunk */
133099a2dd95SBruce Richardson 		if (used) {
133199a2dd95SBruce Richardson 			p_idx = rte_fbarray_find_next_used(p_arr,
133299a2dd95SBruce Richardson 					next_chunk_search_idx);
133399a2dd95SBruce Richardson 		} else {
133499a2dd95SBruce Richardson 			p_idx = rte_fbarray_find_next_free(p_arr,
133599a2dd95SBruce Richardson 					next_chunk_search_idx);
133699a2dd95SBruce Richardson 		}
133799a2dd95SBruce Richardson 	}
133899a2dd95SBruce Richardson 	return 0;
133999a2dd95SBruce Richardson }
134099a2dd95SBruce Richardson 
134199a2dd95SBruce Richardson static int
134299a2dd95SBruce Richardson sync_existing(struct rte_memseg_list *primary_msl,
134399a2dd95SBruce Richardson 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
134499a2dd95SBruce Richardson 		unsigned int msl_idx)
134599a2dd95SBruce Richardson {
134699a2dd95SBruce Richardson 	int ret, dir_fd;
134799a2dd95SBruce Richardson 
134899a2dd95SBruce Richardson 	/* do not allow any page allocations during the time we're allocating,
134999a2dd95SBruce Richardson 	 * because file creation and locking operations are not atomic,
135099a2dd95SBruce Richardson 	 * and we might be the first or the last ones to use a particular page,
135199a2dd95SBruce Richardson 	 * so we need to ensure atomicity of every operation.
135299a2dd95SBruce Richardson 	 */
135399a2dd95SBruce Richardson 	dir_fd = open(hi->hugedir, O_RDONLY);
135499a2dd95SBruce Richardson 	if (dir_fd < 0) {
135599a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
135699a2dd95SBruce Richardson 			hi->hugedir, strerror(errno));
135799a2dd95SBruce Richardson 		return -1;
135899a2dd95SBruce Richardson 	}
135999a2dd95SBruce Richardson 	/* blocking writelock */
136099a2dd95SBruce Richardson 	if (flock(dir_fd, LOCK_EX)) {
136199a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
136299a2dd95SBruce Richardson 			hi->hugedir, strerror(errno));
136399a2dd95SBruce Richardson 		close(dir_fd);
136499a2dd95SBruce Richardson 		return -1;
136599a2dd95SBruce Richardson 	}
136699a2dd95SBruce Richardson 
136799a2dd95SBruce Richardson 	/* ensure all allocated space is the same in both lists */
136899a2dd95SBruce Richardson 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
136999a2dd95SBruce Richardson 	if (ret < 0)
137099a2dd95SBruce Richardson 		goto fail;
137199a2dd95SBruce Richardson 
137299a2dd95SBruce Richardson 	/* ensure all unallocated space is the same in both lists */
137399a2dd95SBruce Richardson 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
137499a2dd95SBruce Richardson 	if (ret < 0)
137599a2dd95SBruce Richardson 		goto fail;
137699a2dd95SBruce Richardson 
137799a2dd95SBruce Richardson 	/* update version number */
137899a2dd95SBruce Richardson 	local_msl->version = primary_msl->version;
137999a2dd95SBruce Richardson 
138099a2dd95SBruce Richardson 	close(dir_fd);
138199a2dd95SBruce Richardson 
138299a2dd95SBruce Richardson 	return 0;
138399a2dd95SBruce Richardson fail:
138499a2dd95SBruce Richardson 	close(dir_fd);
138599a2dd95SBruce Richardson 	return -1;
138699a2dd95SBruce Richardson }
138799a2dd95SBruce Richardson 
138899a2dd95SBruce Richardson static int
138999a2dd95SBruce Richardson sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
139099a2dd95SBruce Richardson {
139199a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
139299a2dd95SBruce Richardson 	struct rte_memseg_list *primary_msl, *local_msl;
139399a2dd95SBruce Richardson 	struct hugepage_info *hi = NULL;
139499a2dd95SBruce Richardson 	unsigned int i;
139599a2dd95SBruce Richardson 	int msl_idx;
139699a2dd95SBruce Richardson 	struct internal_config *internal_conf =
139799a2dd95SBruce Richardson 		eal_get_internal_configuration();
139899a2dd95SBruce Richardson 
139999a2dd95SBruce Richardson 	if (msl->external)
140099a2dd95SBruce Richardson 		return 0;
140199a2dd95SBruce Richardson 
140299a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
140399a2dd95SBruce Richardson 	primary_msl = &mcfg->memsegs[msl_idx];
140499a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
140599a2dd95SBruce Richardson 
140699a2dd95SBruce Richardson 	for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
140799a2dd95SBruce Richardson 		uint64_t cur_sz =
140899a2dd95SBruce Richardson 			internal_conf->hugepage_info[i].hugepage_sz;
140999a2dd95SBruce Richardson 		uint64_t msl_sz = primary_msl->page_sz;
141099a2dd95SBruce Richardson 		if (msl_sz == cur_sz) {
141199a2dd95SBruce Richardson 			hi = &internal_conf->hugepage_info[i];
141299a2dd95SBruce Richardson 			break;
141399a2dd95SBruce Richardson 		}
141499a2dd95SBruce Richardson 	}
141599a2dd95SBruce Richardson 	if (!hi) {
141699a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
141799a2dd95SBruce Richardson 		return -1;
141899a2dd95SBruce Richardson 	}
141999a2dd95SBruce Richardson 
142099a2dd95SBruce Richardson 	/* if versions don't match, synchronize everything */
142199a2dd95SBruce Richardson 	if (local_msl->version != primary_msl->version &&
142299a2dd95SBruce Richardson 			sync_existing(primary_msl, local_msl, hi, msl_idx))
142399a2dd95SBruce Richardson 		return -1;
142499a2dd95SBruce Richardson 	return 0;
142599a2dd95SBruce Richardson }
142699a2dd95SBruce Richardson 
142799a2dd95SBruce Richardson 
142899a2dd95SBruce Richardson int
142999a2dd95SBruce Richardson eal_memalloc_sync_with_primary(void)
143099a2dd95SBruce Richardson {
143199a2dd95SBruce Richardson 	/* nothing to be done in primary */
143299a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
143399a2dd95SBruce Richardson 		return 0;
143499a2dd95SBruce Richardson 
143599a2dd95SBruce Richardson 	/* memalloc is locked, so it's safe to call thread-unsafe version */
143699a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
143799a2dd95SBruce Richardson 		return -1;
143899a2dd95SBruce Richardson 	return 0;
143999a2dd95SBruce Richardson }
144099a2dd95SBruce Richardson 
144199a2dd95SBruce Richardson static int
144299a2dd95SBruce Richardson secondary_msl_create_walk(const struct rte_memseg_list *msl,
144399a2dd95SBruce Richardson 		void *arg __rte_unused)
144499a2dd95SBruce Richardson {
144599a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
144699a2dd95SBruce Richardson 	struct rte_memseg_list *primary_msl, *local_msl;
144799a2dd95SBruce Richardson 	char name[PATH_MAX];
144899a2dd95SBruce Richardson 	int msl_idx, ret;
144999a2dd95SBruce Richardson 
145099a2dd95SBruce Richardson 	if (msl->external)
145199a2dd95SBruce Richardson 		return 0;
145299a2dd95SBruce Richardson 
145399a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
145499a2dd95SBruce Richardson 	primary_msl = &mcfg->memsegs[msl_idx];
145599a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
145699a2dd95SBruce Richardson 
145799a2dd95SBruce Richardson 	/* create distinct fbarrays for each secondary */
145899a2dd95SBruce Richardson 	snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
145999a2dd95SBruce Richardson 		primary_msl->memseg_arr.name, getpid());
146099a2dd95SBruce Richardson 
146199a2dd95SBruce Richardson 	ret = rte_fbarray_init(&local_msl->memseg_arr, name,
146299a2dd95SBruce Richardson 		primary_msl->memseg_arr.len,
146399a2dd95SBruce Richardson 		primary_msl->memseg_arr.elt_sz);
146499a2dd95SBruce Richardson 	if (ret < 0) {
146599a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
146699a2dd95SBruce Richardson 		return -1;
146799a2dd95SBruce Richardson 	}
146899a2dd95SBruce Richardson 	local_msl->base_va = primary_msl->base_va;
146999a2dd95SBruce Richardson 	local_msl->len = primary_msl->len;
147099a2dd95SBruce Richardson 
147199a2dd95SBruce Richardson 	return 0;
147299a2dd95SBruce Richardson }
147399a2dd95SBruce Richardson 
147499a2dd95SBruce Richardson static int
147599a2dd95SBruce Richardson secondary_msl_destroy_walk(const struct rte_memseg_list *msl,
147699a2dd95SBruce Richardson 		void *arg __rte_unused)
147799a2dd95SBruce Richardson {
147899a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
147999a2dd95SBruce Richardson 	struct rte_memseg_list *local_msl;
148099a2dd95SBruce Richardson 	int msl_idx, ret;
148199a2dd95SBruce Richardson 
148299a2dd95SBruce Richardson 	if (msl->external)
148399a2dd95SBruce Richardson 		return 0;
148499a2dd95SBruce Richardson 
148599a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
148699a2dd95SBruce Richardson 	local_msl = &local_memsegs[msl_idx];
148799a2dd95SBruce Richardson 
148899a2dd95SBruce Richardson 	ret = rte_fbarray_destroy(&local_msl->memseg_arr);
148999a2dd95SBruce Richardson 	if (ret < 0) {
149099a2dd95SBruce Richardson 		RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n");
149199a2dd95SBruce Richardson 		return -1;
149299a2dd95SBruce Richardson 	}
149399a2dd95SBruce Richardson 	local_msl->base_va = NULL;
149499a2dd95SBruce Richardson 	local_msl->len = 0;
149599a2dd95SBruce Richardson 
149699a2dd95SBruce Richardson 	return 0;
149799a2dd95SBruce Richardson }
149899a2dd95SBruce Richardson 
149999a2dd95SBruce Richardson static int
150099a2dd95SBruce Richardson alloc_list(int list_idx, int len)
150199a2dd95SBruce Richardson {
150299a2dd95SBruce Richardson 	int *data;
150399a2dd95SBruce Richardson 	int i;
150499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
150599a2dd95SBruce Richardson 		eal_get_internal_configuration();
150699a2dd95SBruce Richardson 
150799a2dd95SBruce Richardson 	/* single-file segments mode does not need fd list */
150899a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments) {
150999a2dd95SBruce Richardson 		/* ensure we have space to store fd per each possible segment */
151099a2dd95SBruce Richardson 		data = malloc(sizeof(int) * len);
151199a2dd95SBruce Richardson 		if (data == NULL) {
151299a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
151399a2dd95SBruce Richardson 			return -1;
151499a2dd95SBruce Richardson 		}
151599a2dd95SBruce Richardson 		/* set all fd's as invalid */
151699a2dd95SBruce Richardson 		for (i = 0; i < len; i++)
151799a2dd95SBruce Richardson 			data[i] = -1;
151899a2dd95SBruce Richardson 		fd_list[list_idx].fds = data;
151999a2dd95SBruce Richardson 		fd_list[list_idx].len = len;
152099a2dd95SBruce Richardson 	} else {
152199a2dd95SBruce Richardson 		fd_list[list_idx].fds = NULL;
152299a2dd95SBruce Richardson 		fd_list[list_idx].len = 0;
152399a2dd95SBruce Richardson 	}
152499a2dd95SBruce Richardson 
152599a2dd95SBruce Richardson 	fd_list[list_idx].count = 0;
152699a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = -1;
152799a2dd95SBruce Richardson 
152899a2dd95SBruce Richardson 	return 0;
152999a2dd95SBruce Richardson }
153099a2dd95SBruce Richardson 
153199a2dd95SBruce Richardson static int
153299a2dd95SBruce Richardson destroy_list(int list_idx)
153399a2dd95SBruce Richardson {
153499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
153599a2dd95SBruce Richardson 			eal_get_internal_configuration();
153699a2dd95SBruce Richardson 
153799a2dd95SBruce Richardson 	/* single-file segments mode does not need fd list */
153899a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments) {
153999a2dd95SBruce Richardson 		int *fds = fd_list[list_idx].fds;
154099a2dd95SBruce Richardson 		int i;
154199a2dd95SBruce Richardson 		/* go through each fd and ensure it's closed */
154299a2dd95SBruce Richardson 		for (i = 0; i < fd_list[list_idx].len; i++) {
154399a2dd95SBruce Richardson 			if (fds[i] >= 0) {
154499a2dd95SBruce Richardson 				close(fds[i]);
154599a2dd95SBruce Richardson 				fds[i] = -1;
154699a2dd95SBruce Richardson 			}
154799a2dd95SBruce Richardson 		}
154899a2dd95SBruce Richardson 		free(fds);
154999a2dd95SBruce Richardson 		fd_list[list_idx].fds = NULL;
155099a2dd95SBruce Richardson 		fd_list[list_idx].len = 0;
155199a2dd95SBruce Richardson 	} else if (fd_list[list_idx].memseg_list_fd >= 0) {
155299a2dd95SBruce Richardson 		close(fd_list[list_idx].memseg_list_fd);
155399a2dd95SBruce Richardson 		fd_list[list_idx].count = 0;
155499a2dd95SBruce Richardson 		fd_list[list_idx].memseg_list_fd = -1;
155599a2dd95SBruce Richardson 	}
155699a2dd95SBruce Richardson 	return 0;
155799a2dd95SBruce Richardson }
155899a2dd95SBruce Richardson 
155999a2dd95SBruce Richardson static int
156099a2dd95SBruce Richardson fd_list_create_walk(const struct rte_memseg_list *msl,
156199a2dd95SBruce Richardson 		void *arg __rte_unused)
156299a2dd95SBruce Richardson {
156399a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
156499a2dd95SBruce Richardson 	unsigned int len;
156599a2dd95SBruce Richardson 	int msl_idx;
156699a2dd95SBruce Richardson 
156799a2dd95SBruce Richardson 	if (msl->external)
156899a2dd95SBruce Richardson 		return 0;
156999a2dd95SBruce Richardson 
157099a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
157199a2dd95SBruce Richardson 	len = msl->memseg_arr.len;
157299a2dd95SBruce Richardson 
157399a2dd95SBruce Richardson 	return alloc_list(msl_idx, len);
157499a2dd95SBruce Richardson }
157599a2dd95SBruce Richardson 
157699a2dd95SBruce Richardson static int
157799a2dd95SBruce Richardson fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
157899a2dd95SBruce Richardson {
157999a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
158099a2dd95SBruce Richardson 	int msl_idx;
158199a2dd95SBruce Richardson 
158299a2dd95SBruce Richardson 	if (msl->external)
158399a2dd95SBruce Richardson 		return 0;
158499a2dd95SBruce Richardson 
158599a2dd95SBruce Richardson 	msl_idx = msl - mcfg->memsegs;
158699a2dd95SBruce Richardson 
158799a2dd95SBruce Richardson 	return destroy_list(msl_idx);
158899a2dd95SBruce Richardson }
158999a2dd95SBruce Richardson 
159099a2dd95SBruce Richardson int
159199a2dd95SBruce Richardson eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
159299a2dd95SBruce Richardson {
159399a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
159499a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
159599a2dd95SBruce Richardson 		eal_get_internal_configuration();
159699a2dd95SBruce Richardson 
159799a2dd95SBruce Richardson 	/* single file segments mode doesn't support individual segment fd's */
159899a2dd95SBruce Richardson 	if (internal_conf->single_file_segments)
159999a2dd95SBruce Richardson 		return -ENOTSUP;
160099a2dd95SBruce Richardson 
160199a2dd95SBruce Richardson 	/* if list is not allocated, allocate it */
160299a2dd95SBruce Richardson 	if (fd_list[list_idx].len == 0) {
160399a2dd95SBruce Richardson 		int len = mcfg->memsegs[list_idx].memseg_arr.len;
160499a2dd95SBruce Richardson 
160599a2dd95SBruce Richardson 		if (alloc_list(list_idx, len) < 0)
160699a2dd95SBruce Richardson 			return -ENOMEM;
160799a2dd95SBruce Richardson 	}
160899a2dd95SBruce Richardson 	fd_list[list_idx].fds[seg_idx] = fd;
160999a2dd95SBruce Richardson 
161099a2dd95SBruce Richardson 	return 0;
161199a2dd95SBruce Richardson }
161299a2dd95SBruce Richardson 
161399a2dd95SBruce Richardson int
161499a2dd95SBruce Richardson eal_memalloc_set_seg_list_fd(int list_idx, int fd)
161599a2dd95SBruce Richardson {
161699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
161799a2dd95SBruce Richardson 		eal_get_internal_configuration();
161899a2dd95SBruce Richardson 
161999a2dd95SBruce Richardson 	/* non-single file segment mode doesn't support segment list fd's */
162099a2dd95SBruce Richardson 	if (!internal_conf->single_file_segments)
162199a2dd95SBruce Richardson 		return -ENOTSUP;
162299a2dd95SBruce Richardson 
162399a2dd95SBruce Richardson 	fd_list[list_idx].memseg_list_fd = fd;
162499a2dd95SBruce Richardson 
162599a2dd95SBruce Richardson 	return 0;
162699a2dd95SBruce Richardson }
162799a2dd95SBruce Richardson 
162899a2dd95SBruce Richardson int
162999a2dd95SBruce Richardson eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
163099a2dd95SBruce Richardson {
163199a2dd95SBruce Richardson 	int fd;
163299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
163399a2dd95SBruce Richardson 		eal_get_internal_configuration();
163499a2dd95SBruce Richardson 
163599a2dd95SBruce Richardson 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
163699a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
163799a2dd95SBruce Richardson 		/* in in-memory or no-huge mode, we rely on memfd support */
163899a2dd95SBruce Richardson 		return -ENOTSUP;
163999a2dd95SBruce Richardson #endif
164099a2dd95SBruce Richardson 		/* memfd supported, but hugetlbfs memfd may not be */
164199a2dd95SBruce Richardson 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
164299a2dd95SBruce Richardson 			return -ENOTSUP;
164399a2dd95SBruce Richardson 	}
164499a2dd95SBruce Richardson 
164599a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
164699a2dd95SBruce Richardson 		fd = fd_list[list_idx].memseg_list_fd;
164799a2dd95SBruce Richardson 	} else if (fd_list[list_idx].len == 0) {
164899a2dd95SBruce Richardson 		/* list not initialized */
164999a2dd95SBruce Richardson 		fd = -1;
165099a2dd95SBruce Richardson 	} else {
165199a2dd95SBruce Richardson 		fd = fd_list[list_idx].fds[seg_idx];
165299a2dd95SBruce Richardson 	}
165399a2dd95SBruce Richardson 	if (fd < 0)
165499a2dd95SBruce Richardson 		return -ENODEV;
165599a2dd95SBruce Richardson 	return fd;
165699a2dd95SBruce Richardson }
165799a2dd95SBruce Richardson 
165899a2dd95SBruce Richardson static int
165999a2dd95SBruce Richardson test_memfd_create(void)
166099a2dd95SBruce Richardson {
166199a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
166299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
166399a2dd95SBruce Richardson 		eal_get_internal_configuration();
166499a2dd95SBruce Richardson 	unsigned int i;
166599a2dd95SBruce Richardson 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
166699a2dd95SBruce Richardson 		uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz;
166799a2dd95SBruce Richardson 		int pagesz_flag = pagesz_flags(pagesz);
166899a2dd95SBruce Richardson 		int flags;
166999a2dd95SBruce Richardson 
167099a2dd95SBruce Richardson 		flags = pagesz_flag | RTE_MFD_HUGETLB;
167199a2dd95SBruce Richardson 		int fd = memfd_create("test", flags);
167299a2dd95SBruce Richardson 		if (fd < 0) {
167399a2dd95SBruce Richardson 			/* we failed - let memalloc know this isn't working */
167499a2dd95SBruce Richardson 			if (errno == EINVAL) {
167599a2dd95SBruce Richardson 				memfd_create_supported = 0;
167699a2dd95SBruce Richardson 				return 0; /* not supported */
167799a2dd95SBruce Richardson 			}
167899a2dd95SBruce Richardson 
167999a2dd95SBruce Richardson 			/* we got other error - something's wrong */
168099a2dd95SBruce Richardson 			return -1; /* error */
168199a2dd95SBruce Richardson 		}
168299a2dd95SBruce Richardson 		close(fd);
168399a2dd95SBruce Richardson 		return 1; /* supported */
168499a2dd95SBruce Richardson 	}
168599a2dd95SBruce Richardson #endif
168699a2dd95SBruce Richardson 	return 0; /* not supported */
168799a2dd95SBruce Richardson }
168899a2dd95SBruce Richardson 
168999a2dd95SBruce Richardson int
169099a2dd95SBruce Richardson eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
169199a2dd95SBruce Richardson {
169299a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
169399a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
169499a2dd95SBruce Richardson 		eal_get_internal_configuration();
169599a2dd95SBruce Richardson 
169699a2dd95SBruce Richardson 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
169799a2dd95SBruce Richardson #ifndef MEMFD_SUPPORTED
169899a2dd95SBruce Richardson 		/* in in-memory or no-huge mode, we rely on memfd support */
169999a2dd95SBruce Richardson 		return -ENOTSUP;
170099a2dd95SBruce Richardson #endif
170199a2dd95SBruce Richardson 		/* memfd supported, but hugetlbfs memfd may not be */
170299a2dd95SBruce Richardson 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
170399a2dd95SBruce Richardson 			return -ENOTSUP;
170499a2dd95SBruce Richardson 	}
170599a2dd95SBruce Richardson 
170699a2dd95SBruce Richardson 	if (internal_conf->single_file_segments) {
170799a2dd95SBruce Richardson 		size_t pgsz = mcfg->memsegs[list_idx].page_sz;
170899a2dd95SBruce Richardson 
170999a2dd95SBruce Richardson 		/* segment not active? */
171099a2dd95SBruce Richardson 		if (fd_list[list_idx].memseg_list_fd < 0)
171199a2dd95SBruce Richardson 			return -ENOENT;
171299a2dd95SBruce Richardson 		*offset = pgsz * seg_idx;
171399a2dd95SBruce Richardson 	} else {
171499a2dd95SBruce Richardson 		/* fd_list not initialized? */
171599a2dd95SBruce Richardson 		if (fd_list[list_idx].len == 0)
171699a2dd95SBruce Richardson 			return -ENODEV;
171799a2dd95SBruce Richardson 
171899a2dd95SBruce Richardson 		/* segment not active? */
171999a2dd95SBruce Richardson 		if (fd_list[list_idx].fds[seg_idx] < 0)
172099a2dd95SBruce Richardson 			return -ENOENT;
172199a2dd95SBruce Richardson 		*offset = 0;
172299a2dd95SBruce Richardson 	}
172399a2dd95SBruce Richardson 	return 0;
172499a2dd95SBruce Richardson }
172599a2dd95SBruce Richardson 
172699a2dd95SBruce Richardson int
172799a2dd95SBruce Richardson eal_memalloc_cleanup(void)
172899a2dd95SBruce Richardson {
172999a2dd95SBruce Richardson 	/* close all remaining fd's - these are per-process, so it's safe */
173099a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL))
173199a2dd95SBruce Richardson 		return -1;
173299a2dd95SBruce Richardson 
173399a2dd95SBruce Richardson 	/* destroy the shadow page table if we're a secondary process */
173499a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
173599a2dd95SBruce Richardson 		return 0;
173699a2dd95SBruce Richardson 
173799a2dd95SBruce Richardson 	if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk,
173899a2dd95SBruce Richardson 			NULL))
173999a2dd95SBruce Richardson 		return -1;
174099a2dd95SBruce Richardson 
174199a2dd95SBruce Richardson 	return 0;
174299a2dd95SBruce Richardson }
174399a2dd95SBruce Richardson 
174499a2dd95SBruce Richardson int
174599a2dd95SBruce Richardson eal_memalloc_init(void)
174699a2dd95SBruce Richardson {
174799a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
174899a2dd95SBruce Richardson 		eal_get_internal_configuration();
174999a2dd95SBruce Richardson 
175099a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
175199a2dd95SBruce Richardson 		if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
175299a2dd95SBruce Richardson 			return -1;
175399a2dd95SBruce Richardson 	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
175499a2dd95SBruce Richardson 			internal_conf->in_memory) {
175599a2dd95SBruce Richardson 		int mfd_res = test_memfd_create();
175699a2dd95SBruce Richardson 
175799a2dd95SBruce Richardson 		if (mfd_res < 0) {
175899a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
175999a2dd95SBruce Richardson 			return -1;
176099a2dd95SBruce Richardson 		}
176199a2dd95SBruce Richardson 		if (mfd_res == 1)
176299a2dd95SBruce Richardson 			RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
176399a2dd95SBruce Richardson 		else
176499a2dd95SBruce Richardson 			RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
176599a2dd95SBruce Richardson 
176699a2dd95SBruce Richardson 		/* we only support single-file segments mode with in-memory mode
176799a2dd95SBruce Richardson 		 * if we support hugetlbfs with memfd_create. this code will
176899a2dd95SBruce Richardson 		 * test if we do.
176999a2dd95SBruce Richardson 		 */
177099a2dd95SBruce Richardson 		if (internal_conf->single_file_segments &&
177199a2dd95SBruce Richardson 				mfd_res != 1) {
177299a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
177399a2dd95SBruce Richardson 			return -1;
177499a2dd95SBruce Richardson 		}
177599a2dd95SBruce Richardson 		/* this cannot ever happen but better safe than sorry */
177699a2dd95SBruce Richardson 		if (!anonymous_hugepages_supported) {
177799a2dd95SBruce Richardson 			RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
177899a2dd95SBruce Richardson 			return -1;
177999a2dd95SBruce Richardson 		}
1780*32b4771cSDmitry Kozlyuk 		/* safety net, should be impossible to configure */
1781*32b4771cSDmitry Kozlyuk 		if (internal_conf->hugepage_file.unlink_before_mapping &&
1782*32b4771cSDmitry Kozlyuk 				!internal_conf->hugepage_file.unlink_existing) {
1783*32b4771cSDmitry Kozlyuk 			RTE_LOG(ERR, EAL, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.\n");
1784*32b4771cSDmitry Kozlyuk 			return -1;
1785*32b4771cSDmitry Kozlyuk 		}
178699a2dd95SBruce Richardson 	}
178799a2dd95SBruce Richardson 
178899a2dd95SBruce Richardson 	/* initialize all of the fd lists */
178999a2dd95SBruce Richardson 	if (rte_memseg_list_walk(fd_list_create_walk, NULL))
179099a2dd95SBruce Richardson 		return -1;
179199a2dd95SBruce Richardson 	return 0;
179299a2dd95SBruce Richardson }
1793