xref: /dpdk/lib/eal/linux/eal_memory.c (revision ae67895b507bb6af22263c79ba0d5c374b396485)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2010-2014 Intel Corporation.
399a2dd95SBruce Richardson  * Copyright(c) 2013 6WIND S.A.
499a2dd95SBruce Richardson  */
599a2dd95SBruce Richardson 
699a2dd95SBruce Richardson #include <errno.h>
799a2dd95SBruce Richardson #include <fcntl.h>
899a2dd95SBruce Richardson #include <stdbool.h>
999a2dd95SBruce Richardson #include <stdlib.h>
1099a2dd95SBruce Richardson #include <stdio.h>
1199a2dd95SBruce Richardson #include <stdint.h>
1299a2dd95SBruce Richardson #include <inttypes.h>
1399a2dd95SBruce Richardson #include <string.h>
1499a2dd95SBruce Richardson #include <sys/mman.h>
1599a2dd95SBruce Richardson #include <sys/stat.h>
1699a2dd95SBruce Richardson #include <sys/file.h>
1799a2dd95SBruce Richardson #include <sys/resource.h>
1899a2dd95SBruce Richardson #include <unistd.h>
1999a2dd95SBruce Richardson #include <limits.h>
2099a2dd95SBruce Richardson #include <signal.h>
2199a2dd95SBruce Richardson #include <setjmp.h>
2299a2dd95SBruce Richardson #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
2399a2dd95SBruce Richardson #define MEMFD_SUPPORTED
2499a2dd95SBruce Richardson #endif
2599a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
2699a2dd95SBruce Richardson #include <numa.h>
2799a2dd95SBruce Richardson #include <numaif.h>
2899a2dd95SBruce Richardson #endif
2999a2dd95SBruce Richardson 
3099a2dd95SBruce Richardson #include <rte_errno.h>
3199a2dd95SBruce Richardson #include <rte_log.h>
3299a2dd95SBruce Richardson #include <rte_memory.h>
3399a2dd95SBruce Richardson #include <rte_eal.h>
3499a2dd95SBruce Richardson #include <rte_lcore.h>
3599a2dd95SBruce Richardson #include <rte_common.h>
3699a2dd95SBruce Richardson 
3799a2dd95SBruce Richardson #include "eal_private.h"
3899a2dd95SBruce Richardson #include "eal_memalloc.h"
3999a2dd95SBruce Richardson #include "eal_memcfg.h"
4099a2dd95SBruce Richardson #include "eal_internal_cfg.h"
4199a2dd95SBruce Richardson #include "eal_filesystem.h"
4299a2dd95SBruce Richardson #include "eal_hugepages.h"
4399a2dd95SBruce Richardson #include "eal_options.h"
4499a2dd95SBruce Richardson 
4599a2dd95SBruce Richardson #define PFN_MASK_SIZE	8
4699a2dd95SBruce Richardson 
4799a2dd95SBruce Richardson /**
4899a2dd95SBruce Richardson  * @file
4999a2dd95SBruce Richardson  * Huge page mapping under linux
5099a2dd95SBruce Richardson  *
5199a2dd95SBruce Richardson  * To reserve a big contiguous amount of memory, we use the hugepage
5299a2dd95SBruce Richardson  * feature of linux. For that, we need to have hugetlbfs mounted. This
5399a2dd95SBruce Richardson  * code will create many files in this directory (one per page) and
5499a2dd95SBruce Richardson  * map them in virtual memory. For each page, we will retrieve its
5599a2dd95SBruce Richardson  * physical address and remap it in order to have a virtual contiguous
5699a2dd95SBruce Richardson  * zone as well as a physical contiguous zone.
5799a2dd95SBruce Richardson  */
5899a2dd95SBruce Richardson 
5999a2dd95SBruce Richardson static int phys_addrs_available = -1;
6099a2dd95SBruce Richardson 
6199a2dd95SBruce Richardson #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
6299a2dd95SBruce Richardson 
eal_get_baseaddr(void)6399a2dd95SBruce Richardson uint64_t eal_get_baseaddr(void)
6499a2dd95SBruce Richardson {
6599a2dd95SBruce Richardson 	/*
6699a2dd95SBruce Richardson 	 * Linux kernel uses a really high address as starting address for
6799a2dd95SBruce Richardson 	 * serving mmaps calls. If there exists addressing limitations and IOVA
6899a2dd95SBruce Richardson 	 * mode is VA, this starting address is likely too high for those
6999a2dd95SBruce Richardson 	 * devices. However, it is possible to use a lower address in the
7099a2dd95SBruce Richardson 	 * process virtual address space as with 64 bits there is a lot of
7199a2dd95SBruce Richardson 	 * available space.
7299a2dd95SBruce Richardson 	 *
7399a2dd95SBruce Richardson 	 * Current known limitations are 39 or 40 bits. Setting the starting
7499a2dd95SBruce Richardson 	 * address at 4GB implies there are 508GB or 1020GB for mapping the
7599a2dd95SBruce Richardson 	 * available hugepages. This is likely enough for most systems, although
7699a2dd95SBruce Richardson 	 * a device with addressing limitations should call
7799a2dd95SBruce Richardson 	 * rte_mem_check_dma_mask for ensuring all memory is within supported
7899a2dd95SBruce Richardson 	 * range.
7999a2dd95SBruce Richardson 	 */
8029631ee5SMin Zhou #if defined(RTE_ARCH_LOONGARCH)
8129631ee5SMin Zhou 	return 0x7000000000ULL;
8229631ee5SMin Zhou #else
8399a2dd95SBruce Richardson 	return 0x100000000ULL;
8429631ee5SMin Zhou #endif
8599a2dd95SBruce Richardson }
8699a2dd95SBruce Richardson 
8799a2dd95SBruce Richardson /*
8899a2dd95SBruce Richardson  * Get physical address of any mapped virtual address in the current process.
8999a2dd95SBruce Richardson  */
9099a2dd95SBruce Richardson phys_addr_t
rte_mem_virt2phy(const void * virtaddr)9199a2dd95SBruce Richardson rte_mem_virt2phy(const void *virtaddr)
9299a2dd95SBruce Richardson {
9399a2dd95SBruce Richardson 	int fd, retval;
9499a2dd95SBruce Richardson 	uint64_t page, physaddr;
9599a2dd95SBruce Richardson 	unsigned long virt_pfn;
9699a2dd95SBruce Richardson 	int page_size;
9799a2dd95SBruce Richardson 	off_t offset;
9899a2dd95SBruce Richardson 
9999a2dd95SBruce Richardson 	if (phys_addrs_available == 0)
10099a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
10199a2dd95SBruce Richardson 
10299a2dd95SBruce Richardson 	/* standard page size */
10399a2dd95SBruce Richardson 	page_size = getpagesize();
10499a2dd95SBruce Richardson 
10599a2dd95SBruce Richardson 	fd = open("/proc/self/pagemap", O_RDONLY);
10699a2dd95SBruce Richardson 	if (fd < 0) {
107*ae67895bSDavid Marchand 		EAL_LOG(INFO, "%s(): cannot open /proc/self/pagemap: %s",
10899a2dd95SBruce Richardson 			__func__, strerror(errno));
10999a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
11099a2dd95SBruce Richardson 	}
11199a2dd95SBruce Richardson 
11299a2dd95SBruce Richardson 	virt_pfn = (unsigned long)virtaddr / page_size;
11399a2dd95SBruce Richardson 	offset = sizeof(uint64_t) * virt_pfn;
11499a2dd95SBruce Richardson 	if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
115*ae67895bSDavid Marchand 		EAL_LOG(INFO, "%s(): seek error in /proc/self/pagemap: %s",
11699a2dd95SBruce Richardson 				__func__, strerror(errno));
11799a2dd95SBruce Richardson 		close(fd);
11899a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
11999a2dd95SBruce Richardson 	}
12099a2dd95SBruce Richardson 
12199a2dd95SBruce Richardson 	retval = read(fd, &page, PFN_MASK_SIZE);
12299a2dd95SBruce Richardson 	close(fd);
12399a2dd95SBruce Richardson 	if (retval < 0) {
124*ae67895bSDavid Marchand 		EAL_LOG(INFO, "%s(): cannot read /proc/self/pagemap: %s",
12599a2dd95SBruce Richardson 				__func__, strerror(errno));
12699a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
12799a2dd95SBruce Richardson 	} else if (retval != PFN_MASK_SIZE) {
128*ae67895bSDavid Marchand 		EAL_LOG(INFO, "%s(): read %d bytes from /proc/self/pagemap "
129*ae67895bSDavid Marchand 				"but expected %d:",
13099a2dd95SBruce Richardson 				__func__, retval, PFN_MASK_SIZE);
13199a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
13299a2dd95SBruce Richardson 	}
13399a2dd95SBruce Richardson 
13499a2dd95SBruce Richardson 	/*
13599a2dd95SBruce Richardson 	 * the pfn (page frame number) are bits 0-54 (see
13699a2dd95SBruce Richardson 	 * pagemap.txt in linux Documentation)
13799a2dd95SBruce Richardson 	 */
13899a2dd95SBruce Richardson 	if ((page & 0x7fffffffffffffULL) == 0)
13999a2dd95SBruce Richardson 		return RTE_BAD_IOVA;
14099a2dd95SBruce Richardson 
14199a2dd95SBruce Richardson 	physaddr = ((page & 0x7fffffffffffffULL) * page_size)
14299a2dd95SBruce Richardson 		+ ((unsigned long)virtaddr % page_size);
14399a2dd95SBruce Richardson 
14499a2dd95SBruce Richardson 	return physaddr;
14599a2dd95SBruce Richardson }
14699a2dd95SBruce Richardson 
14799a2dd95SBruce Richardson rte_iova_t
rte_mem_virt2iova(const void * virtaddr)14899a2dd95SBruce Richardson rte_mem_virt2iova(const void *virtaddr)
14999a2dd95SBruce Richardson {
15099a2dd95SBruce Richardson 	if (rte_eal_iova_mode() == RTE_IOVA_VA)
15199a2dd95SBruce Richardson 		return (uintptr_t)virtaddr;
15299a2dd95SBruce Richardson 	return rte_mem_virt2phy(virtaddr);
15399a2dd95SBruce Richardson }
15499a2dd95SBruce Richardson 
15599a2dd95SBruce Richardson /*
15699a2dd95SBruce Richardson  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
15799a2dd95SBruce Richardson  * it by browsing the /proc/self/pagemap special file.
15899a2dd95SBruce Richardson  */
15999a2dd95SBruce Richardson static int
find_physaddrs(struct hugepage_file * hugepg_tbl,struct hugepage_info * hpi)16099a2dd95SBruce Richardson find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
16199a2dd95SBruce Richardson {
16299a2dd95SBruce Richardson 	unsigned int i;
16399a2dd95SBruce Richardson 	phys_addr_t addr;
16499a2dd95SBruce Richardson 
16599a2dd95SBruce Richardson 	for (i = 0; i < hpi->num_pages[0]; i++) {
16699a2dd95SBruce Richardson 		addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
16799a2dd95SBruce Richardson 		if (addr == RTE_BAD_PHYS_ADDR)
16899a2dd95SBruce Richardson 			return -1;
16999a2dd95SBruce Richardson 		hugepg_tbl[i].physaddr = addr;
17099a2dd95SBruce Richardson 	}
17199a2dd95SBruce Richardson 	return 0;
17299a2dd95SBruce Richardson }
17399a2dd95SBruce Richardson 
17499a2dd95SBruce Richardson /*
17599a2dd95SBruce Richardson  * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
17699a2dd95SBruce Richardson  */
17799a2dd95SBruce Richardson static int
set_physaddrs(struct hugepage_file * hugepg_tbl,struct hugepage_info * hpi)17899a2dd95SBruce Richardson set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
17999a2dd95SBruce Richardson {
18099a2dd95SBruce Richardson 	unsigned int i;
18199a2dd95SBruce Richardson 	static phys_addr_t addr;
18299a2dd95SBruce Richardson 
18399a2dd95SBruce Richardson 	for (i = 0; i < hpi->num_pages[0]; i++) {
18499a2dd95SBruce Richardson 		hugepg_tbl[i].physaddr = addr;
18599a2dd95SBruce Richardson 		addr += hugepg_tbl[i].size;
18699a2dd95SBruce Richardson 	}
18799a2dd95SBruce Richardson 	return 0;
18899a2dd95SBruce Richardson }
18999a2dd95SBruce Richardson 
19099a2dd95SBruce Richardson /*
19199a2dd95SBruce Richardson  * Check whether address-space layout randomization is enabled in
19299a2dd95SBruce Richardson  * the kernel. This is important for multi-process as it can prevent
19399a2dd95SBruce Richardson  * two processes mapping data to the same virtual address
19499a2dd95SBruce Richardson  * Returns:
19599a2dd95SBruce Richardson  *    0 - address space randomization disabled
19699a2dd95SBruce Richardson  *    1/2 - address space randomization enabled
19799a2dd95SBruce Richardson  *    negative error code on error
19899a2dd95SBruce Richardson  */
19999a2dd95SBruce Richardson static int
aslr_enabled(void)20099a2dd95SBruce Richardson aslr_enabled(void)
20199a2dd95SBruce Richardson {
20299a2dd95SBruce Richardson 	char c;
20399a2dd95SBruce Richardson 	int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
20499a2dd95SBruce Richardson 	if (fd < 0)
20599a2dd95SBruce Richardson 		return -errno;
20699a2dd95SBruce Richardson 	retval = read(fd, &c, 1);
20799a2dd95SBruce Richardson 	close(fd);
20899a2dd95SBruce Richardson 	if (retval < 0)
20999a2dd95SBruce Richardson 		return -errno;
21099a2dd95SBruce Richardson 	if (retval == 0)
21199a2dd95SBruce Richardson 		return -EIO;
21299a2dd95SBruce Richardson 	switch (c) {
21399a2dd95SBruce Richardson 		case '0' : return 0;
21499a2dd95SBruce Richardson 		case '1' : return 1;
21599a2dd95SBruce Richardson 		case '2' : return 2;
21699a2dd95SBruce Richardson 		default: return -EINVAL;
21799a2dd95SBruce Richardson 	}
21899a2dd95SBruce Richardson }
21999a2dd95SBruce Richardson 
22099a2dd95SBruce Richardson static sigjmp_buf huge_jmpenv;
22199a2dd95SBruce Richardson 
huge_sigbus_handler(int signo __rte_unused)22299a2dd95SBruce Richardson static void huge_sigbus_handler(int signo __rte_unused)
22399a2dd95SBruce Richardson {
22499a2dd95SBruce Richardson 	siglongjmp(huge_jmpenv, 1);
22599a2dd95SBruce Richardson }
22699a2dd95SBruce Richardson 
22799a2dd95SBruce Richardson /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
22899a2dd95SBruce Richardson  * non-static local variable in the stack frame calling sigsetjmp might be
22999a2dd95SBruce Richardson  * clobbered by a call to longjmp.
23099a2dd95SBruce Richardson  */
huge_wrap_sigsetjmp(void)23199a2dd95SBruce Richardson static int huge_wrap_sigsetjmp(void)
23299a2dd95SBruce Richardson {
23399a2dd95SBruce Richardson 	return sigsetjmp(huge_jmpenv, 1);
23499a2dd95SBruce Richardson }
23599a2dd95SBruce Richardson 
23699a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
23799a2dd95SBruce Richardson /* Callback for numa library. */
numa_error(char * where)23899a2dd95SBruce Richardson void numa_error(char *where)
23999a2dd95SBruce Richardson {
240*ae67895bSDavid Marchand 	EAL_LOG(ERR, "%s failed: %s", where, strerror(errno));
24199a2dd95SBruce Richardson }
24299a2dd95SBruce Richardson #endif
24399a2dd95SBruce Richardson 
24499a2dd95SBruce Richardson /*
24599a2dd95SBruce Richardson  * Mmap all hugepages of hugepage table: it first open a file in
24699a2dd95SBruce Richardson  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
24799a2dd95SBruce Richardson  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
24899a2dd95SBruce Richardson  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
24999a2dd95SBruce Richardson  * map contiguous physical blocks in contiguous virtual blocks.
25099a2dd95SBruce Richardson  */
25199a2dd95SBruce Richardson static unsigned
map_all_hugepages(struct hugepage_file * hugepg_tbl,struct hugepage_info * hpi,uint64_t * essential_memory __rte_unused)25299a2dd95SBruce Richardson map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
25399a2dd95SBruce Richardson 		  uint64_t *essential_memory __rte_unused)
25499a2dd95SBruce Richardson {
25599a2dd95SBruce Richardson 	int fd;
25699a2dd95SBruce Richardson 	unsigned i;
25799a2dd95SBruce Richardson 	void *virtaddr;
25899a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
25999a2dd95SBruce Richardson 	int node_id = -1;
26099a2dd95SBruce Richardson 	int essential_prev = 0;
26199a2dd95SBruce Richardson 	int oldpolicy;
26299a2dd95SBruce Richardson 	struct bitmask *oldmask = NULL;
26399a2dd95SBruce Richardson 	bool have_numa = true;
26499a2dd95SBruce Richardson 	unsigned long maxnode = 0;
26599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
26699a2dd95SBruce Richardson 		eal_get_internal_configuration();
26799a2dd95SBruce Richardson 
26899a2dd95SBruce Richardson 	/* Check if kernel supports NUMA. */
26999a2dd95SBruce Richardson 	if (numa_available() != 0) {
270*ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "NUMA is not supported.");
27199a2dd95SBruce Richardson 		have_numa = false;
27299a2dd95SBruce Richardson 	}
27399a2dd95SBruce Richardson 
27499a2dd95SBruce Richardson 	if (have_numa) {
275*ae67895bSDavid Marchand 		EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
27699a2dd95SBruce Richardson 		oldmask = numa_allocate_nodemask();
27799a2dd95SBruce Richardson 		if (get_mempolicy(&oldpolicy, oldmask->maskp,
27899a2dd95SBruce Richardson 				  oldmask->size + 1, 0, 0) < 0) {
279*ae67895bSDavid Marchand 			EAL_LOG(ERR,
28099a2dd95SBruce Richardson 				"Failed to get current mempolicy: %s. "
281*ae67895bSDavid Marchand 				"Assuming MPOL_DEFAULT.", strerror(errno));
28299a2dd95SBruce Richardson 			oldpolicy = MPOL_DEFAULT;
28399a2dd95SBruce Richardson 		}
28499a2dd95SBruce Richardson 		for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
28599a2dd95SBruce Richardson 			if (internal_conf->socket_mem[i])
28699a2dd95SBruce Richardson 				maxnode = i + 1;
28799a2dd95SBruce Richardson 	}
28899a2dd95SBruce Richardson #endif
28999a2dd95SBruce Richardson 
29099a2dd95SBruce Richardson 	for (i = 0; i < hpi->num_pages[0]; i++) {
29199a2dd95SBruce Richardson 		struct hugepage_file *hf = &hugepg_tbl[i];
29299a2dd95SBruce Richardson 		uint64_t hugepage_sz = hpi->hugepage_sz;
29399a2dd95SBruce Richardson 
29499a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
29599a2dd95SBruce Richardson 		if (maxnode) {
29699a2dd95SBruce Richardson 			unsigned int j;
29799a2dd95SBruce Richardson 
29899a2dd95SBruce Richardson 			for (j = 0; j < maxnode; j++)
29999a2dd95SBruce Richardson 				if (essential_memory[j])
30099a2dd95SBruce Richardson 					break;
30199a2dd95SBruce Richardson 
30299a2dd95SBruce Richardson 			if (j == maxnode) {
30399a2dd95SBruce Richardson 				node_id = (node_id + 1) % maxnode;
30499a2dd95SBruce Richardson 				while (!internal_conf->socket_mem[node_id]) {
30599a2dd95SBruce Richardson 					node_id++;
30699a2dd95SBruce Richardson 					node_id %= maxnode;
30799a2dd95SBruce Richardson 				}
30899a2dd95SBruce Richardson 				essential_prev = 0;
30999a2dd95SBruce Richardson 			} else {
31099a2dd95SBruce Richardson 				node_id = j;
31199a2dd95SBruce Richardson 				essential_prev = essential_memory[j];
31299a2dd95SBruce Richardson 
31399a2dd95SBruce Richardson 				if (essential_memory[j] < hugepage_sz)
31499a2dd95SBruce Richardson 					essential_memory[j] = 0;
31599a2dd95SBruce Richardson 				else
31699a2dd95SBruce Richardson 					essential_memory[j] -= hugepage_sz;
31799a2dd95SBruce Richardson 			}
31899a2dd95SBruce Richardson 
319*ae67895bSDavid Marchand 			EAL_LOG(DEBUG,
320*ae67895bSDavid Marchand 				"Setting policy MPOL_PREFERRED for socket %d",
32199a2dd95SBruce Richardson 				node_id);
32299a2dd95SBruce Richardson 			numa_set_preferred(node_id);
32399a2dd95SBruce Richardson 		}
32499a2dd95SBruce Richardson #endif
32599a2dd95SBruce Richardson 
32699a2dd95SBruce Richardson 		hf->file_id = i;
32799a2dd95SBruce Richardson 		hf->size = hugepage_sz;
32899a2dd95SBruce Richardson 		eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
32999a2dd95SBruce Richardson 				hpi->hugedir, hf->file_id);
33099a2dd95SBruce Richardson 		hf->filepath[sizeof(hf->filepath) - 1] = '\0';
33199a2dd95SBruce Richardson 
33299a2dd95SBruce Richardson 		/* try to create hugepage file */
33399a2dd95SBruce Richardson 		fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
33499a2dd95SBruce Richardson 		if (fd < 0) {
335*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): open failed: %s", __func__,
33699a2dd95SBruce Richardson 					strerror(errno));
33799a2dd95SBruce Richardson 			goto out;
33899a2dd95SBruce Richardson 		}
33999a2dd95SBruce Richardson 
34099a2dd95SBruce Richardson 		/* map the segment, and populate page tables,
34199a2dd95SBruce Richardson 		 * the kernel fills this segment with zeros. we don't care where
34299a2dd95SBruce Richardson 		 * this gets mapped - we already have contiguous memory areas
34399a2dd95SBruce Richardson 		 * ready for us to map into.
34499a2dd95SBruce Richardson 		 */
34599a2dd95SBruce Richardson 		virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
34699a2dd95SBruce Richardson 				MAP_SHARED | MAP_POPULATE, fd, 0);
34799a2dd95SBruce Richardson 		if (virtaddr == MAP_FAILED) {
348*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): mmap failed: %s", __func__,
34999a2dd95SBruce Richardson 					strerror(errno));
35099a2dd95SBruce Richardson 			close(fd);
35199a2dd95SBruce Richardson 			goto out;
35299a2dd95SBruce Richardson 		}
35399a2dd95SBruce Richardson 
35499a2dd95SBruce Richardson 		hf->orig_va = virtaddr;
35599a2dd95SBruce Richardson 
35699a2dd95SBruce Richardson 		/* In linux, hugetlb limitations, like cgroup, are
35799a2dd95SBruce Richardson 		 * enforced at fault time instead of mmap(), even
35899a2dd95SBruce Richardson 		 * with the option of MAP_POPULATE. Kernel will send
35999a2dd95SBruce Richardson 		 * a SIGBUS signal. To avoid to be killed, save stack
36099a2dd95SBruce Richardson 		 * environment here, if SIGBUS happens, we can jump
36199a2dd95SBruce Richardson 		 * back here.
36299a2dd95SBruce Richardson 		 */
36399a2dd95SBruce Richardson 		if (huge_wrap_sigsetjmp()) {
364*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more "
365*ae67895bSDavid Marchand 				"hugepages of size %u MB",
36699a2dd95SBruce Richardson 				(unsigned int)(hugepage_sz / 0x100000));
36799a2dd95SBruce Richardson 			munmap(virtaddr, hugepage_sz);
36899a2dd95SBruce Richardson 			close(fd);
36999a2dd95SBruce Richardson 			unlink(hugepg_tbl[i].filepath);
37099a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
37199a2dd95SBruce Richardson 			if (maxnode)
37299a2dd95SBruce Richardson 				essential_memory[node_id] =
37399a2dd95SBruce Richardson 					essential_prev;
37499a2dd95SBruce Richardson #endif
37599a2dd95SBruce Richardson 			goto out;
37699a2dd95SBruce Richardson 		}
37799a2dd95SBruce Richardson 		*(int *)virtaddr = 0;
37899a2dd95SBruce Richardson 
37999a2dd95SBruce Richardson 		/* set shared lock on the file. */
38099a2dd95SBruce Richardson 		if (flock(fd, LOCK_SH) < 0) {
381*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): Locking file failed:%s ",
38299a2dd95SBruce Richardson 				__func__, strerror(errno));
38399a2dd95SBruce Richardson 			close(fd);
38499a2dd95SBruce Richardson 			goto out;
38599a2dd95SBruce Richardson 		}
38699a2dd95SBruce Richardson 
38799a2dd95SBruce Richardson 		close(fd);
38899a2dd95SBruce Richardson 	}
38999a2dd95SBruce Richardson 
39099a2dd95SBruce Richardson out:
39199a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
39299a2dd95SBruce Richardson 	if (maxnode) {
393*ae67895bSDavid Marchand 		EAL_LOG(DEBUG,
394*ae67895bSDavid Marchand 			"Restoring previous memory policy: %d", oldpolicy);
39599a2dd95SBruce Richardson 		if (oldpolicy == MPOL_DEFAULT) {
39699a2dd95SBruce Richardson 			numa_set_localalloc();
39799a2dd95SBruce Richardson 		} else if (set_mempolicy(oldpolicy, oldmask->maskp,
39899a2dd95SBruce Richardson 					 oldmask->size + 1) < 0) {
399*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Failed to restore mempolicy: %s",
40099a2dd95SBruce Richardson 				strerror(errno));
40199a2dd95SBruce Richardson 			numa_set_localalloc();
40299a2dd95SBruce Richardson 		}
40399a2dd95SBruce Richardson 	}
40499a2dd95SBruce Richardson 	if (oldmask != NULL)
40599a2dd95SBruce Richardson 		numa_free_cpumask(oldmask);
40699a2dd95SBruce Richardson #endif
40799a2dd95SBruce Richardson 	return i;
40899a2dd95SBruce Richardson }
40999a2dd95SBruce Richardson 
41099a2dd95SBruce Richardson /*
41199a2dd95SBruce Richardson  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
41299a2dd95SBruce Richardson  * page.
41399a2dd95SBruce Richardson  */
41499a2dd95SBruce Richardson static int
find_numasocket(struct hugepage_file * hugepg_tbl,struct hugepage_info * hpi)41599a2dd95SBruce Richardson find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
41699a2dd95SBruce Richardson {
41799a2dd95SBruce Richardson 	int socket_id;
41899a2dd95SBruce Richardson 	char *end, *nodestr;
41999a2dd95SBruce Richardson 	unsigned i, hp_count = 0;
42099a2dd95SBruce Richardson 	uint64_t virt_addr;
42199a2dd95SBruce Richardson 	char buf[BUFSIZ];
42299a2dd95SBruce Richardson 	char hugedir_str[PATH_MAX];
42399a2dd95SBruce Richardson 	FILE *f;
42499a2dd95SBruce Richardson 
42599a2dd95SBruce Richardson 	f = fopen("/proc/self/numa_maps", "r");
42699a2dd95SBruce Richardson 	if (f == NULL) {
427*ae67895bSDavid Marchand 		EAL_LOG(NOTICE, "NUMA support not available"
428*ae67895bSDavid Marchand 			" consider that all memory is in socket_id 0");
42999a2dd95SBruce Richardson 		return 0;
43099a2dd95SBruce Richardson 	}
43199a2dd95SBruce Richardson 
43299a2dd95SBruce Richardson 	snprintf(hugedir_str, sizeof(hugedir_str),
43399a2dd95SBruce Richardson 			"%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
43499a2dd95SBruce Richardson 
43599a2dd95SBruce Richardson 	/* parse numa map */
43699a2dd95SBruce Richardson 	while (fgets(buf, sizeof(buf), f) != NULL) {
43799a2dd95SBruce Richardson 
43899a2dd95SBruce Richardson 		/* ignore non huge page */
43999a2dd95SBruce Richardson 		if (strstr(buf, " huge ") == NULL &&
44099a2dd95SBruce Richardson 				strstr(buf, hugedir_str) == NULL)
44199a2dd95SBruce Richardson 			continue;
44299a2dd95SBruce Richardson 
44399a2dd95SBruce Richardson 		/* get zone addr */
44499a2dd95SBruce Richardson 		virt_addr = strtoull(buf, &end, 16);
44599a2dd95SBruce Richardson 		if (virt_addr == 0 || end == buf) {
446*ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
44799a2dd95SBruce Richardson 			goto error;
44899a2dd95SBruce Richardson 		}
44999a2dd95SBruce Richardson 
45099a2dd95SBruce Richardson 		/* get node id (socket id) */
45199a2dd95SBruce Richardson 		nodestr = strstr(buf, " N");
45299a2dd95SBruce Richardson 		if (nodestr == NULL) {
453*ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
45499a2dd95SBruce Richardson 			goto error;
45599a2dd95SBruce Richardson 		}
45699a2dd95SBruce Richardson 		nodestr += 2;
45799a2dd95SBruce Richardson 		end = strstr(nodestr, "=");
45899a2dd95SBruce Richardson 		if (end == NULL) {
459*ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
46099a2dd95SBruce Richardson 			goto error;
46199a2dd95SBruce Richardson 		}
46299a2dd95SBruce Richardson 		end[0] = '\0';
46399a2dd95SBruce Richardson 		end = NULL;
46499a2dd95SBruce Richardson 
46599a2dd95SBruce Richardson 		socket_id = strtoul(nodestr, &end, 0);
46699a2dd95SBruce Richardson 		if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
467*ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
46899a2dd95SBruce Richardson 			goto error;
46999a2dd95SBruce Richardson 		}
47099a2dd95SBruce Richardson 
47199a2dd95SBruce Richardson 		/* if we find this page in our mappings, set socket_id */
47299a2dd95SBruce Richardson 		for (i = 0; i < hpi->num_pages[0]; i++) {
47399a2dd95SBruce Richardson 			void *va = (void *)(unsigned long)virt_addr;
47499a2dd95SBruce Richardson 			if (hugepg_tbl[i].orig_va == va) {
47599a2dd95SBruce Richardson 				hugepg_tbl[i].socket_id = socket_id;
47699a2dd95SBruce Richardson 				hp_count++;
47799a2dd95SBruce Richardson #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
478*ae67895bSDavid Marchand 				EAL_LOG(DEBUG,
479*ae67895bSDavid Marchand 					"Hugepage %s is on socket %d",
48099a2dd95SBruce Richardson 					hugepg_tbl[i].filepath, socket_id);
48199a2dd95SBruce Richardson #endif
48299a2dd95SBruce Richardson 			}
48399a2dd95SBruce Richardson 		}
48499a2dd95SBruce Richardson 	}
48599a2dd95SBruce Richardson 
48699a2dd95SBruce Richardson 	if (hp_count < hpi->num_pages[0])
48799a2dd95SBruce Richardson 		goto error;
48899a2dd95SBruce Richardson 
48999a2dd95SBruce Richardson 	fclose(f);
49099a2dd95SBruce Richardson 	return 0;
49199a2dd95SBruce Richardson 
49299a2dd95SBruce Richardson error:
49399a2dd95SBruce Richardson 	fclose(f);
49499a2dd95SBruce Richardson 	return -1;
49599a2dd95SBruce Richardson }
49699a2dd95SBruce Richardson 
49799a2dd95SBruce Richardson static int
cmp_physaddr(const void * a,const void * b)49899a2dd95SBruce Richardson cmp_physaddr(const void *a, const void *b)
49999a2dd95SBruce Richardson {
50099a2dd95SBruce Richardson #ifndef RTE_ARCH_PPC_64
50199a2dd95SBruce Richardson 	const struct hugepage_file *p1 = a;
50299a2dd95SBruce Richardson 	const struct hugepage_file *p2 = b;
50399a2dd95SBruce Richardson #else
50499a2dd95SBruce Richardson 	/* PowerPC needs memory sorted in reverse order from x86 */
50599a2dd95SBruce Richardson 	const struct hugepage_file *p1 = b;
50699a2dd95SBruce Richardson 	const struct hugepage_file *p2 = a;
50799a2dd95SBruce Richardson #endif
50899a2dd95SBruce Richardson 	if (p1->physaddr < p2->physaddr)
50999a2dd95SBruce Richardson 		return -1;
51099a2dd95SBruce Richardson 	else if (p1->physaddr > p2->physaddr)
51199a2dd95SBruce Richardson 		return 1;
51299a2dd95SBruce Richardson 	else
51399a2dd95SBruce Richardson 		return 0;
51499a2dd95SBruce Richardson }
51599a2dd95SBruce Richardson 
51699a2dd95SBruce Richardson /*
51799a2dd95SBruce Richardson  * Uses mmap to create a shared memory area for storage of data
51899a2dd95SBruce Richardson  * Used in this file to store the hugepage file map on disk
51999a2dd95SBruce Richardson  */
52099a2dd95SBruce Richardson static void *
create_shared_memory(const char * filename,const size_t mem_size)52199a2dd95SBruce Richardson create_shared_memory(const char *filename, const size_t mem_size)
52299a2dd95SBruce Richardson {
52399a2dd95SBruce Richardson 	void *retval;
52499a2dd95SBruce Richardson 	int fd;
52599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
52699a2dd95SBruce Richardson 		eal_get_internal_configuration();
52799a2dd95SBruce Richardson 
52899a2dd95SBruce Richardson 	/* if no shared files mode is used, create anonymous memory instead */
52999a2dd95SBruce Richardson 	if (internal_conf->no_shconf) {
53099a2dd95SBruce Richardson 		retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
53199a2dd95SBruce Richardson 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
53299a2dd95SBruce Richardson 		if (retval == MAP_FAILED)
53399a2dd95SBruce Richardson 			return NULL;
53499a2dd95SBruce Richardson 		return retval;
53599a2dd95SBruce Richardson 	}
53699a2dd95SBruce Richardson 
53799a2dd95SBruce Richardson 	fd = open(filename, O_CREAT | O_RDWR, 0600);
53899a2dd95SBruce Richardson 	if (fd < 0)
53999a2dd95SBruce Richardson 		return NULL;
54099a2dd95SBruce Richardson 	if (ftruncate(fd, mem_size) < 0) {
54199a2dd95SBruce Richardson 		close(fd);
54299a2dd95SBruce Richardson 		return NULL;
54399a2dd95SBruce Richardson 	}
54499a2dd95SBruce Richardson 	retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
54599a2dd95SBruce Richardson 	close(fd);
54699a2dd95SBruce Richardson 	if (retval == MAP_FAILED)
54799a2dd95SBruce Richardson 		return NULL;
54899a2dd95SBruce Richardson 	return retval;
54999a2dd95SBruce Richardson }
55099a2dd95SBruce Richardson 
55199a2dd95SBruce Richardson /*
55299a2dd95SBruce Richardson  * this copies *active* hugepages from one hugepage table to another.
55399a2dd95SBruce Richardson  * destination is typically the shared memory.
55499a2dd95SBruce Richardson  */
55599a2dd95SBruce Richardson static int
copy_hugepages_to_shared_mem(struct hugepage_file * dst,int dest_size,const struct hugepage_file * src,int src_size)55699a2dd95SBruce Richardson copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
55799a2dd95SBruce Richardson 		const struct hugepage_file * src, int src_size)
55899a2dd95SBruce Richardson {
55999a2dd95SBruce Richardson 	int src_pos, dst_pos = 0;
56099a2dd95SBruce Richardson 
56199a2dd95SBruce Richardson 	for (src_pos = 0; src_pos < src_size; src_pos++) {
56299a2dd95SBruce Richardson 		if (src[src_pos].orig_va != NULL) {
56399a2dd95SBruce Richardson 			/* error on overflow attempt */
56499a2dd95SBruce Richardson 			if (dst_pos == dest_size)
56599a2dd95SBruce Richardson 				return -1;
56699a2dd95SBruce Richardson 			memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
56799a2dd95SBruce Richardson 			dst_pos++;
56899a2dd95SBruce Richardson 		}
56999a2dd95SBruce Richardson 	}
57099a2dd95SBruce Richardson 	return 0;
57199a2dd95SBruce Richardson }
57299a2dd95SBruce Richardson 
57399a2dd95SBruce Richardson static int
unlink_hugepage_files(struct hugepage_file * hugepg_tbl,unsigned num_hp_info)57499a2dd95SBruce Richardson unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
57599a2dd95SBruce Richardson 		unsigned num_hp_info)
57699a2dd95SBruce Richardson {
57799a2dd95SBruce Richardson 	unsigned socket, size;
57899a2dd95SBruce Richardson 	int page, nrpages = 0;
57999a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
58099a2dd95SBruce Richardson 		eal_get_internal_configuration();
58199a2dd95SBruce Richardson 
58299a2dd95SBruce Richardson 	/* get total number of hugepages */
58399a2dd95SBruce Richardson 	for (size = 0; size < num_hp_info; size++)
58499a2dd95SBruce Richardson 		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
58599a2dd95SBruce Richardson 			nrpages +=
58699a2dd95SBruce Richardson 			internal_conf->hugepage_info[size].num_pages[socket];
58799a2dd95SBruce Richardson 
58899a2dd95SBruce Richardson 	for (page = 0; page < nrpages; page++) {
58999a2dd95SBruce Richardson 		struct hugepage_file *hp = &hugepg_tbl[page];
59099a2dd95SBruce Richardson 
59199a2dd95SBruce Richardson 		if (hp->orig_va != NULL && unlink(hp->filepath)) {
592*ae67895bSDavid Marchand 			EAL_LOG(WARNING, "%s(): Removing %s failed: %s",
59399a2dd95SBruce Richardson 				__func__, hp->filepath, strerror(errno));
59499a2dd95SBruce Richardson 		}
59599a2dd95SBruce Richardson 	}
59699a2dd95SBruce Richardson 	return 0;
59799a2dd95SBruce Richardson }
59899a2dd95SBruce Richardson 
59999a2dd95SBruce Richardson /*
60099a2dd95SBruce Richardson  * unmaps hugepages that are not going to be used. since we originally allocate
60199a2dd95SBruce Richardson  * ALL hugepages (not just those we need), additional unmapping needs to be done.
60299a2dd95SBruce Richardson  */
60399a2dd95SBruce Richardson static int
unmap_unneeded_hugepages(struct hugepage_file * hugepg_tbl,struct hugepage_info * hpi,unsigned num_hp_info)60499a2dd95SBruce Richardson unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
60599a2dd95SBruce Richardson 		struct hugepage_info *hpi,
60699a2dd95SBruce Richardson 		unsigned num_hp_info)
60799a2dd95SBruce Richardson {
60899a2dd95SBruce Richardson 	unsigned socket, size;
60999a2dd95SBruce Richardson 	int page, nrpages = 0;
61099a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
61199a2dd95SBruce Richardson 		eal_get_internal_configuration();
61299a2dd95SBruce Richardson 
61399a2dd95SBruce Richardson 	/* get total number of hugepages */
61499a2dd95SBruce Richardson 	for (size = 0; size < num_hp_info; size++)
61599a2dd95SBruce Richardson 		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
61699a2dd95SBruce Richardson 			nrpages += internal_conf->hugepage_info[size].num_pages[socket];
61799a2dd95SBruce Richardson 
61899a2dd95SBruce Richardson 	for (size = 0; size < num_hp_info; size++) {
61999a2dd95SBruce Richardson 		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
62099a2dd95SBruce Richardson 			unsigned pages_found = 0;
62199a2dd95SBruce Richardson 
62299a2dd95SBruce Richardson 			/* traverse until we have unmapped all the unused pages */
62399a2dd95SBruce Richardson 			for (page = 0; page < nrpages; page++) {
62499a2dd95SBruce Richardson 				struct hugepage_file *hp = &hugepg_tbl[page];
62599a2dd95SBruce Richardson 
62699a2dd95SBruce Richardson 				/* find a page that matches the criteria */
62799a2dd95SBruce Richardson 				if ((hp->size == hpi[size].hugepage_sz) &&
62899a2dd95SBruce Richardson 						(hp->socket_id == (int) socket)) {
62999a2dd95SBruce Richardson 
63099a2dd95SBruce Richardson 					/* if we skipped enough pages, unmap the rest */
63199a2dd95SBruce Richardson 					if (pages_found == hpi[size].num_pages[socket]) {
63299a2dd95SBruce Richardson 						uint64_t unmap_len;
63399a2dd95SBruce Richardson 
63499a2dd95SBruce Richardson 						unmap_len = hp->size;
63599a2dd95SBruce Richardson 
63699a2dd95SBruce Richardson 						/* get start addr and len of the remaining segment */
63799a2dd95SBruce Richardson 						munmap(hp->orig_va,
63899a2dd95SBruce Richardson 							(size_t)unmap_len);
63999a2dd95SBruce Richardson 
64099a2dd95SBruce Richardson 						hp->orig_va = NULL;
64199a2dd95SBruce Richardson 						if (unlink(hp->filepath) == -1) {
642*ae67895bSDavid Marchand 							EAL_LOG(ERR, "%s(): Removing %s failed: %s",
64399a2dd95SBruce Richardson 									__func__, hp->filepath, strerror(errno));
64499a2dd95SBruce Richardson 							return -1;
64599a2dd95SBruce Richardson 						}
64699a2dd95SBruce Richardson 					} else {
64799a2dd95SBruce Richardson 						/* lock the page and skip */
64899a2dd95SBruce Richardson 						pages_found++;
64999a2dd95SBruce Richardson 					}
65099a2dd95SBruce Richardson 
65199a2dd95SBruce Richardson 				} /* match page */
65299a2dd95SBruce Richardson 			} /* foreach page */
65399a2dd95SBruce Richardson 		} /* foreach socket */
65499a2dd95SBruce Richardson 	} /* foreach pagesize */
65599a2dd95SBruce Richardson 
65699a2dd95SBruce Richardson 	return 0;
65799a2dd95SBruce Richardson }
65899a2dd95SBruce Richardson 
65999a2dd95SBruce Richardson static int
remap_segment(struct hugepage_file * hugepages,int seg_start,int seg_end)66099a2dd95SBruce Richardson remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
66199a2dd95SBruce Richardson {
66299a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
66399a2dd95SBruce Richardson 	struct rte_memseg_list *msl;
66499a2dd95SBruce Richardson 	struct rte_fbarray *arr;
66599a2dd95SBruce Richardson 	int cur_page, seg_len;
66699a2dd95SBruce Richardson 	unsigned int msl_idx;
66799a2dd95SBruce Richardson 	int ms_idx;
66899a2dd95SBruce Richardson 	uint64_t page_sz;
66999a2dd95SBruce Richardson 	size_t memseg_len;
67099a2dd95SBruce Richardson 	int socket_id;
67199a2dd95SBruce Richardson #ifndef RTE_ARCH_64
67299a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
67399a2dd95SBruce Richardson 		eal_get_internal_configuration();
67499a2dd95SBruce Richardson #endif
67599a2dd95SBruce Richardson 	page_sz = hugepages[seg_start].size;
67699a2dd95SBruce Richardson 	socket_id = hugepages[seg_start].socket_id;
67799a2dd95SBruce Richardson 	seg_len = seg_end - seg_start;
67899a2dd95SBruce Richardson 
679*ae67895bSDavid Marchand 	EAL_LOG(DEBUG, "Attempting to map %" PRIu64 "M on socket %i",
68099a2dd95SBruce Richardson 			(seg_len * page_sz) >> 20ULL, socket_id);
68199a2dd95SBruce Richardson 
68299a2dd95SBruce Richardson 	/* find free space in memseg lists */
68399a2dd95SBruce Richardson 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
68451a5a72eSFengnan Chang 		int free_len;
68599a2dd95SBruce Richardson 		bool empty;
68699a2dd95SBruce Richardson 		msl = &mcfg->memsegs[msl_idx];
68799a2dd95SBruce Richardson 		arr = &msl->memseg_arr;
68899a2dd95SBruce Richardson 
68999a2dd95SBruce Richardson 		if (msl->page_sz != page_sz)
69099a2dd95SBruce Richardson 			continue;
69199a2dd95SBruce Richardson 		if (msl->socket_id != socket_id)
69299a2dd95SBruce Richardson 			continue;
69399a2dd95SBruce Richardson 
69499a2dd95SBruce Richardson 		/* leave space for a hole if array is not empty */
69599a2dd95SBruce Richardson 		empty = arr->count == 0;
69651a5a72eSFengnan Chang 		/* find start of the biggest contiguous block and its size */
69751a5a72eSFengnan Chang 		ms_idx = rte_fbarray_find_biggest_free(arr, 0);
69899a2dd95SBruce Richardson 		if (ms_idx < 0)
69999a2dd95SBruce Richardson 			continue;
70051a5a72eSFengnan Chang 		/* hole is 1 segment long, so at least two segments long. */
70151a5a72eSFengnan Chang 		free_len = rte_fbarray_find_contig_free(arr, ms_idx);
70251a5a72eSFengnan Chang 		if (free_len < 2)
70351a5a72eSFengnan Chang 			continue;
70499a2dd95SBruce Richardson 		/* leave some space between memsegs, they are not IOVA
70599a2dd95SBruce Richardson 		 * contiguous, so they shouldn't be VA contiguous either.
70699a2dd95SBruce Richardson 		 */
70751a5a72eSFengnan Chang 		if (!empty) {
70899a2dd95SBruce Richardson 			ms_idx++;
70951a5a72eSFengnan Chang 			free_len--;
71051a5a72eSFengnan Chang 		}
71151a5a72eSFengnan Chang 
71251a5a72eSFengnan Chang 		/* we might not get all of the space we wanted */
71351a5a72eSFengnan Chang 		free_len = RTE_MIN(seg_len, free_len);
71451a5a72eSFengnan Chang 		seg_end = seg_start + free_len;
71551a5a72eSFengnan Chang 		seg_len = seg_end - seg_start;
71699a2dd95SBruce Richardson 		break;
71799a2dd95SBruce Richardson 	}
71899a2dd95SBruce Richardson 	if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
719*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Could not find space for memseg. Please increase RTE_MAX_MEMSEG_PER_LIST "
720*ae67895bSDavid Marchand 			"RTE_MAX_MEMSEG_PER_TYPE and/or RTE_MAX_MEM_MB_PER_TYPE in configuration.");
72199a2dd95SBruce Richardson 		return -1;
72299a2dd95SBruce Richardson 	}
72399a2dd95SBruce Richardson 
72499a2dd95SBruce Richardson #ifdef RTE_ARCH_PPC_64
72599a2dd95SBruce Richardson 	/* for PPC64 we go through the list backwards */
72699a2dd95SBruce Richardson 	for (cur_page = seg_end - 1; cur_page >= seg_start;
72799a2dd95SBruce Richardson 			cur_page--, ms_idx++) {
72899a2dd95SBruce Richardson #else
72999a2dd95SBruce Richardson 	for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
73099a2dd95SBruce Richardson #endif
73199a2dd95SBruce Richardson 		struct hugepage_file *hfile = &hugepages[cur_page];
73299a2dd95SBruce Richardson 		struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
73399a2dd95SBruce Richardson 		void *addr;
73499a2dd95SBruce Richardson 		int fd;
73599a2dd95SBruce Richardson 
73699a2dd95SBruce Richardson 		fd = open(hfile->filepath, O_RDWR);
73799a2dd95SBruce Richardson 		if (fd < 0) {
738*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Could not open '%s': %s",
73999a2dd95SBruce Richardson 					hfile->filepath, strerror(errno));
74099a2dd95SBruce Richardson 			return -1;
74199a2dd95SBruce Richardson 		}
74299a2dd95SBruce Richardson 		/* set shared lock on the file. */
74399a2dd95SBruce Richardson 		if (flock(fd, LOCK_SH) < 0) {
744*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Could not lock '%s': %s",
74599a2dd95SBruce Richardson 					hfile->filepath, strerror(errno));
74699a2dd95SBruce Richardson 			close(fd);
74799a2dd95SBruce Richardson 			return -1;
74899a2dd95SBruce Richardson 		}
74999a2dd95SBruce Richardson 		memseg_len = (size_t)page_sz;
75099a2dd95SBruce Richardson 		addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
75199a2dd95SBruce Richardson 
75299a2dd95SBruce Richardson 		/* we know this address is already mmapped by memseg list, so
75399a2dd95SBruce Richardson 		 * using MAP_FIXED here is safe
75499a2dd95SBruce Richardson 		 */
75599a2dd95SBruce Richardson 		addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
75699a2dd95SBruce Richardson 				MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
75799a2dd95SBruce Richardson 		if (addr == MAP_FAILED) {
758*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Couldn't remap '%s': %s",
75999a2dd95SBruce Richardson 					hfile->filepath, strerror(errno));
76099a2dd95SBruce Richardson 			close(fd);
76199a2dd95SBruce Richardson 			return -1;
76299a2dd95SBruce Richardson 		}
76399a2dd95SBruce Richardson 
76499a2dd95SBruce Richardson 		/* we have a new address, so unmap previous one */
76599a2dd95SBruce Richardson #ifndef RTE_ARCH_64
76699a2dd95SBruce Richardson 		/* in 32-bit legacy mode, we have already unmapped the page */
76799a2dd95SBruce Richardson 		if (!internal_conf->legacy_mem)
76899a2dd95SBruce Richardson 			munmap(hfile->orig_va, page_sz);
76999a2dd95SBruce Richardson #else
77099a2dd95SBruce Richardson 		munmap(hfile->orig_va, page_sz);
77199a2dd95SBruce Richardson #endif
77299a2dd95SBruce Richardson 
77399a2dd95SBruce Richardson 		hfile->orig_va = NULL;
77499a2dd95SBruce Richardson 		hfile->final_va = addr;
77599a2dd95SBruce Richardson 
77699a2dd95SBruce Richardson 		/* rewrite physical addresses in IOVA as VA mode */
77799a2dd95SBruce Richardson 		if (rte_eal_iova_mode() == RTE_IOVA_VA)
77899a2dd95SBruce Richardson 			hfile->physaddr = (uintptr_t)addr;
77999a2dd95SBruce Richardson 
78099a2dd95SBruce Richardson 		/* set up memseg data */
78199a2dd95SBruce Richardson 		ms->addr = addr;
78299a2dd95SBruce Richardson 		ms->hugepage_sz = page_sz;
78399a2dd95SBruce Richardson 		ms->len = memseg_len;
78499a2dd95SBruce Richardson 		ms->iova = hfile->physaddr;
78599a2dd95SBruce Richardson 		ms->socket_id = hfile->socket_id;
78699a2dd95SBruce Richardson 		ms->nchannel = rte_memory_get_nchannel();
78799a2dd95SBruce Richardson 		ms->nrank = rte_memory_get_nrank();
78899a2dd95SBruce Richardson 
78999a2dd95SBruce Richardson 		rte_fbarray_set_used(arr, ms_idx);
79099a2dd95SBruce Richardson 
79199a2dd95SBruce Richardson 		/* store segment fd internally */
79299a2dd95SBruce Richardson 		if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
793*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Could not store segment fd: %s",
79499a2dd95SBruce Richardson 				rte_strerror(rte_errno));
79599a2dd95SBruce Richardson 	}
796*ae67895bSDavid Marchand 	EAL_LOG(DEBUG, "Allocated %" PRIu64 "M on socket %i",
79799a2dd95SBruce Richardson 			(seg_len * page_sz) >> 20, socket_id);
79851a5a72eSFengnan Chang 	return seg_len;
79999a2dd95SBruce Richardson }
80099a2dd95SBruce Richardson 
80199a2dd95SBruce Richardson static uint64_t
80299a2dd95SBruce Richardson get_mem_amount(uint64_t page_sz, uint64_t max_mem)
80399a2dd95SBruce Richardson {
80499a2dd95SBruce Richardson 	uint64_t area_sz, max_pages;
80599a2dd95SBruce Richardson 
80699a2dd95SBruce Richardson 	/* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
80799a2dd95SBruce Richardson 	max_pages = RTE_MAX_MEMSEG_PER_LIST;
80899a2dd95SBruce Richardson 	max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
80999a2dd95SBruce Richardson 
81099a2dd95SBruce Richardson 	area_sz = RTE_MIN(page_sz * max_pages, max_mem);
81199a2dd95SBruce Richardson 
81299a2dd95SBruce Richardson 	/* make sure the list isn't smaller than the page size */
81399a2dd95SBruce Richardson 	area_sz = RTE_MAX(area_sz, page_sz);
81499a2dd95SBruce Richardson 
81599a2dd95SBruce Richardson 	return RTE_ALIGN(area_sz, page_sz);
81699a2dd95SBruce Richardson }
81799a2dd95SBruce Richardson 
81899a2dd95SBruce Richardson static int
81999a2dd95SBruce Richardson memseg_list_free(struct rte_memseg_list *msl)
82099a2dd95SBruce Richardson {
82199a2dd95SBruce Richardson 	if (rte_fbarray_destroy(&msl->memseg_arr)) {
822*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Cannot destroy memseg list");
82399a2dd95SBruce Richardson 		return -1;
82499a2dd95SBruce Richardson 	}
82599a2dd95SBruce Richardson 	memset(msl, 0, sizeof(*msl));
82699a2dd95SBruce Richardson 	return 0;
82799a2dd95SBruce Richardson }
82899a2dd95SBruce Richardson 
82999a2dd95SBruce Richardson /*
83099a2dd95SBruce Richardson  * Our VA space is not preallocated yet, so preallocate it here. We need to know
83199a2dd95SBruce Richardson  * how many segments there are in order to map all pages into one address space,
83299a2dd95SBruce Richardson  * and leave appropriate holes between segments so that rte_malloc does not
83399a2dd95SBruce Richardson  * concatenate them into one big segment.
83499a2dd95SBruce Richardson  *
83599a2dd95SBruce Richardson  * we also need to unmap original pages to free up address space.
83699a2dd95SBruce Richardson  */
83799a2dd95SBruce Richardson static int __rte_unused
83899a2dd95SBruce Richardson prealloc_segments(struct hugepage_file *hugepages, int n_pages)
83999a2dd95SBruce Richardson {
84099a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
84199a2dd95SBruce Richardson 	int cur_page, seg_start_page, end_seg, new_memseg;
84299a2dd95SBruce Richardson 	unsigned int hpi_idx, socket, i;
84399a2dd95SBruce Richardson 	int n_contig_segs, n_segs;
84499a2dd95SBruce Richardson 	int msl_idx;
84599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
84699a2dd95SBruce Richardson 		eal_get_internal_configuration();
84799a2dd95SBruce Richardson 
84899a2dd95SBruce Richardson 	/* before we preallocate segments, we need to free up our VA space.
84999a2dd95SBruce Richardson 	 * we're not removing files, and we already have information about
85099a2dd95SBruce Richardson 	 * PA-contiguousness, so it is safe to unmap everything.
85199a2dd95SBruce Richardson 	 */
85299a2dd95SBruce Richardson 	for (cur_page = 0; cur_page < n_pages; cur_page++) {
85399a2dd95SBruce Richardson 		struct hugepage_file *hpi = &hugepages[cur_page];
85499a2dd95SBruce Richardson 		munmap(hpi->orig_va, hpi->size);
85599a2dd95SBruce Richardson 		hpi->orig_va = NULL;
85699a2dd95SBruce Richardson 	}
85799a2dd95SBruce Richardson 
85899a2dd95SBruce Richardson 	/* we cannot know how many page sizes and sockets we have discovered, so
85999a2dd95SBruce Richardson 	 * loop over all of them
86099a2dd95SBruce Richardson 	 */
86199a2dd95SBruce Richardson 	for (hpi_idx = 0; hpi_idx < internal_conf->num_hugepage_sizes;
86299a2dd95SBruce Richardson 			hpi_idx++) {
86399a2dd95SBruce Richardson 		uint64_t page_sz =
86499a2dd95SBruce Richardson 			internal_conf->hugepage_info[hpi_idx].hugepage_sz;
86599a2dd95SBruce Richardson 
86699a2dd95SBruce Richardson 		for (i = 0; i < rte_socket_count(); i++) {
86799a2dd95SBruce Richardson 			struct rte_memseg_list *msl;
86899a2dd95SBruce Richardson 
86999a2dd95SBruce Richardson 			socket = rte_socket_id_by_idx(i);
87099a2dd95SBruce Richardson 			n_contig_segs = 0;
87199a2dd95SBruce Richardson 			n_segs = 0;
87299a2dd95SBruce Richardson 			seg_start_page = -1;
87399a2dd95SBruce Richardson 
87499a2dd95SBruce Richardson 			for (cur_page = 0; cur_page < n_pages; cur_page++) {
87599a2dd95SBruce Richardson 				struct hugepage_file *prev, *cur;
87699a2dd95SBruce Richardson 				int prev_seg_start_page = -1;
87799a2dd95SBruce Richardson 
87899a2dd95SBruce Richardson 				cur = &hugepages[cur_page];
87999a2dd95SBruce Richardson 				prev = cur_page == 0 ? NULL :
88099a2dd95SBruce Richardson 						&hugepages[cur_page - 1];
88199a2dd95SBruce Richardson 
88299a2dd95SBruce Richardson 				new_memseg = 0;
88399a2dd95SBruce Richardson 				end_seg = 0;
88499a2dd95SBruce Richardson 
88599a2dd95SBruce Richardson 				if (cur->size == 0)
88699a2dd95SBruce Richardson 					end_seg = 1;
88799a2dd95SBruce Richardson 				else if (cur->socket_id != (int) socket)
88899a2dd95SBruce Richardson 					end_seg = 1;
88999a2dd95SBruce Richardson 				else if (cur->size != page_sz)
89099a2dd95SBruce Richardson 					end_seg = 1;
89199a2dd95SBruce Richardson 				else if (cur_page == 0)
89299a2dd95SBruce Richardson 					new_memseg = 1;
89399a2dd95SBruce Richardson #ifdef RTE_ARCH_PPC_64
89499a2dd95SBruce Richardson 				/* On PPC64 architecture, the mmap always start
89599a2dd95SBruce Richardson 				 * from higher address to lower address. Here,
89699a2dd95SBruce Richardson 				 * physical addresses are in descending order.
89799a2dd95SBruce Richardson 				 */
89899a2dd95SBruce Richardson 				else if ((prev->physaddr - cur->physaddr) !=
89999a2dd95SBruce Richardson 						cur->size)
90099a2dd95SBruce Richardson 					new_memseg = 1;
90199a2dd95SBruce Richardson #else
90299a2dd95SBruce Richardson 				else if ((cur->physaddr - prev->physaddr) !=
90399a2dd95SBruce Richardson 						cur->size)
90499a2dd95SBruce Richardson 					new_memseg = 1;
90599a2dd95SBruce Richardson #endif
90699a2dd95SBruce Richardson 				if (new_memseg) {
90799a2dd95SBruce Richardson 					/* if we're already inside a segment,
90899a2dd95SBruce Richardson 					 * new segment means end of current one
90999a2dd95SBruce Richardson 					 */
91099a2dd95SBruce Richardson 					if (seg_start_page != -1) {
91199a2dd95SBruce Richardson 						end_seg = 1;
91299a2dd95SBruce Richardson 						prev_seg_start_page =
91399a2dd95SBruce Richardson 								seg_start_page;
91499a2dd95SBruce Richardson 					}
91599a2dd95SBruce Richardson 					seg_start_page = cur_page;
91699a2dd95SBruce Richardson 				}
91799a2dd95SBruce Richardson 
91899a2dd95SBruce Richardson 				if (end_seg) {
91999a2dd95SBruce Richardson 					if (prev_seg_start_page != -1) {
92099a2dd95SBruce Richardson 						/* we've found a new segment */
92199a2dd95SBruce Richardson 						n_contig_segs++;
92299a2dd95SBruce Richardson 						n_segs += cur_page -
92399a2dd95SBruce Richardson 							prev_seg_start_page;
92499a2dd95SBruce Richardson 					} else if (seg_start_page != -1) {
92599a2dd95SBruce Richardson 						/* we didn't find new segment,
92699a2dd95SBruce Richardson 						 * but did end current one
92799a2dd95SBruce Richardson 						 */
92899a2dd95SBruce Richardson 						n_contig_segs++;
92999a2dd95SBruce Richardson 						n_segs += cur_page -
93099a2dd95SBruce Richardson 								seg_start_page;
93199a2dd95SBruce Richardson 						seg_start_page = -1;
93299a2dd95SBruce Richardson 						continue;
93399a2dd95SBruce Richardson 					} else {
93499a2dd95SBruce Richardson 						/* we're skipping this page */
93599a2dd95SBruce Richardson 						continue;
93699a2dd95SBruce Richardson 					}
93799a2dd95SBruce Richardson 				}
93899a2dd95SBruce Richardson 				/* segment continues */
93999a2dd95SBruce Richardson 			}
94099a2dd95SBruce Richardson 			/* check if we missed last segment */
94199a2dd95SBruce Richardson 			if (seg_start_page != -1) {
94299a2dd95SBruce Richardson 				n_contig_segs++;
94399a2dd95SBruce Richardson 				n_segs += cur_page - seg_start_page;
94499a2dd95SBruce Richardson 			}
94599a2dd95SBruce Richardson 
94699a2dd95SBruce Richardson 			/* if no segments were found, do not preallocate */
94799a2dd95SBruce Richardson 			if (n_segs == 0)
94899a2dd95SBruce Richardson 				continue;
94999a2dd95SBruce Richardson 
95099a2dd95SBruce Richardson 			/* we now have total number of pages that we will
95199a2dd95SBruce Richardson 			 * allocate for this segment list. add separator pages
95299a2dd95SBruce Richardson 			 * to the total count, and preallocate VA space.
95399a2dd95SBruce Richardson 			 */
95499a2dd95SBruce Richardson 			n_segs += n_contig_segs - 1;
95599a2dd95SBruce Richardson 
95699a2dd95SBruce Richardson 			/* now, preallocate VA space for these segments */
95799a2dd95SBruce Richardson 
95899a2dd95SBruce Richardson 			/* first, find suitable memseg list for this */
95999a2dd95SBruce Richardson 			for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
96099a2dd95SBruce Richardson 					msl_idx++) {
96199a2dd95SBruce Richardson 				msl = &mcfg->memsegs[msl_idx];
96299a2dd95SBruce Richardson 
96399a2dd95SBruce Richardson 				if (msl->base_va != NULL)
96499a2dd95SBruce Richardson 					continue;
96599a2dd95SBruce Richardson 				break;
96699a2dd95SBruce Richardson 			}
96799a2dd95SBruce Richardson 			if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
968*ae67895bSDavid Marchand 				EAL_LOG(ERR, "Not enough space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
96999a2dd95SBruce Richardson 				return -1;
97099a2dd95SBruce Richardson 			}
97199a2dd95SBruce Richardson 
97299a2dd95SBruce Richardson 			/* now, allocate fbarray itself */
97399a2dd95SBruce Richardson 			if (eal_memseg_list_init(msl, page_sz, n_segs,
97499a2dd95SBruce Richardson 					socket, msl_idx, true) < 0)
97599a2dd95SBruce Richardson 				return -1;
97699a2dd95SBruce Richardson 
97799a2dd95SBruce Richardson 			/* finally, allocate VA space */
97899a2dd95SBruce Richardson 			if (eal_memseg_list_alloc(msl, 0) < 0) {
979*ae67895bSDavid Marchand 				EAL_LOG(ERR, "Cannot preallocate 0x%"PRIx64"kB hugepages",
98099a2dd95SBruce Richardson 					page_sz >> 10);
98199a2dd95SBruce Richardson 				return -1;
98299a2dd95SBruce Richardson 			}
98399a2dd95SBruce Richardson 		}
98499a2dd95SBruce Richardson 	}
98599a2dd95SBruce Richardson 	return 0;
98699a2dd95SBruce Richardson }
98799a2dd95SBruce Richardson 
98899a2dd95SBruce Richardson /*
98999a2dd95SBruce Richardson  * We cannot reallocate memseg lists on the fly because PPC64 stores pages
99099a2dd95SBruce Richardson  * backwards, therefore we have to process the entire memseg first before
99199a2dd95SBruce Richardson  * remapping it into memseg list VA space.
99299a2dd95SBruce Richardson  */
99399a2dd95SBruce Richardson static int
99499a2dd95SBruce Richardson remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
99599a2dd95SBruce Richardson {
99699a2dd95SBruce Richardson 	int cur_page, seg_start_page, new_memseg, ret;
99799a2dd95SBruce Richardson 
99899a2dd95SBruce Richardson 	seg_start_page = 0;
99999a2dd95SBruce Richardson 	for (cur_page = 0; cur_page < n_pages; cur_page++) {
100099a2dd95SBruce Richardson 		struct hugepage_file *prev, *cur;
100199a2dd95SBruce Richardson 
100299a2dd95SBruce Richardson 		new_memseg = 0;
100399a2dd95SBruce Richardson 
100499a2dd95SBruce Richardson 		cur = &hugepages[cur_page];
100599a2dd95SBruce Richardson 		prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
100699a2dd95SBruce Richardson 
100799a2dd95SBruce Richardson 		/* if size is zero, no more pages left */
100899a2dd95SBruce Richardson 		if (cur->size == 0)
100999a2dd95SBruce Richardson 			break;
101099a2dd95SBruce Richardson 
101199a2dd95SBruce Richardson 		if (cur_page == 0)
101299a2dd95SBruce Richardson 			new_memseg = 1;
101399a2dd95SBruce Richardson 		else if (cur->socket_id != prev->socket_id)
101499a2dd95SBruce Richardson 			new_memseg = 1;
101599a2dd95SBruce Richardson 		else if (cur->size != prev->size)
101699a2dd95SBruce Richardson 			new_memseg = 1;
101799a2dd95SBruce Richardson #ifdef RTE_ARCH_PPC_64
101899a2dd95SBruce Richardson 		/* On PPC64 architecture, the mmap always start from higher
101999a2dd95SBruce Richardson 		 * address to lower address. Here, physical addresses are in
102099a2dd95SBruce Richardson 		 * descending order.
102199a2dd95SBruce Richardson 		 */
102299a2dd95SBruce Richardson 		else if ((prev->physaddr - cur->physaddr) != cur->size)
102399a2dd95SBruce Richardson 			new_memseg = 1;
102499a2dd95SBruce Richardson #else
102599a2dd95SBruce Richardson 		else if ((cur->physaddr - prev->physaddr) != cur->size)
102699a2dd95SBruce Richardson 			new_memseg = 1;
102799a2dd95SBruce Richardson #endif
102899a2dd95SBruce Richardson 
102999a2dd95SBruce Richardson 		if (new_memseg) {
103099a2dd95SBruce Richardson 			/* if this isn't the first time, remap segment */
103199a2dd95SBruce Richardson 			if (cur_page != 0) {
103251a5a72eSFengnan Chang 				int n_remapped = 0;
103351a5a72eSFengnan Chang 				int n_needed = cur_page - seg_start_page;
103451a5a72eSFengnan Chang 				while (n_remapped < n_needed) {
103599a2dd95SBruce Richardson 					ret = remap_segment(hugepages, seg_start_page,
103699a2dd95SBruce Richardson 							cur_page);
103751a5a72eSFengnan Chang 					if (ret < 0)
103899a2dd95SBruce Richardson 						return -1;
103951a5a72eSFengnan Chang 					n_remapped += ret;
104051a5a72eSFengnan Chang 					seg_start_page += ret;
104151a5a72eSFengnan Chang 				}
104299a2dd95SBruce Richardson 			}
104399a2dd95SBruce Richardson 			/* remember where we started */
104499a2dd95SBruce Richardson 			seg_start_page = cur_page;
104599a2dd95SBruce Richardson 		}
104699a2dd95SBruce Richardson 		/* continuation of previous memseg */
104799a2dd95SBruce Richardson 	}
104899a2dd95SBruce Richardson 	/* we were stopped, but we didn't remap the last segment, do it now */
104999a2dd95SBruce Richardson 	if (cur_page != 0) {
105051a5a72eSFengnan Chang 		int n_remapped = 0;
105151a5a72eSFengnan Chang 		int n_needed = cur_page - seg_start_page;
105251a5a72eSFengnan Chang 		while (n_remapped < n_needed) {
105399a2dd95SBruce Richardson 			ret = remap_segment(hugepages, seg_start_page,
105499a2dd95SBruce Richardson 					cur_page);
105551a5a72eSFengnan Chang 			if (ret < 0)
105699a2dd95SBruce Richardson 				return -1;
105751a5a72eSFengnan Chang 			n_remapped += ret;
105851a5a72eSFengnan Chang 			seg_start_page += ret;
105951a5a72eSFengnan Chang 		}
106099a2dd95SBruce Richardson 	}
106199a2dd95SBruce Richardson 	return 0;
106299a2dd95SBruce Richardson }
106399a2dd95SBruce Richardson 
106499a2dd95SBruce Richardson static inline size_t
106599a2dd95SBruce Richardson eal_get_hugepage_mem_size(void)
106699a2dd95SBruce Richardson {
106799a2dd95SBruce Richardson 	uint64_t size = 0;
106899a2dd95SBruce Richardson 	unsigned i, j;
106999a2dd95SBruce Richardson 	struct internal_config *internal_conf =
107099a2dd95SBruce Richardson 		eal_get_internal_configuration();
107199a2dd95SBruce Richardson 
107299a2dd95SBruce Richardson 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
107399a2dd95SBruce Richardson 		struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
107499a2dd95SBruce Richardson 		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
107599a2dd95SBruce Richardson 			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
107699a2dd95SBruce Richardson 				size += hpi->hugepage_sz * hpi->num_pages[j];
107799a2dd95SBruce Richardson 			}
107899a2dd95SBruce Richardson 		}
107999a2dd95SBruce Richardson 	}
108099a2dd95SBruce Richardson 
108199a2dd95SBruce Richardson 	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
108299a2dd95SBruce Richardson }
108399a2dd95SBruce Richardson 
108499a2dd95SBruce Richardson static struct sigaction huge_action_old;
108599a2dd95SBruce Richardson static int huge_need_recover;
108699a2dd95SBruce Richardson 
108799a2dd95SBruce Richardson static void
108899a2dd95SBruce Richardson huge_register_sigbus(void)
108999a2dd95SBruce Richardson {
109099a2dd95SBruce Richardson 	sigset_t mask;
109199a2dd95SBruce Richardson 	struct sigaction action;
109299a2dd95SBruce Richardson 
109399a2dd95SBruce Richardson 	sigemptyset(&mask);
109499a2dd95SBruce Richardson 	sigaddset(&mask, SIGBUS);
109599a2dd95SBruce Richardson 	action.sa_flags = 0;
109699a2dd95SBruce Richardson 	action.sa_mask = mask;
109799a2dd95SBruce Richardson 	action.sa_handler = huge_sigbus_handler;
109899a2dd95SBruce Richardson 
109999a2dd95SBruce Richardson 	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
110099a2dd95SBruce Richardson }
110199a2dd95SBruce Richardson 
110299a2dd95SBruce Richardson static void
110399a2dd95SBruce Richardson huge_recover_sigbus(void)
110499a2dd95SBruce Richardson {
110599a2dd95SBruce Richardson 	if (huge_need_recover) {
110699a2dd95SBruce Richardson 		sigaction(SIGBUS, &huge_action_old, NULL);
110799a2dd95SBruce Richardson 		huge_need_recover = 0;
110899a2dd95SBruce Richardson 	}
110999a2dd95SBruce Richardson }
111099a2dd95SBruce Richardson 
111199a2dd95SBruce Richardson /*
111299a2dd95SBruce Richardson  * Prepare physical memory mapping: fill configuration structure with
111399a2dd95SBruce Richardson  * these infos, return 0 on success.
111499a2dd95SBruce Richardson  *  1. map N huge pages in separate files in hugetlbfs
111599a2dd95SBruce Richardson  *  2. find associated physical addr
111699a2dd95SBruce Richardson  *  3. find associated NUMA socket ID
111799a2dd95SBruce Richardson  *  4. sort all huge pages by physical address
111899a2dd95SBruce Richardson  *  5. remap these N huge pages in the correct order
111999a2dd95SBruce Richardson  *  6. unmap the first mapping
112099a2dd95SBruce Richardson  *  7. fill memsegs in configuration with contiguous zones
112199a2dd95SBruce Richardson  */
112299a2dd95SBruce Richardson static int
112399a2dd95SBruce Richardson eal_legacy_hugepage_init(void)
112499a2dd95SBruce Richardson {
112599a2dd95SBruce Richardson 	struct rte_mem_config *mcfg;
112699a2dd95SBruce Richardson 	struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
112799a2dd95SBruce Richardson 	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
112899a2dd95SBruce Richardson 	struct internal_config *internal_conf =
112999a2dd95SBruce Richardson 		eal_get_internal_configuration();
113099a2dd95SBruce Richardson 
113199a2dd95SBruce Richardson 	uint64_t memory[RTE_MAX_NUMA_NODES];
113299a2dd95SBruce Richardson 
113399a2dd95SBruce Richardson 	unsigned hp_offset;
113499a2dd95SBruce Richardson 	int i, j;
113599a2dd95SBruce Richardson 	int nr_hugefiles, nr_hugepages = 0;
113699a2dd95SBruce Richardson 	void *addr;
113799a2dd95SBruce Richardson 
113899a2dd95SBruce Richardson 	memset(used_hp, 0, sizeof(used_hp));
113999a2dd95SBruce Richardson 
114099a2dd95SBruce Richardson 	/* get pointer to global configuration */
114199a2dd95SBruce Richardson 	mcfg = rte_eal_get_configuration()->mem_config;
114299a2dd95SBruce Richardson 
114399a2dd95SBruce Richardson 	/* hugetlbfs can be disabled */
114499a2dd95SBruce Richardson 	if (internal_conf->no_hugetlbfs) {
114599a2dd95SBruce Richardson 		void *prealloc_addr;
114699a2dd95SBruce Richardson 		size_t mem_sz;
114799a2dd95SBruce Richardson 		struct rte_memseg_list *msl;
114899a2dd95SBruce Richardson 		int n_segs, fd, flags;
114999a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
115099a2dd95SBruce Richardson 		int memfd;
115199a2dd95SBruce Richardson #endif
115299a2dd95SBruce Richardson 		uint64_t page_sz;
115399a2dd95SBruce Richardson 
115499a2dd95SBruce Richardson 		/* nohuge mode is legacy mode */
115599a2dd95SBruce Richardson 		internal_conf->legacy_mem = 1;
115699a2dd95SBruce Richardson 
115799a2dd95SBruce Richardson 		/* nohuge mode is single-file segments mode */
115899a2dd95SBruce Richardson 		internal_conf->single_file_segments = 1;
115999a2dd95SBruce Richardson 
116099a2dd95SBruce Richardson 		/* create a memseg list */
116199a2dd95SBruce Richardson 		msl = &mcfg->memsegs[0];
116299a2dd95SBruce Richardson 
116399a2dd95SBruce Richardson 		mem_sz = internal_conf->memory;
116499a2dd95SBruce Richardson 		page_sz = RTE_PGSIZE_4K;
116599a2dd95SBruce Richardson 		n_segs = mem_sz / page_sz;
116699a2dd95SBruce Richardson 
116799a2dd95SBruce Richardson 		if (eal_memseg_list_init_named(
116899a2dd95SBruce Richardson 				msl, "nohugemem", page_sz, n_segs, 0, true)) {
116999a2dd95SBruce Richardson 			return -1;
117099a2dd95SBruce Richardson 		}
117199a2dd95SBruce Richardson 
117299a2dd95SBruce Richardson 		/* set up parameters for anonymous mmap */
117399a2dd95SBruce Richardson 		fd = -1;
117499a2dd95SBruce Richardson 		flags = MAP_PRIVATE | MAP_ANONYMOUS;
117599a2dd95SBruce Richardson 
117699a2dd95SBruce Richardson #ifdef MEMFD_SUPPORTED
117799a2dd95SBruce Richardson 		/* create a memfd and store it in the segment fd table */
117899a2dd95SBruce Richardson 		memfd = memfd_create("nohuge", 0);
117999a2dd95SBruce Richardson 		if (memfd < 0) {
1180*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Cannot create memfd: %s",
118199a2dd95SBruce Richardson 					strerror(errno));
1182*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Falling back to anonymous map");
118399a2dd95SBruce Richardson 		} else {
118499a2dd95SBruce Richardson 			/* we got an fd - now resize it */
118599a2dd95SBruce Richardson 			if (ftruncate(memfd, internal_conf->memory) < 0) {
1186*ae67895bSDavid Marchand 				EAL_LOG(ERR, "Cannot resize memfd: %s",
118799a2dd95SBruce Richardson 						strerror(errno));
1188*ae67895bSDavid Marchand 				EAL_LOG(ERR, "Falling back to anonymous map");
118999a2dd95SBruce Richardson 				close(memfd);
119099a2dd95SBruce Richardson 			} else {
119199a2dd95SBruce Richardson 				/* creating memfd-backed file was successful.
119299a2dd95SBruce Richardson 				 * we want changes to memfd to be visible to
119399a2dd95SBruce Richardson 				 * other processes (such as vhost backend), so
119499a2dd95SBruce Richardson 				 * map it as shared memory.
119599a2dd95SBruce Richardson 				 */
1196*ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "Using memfd for anonymous memory");
119799a2dd95SBruce Richardson 				fd = memfd;
119899a2dd95SBruce Richardson 				flags = MAP_SHARED;
119999a2dd95SBruce Richardson 			}
120099a2dd95SBruce Richardson 		}
120199a2dd95SBruce Richardson #endif
120299a2dd95SBruce Richardson 		/* preallocate address space for the memory, so that it can be
120399a2dd95SBruce Richardson 		 * fit into the DMA mask.
120499a2dd95SBruce Richardson 		 */
120599a2dd95SBruce Richardson 		if (eal_memseg_list_alloc(msl, 0)) {
1206*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
120799a2dd95SBruce Richardson 			return -1;
120899a2dd95SBruce Richardson 		}
120999a2dd95SBruce Richardson 
121099a2dd95SBruce Richardson 		prealloc_addr = msl->base_va;
121199a2dd95SBruce Richardson 		addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
121299a2dd95SBruce Richardson 				flags | MAP_FIXED, fd, 0);
121399a2dd95SBruce Richardson 		if (addr == MAP_FAILED || addr != prealloc_addr) {
1214*ae67895bSDavid Marchand 			EAL_LOG(ERR, "%s: mmap() failed: %s", __func__,
121599a2dd95SBruce Richardson 					strerror(errno));
121699a2dd95SBruce Richardson 			munmap(prealloc_addr, mem_sz);
121799a2dd95SBruce Richardson 			return -1;
121899a2dd95SBruce Richardson 		}
121999a2dd95SBruce Richardson 
122099a2dd95SBruce Richardson 		/* we're in single-file segments mode, so only the segment list
122199a2dd95SBruce Richardson 		 * fd needs to be set up.
122299a2dd95SBruce Richardson 		 */
122399a2dd95SBruce Richardson 		if (fd != -1) {
122499a2dd95SBruce Richardson 			if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
1225*ae67895bSDavid Marchand 				EAL_LOG(ERR, "Cannot set up segment list fd");
122699a2dd95SBruce Richardson 				/* not a serious error, proceed */
122799a2dd95SBruce Richardson 			}
122899a2dd95SBruce Richardson 		}
122999a2dd95SBruce Richardson 
123099a2dd95SBruce Richardson 		eal_memseg_list_populate(msl, addr, n_segs);
123199a2dd95SBruce Richardson 
123299a2dd95SBruce Richardson 		if (mcfg->dma_maskbits &&
123399a2dd95SBruce Richardson 		    rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1234*ae67895bSDavid Marchand 			EAL_LOG(ERR,
1235*ae67895bSDavid Marchand 				"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
123699a2dd95SBruce Richardson 				__func__);
123799a2dd95SBruce Richardson 			if (rte_eal_iova_mode() == RTE_IOVA_VA &&
123899a2dd95SBruce Richardson 			    rte_eal_using_phys_addrs())
1239*ae67895bSDavid Marchand 				EAL_LOG(ERR,
1240*ae67895bSDavid Marchand 					"%s(): Please try initializing EAL with --iova-mode=pa parameter.",
124199a2dd95SBruce Richardson 					__func__);
124299a2dd95SBruce Richardson 			goto fail;
124399a2dd95SBruce Richardson 		}
124499a2dd95SBruce Richardson 		return 0;
124599a2dd95SBruce Richardson 	}
124699a2dd95SBruce Richardson 
124799a2dd95SBruce Richardson 	/* calculate total number of hugepages available. at this point we haven't
124899a2dd95SBruce Richardson 	 * yet started sorting them so they all are on socket 0 */
124999a2dd95SBruce Richardson 	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
125099a2dd95SBruce Richardson 		/* meanwhile, also initialize used_hp hugepage sizes in used_hp */
125199a2dd95SBruce Richardson 		used_hp[i].hugepage_sz = internal_conf->hugepage_info[i].hugepage_sz;
125299a2dd95SBruce Richardson 
125399a2dd95SBruce Richardson 		nr_hugepages += internal_conf->hugepage_info[i].num_pages[0];
125499a2dd95SBruce Richardson 	}
125599a2dd95SBruce Richardson 
125699a2dd95SBruce Richardson 	/*
125799a2dd95SBruce Richardson 	 * allocate a memory area for hugepage table.
125899a2dd95SBruce Richardson 	 * this isn't shared memory yet. due to the fact that we need some
125999a2dd95SBruce Richardson 	 * processing done on these pages, shared memory will be created
126099a2dd95SBruce Richardson 	 * at a later stage.
126199a2dd95SBruce Richardson 	 */
126299a2dd95SBruce Richardson 	tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
126399a2dd95SBruce Richardson 	if (tmp_hp == NULL)
126499a2dd95SBruce Richardson 		goto fail;
126599a2dd95SBruce Richardson 
126699a2dd95SBruce Richardson 	memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
126799a2dd95SBruce Richardson 
126899a2dd95SBruce Richardson 	hp_offset = 0; /* where we start the current page size entries */
126999a2dd95SBruce Richardson 
127099a2dd95SBruce Richardson 	huge_register_sigbus();
127199a2dd95SBruce Richardson 
127299a2dd95SBruce Richardson 	/* make a copy of socket_mem, needed for balanced allocation. */
127399a2dd95SBruce Richardson 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
127499a2dd95SBruce Richardson 		memory[i] = internal_conf->socket_mem[i];
127599a2dd95SBruce Richardson 
127699a2dd95SBruce Richardson 	/* map all hugepages and sort them */
127799a2dd95SBruce Richardson 	for (i = 0; i < (int)internal_conf->num_hugepage_sizes; i++) {
127899a2dd95SBruce Richardson 		unsigned pages_old, pages_new;
127999a2dd95SBruce Richardson 		struct hugepage_info *hpi;
128099a2dd95SBruce Richardson 
128199a2dd95SBruce Richardson 		/*
128299a2dd95SBruce Richardson 		 * we don't yet mark hugepages as used at this stage, so
128399a2dd95SBruce Richardson 		 * we just map all hugepages available to the system
128499a2dd95SBruce Richardson 		 * all hugepages are still located on socket 0
128599a2dd95SBruce Richardson 		 */
128699a2dd95SBruce Richardson 		hpi = &internal_conf->hugepage_info[i];
128799a2dd95SBruce Richardson 
128899a2dd95SBruce Richardson 		if (hpi->num_pages[0] == 0)
128999a2dd95SBruce Richardson 			continue;
129099a2dd95SBruce Richardson 
129199a2dd95SBruce Richardson 		/* map all hugepages available */
129299a2dd95SBruce Richardson 		pages_old = hpi->num_pages[0];
129399a2dd95SBruce Richardson 		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
129499a2dd95SBruce Richardson 		if (pages_new < pages_old) {
1295*ae67895bSDavid Marchand 			EAL_LOG(DEBUG,
1296*ae67895bSDavid Marchand 				"%d not %d hugepages of size %u MB allocated",
129799a2dd95SBruce Richardson 				pages_new, pages_old,
129899a2dd95SBruce Richardson 				(unsigned)(hpi->hugepage_sz / 0x100000));
129999a2dd95SBruce Richardson 
130099a2dd95SBruce Richardson 			int pages = pages_old - pages_new;
130199a2dd95SBruce Richardson 
130299a2dd95SBruce Richardson 			nr_hugepages -= pages;
130399a2dd95SBruce Richardson 			hpi->num_pages[0] = pages_new;
130499a2dd95SBruce Richardson 			if (pages_new == 0)
130599a2dd95SBruce Richardson 				continue;
130699a2dd95SBruce Richardson 		}
130799a2dd95SBruce Richardson 
130899a2dd95SBruce Richardson 		if (rte_eal_using_phys_addrs() &&
130999a2dd95SBruce Richardson 				rte_eal_iova_mode() != RTE_IOVA_VA) {
131099a2dd95SBruce Richardson 			/* find physical addresses for each hugepage */
131199a2dd95SBruce Richardson 			if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1312*ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "Failed to find phys addr "
1313*ae67895bSDavid Marchand 					"for %u MB pages",
131499a2dd95SBruce Richardson 					(unsigned int)(hpi->hugepage_sz / 0x100000));
131599a2dd95SBruce Richardson 				goto fail;
131699a2dd95SBruce Richardson 			}
131799a2dd95SBruce Richardson 		} else {
131899a2dd95SBruce Richardson 			/* set physical addresses for each hugepage */
131999a2dd95SBruce Richardson 			if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1320*ae67895bSDavid Marchand 				EAL_LOG(DEBUG, "Failed to set phys addr "
1321*ae67895bSDavid Marchand 					"for %u MB pages",
132299a2dd95SBruce Richardson 					(unsigned int)(hpi->hugepage_sz / 0x100000));
132399a2dd95SBruce Richardson 				goto fail;
132499a2dd95SBruce Richardson 			}
132599a2dd95SBruce Richardson 		}
132699a2dd95SBruce Richardson 
132799a2dd95SBruce Richardson 		if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1328*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Failed to find NUMA socket for %u MB pages",
132999a2dd95SBruce Richardson 					(unsigned)(hpi->hugepage_sz / 0x100000));
133099a2dd95SBruce Richardson 			goto fail;
133199a2dd95SBruce Richardson 		}
133299a2dd95SBruce Richardson 
133399a2dd95SBruce Richardson 		qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
133499a2dd95SBruce Richardson 		      sizeof(struct hugepage_file), cmp_physaddr);
133599a2dd95SBruce Richardson 
133699a2dd95SBruce Richardson 		/* we have processed a num of hugepages of this size, so inc offset */
133799a2dd95SBruce Richardson 		hp_offset += hpi->num_pages[0];
133899a2dd95SBruce Richardson 	}
133999a2dd95SBruce Richardson 
134099a2dd95SBruce Richardson 	huge_recover_sigbus();
134199a2dd95SBruce Richardson 
134299a2dd95SBruce Richardson 	if (internal_conf->memory == 0 && internal_conf->force_sockets == 0)
134399a2dd95SBruce Richardson 		internal_conf->memory = eal_get_hugepage_mem_size();
134499a2dd95SBruce Richardson 
134599a2dd95SBruce Richardson 	nr_hugefiles = nr_hugepages;
134699a2dd95SBruce Richardson 
134799a2dd95SBruce Richardson 
134899a2dd95SBruce Richardson 	/* clean out the numbers of pages */
134999a2dd95SBruce Richardson 	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++)
135099a2dd95SBruce Richardson 		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
135199a2dd95SBruce Richardson 			internal_conf->hugepage_info[i].num_pages[j] = 0;
135299a2dd95SBruce Richardson 
135399a2dd95SBruce Richardson 	/* get hugepages for each socket */
135499a2dd95SBruce Richardson 	for (i = 0; i < nr_hugefiles; i++) {
135599a2dd95SBruce Richardson 		int socket = tmp_hp[i].socket_id;
135699a2dd95SBruce Richardson 
135799a2dd95SBruce Richardson 		/* find a hugepage info with right size and increment num_pages */
135899a2dd95SBruce Richardson 		const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
135999a2dd95SBruce Richardson 				(int)internal_conf->num_hugepage_sizes);
136099a2dd95SBruce Richardson 		for (j = 0; j < nb_hpsizes; j++) {
136199a2dd95SBruce Richardson 			if (tmp_hp[i].size ==
136299a2dd95SBruce Richardson 					internal_conf->hugepage_info[j].hugepage_sz) {
136399a2dd95SBruce Richardson 				internal_conf->hugepage_info[j].num_pages[socket]++;
136499a2dd95SBruce Richardson 			}
136599a2dd95SBruce Richardson 		}
136699a2dd95SBruce Richardson 	}
136799a2dd95SBruce Richardson 
136899a2dd95SBruce Richardson 	/* make a copy of socket_mem, needed for number of pages calculation */
136999a2dd95SBruce Richardson 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
137099a2dd95SBruce Richardson 		memory[i] = internal_conf->socket_mem[i];
137199a2dd95SBruce Richardson 
137299a2dd95SBruce Richardson 	/* calculate final number of pages */
137399a2dd95SBruce Richardson 	nr_hugepages = eal_dynmem_calc_num_pages_per_socket(memory,
137499a2dd95SBruce Richardson 			internal_conf->hugepage_info, used_hp,
137599a2dd95SBruce Richardson 			internal_conf->num_hugepage_sizes);
137699a2dd95SBruce Richardson 
137799a2dd95SBruce Richardson 	/* error if not enough memory available */
137899a2dd95SBruce Richardson 	if (nr_hugepages < 0)
137999a2dd95SBruce Richardson 		goto fail;
138099a2dd95SBruce Richardson 
138199a2dd95SBruce Richardson 	/* reporting in! */
138299a2dd95SBruce Richardson 	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
138399a2dd95SBruce Richardson 		for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
138499a2dd95SBruce Richardson 			if (used_hp[i].num_pages[j] > 0) {
1385*ae67895bSDavid Marchand 				EAL_LOG(DEBUG,
138699a2dd95SBruce Richardson 					"Requesting %u pages of size %uMB"
1387*ae67895bSDavid Marchand 					" from socket %i",
138899a2dd95SBruce Richardson 					used_hp[i].num_pages[j],
138999a2dd95SBruce Richardson 					(unsigned)
139099a2dd95SBruce Richardson 					(used_hp[i].hugepage_sz / 0x100000),
139199a2dd95SBruce Richardson 					j);
139299a2dd95SBruce Richardson 			}
139399a2dd95SBruce Richardson 		}
139499a2dd95SBruce Richardson 	}
139599a2dd95SBruce Richardson 
139699a2dd95SBruce Richardson 	/* create shared memory */
139799a2dd95SBruce Richardson 	hugepage = create_shared_memory(eal_hugepage_data_path(),
139899a2dd95SBruce Richardson 			nr_hugefiles * sizeof(struct hugepage_file));
139999a2dd95SBruce Richardson 
140099a2dd95SBruce Richardson 	if (hugepage == NULL) {
1401*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Failed to create shared memory!");
140299a2dd95SBruce Richardson 		goto fail;
140399a2dd95SBruce Richardson 	}
140499a2dd95SBruce Richardson 	memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
140599a2dd95SBruce Richardson 
140699a2dd95SBruce Richardson 	/*
140799a2dd95SBruce Richardson 	 * unmap pages that we won't need (looks at used_hp).
140899a2dd95SBruce Richardson 	 * also, sets final_va to NULL on pages that were unmapped.
140999a2dd95SBruce Richardson 	 */
141099a2dd95SBruce Richardson 	if (unmap_unneeded_hugepages(tmp_hp, used_hp,
141199a2dd95SBruce Richardson 			internal_conf->num_hugepage_sizes) < 0) {
1412*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Unmapping and locking hugepages failed!");
141399a2dd95SBruce Richardson 		goto fail;
141499a2dd95SBruce Richardson 	}
141599a2dd95SBruce Richardson 
141699a2dd95SBruce Richardson 	/*
141799a2dd95SBruce Richardson 	 * copy stuff from malloc'd hugepage* to the actual shared memory.
141899a2dd95SBruce Richardson 	 * this procedure only copies those hugepages that have orig_va
141999a2dd95SBruce Richardson 	 * not NULL. has overflow protection.
142099a2dd95SBruce Richardson 	 */
142199a2dd95SBruce Richardson 	if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
142299a2dd95SBruce Richardson 			tmp_hp, nr_hugefiles) < 0) {
1423*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Copying tables to shared memory failed!");
142499a2dd95SBruce Richardson 		goto fail;
142599a2dd95SBruce Richardson 	}
142699a2dd95SBruce Richardson 
142799a2dd95SBruce Richardson #ifndef RTE_ARCH_64
142899a2dd95SBruce Richardson 	/* for legacy 32-bit mode, we did not preallocate VA space, so do it */
142999a2dd95SBruce Richardson 	if (internal_conf->legacy_mem &&
143099a2dd95SBruce Richardson 			prealloc_segments(hugepage, nr_hugefiles)) {
1431*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Could not preallocate VA space for hugepages");
143299a2dd95SBruce Richardson 		goto fail;
143399a2dd95SBruce Richardson 	}
143499a2dd95SBruce Richardson #endif
143599a2dd95SBruce Richardson 
143699a2dd95SBruce Richardson 	/* remap all pages we do need into memseg list VA space, so that those
143799a2dd95SBruce Richardson 	 * pages become first-class citizens in DPDK memory subsystem
143899a2dd95SBruce Richardson 	 */
143999a2dd95SBruce Richardson 	if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
1440*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Couldn't remap hugepage files into memseg lists");
144199a2dd95SBruce Richardson 		goto fail;
144299a2dd95SBruce Richardson 	}
144399a2dd95SBruce Richardson 
144499a2dd95SBruce Richardson 	/* free the hugepage backing files */
144552d7d91eSDmitry Kozlyuk 	if (internal_conf->hugepage_file.unlink_before_mapping &&
144699a2dd95SBruce Richardson 		unlink_hugepage_files(tmp_hp, internal_conf->num_hugepage_sizes) < 0) {
1447*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Unlinking hugepage files failed!");
144899a2dd95SBruce Richardson 		goto fail;
144999a2dd95SBruce Richardson 	}
145099a2dd95SBruce Richardson 
145199a2dd95SBruce Richardson 	/* free the temporary hugepage table */
145299a2dd95SBruce Richardson 	free(tmp_hp);
145399a2dd95SBruce Richardson 	tmp_hp = NULL;
145499a2dd95SBruce Richardson 
145599a2dd95SBruce Richardson 	munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
145699a2dd95SBruce Richardson 	hugepage = NULL;
145799a2dd95SBruce Richardson 
145899a2dd95SBruce Richardson 	/* we're not going to allocate more pages, so release VA space for
145999a2dd95SBruce Richardson 	 * unused memseg lists
146099a2dd95SBruce Richardson 	 */
146199a2dd95SBruce Richardson 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
146299a2dd95SBruce Richardson 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
146399a2dd95SBruce Richardson 		size_t mem_sz;
146499a2dd95SBruce Richardson 
146599a2dd95SBruce Richardson 		/* skip inactive lists */
146699a2dd95SBruce Richardson 		if (msl->base_va == NULL)
146799a2dd95SBruce Richardson 			continue;
146899a2dd95SBruce Richardson 		/* skip lists where there is at least one page allocated */
146999a2dd95SBruce Richardson 		if (msl->memseg_arr.count > 0)
147099a2dd95SBruce Richardson 			continue;
147199a2dd95SBruce Richardson 		/* this is an unused list, deallocate it */
147299a2dd95SBruce Richardson 		mem_sz = msl->len;
147399a2dd95SBruce Richardson 		munmap(msl->base_va, mem_sz);
147499a2dd95SBruce Richardson 		msl->base_va = NULL;
147599a2dd95SBruce Richardson 		msl->heap = 0;
147699a2dd95SBruce Richardson 
147799a2dd95SBruce Richardson 		/* destroy backing fbarray */
147899a2dd95SBruce Richardson 		rte_fbarray_destroy(&msl->memseg_arr);
147999a2dd95SBruce Richardson 	}
148099a2dd95SBruce Richardson 
148199a2dd95SBruce Richardson 	if (mcfg->dma_maskbits &&
148299a2dd95SBruce Richardson 	    rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1483*ae67895bSDavid Marchand 		EAL_LOG(ERR,
1484*ae67895bSDavid Marchand 			"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
148599a2dd95SBruce Richardson 			__func__);
148699a2dd95SBruce Richardson 		goto fail;
148799a2dd95SBruce Richardson 	}
148899a2dd95SBruce Richardson 
148999a2dd95SBruce Richardson 	return 0;
149099a2dd95SBruce Richardson 
149199a2dd95SBruce Richardson fail:
149299a2dd95SBruce Richardson 	huge_recover_sigbus();
149399a2dd95SBruce Richardson 	free(tmp_hp);
149499a2dd95SBruce Richardson 	if (hugepage != NULL)
149599a2dd95SBruce Richardson 		munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
149699a2dd95SBruce Richardson 
149799a2dd95SBruce Richardson 	return -1;
149899a2dd95SBruce Richardson }
149999a2dd95SBruce Richardson 
150099a2dd95SBruce Richardson /*
150199a2dd95SBruce Richardson  * uses fstat to report the size of a file on disk
150299a2dd95SBruce Richardson  */
150399a2dd95SBruce Richardson static off_t
150499a2dd95SBruce Richardson getFileSize(int fd)
150599a2dd95SBruce Richardson {
150699a2dd95SBruce Richardson 	struct stat st;
150799a2dd95SBruce Richardson 	if (fstat(fd, &st) < 0)
150899a2dd95SBruce Richardson 		return 0;
150999a2dd95SBruce Richardson 	return st.st_size;
151099a2dd95SBruce Richardson }
151199a2dd95SBruce Richardson 
151299a2dd95SBruce Richardson /*
151399a2dd95SBruce Richardson  * This creates the memory mappings in the secondary process to match that of
151499a2dd95SBruce Richardson  * the server process. It goes through each memory segment in the DPDK runtime
151599a2dd95SBruce Richardson  * configuration and finds the hugepages which form that segment, mapping them
151699a2dd95SBruce Richardson  * in order to form a contiguous block in the virtual memory space
151799a2dd95SBruce Richardson  */
151899a2dd95SBruce Richardson static int
151999a2dd95SBruce Richardson eal_legacy_hugepage_attach(void)
152099a2dd95SBruce Richardson {
152199a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
152299a2dd95SBruce Richardson 	struct hugepage_file *hp = NULL;
152399a2dd95SBruce Richardson 	unsigned int num_hp = 0;
152499a2dd95SBruce Richardson 	unsigned int i = 0;
152599a2dd95SBruce Richardson 	unsigned int cur_seg;
152699a2dd95SBruce Richardson 	off_t size = 0;
152799a2dd95SBruce Richardson 	int fd, fd_hugepage = -1;
152899a2dd95SBruce Richardson 
152999a2dd95SBruce Richardson 	if (aslr_enabled() > 0) {
1530*ae67895bSDavid Marchand 		EAL_LOG(WARNING, "WARNING: Address Space Layout Randomization "
1531*ae67895bSDavid Marchand 				"(ASLR) is enabled in the kernel.");
1532*ae67895bSDavid Marchand 		EAL_LOG(WARNING, "   This may cause issues with mapping memory "
1533*ae67895bSDavid Marchand 				"into secondary processes");
153499a2dd95SBruce Richardson 	}
153599a2dd95SBruce Richardson 
153699a2dd95SBruce Richardson 	fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
153799a2dd95SBruce Richardson 	if (fd_hugepage < 0) {
1538*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Could not open %s",
153999a2dd95SBruce Richardson 				eal_hugepage_data_path());
154099a2dd95SBruce Richardson 		goto error;
154199a2dd95SBruce Richardson 	}
154299a2dd95SBruce Richardson 
154399a2dd95SBruce Richardson 	size = getFileSize(fd_hugepage);
154499a2dd95SBruce Richardson 	hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
154599a2dd95SBruce Richardson 	if (hp == MAP_FAILED) {
1546*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Could not mmap %s",
154799a2dd95SBruce Richardson 				eal_hugepage_data_path());
154899a2dd95SBruce Richardson 		goto error;
154999a2dd95SBruce Richardson 	}
155099a2dd95SBruce Richardson 
155199a2dd95SBruce Richardson 	num_hp = size / sizeof(struct hugepage_file);
1552*ae67895bSDavid Marchand 	EAL_LOG(DEBUG, "Analysing %u files", num_hp);
155399a2dd95SBruce Richardson 
155499a2dd95SBruce Richardson 	/* map all segments into memory to make sure we get the addrs. the
155599a2dd95SBruce Richardson 	 * segments themselves are already in memseg list (which is shared and
155699a2dd95SBruce Richardson 	 * has its VA space already preallocated), so we just need to map
155799a2dd95SBruce Richardson 	 * everything into correct addresses.
155899a2dd95SBruce Richardson 	 */
155999a2dd95SBruce Richardson 	for (i = 0; i < num_hp; i++) {
156099a2dd95SBruce Richardson 		struct hugepage_file *hf = &hp[i];
156199a2dd95SBruce Richardson 		size_t map_sz = hf->size;
156299a2dd95SBruce Richardson 		void *map_addr = hf->final_va;
156399a2dd95SBruce Richardson 		int msl_idx, ms_idx;
156499a2dd95SBruce Richardson 		struct rte_memseg_list *msl;
156599a2dd95SBruce Richardson 		struct rte_memseg *ms;
156699a2dd95SBruce Richardson 
156799a2dd95SBruce Richardson 		/* if size is zero, no more pages left */
156899a2dd95SBruce Richardson 		if (map_sz == 0)
156999a2dd95SBruce Richardson 			break;
157099a2dd95SBruce Richardson 
157199a2dd95SBruce Richardson 		fd = open(hf->filepath, O_RDWR);
157299a2dd95SBruce Richardson 		if (fd < 0) {
1573*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Could not open %s: %s",
157499a2dd95SBruce Richardson 				hf->filepath, strerror(errno));
157599a2dd95SBruce Richardson 			goto error;
157699a2dd95SBruce Richardson 		}
157799a2dd95SBruce Richardson 
157899a2dd95SBruce Richardson 		map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
157999a2dd95SBruce Richardson 				MAP_SHARED | MAP_FIXED, fd, 0);
158099a2dd95SBruce Richardson 		if (map_addr == MAP_FAILED) {
1581*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Could not map %s: %s",
158299a2dd95SBruce Richardson 				hf->filepath, strerror(errno));
158399a2dd95SBruce Richardson 			goto fd_error;
158499a2dd95SBruce Richardson 		}
158599a2dd95SBruce Richardson 
158699a2dd95SBruce Richardson 		/* set shared lock on the file. */
158799a2dd95SBruce Richardson 		if (flock(fd, LOCK_SH) < 0) {
1588*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): Locking file failed: %s",
158999a2dd95SBruce Richardson 				__func__, strerror(errno));
159099a2dd95SBruce Richardson 			goto mmap_error;
159199a2dd95SBruce Richardson 		}
159299a2dd95SBruce Richardson 
159399a2dd95SBruce Richardson 		/* find segment data */
159499a2dd95SBruce Richardson 		msl = rte_mem_virt2memseg_list(map_addr);
159599a2dd95SBruce Richardson 		if (msl == NULL) {
1596*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): Cannot find memseg list",
159799a2dd95SBruce Richardson 				__func__);
159899a2dd95SBruce Richardson 			goto mmap_error;
159999a2dd95SBruce Richardson 		}
160099a2dd95SBruce Richardson 		ms = rte_mem_virt2memseg(map_addr, msl);
160199a2dd95SBruce Richardson 		if (ms == NULL) {
1602*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): Cannot find memseg",
160399a2dd95SBruce Richardson 				__func__);
160499a2dd95SBruce Richardson 			goto mmap_error;
160599a2dd95SBruce Richardson 		}
160699a2dd95SBruce Richardson 
160799a2dd95SBruce Richardson 		msl_idx = msl - mcfg->memsegs;
160899a2dd95SBruce Richardson 		ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
160999a2dd95SBruce Richardson 		if (ms_idx < 0) {
1610*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "%s(): Cannot find memseg idx",
161199a2dd95SBruce Richardson 				__func__);
161299a2dd95SBruce Richardson 			goto mmap_error;
161399a2dd95SBruce Richardson 		}
161499a2dd95SBruce Richardson 
161599a2dd95SBruce Richardson 		/* store segment fd internally */
161699a2dd95SBruce Richardson 		if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
1617*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Could not store segment fd: %s",
161899a2dd95SBruce Richardson 				rte_strerror(rte_errno));
161999a2dd95SBruce Richardson 	}
162099a2dd95SBruce Richardson 	/* unmap the hugepage config file, since we are done using it */
162199a2dd95SBruce Richardson 	munmap(hp, size);
162299a2dd95SBruce Richardson 	close(fd_hugepage);
162399a2dd95SBruce Richardson 	return 0;
162499a2dd95SBruce Richardson 
162599a2dd95SBruce Richardson mmap_error:
162699a2dd95SBruce Richardson 	munmap(hp[i].final_va, hp[i].size);
162799a2dd95SBruce Richardson fd_error:
162899a2dd95SBruce Richardson 	close(fd);
162999a2dd95SBruce Richardson error:
163099a2dd95SBruce Richardson 	/* unwind mmap's done so far */
163199a2dd95SBruce Richardson 	for (cur_seg = 0; cur_seg < i; cur_seg++)
163299a2dd95SBruce Richardson 		munmap(hp[cur_seg].final_va, hp[cur_seg].size);
163399a2dd95SBruce Richardson 
163499a2dd95SBruce Richardson 	if (hp != NULL && hp != MAP_FAILED)
163599a2dd95SBruce Richardson 		munmap(hp, size);
163699a2dd95SBruce Richardson 	if (fd_hugepage >= 0)
163799a2dd95SBruce Richardson 		close(fd_hugepage);
163899a2dd95SBruce Richardson 	return -1;
163999a2dd95SBruce Richardson }
164099a2dd95SBruce Richardson 
164199a2dd95SBruce Richardson static int
164299a2dd95SBruce Richardson eal_hugepage_attach(void)
164399a2dd95SBruce Richardson {
164499a2dd95SBruce Richardson 	if (eal_memalloc_sync_with_primary()) {
1645*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Could not map memory from primary process");
164699a2dd95SBruce Richardson 		if (aslr_enabled() > 0)
1647*ae67895bSDavid Marchand 			EAL_LOG(ERR, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes");
164899a2dd95SBruce Richardson 		return -1;
164999a2dd95SBruce Richardson 	}
165099a2dd95SBruce Richardson 	return 0;
165199a2dd95SBruce Richardson }
165299a2dd95SBruce Richardson 
165399a2dd95SBruce Richardson int
165499a2dd95SBruce Richardson rte_eal_hugepage_init(void)
165599a2dd95SBruce Richardson {
165699a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
165799a2dd95SBruce Richardson 		eal_get_internal_configuration();
165899a2dd95SBruce Richardson 
165999a2dd95SBruce Richardson 	return internal_conf->legacy_mem ?
166099a2dd95SBruce Richardson 			eal_legacy_hugepage_init() :
166199a2dd95SBruce Richardson 			eal_dynmem_hugepage_init();
166299a2dd95SBruce Richardson }
166399a2dd95SBruce Richardson 
166499a2dd95SBruce Richardson int
166599a2dd95SBruce Richardson rte_eal_hugepage_attach(void)
166699a2dd95SBruce Richardson {
166799a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
166899a2dd95SBruce Richardson 		eal_get_internal_configuration();
166999a2dd95SBruce Richardson 
167099a2dd95SBruce Richardson 	return internal_conf->legacy_mem ?
167199a2dd95SBruce Richardson 			eal_legacy_hugepage_attach() :
167299a2dd95SBruce Richardson 			eal_hugepage_attach();
167399a2dd95SBruce Richardson }
167499a2dd95SBruce Richardson 
167599a2dd95SBruce Richardson int
167699a2dd95SBruce Richardson rte_eal_using_phys_addrs(void)
167799a2dd95SBruce Richardson {
167899a2dd95SBruce Richardson 	if (phys_addrs_available == -1) {
167999a2dd95SBruce Richardson 		uint64_t tmp = 0;
168099a2dd95SBruce Richardson 
168199a2dd95SBruce Richardson 		if (rte_eal_has_hugepages() != 0 &&
168299a2dd95SBruce Richardson 		    rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
168399a2dd95SBruce Richardson 			phys_addrs_available = 1;
168499a2dd95SBruce Richardson 		else
168599a2dd95SBruce Richardson 			phys_addrs_available = 0;
168699a2dd95SBruce Richardson 	}
168799a2dd95SBruce Richardson 	return phys_addrs_available;
168899a2dd95SBruce Richardson }
168999a2dd95SBruce Richardson 
169099a2dd95SBruce Richardson static int __rte_unused
169199a2dd95SBruce Richardson memseg_primary_init_32(void)
169299a2dd95SBruce Richardson {
169399a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
169499a2dd95SBruce Richardson 	int active_sockets, hpi_idx, msl_idx = 0;
169599a2dd95SBruce Richardson 	unsigned int socket_id, i;
169699a2dd95SBruce Richardson 	struct rte_memseg_list *msl;
169799a2dd95SBruce Richardson 	uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
169899a2dd95SBruce Richardson 	uint64_t max_mem;
169999a2dd95SBruce Richardson 	struct internal_config *internal_conf =
170099a2dd95SBruce Richardson 		eal_get_internal_configuration();
170199a2dd95SBruce Richardson 
170299a2dd95SBruce Richardson 	/* no-huge does not need this at all */
170399a2dd95SBruce Richardson 	if (internal_conf->no_hugetlbfs)
170499a2dd95SBruce Richardson 		return 0;
170599a2dd95SBruce Richardson 
170699a2dd95SBruce Richardson 	/* this is a giant hack, but desperate times call for desperate
170799a2dd95SBruce Richardson 	 * measures. in legacy 32-bit mode, we cannot preallocate VA space,
170899a2dd95SBruce Richardson 	 * because having upwards of 2 gigabytes of VA space already mapped will
170999a2dd95SBruce Richardson 	 * interfere with our ability to map and sort hugepages.
171099a2dd95SBruce Richardson 	 *
171199a2dd95SBruce Richardson 	 * therefore, in legacy 32-bit mode, we will be initializing memseg
171299a2dd95SBruce Richardson 	 * lists much later - in eal_memory.c, right after we unmap all the
171399a2dd95SBruce Richardson 	 * unneeded pages. this will not affect secondary processes, as those
171499a2dd95SBruce Richardson 	 * should be able to mmap the space without (too many) problems.
171599a2dd95SBruce Richardson 	 */
171699a2dd95SBruce Richardson 	if (internal_conf->legacy_mem)
171799a2dd95SBruce Richardson 		return 0;
171899a2dd95SBruce Richardson 
171999a2dd95SBruce Richardson 	/* 32-bit mode is a very special case. we cannot know in advance where
172099a2dd95SBruce Richardson 	 * the user will want to allocate their memory, so we have to do some
172199a2dd95SBruce Richardson 	 * heuristics.
172299a2dd95SBruce Richardson 	 */
172399a2dd95SBruce Richardson 	active_sockets = 0;
172499a2dd95SBruce Richardson 	total_requested_mem = 0;
172599a2dd95SBruce Richardson 	if (internal_conf->force_sockets)
172699a2dd95SBruce Richardson 		for (i = 0; i < rte_socket_count(); i++) {
172799a2dd95SBruce Richardson 			uint64_t mem;
172899a2dd95SBruce Richardson 
172999a2dd95SBruce Richardson 			socket_id = rte_socket_id_by_idx(i);
173099a2dd95SBruce Richardson 			mem = internal_conf->socket_mem[socket_id];
173199a2dd95SBruce Richardson 
173299a2dd95SBruce Richardson 			if (mem == 0)
173399a2dd95SBruce Richardson 				continue;
173499a2dd95SBruce Richardson 
173599a2dd95SBruce Richardson 			active_sockets++;
173699a2dd95SBruce Richardson 			total_requested_mem += mem;
173799a2dd95SBruce Richardson 		}
173899a2dd95SBruce Richardson 	else
173999a2dd95SBruce Richardson 		total_requested_mem = internal_conf->memory;
174099a2dd95SBruce Richardson 
174199a2dd95SBruce Richardson 	max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
174299a2dd95SBruce Richardson 	if (total_requested_mem > max_mem) {
1743*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Invalid parameters: 32-bit process can at most use %uM of memory",
174499a2dd95SBruce Richardson 				(unsigned int)(max_mem >> 20));
174599a2dd95SBruce Richardson 		return -1;
174699a2dd95SBruce Richardson 	}
174799a2dd95SBruce Richardson 	total_extra_mem = max_mem - total_requested_mem;
174899a2dd95SBruce Richardson 	extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
174999a2dd95SBruce Richardson 			total_extra_mem / active_sockets;
175099a2dd95SBruce Richardson 
175199a2dd95SBruce Richardson 	/* the allocation logic is a little bit convoluted, but here's how it
175299a2dd95SBruce Richardson 	 * works, in a nutshell:
175399a2dd95SBruce Richardson 	 *  - if user hasn't specified on which sockets to allocate memory via
175499a2dd95SBruce Richardson 	 *    --socket-mem, we allocate all of our memory on main core socket.
175599a2dd95SBruce Richardson 	 *  - if user has specified sockets to allocate memory on, there may be
175699a2dd95SBruce Richardson 	 *    some "unused" memory left (e.g. if user has specified --socket-mem
175799a2dd95SBruce Richardson 	 *    such that not all memory adds up to 2 gigabytes), so add it to all
175899a2dd95SBruce Richardson 	 *    sockets that are in use equally.
175999a2dd95SBruce Richardson 	 *
176099a2dd95SBruce Richardson 	 * page sizes are sorted by size in descending order, so we can safely
176199a2dd95SBruce Richardson 	 * assume that we dispense with bigger page sizes first.
176299a2dd95SBruce Richardson 	 */
176399a2dd95SBruce Richardson 
176499a2dd95SBruce Richardson 	/* create memseg lists */
176599a2dd95SBruce Richardson 	for (i = 0; i < rte_socket_count(); i++) {
176699a2dd95SBruce Richardson 		int hp_sizes = (int) internal_conf->num_hugepage_sizes;
176799a2dd95SBruce Richardson 		uint64_t max_socket_mem, cur_socket_mem;
176899a2dd95SBruce Richardson 		unsigned int main_lcore_socket;
176999a2dd95SBruce Richardson 		struct rte_config *cfg = rte_eal_get_configuration();
177099a2dd95SBruce Richardson 		bool skip;
177199a2dd95SBruce Richardson 
177299a2dd95SBruce Richardson 		socket_id = rte_socket_id_by_idx(i);
177399a2dd95SBruce Richardson 
177499a2dd95SBruce Richardson #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
177599a2dd95SBruce Richardson 		/* we can still sort pages by socket in legacy mode */
177699a2dd95SBruce Richardson 		if (!internal_conf->legacy_mem && socket_id > 0)
177799a2dd95SBruce Richardson 			break;
177899a2dd95SBruce Richardson #endif
177999a2dd95SBruce Richardson 
178099a2dd95SBruce Richardson 		/* if we didn't specifically request memory on this socket */
178199a2dd95SBruce Richardson 		skip = active_sockets != 0 &&
178299a2dd95SBruce Richardson 				internal_conf->socket_mem[socket_id] == 0;
178399a2dd95SBruce Richardson 		/* ...or if we didn't specifically request memory on *any*
178499a2dd95SBruce Richardson 		 * socket, and this is not main lcore
178599a2dd95SBruce Richardson 		 */
178699a2dd95SBruce Richardson 		main_lcore_socket = rte_lcore_to_socket_id(cfg->main_lcore);
178799a2dd95SBruce Richardson 		skip |= active_sockets == 0 && socket_id != main_lcore_socket;
178899a2dd95SBruce Richardson 
178999a2dd95SBruce Richardson 		if (skip) {
1790*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Will not preallocate memory on socket %u",
179199a2dd95SBruce Richardson 					socket_id);
179299a2dd95SBruce Richardson 			continue;
179399a2dd95SBruce Richardson 		}
179499a2dd95SBruce Richardson 
179599a2dd95SBruce Richardson 		/* max amount of memory on this socket */
179699a2dd95SBruce Richardson 		max_socket_mem = (active_sockets != 0 ?
179799a2dd95SBruce Richardson 					internal_conf->socket_mem[socket_id] :
179899a2dd95SBruce Richardson 					internal_conf->memory) +
179999a2dd95SBruce Richardson 					extra_mem_per_socket;
180099a2dd95SBruce Richardson 		cur_socket_mem = 0;
180199a2dd95SBruce Richardson 
180299a2dd95SBruce Richardson 		for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
180399a2dd95SBruce Richardson 			uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
180499a2dd95SBruce Richardson 			uint64_t hugepage_sz;
180599a2dd95SBruce Richardson 			struct hugepage_info *hpi;
180699a2dd95SBruce Richardson 			int type_msl_idx, max_segs, total_segs = 0;
180799a2dd95SBruce Richardson 
180899a2dd95SBruce Richardson 			hpi = &internal_conf->hugepage_info[hpi_idx];
180999a2dd95SBruce Richardson 			hugepage_sz = hpi->hugepage_sz;
181099a2dd95SBruce Richardson 
181199a2dd95SBruce Richardson 			/* check if pages are actually available */
181299a2dd95SBruce Richardson 			if (hpi->num_pages[socket_id] == 0)
181399a2dd95SBruce Richardson 				continue;
181499a2dd95SBruce Richardson 
181599a2dd95SBruce Richardson 			max_segs = RTE_MAX_MEMSEG_PER_TYPE;
181699a2dd95SBruce Richardson 			max_pagesz_mem = max_socket_mem - cur_socket_mem;
181799a2dd95SBruce Richardson 
181899a2dd95SBruce Richardson 			/* make it multiple of page size */
181999a2dd95SBruce Richardson 			max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
182099a2dd95SBruce Richardson 					hugepage_sz);
182199a2dd95SBruce Richardson 
1822*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Attempting to preallocate "
1823*ae67895bSDavid Marchand 					"%" PRIu64 "M on socket %i",
182499a2dd95SBruce Richardson 					max_pagesz_mem >> 20, socket_id);
182599a2dd95SBruce Richardson 
182699a2dd95SBruce Richardson 			type_msl_idx = 0;
182799a2dd95SBruce Richardson 			while (cur_pagesz_mem < max_pagesz_mem &&
182899a2dd95SBruce Richardson 					total_segs < max_segs) {
182999a2dd95SBruce Richardson 				uint64_t cur_mem;
183099a2dd95SBruce Richardson 				unsigned int n_segs;
183199a2dd95SBruce Richardson 
183299a2dd95SBruce Richardson 				if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
1833*ae67895bSDavid Marchand 					EAL_LOG(ERR,
1834*ae67895bSDavid Marchand 						"No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
183599a2dd95SBruce Richardson 					return -1;
183699a2dd95SBruce Richardson 				}
183799a2dd95SBruce Richardson 
183899a2dd95SBruce Richardson 				msl = &mcfg->memsegs[msl_idx];
183999a2dd95SBruce Richardson 
184099a2dd95SBruce Richardson 				cur_mem = get_mem_amount(hugepage_sz,
184199a2dd95SBruce Richardson 						max_pagesz_mem);
184299a2dd95SBruce Richardson 				n_segs = cur_mem / hugepage_sz;
184399a2dd95SBruce Richardson 
184499a2dd95SBruce Richardson 				if (eal_memseg_list_init(msl, hugepage_sz,
184599a2dd95SBruce Richardson 						n_segs, socket_id, type_msl_idx,
184699a2dd95SBruce Richardson 						true)) {
184799a2dd95SBruce Richardson 					/* failing to allocate a memseg list is
184899a2dd95SBruce Richardson 					 * a serious error.
184999a2dd95SBruce Richardson 					 */
1850*ae67895bSDavid Marchand 					EAL_LOG(ERR, "Cannot allocate memseg list");
185199a2dd95SBruce Richardson 					return -1;
185299a2dd95SBruce Richardson 				}
185399a2dd95SBruce Richardson 
185499a2dd95SBruce Richardson 				if (eal_memseg_list_alloc(msl, 0)) {
185599a2dd95SBruce Richardson 					/* if we couldn't allocate VA space, we
185699a2dd95SBruce Richardson 					 * can try with smaller page sizes.
185799a2dd95SBruce Richardson 					 */
1858*ae67895bSDavid Marchand 					EAL_LOG(ERR, "Cannot allocate VA space for memseg list, retrying with different page size");
185999a2dd95SBruce Richardson 					/* deallocate memseg list */
186099a2dd95SBruce Richardson 					if (memseg_list_free(msl))
186199a2dd95SBruce Richardson 						return -1;
186299a2dd95SBruce Richardson 					break;
186399a2dd95SBruce Richardson 				}
186499a2dd95SBruce Richardson 
186599a2dd95SBruce Richardson 				total_segs += msl->memseg_arr.len;
186699a2dd95SBruce Richardson 				cur_pagesz_mem = total_segs * hugepage_sz;
186799a2dd95SBruce Richardson 				type_msl_idx++;
186899a2dd95SBruce Richardson 				msl_idx++;
186999a2dd95SBruce Richardson 			}
187099a2dd95SBruce Richardson 			cur_socket_mem += cur_pagesz_mem;
187199a2dd95SBruce Richardson 		}
187299a2dd95SBruce Richardson 		if (cur_socket_mem == 0) {
1873*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Cannot allocate VA space on socket %u",
187499a2dd95SBruce Richardson 				socket_id);
187599a2dd95SBruce Richardson 			return -1;
187699a2dd95SBruce Richardson 		}
187799a2dd95SBruce Richardson 	}
187899a2dd95SBruce Richardson 
187999a2dd95SBruce Richardson 	return 0;
188099a2dd95SBruce Richardson }
188199a2dd95SBruce Richardson 
188299a2dd95SBruce Richardson static int __rte_unused
188399a2dd95SBruce Richardson memseg_primary_init(void)
188499a2dd95SBruce Richardson {
188599a2dd95SBruce Richardson 	return eal_dynmem_memseg_lists_init();
188699a2dd95SBruce Richardson }
188799a2dd95SBruce Richardson 
188899a2dd95SBruce Richardson static int
188999a2dd95SBruce Richardson memseg_secondary_init(void)
189099a2dd95SBruce Richardson {
189199a2dd95SBruce Richardson 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
189299a2dd95SBruce Richardson 	int msl_idx = 0;
189399a2dd95SBruce Richardson 	struct rte_memseg_list *msl;
189499a2dd95SBruce Richardson 
189599a2dd95SBruce Richardson 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
189699a2dd95SBruce Richardson 
189799a2dd95SBruce Richardson 		msl = &mcfg->memsegs[msl_idx];
189899a2dd95SBruce Richardson 
189990bf3f89SDeepak Khandelwal 		/* skip empty and external memseg lists */
190090bf3f89SDeepak Khandelwal 		if (msl->memseg_arr.len == 0 || msl->external)
190199a2dd95SBruce Richardson 			continue;
190299a2dd95SBruce Richardson 
190399a2dd95SBruce Richardson 		if (rte_fbarray_attach(&msl->memseg_arr)) {
1904*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Cannot attach to primary process memseg lists");
190599a2dd95SBruce Richardson 			return -1;
190699a2dd95SBruce Richardson 		}
190799a2dd95SBruce Richardson 
190899a2dd95SBruce Richardson 		/* preallocate VA space */
190999a2dd95SBruce Richardson 		if (eal_memseg_list_alloc(msl, 0)) {
1910*ae67895bSDavid Marchand 			EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
191199a2dd95SBruce Richardson 			return -1;
191299a2dd95SBruce Richardson 		}
191399a2dd95SBruce Richardson 	}
191499a2dd95SBruce Richardson 
191599a2dd95SBruce Richardson 	return 0;
191699a2dd95SBruce Richardson }
191799a2dd95SBruce Richardson 
191899a2dd95SBruce Richardson int
191999a2dd95SBruce Richardson rte_eal_memseg_init(void)
192099a2dd95SBruce Richardson {
192199a2dd95SBruce Richardson 	/* increase rlimit to maximum */
192299a2dd95SBruce Richardson 	struct rlimit lim;
192399a2dd95SBruce Richardson 
192499a2dd95SBruce Richardson #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
192599a2dd95SBruce Richardson 	const struct internal_config *internal_conf =
192699a2dd95SBruce Richardson 		eal_get_internal_configuration();
192799a2dd95SBruce Richardson #endif
192899a2dd95SBruce Richardson 	if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
192999a2dd95SBruce Richardson 		/* set limit to maximum */
193099a2dd95SBruce Richardson 		lim.rlim_cur = lim.rlim_max;
193199a2dd95SBruce Richardson 
193299a2dd95SBruce Richardson 		if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
1933*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Setting maximum number of open files failed: %s",
193499a2dd95SBruce Richardson 					strerror(errno));
193599a2dd95SBruce Richardson 		} else {
1936*ae67895bSDavid Marchand 			EAL_LOG(DEBUG, "Setting maximum number of open files to %"
1937*ae67895bSDavid Marchand 					PRIu64,
193899a2dd95SBruce Richardson 					(uint64_t)lim.rlim_cur);
193999a2dd95SBruce Richardson 		}
194099a2dd95SBruce Richardson 	} else {
1941*ae67895bSDavid Marchand 		EAL_LOG(ERR, "Cannot get current resource limits");
194299a2dd95SBruce Richardson 	}
194399a2dd95SBruce Richardson #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
194499a2dd95SBruce Richardson 	if (!internal_conf->legacy_mem && rte_socket_count() > 1) {
1945*ae67895bSDavid Marchand 		EAL_LOG(WARNING, "DPDK is running on a NUMA system, but is compiled without NUMA support.");
1946*ae67895bSDavid Marchand 		EAL_LOG(WARNING, "This will have adverse consequences for performance and usability.");
1947*ae67895bSDavid Marchand 		EAL_LOG(WARNING, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.");
194899a2dd95SBruce Richardson 	}
194999a2dd95SBruce Richardson #endif
195099a2dd95SBruce Richardson 
195199a2dd95SBruce Richardson 	return rte_eal_process_type() == RTE_PROC_PRIMARY ?
195299a2dd95SBruce Richardson #ifndef RTE_ARCH_64
195399a2dd95SBruce Richardson 			memseg_primary_init_32() :
195499a2dd95SBruce Richardson #else
195599a2dd95SBruce Richardson 			memseg_primary_init() :
195699a2dd95SBruce Richardson #endif
195799a2dd95SBruce Richardson 			memseg_secondary_init();
195899a2dd95SBruce Richardson }
1959