xref: /dpdk/lib/eal/linux/eal_memalloc.c (revision 0dff3f26d6faad4e51f75e5245f0387ee9bb0c6d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2018 Intel Corporation
3  */
4 
5 #include <errno.h>
6 #include <stdarg.h>
7 #include <stdbool.h>
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <stdint.h>
11 #include <inttypes.h>
12 #include <string.h>
13 #include <sys/mman.h>
14 #include <sys/types.h>
15 #include <sys/stat.h>
16 #include <sys/queue.h>
17 #include <sys/file.h>
18 #include <unistd.h>
19 #include <limits.h>
20 #include <fcntl.h>
21 #include <sys/ioctl.h>
22 #include <sys/time.h>
23 #include <signal.h>
24 #include <setjmp.h>
25 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
26 #include <linux/memfd.h>
27 #define MEMFD_SUPPORTED
28 #endif
29 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
30 #include <numa.h>
31 #include <numaif.h>
32 #endif
33 #include <linux/falloc.h>
34 #include <linux/mman.h> /* for hugetlb-related mmap flags */
35 
36 #include <rte_common.h>
37 #include <rte_log.h>
38 #include <rte_eal.h>
39 #include <rte_errno.h>
40 #include <rte_memory.h>
41 #include <rte_spinlock.h>
42 
43 #include "eal_filesystem.h"
44 #include "eal_internal_cfg.h"
45 #include "eal_memalloc.h"
46 #include "eal_memcfg.h"
47 #include "eal_private.h"
48 
49 const int anonymous_hugepages_supported =
50 #ifdef MAP_HUGE_SHIFT
51 		1;
52 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
53 #else
54 		0;
55 #define RTE_MAP_HUGE_SHIFT 26
56 #endif
57 
58 /*
59  * we've already checked memfd support at compile-time, but we also need to
60  * check if we can create hugepage files with memfd.
61  *
62  * also, this is not a constant, because while we may be *compiled* with memfd
63  * hugetlbfs support, we might not be *running* on a system that supports memfd
64  * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
65  * runtime, and fall back to anonymous memory.
66  */
67 static int memfd_create_supported =
68 #ifdef MFD_HUGETLB
69 		1;
70 #define RTE_MFD_HUGETLB MFD_HUGETLB
71 #else
72 		0;
73 #define RTE_MFD_HUGETLB 4U
74 #endif
75 
76 /*
77  * not all kernel version support fallocate on hugetlbfs, so fall back to
78  * ftruncate and disallow deallocation if fallocate is not supported.
79  */
80 static int fallocate_supported = -1; /* unknown */
81 
82 /*
83  * we have two modes - single file segments, and file-per-page mode.
84  *
85  * for single-file segments, we use memseg_list_fd to store the segment fd,
86  * while the fds[] will not be allocated, and len will be set to 0.
87  *
88  * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
89  * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
90  *
91  * we cannot know how many pages a system will have in advance, but we do know
92  * that they come in lists, and we know lengths of these lists. so, simply store
93  * a malloc'd array of fd's indexed by list and segment index.
94  *
95  * they will be initialized at startup, and filled as we allocate/deallocate
96  * segments.
97  */
98 static struct {
99 	int *fds; /**< dynamically allocated array of segment lock fd's */
100 	int memseg_list_fd; /**< memseg list fd */
101 	int len; /**< total length of the array */
102 	int count; /**< entries used in an array */
103 } fd_list[RTE_MAX_MEMSEG_LISTS];
104 
105 /** local copy of a memory map, used to synchronize memory hotplug in MP */
106 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
107 
108 static sigjmp_buf huge_jmpenv;
109 
110 static void huge_sigbus_handler(int signo __rte_unused)
111 {
112 	siglongjmp(huge_jmpenv, 1);
113 }
114 
115 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
116  * non-static local variable in the stack frame calling sigsetjmp might be
117  * clobbered by a call to longjmp.
118  */
119 static int huge_wrap_sigsetjmp(void)
120 {
121 	return sigsetjmp(huge_jmpenv, 1);
122 }
123 
124 static struct sigaction huge_action_old;
125 static int huge_need_recover;
126 
127 static void
128 huge_register_sigbus(void)
129 {
130 	sigset_t mask;
131 	struct sigaction action;
132 
133 	sigemptyset(&mask);
134 	sigaddset(&mask, SIGBUS);
135 	action.sa_flags = 0;
136 	action.sa_mask = mask;
137 	action.sa_handler = huge_sigbus_handler;
138 
139 	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
140 }
141 
142 static void
143 huge_recover_sigbus(void)
144 {
145 	if (huge_need_recover) {
146 		sigaction(SIGBUS, &huge_action_old, NULL);
147 		huge_need_recover = 0;
148 	}
149 }
150 
151 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
152 static bool
153 check_numa(void)
154 {
155 	bool ret = true;
156 	/* Check if kernel supports NUMA. */
157 	if (numa_available() != 0) {
158 		RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
159 		ret = false;
160 	}
161 	return ret;
162 }
163 
164 static void
165 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
166 {
167 	RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
168 	if (get_mempolicy(oldpolicy, oldmask->maskp,
169 			  oldmask->size + 1, 0, 0) < 0) {
170 		RTE_LOG(ERR, EAL,
171 			"Failed to get current mempolicy: %s. "
172 			"Assuming MPOL_DEFAULT.\n", strerror(errno));
173 		*oldpolicy = MPOL_DEFAULT;
174 	}
175 	RTE_LOG(DEBUG, EAL,
176 		"Setting policy MPOL_PREFERRED for socket %d\n",
177 		socket_id);
178 	numa_set_preferred(socket_id);
179 }
180 
181 static void
182 restore_numa(int *oldpolicy, struct bitmask *oldmask)
183 {
184 	RTE_LOG(DEBUG, EAL,
185 		"Restoring previous memory policy: %d\n", *oldpolicy);
186 	if (*oldpolicy == MPOL_DEFAULT) {
187 		numa_set_localalloc();
188 	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
189 				 oldmask->size + 1) < 0) {
190 		RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
191 			strerror(errno));
192 		numa_set_localalloc();
193 	}
194 	numa_free_cpumask(oldmask);
195 }
196 #endif
197 
198 /*
199  * uses fstat to report the size of a file on disk
200  */
201 static off_t
202 get_file_size(int fd)
203 {
204 	struct stat st;
205 	if (fstat(fd, &st) < 0)
206 		return 0;
207 	return st.st_size;
208 }
209 
210 static int
211 pagesz_flags(uint64_t page_sz)
212 {
213 	/* as per mmap() manpage, all page sizes are log2 of page size
214 	 * shifted by MAP_HUGE_SHIFT
215 	 */
216 	int log2 = rte_log2_u64(page_sz);
217 	return log2 << RTE_MAP_HUGE_SHIFT;
218 }
219 
220 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
221 static int lock(int fd, int type)
222 {
223 	int ret;
224 
225 	/* flock may be interrupted */
226 	do {
227 		ret = flock(fd, type | LOCK_NB);
228 	} while (ret && errno == EINTR);
229 
230 	if (ret && errno == EWOULDBLOCK) {
231 		/* couldn't lock */
232 		return 0;
233 	} else if (ret) {
234 		RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
235 			__func__, strerror(errno));
236 		return -1;
237 	}
238 	/* lock was successful */
239 	return 1;
240 }
241 
242 static int
243 get_seg_memfd(struct hugepage_info *hi __rte_unused,
244 		unsigned int list_idx __rte_unused,
245 		unsigned int seg_idx __rte_unused)
246 {
247 #ifdef MEMFD_SUPPORTED
248 	int fd;
249 	char segname[250]; /* as per manpage, limit is 249 bytes plus null */
250 
251 	int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
252 	const struct internal_config *internal_conf =
253 		eal_get_internal_configuration();
254 
255 	if (internal_conf->single_file_segments) {
256 		fd = fd_list[list_idx].memseg_list_fd;
257 
258 		if (fd < 0) {
259 			snprintf(segname, sizeof(segname), "seg_%i", list_idx);
260 			fd = memfd_create(segname, flags);
261 			if (fd < 0) {
262 				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
263 					__func__, strerror(errno));
264 				return -1;
265 			}
266 			fd_list[list_idx].memseg_list_fd = fd;
267 		}
268 	} else {
269 		fd = fd_list[list_idx].fds[seg_idx];
270 
271 		if (fd < 0) {
272 			snprintf(segname, sizeof(segname), "seg_%i-%i",
273 					list_idx, seg_idx);
274 			fd = memfd_create(segname, flags);
275 			if (fd < 0) {
276 				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
277 					__func__, strerror(errno));
278 				return -1;
279 			}
280 			fd_list[list_idx].fds[seg_idx] = fd;
281 		}
282 	}
283 	return fd;
284 #endif
285 	return -1;
286 }
287 
288 static int
289 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
290 		unsigned int list_idx, unsigned int seg_idx,
291 		bool *dirty)
292 {
293 	int fd;
294 	int *out_fd;
295 	struct stat st;
296 	int ret;
297 	const struct internal_config *internal_conf =
298 		eal_get_internal_configuration();
299 
300 	if (dirty != NULL)
301 		*dirty = false;
302 
303 	/* for in-memory mode, we only make it here when we're sure we support
304 	 * memfd, and this is a special case.
305 	 */
306 	if (internal_conf->in_memory)
307 		return get_seg_memfd(hi, list_idx, seg_idx);
308 
309 	if (internal_conf->single_file_segments) {
310 		out_fd = &fd_list[list_idx].memseg_list_fd;
311 		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
312 	} else {
313 		out_fd = &fd_list[list_idx].fds[seg_idx];
314 		eal_get_hugefile_path(path, buflen, hi->hugedir,
315 				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
316 	}
317 	fd = *out_fd;
318 	if (fd >= 0)
319 		return fd;
320 
321 	/*
322 	 * There is no TOCTOU between stat() and unlink()/open()
323 	 * because the hugepage directory is locked.
324 	 */
325 	ret = stat(path, &st);
326 	if (ret < 0 && errno != ENOENT) {
327 		RTE_LOG(DEBUG, EAL, "%s(): stat() for '%s' failed: %s\n",
328 			__func__, path, strerror(errno));
329 		return -1;
330 	}
331 	if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
332 			dirty != NULL)
333 		*dirty = true;
334 
335 	/*
336 	 * The kernel clears a hugepage only when it is mapped
337 	 * from a particular file for the first time.
338 	 * If the file already exists, the old content will be mapped.
339 	 * If the memory manager assumes all mapped pages to be clean,
340 	 * the file must be removed and created anew.
341 	 * Otherwise, the primary caller must be notified
342 	 * that mapped pages will be dirty
343 	 * (secondary callers receive the segment state from the primary one).
344 	 * When multiple hugepages are mapped from the same file,
345 	 * whether they will be dirty depends on the part that is mapped.
346 	 */
347 	if (!internal_conf->single_file_segments &&
348 			internal_conf->hugepage_file.unlink_existing &&
349 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
350 			ret == 0) {
351 		/* coverity[toctou] */
352 		if (unlink(path) < 0) {
353 			RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n",
354 				__func__, path, strerror(errno));
355 			return -1;
356 		}
357 	}
358 
359 	/* coverity[toctou] */
360 	fd = open(path, O_CREAT | O_RDWR, 0600);
361 	if (fd < 0) {
362 		RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n",
363 			__func__, path, strerror(errno));
364 		return -1;
365 	}
366 	/* take out a read lock */
367 	if (lock(fd, LOCK_SH) < 0) {
368 		RTE_LOG(ERR, EAL, "%s(): lock '%s' failed: %s\n",
369 			__func__, path, strerror(errno));
370 		close(fd);
371 		return -1;
372 	}
373 	*out_fd = fd;
374 	return fd;
375 }
376 
377 static int
378 resize_hugefile_in_memory(int fd, uint64_t fa_offset,
379 		uint64_t page_sz, bool grow)
380 {
381 	int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
382 			FALLOC_FL_KEEP_SIZE;
383 	int ret;
384 
385 	/* grow or shrink the file */
386 	ret = fallocate(fd, flags, fa_offset, page_sz);
387 
388 	if (ret < 0) {
389 		RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
390 				__func__,
391 				strerror(errno));
392 		return -1;
393 	}
394 	return 0;
395 }
396 
397 static int
398 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
399 		bool grow, bool *dirty)
400 {
401 	const struct internal_config *internal_conf =
402 			eal_get_internal_configuration();
403 	bool again = false;
404 
405 	do {
406 		if (fallocate_supported == 0) {
407 			/* we cannot deallocate memory if fallocate() is not
408 			 * supported, and hugepage file is already locked at
409 			 * creation, so no further synchronization needed.
410 			 */
411 
412 			if (!grow) {
413 				RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
414 					__func__);
415 				return -1;
416 			}
417 			uint64_t new_size = fa_offset + page_sz;
418 			uint64_t cur_size = get_file_size(fd);
419 
420 			/* fallocate isn't supported, fall back to ftruncate */
421 			if (dirty != NULL)
422 				*dirty = new_size <= cur_size;
423 			if (new_size > cur_size &&
424 					ftruncate(fd, new_size) < 0) {
425 				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
426 					__func__, strerror(errno));
427 				return -1;
428 			}
429 		} else {
430 			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
431 					FALLOC_FL_KEEP_SIZE;
432 			int ret;
433 
434 			/*
435 			 * technically, it is perfectly safe for both primary
436 			 * and secondary to grow and shrink the page files:
437 			 * growing the file repeatedly has no effect because
438 			 * a page can only be allocated once, while mmap ensures
439 			 * that secondaries hold on to the page even after the
440 			 * page itself is removed from the filesystem.
441 			 *
442 			 * however, leaving growing/shrinking to the primary
443 			 * tends to expose bugs in fdlist page count handling,
444 			 * so leave this here just in case.
445 			 */
446 			if (rte_eal_process_type() != RTE_PROC_PRIMARY)
447 				return 0;
448 
449 			/* grow or shrink the file */
450 			ret = fallocate(fd, flags, fa_offset, page_sz);
451 
452 			if (ret < 0) {
453 				if (fallocate_supported == -1 &&
454 						errno == ENOTSUP) {
455 					RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
456 						__func__);
457 					again = true;
458 					fallocate_supported = 0;
459 				} else {
460 					RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
461 						__func__,
462 						strerror(errno));
463 					return -1;
464 				}
465 			} else {
466 				fallocate_supported = 1;
467 				/*
468 				 * It is unknown which portions of an existing
469 				 * hugepage file were allocated previously,
470 				 * so all pages within the file are considered
471 				 * dirty, unless the file is a fresh one.
472 				 */
473 				if (dirty != NULL)
474 					*dirty &= !internal_conf->hugepage_file.unlink_existing;
475 			}
476 		}
477 	} while (again);
478 
479 	return 0;
480 }
481 
482 static void
483 close_hugefile(int fd, char *path, int list_idx)
484 {
485 	const struct internal_config *internal_conf =
486 		eal_get_internal_configuration();
487 	/*
488 	 * primary process must unlink the file, but only when not in in-memory
489 	 * mode (as in that case there is no file to unlink).
490 	 */
491 	if (!internal_conf->in_memory &&
492 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
493 			unlink(path))
494 		RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
495 			__func__, path, strerror(errno));
496 
497 	close(fd);
498 	fd_list[list_idx].memseg_list_fd = -1;
499 }
500 
501 static int
502 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
503 		bool *dirty)
504 {
505 	/* in-memory mode is a special case, because we can be sure that
506 	 * fallocate() is supported.
507 	 */
508 	const struct internal_config *internal_conf =
509 		eal_get_internal_configuration();
510 
511 	if (internal_conf->in_memory) {
512 		if (dirty != NULL)
513 			*dirty = false;
514 		return resize_hugefile_in_memory(fd, fa_offset,
515 				page_sz, grow);
516 	}
517 
518 	return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
519 			grow, dirty);
520 }
521 
522 static int
523 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
524 		struct hugepage_info *hi, unsigned int list_idx,
525 		unsigned int seg_idx)
526 {
527 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
528 	int cur_socket_id = 0;
529 #endif
530 	uint64_t map_offset;
531 	rte_iova_t iova;
532 	void *va;
533 	char path[PATH_MAX];
534 	int ret = 0;
535 	int fd;
536 	bool dirty;
537 	size_t alloc_sz;
538 	int flags;
539 	void *new_addr;
540 	const struct internal_config *internal_conf =
541 		eal_get_internal_configuration();
542 
543 	alloc_sz = hi->hugepage_sz;
544 
545 	/* these are checked at init, but code analyzers don't know that */
546 	if (internal_conf->in_memory && !anonymous_hugepages_supported) {
547 		RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
548 		return -1;
549 	}
550 	if (internal_conf->in_memory && !memfd_create_supported &&
551 			internal_conf->single_file_segments) {
552 		RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
553 		return -1;
554 	}
555 
556 	/* in-memory without memfd is a special case */
557 	int mmap_flags;
558 
559 	if (internal_conf->in_memory && !memfd_create_supported) {
560 		const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
561 				MAP_PRIVATE | MAP_ANONYMOUS;
562 		int pagesz_flag;
563 
564 		pagesz_flag = pagesz_flags(alloc_sz);
565 		fd = -1;
566 		dirty = false;
567 		mmap_flags = in_memory_flags | pagesz_flag;
568 
569 		/* single-file segments codepath will never be active
570 		 * here because in-memory mode is incompatible with the
571 		 * fallback path, and it's stopped at EAL initialization
572 		 * stage.
573 		 */
574 		map_offset = 0;
575 	} else {
576 		/* takes out a read lock on segment or segment list */
577 		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
578 				&dirty);
579 		if (fd < 0) {
580 			RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
581 			return -1;
582 		}
583 
584 		if (internal_conf->single_file_segments) {
585 			map_offset = seg_idx * alloc_sz;
586 			ret = resize_hugefile(fd, map_offset, alloc_sz, true,
587 					&dirty);
588 			if (ret < 0)
589 				goto resized;
590 
591 			fd_list[list_idx].count++;
592 		} else {
593 			map_offset = 0;
594 			if (ftruncate(fd, alloc_sz) < 0) {
595 				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
596 					__func__, strerror(errno));
597 				goto resized;
598 			}
599 			if (internal_conf->hugepage_file.unlink_before_mapping &&
600 					!internal_conf->in_memory) {
601 				if (unlink(path)) {
602 					RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
603 						__func__, strerror(errno));
604 					goto resized;
605 				}
606 			}
607 		}
608 		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
609 	}
610 
611 	huge_register_sigbus();
612 
613 	/*
614 	 * map the segment, and populate page tables, the kernel fills
615 	 * this segment with zeros if it's a new page.
616 	 */
617 	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
618 			map_offset);
619 
620 	if (va == MAP_FAILED) {
621 		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
622 			strerror(errno));
623 		/* mmap failed, but the previous region might have been
624 		 * unmapped anyway. try to remap it
625 		 */
626 		goto unmapped;
627 	}
628 	if (va != addr) {
629 		RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
630 		munmap(va, alloc_sz);
631 		goto resized;
632 	}
633 
634 	/* In linux, hugetlb limitations, like cgroup, are
635 	 * enforced at fault time instead of mmap(), even
636 	 * with the option of MAP_POPULATE. Kernel will send
637 	 * a SIGBUS signal. To avoid to be killed, save stack
638 	 * environment here, if SIGBUS happens, we can jump
639 	 * back here.
640 	 */
641 	if (huge_wrap_sigsetjmp()) {
642 		RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
643 			(unsigned int)(alloc_sz >> 20));
644 		goto mapped;
645 	}
646 
647 	/* we need to trigger a write to the page to enforce page fault and
648 	 * ensure that page is accessible to us, but we can't overwrite value
649 	 * that is already there, so read the old value, and write itback.
650 	 * kernel populates the page with zeroes initially.
651 	 */
652 	*(volatile int *)addr = *(volatile int *)addr;
653 
654 	iova = rte_mem_virt2iova(addr);
655 	if (iova == RTE_BAD_PHYS_ADDR) {
656 		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
657 			__func__);
658 		goto mapped;
659 	}
660 
661 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
662 	/*
663 	 * If the kernel has been built without NUMA support, get_mempolicy()
664 	 * will return an error. If check_numa() returns false, memory
665 	 * allocation is not NUMA aware and the socket_id should not be
666 	 * checked.
667 	 */
668 	if (check_numa()) {
669 		ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
670 					MPOL_F_NODE | MPOL_F_ADDR);
671 		if (ret < 0) {
672 			RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n",
673 				__func__, strerror(errno));
674 			goto mapped;
675 		} else if (cur_socket_id != socket_id) {
676 			RTE_LOG(DEBUG, EAL,
677 					"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
678 				__func__, socket_id, cur_socket_id);
679 			goto mapped;
680 		}
681 	}
682 #else
683 	if (rte_socket_count() > 1)
684 		RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n",
685 				__func__);
686 #endif
687 
688 	huge_recover_sigbus();
689 
690 	ms->addr = addr;
691 	ms->hugepage_sz = alloc_sz;
692 	ms->len = alloc_sz;
693 	ms->nchannel = rte_memory_get_nchannel();
694 	ms->nrank = rte_memory_get_nrank();
695 	ms->iova = iova;
696 	ms->socket_id = socket_id;
697 	ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
698 
699 	return 0;
700 
701 mapped:
702 	munmap(addr, alloc_sz);
703 unmapped:
704 	huge_recover_sigbus();
705 	flags = EAL_RESERVE_FORCE_ADDRESS;
706 	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
707 	if (new_addr != addr) {
708 		if (new_addr != NULL)
709 			munmap(new_addr, alloc_sz);
710 		/* we're leaving a hole in our virtual address space. if
711 		 * somebody else maps this hole now, we could accidentally
712 		 * override it in the future.
713 		 */
714 		RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
715 	}
716 	/* roll back the ref count */
717 	if (internal_conf->single_file_segments)
718 		fd_list[list_idx].count--;
719 resized:
720 	/* some codepaths will return negative fd, so exit early */
721 	if (fd < 0)
722 		return -1;
723 
724 	if (internal_conf->single_file_segments) {
725 		resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
726 		/* ignore failure, can't make it any worse */
727 
728 		/* if refcount is at zero, close the file */
729 		if (fd_list[list_idx].count == 0)
730 			close_hugefile(fd, path, list_idx);
731 	} else {
732 		/* only remove file if we can take out a write lock */
733 		if (!internal_conf->hugepage_file.unlink_before_mapping &&
734 				internal_conf->in_memory == 0 &&
735 				lock(fd, LOCK_EX) == 1)
736 			unlink(path);
737 		close(fd);
738 		fd_list[list_idx].fds[seg_idx] = -1;
739 	}
740 	return -1;
741 }
742 
743 static int
744 free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
745 		unsigned int list_idx, unsigned int seg_idx)
746 {
747 	uint64_t map_offset;
748 	char path[PATH_MAX];
749 	int fd, ret = 0;
750 	const struct internal_config *internal_conf =
751 		eal_get_internal_configuration();
752 
753 	/* erase page data */
754 	memset(ms->addr, 0, ms->len);
755 
756 	if (mmap(ms->addr, ms->len, PROT_NONE,
757 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
758 				MAP_FAILED) {
759 		RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
760 		return -1;
761 	}
762 
763 	eal_mem_set_dump(ms->addr, ms->len, false);
764 
765 	/* if we're using anonymous hugepages, nothing to be done */
766 	if (internal_conf->in_memory && !memfd_create_supported) {
767 		memset(ms, 0, sizeof(*ms));
768 		return 0;
769 	}
770 
771 	/* if we are not in single file segments mode, we're going to unmap the
772 	 * segment and thus drop the lock on original fd, but hugepage dir is
773 	 * now locked so we can take out another one without races.
774 	 */
775 	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
776 	if (fd < 0)
777 		return -1;
778 
779 	if (internal_conf->single_file_segments) {
780 		map_offset = seg_idx * ms->len;
781 		if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
782 			return -1;
783 
784 		if (--(fd_list[list_idx].count) == 0)
785 			close_hugefile(fd, path, list_idx);
786 
787 		ret = 0;
788 	} else {
789 		/* if we're able to take out a write lock, we're the last one
790 		 * holding onto this page.
791 		 */
792 		if (!internal_conf->in_memory &&
793 				internal_conf->hugepage_file.unlink_existing &&
794 				!internal_conf->hugepage_file.unlink_before_mapping) {
795 			ret = lock(fd, LOCK_EX);
796 			if (ret >= 0) {
797 				/* no one else is using this page */
798 				if (ret == 1)
799 					unlink(path);
800 			}
801 		}
802 		/* closing fd will drop the lock */
803 		close(fd);
804 		fd_list[list_idx].fds[seg_idx] = -1;
805 	}
806 
807 	memset(ms, 0, sizeof(*ms));
808 
809 	return ret < 0 ? -1 : 0;
810 }
811 
812 struct alloc_walk_param {
813 	struct hugepage_info *hi;
814 	struct rte_memseg **ms;
815 	size_t page_sz;
816 	unsigned int segs_allocated;
817 	unsigned int n_segs;
818 	int socket;
819 	bool exact;
820 };
821 static int
822 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
823 {
824 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
825 	struct alloc_walk_param *wa = arg;
826 	struct rte_memseg_list *cur_msl;
827 	size_t page_sz;
828 	int cur_idx, start_idx, j, dir_fd = -1;
829 	unsigned int msl_idx, need, i;
830 	const struct internal_config *internal_conf =
831 		eal_get_internal_configuration();
832 
833 	if (msl->page_sz != wa->page_sz)
834 		return 0;
835 	if (msl->socket_id != wa->socket)
836 		return 0;
837 
838 	page_sz = (size_t)msl->page_sz;
839 
840 	msl_idx = msl - mcfg->memsegs;
841 	cur_msl = &mcfg->memsegs[msl_idx];
842 
843 	need = wa->n_segs;
844 
845 	/* try finding space in memseg list */
846 	if (wa->exact) {
847 		/* if we require exact number of pages in a list, find them */
848 		cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
849 				need);
850 		if (cur_idx < 0)
851 			return 0;
852 		start_idx = cur_idx;
853 	} else {
854 		int cur_len;
855 
856 		/* we don't require exact number of pages, so we're going to go
857 		 * for best-effort allocation. that means finding the biggest
858 		 * unused block, and going with that.
859 		 */
860 		cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
861 				0);
862 		if (cur_idx < 0)
863 			return 0;
864 		start_idx = cur_idx;
865 		/* adjust the size to possibly be smaller than original
866 		 * request, but do not allow it to be bigger.
867 		 */
868 		cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
869 				cur_idx);
870 		need = RTE_MIN(need, (unsigned int)cur_len);
871 	}
872 
873 	/* do not allow any page allocations during the time we're allocating,
874 	 * because file creation and locking operations are not atomic,
875 	 * and we might be the first or the last ones to use a particular page,
876 	 * so we need to ensure atomicity of every operation.
877 	 *
878 	 * during init, we already hold a write lock, so don't try to take out
879 	 * another one.
880 	 */
881 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
882 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
883 		if (dir_fd < 0) {
884 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
885 				__func__, wa->hi->hugedir, strerror(errno));
886 			return -1;
887 		}
888 		/* blocking writelock */
889 		if (flock(dir_fd, LOCK_EX)) {
890 			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
891 				__func__, wa->hi->hugedir, strerror(errno));
892 			close(dir_fd);
893 			return -1;
894 		}
895 	}
896 
897 	for (i = 0; i < need; i++, cur_idx++) {
898 		struct rte_memseg *cur;
899 		void *map_addr;
900 
901 		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
902 		map_addr = RTE_PTR_ADD(cur_msl->base_va,
903 				cur_idx * page_sz);
904 
905 		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
906 				msl_idx, cur_idx)) {
907 			RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
908 				need, i);
909 
910 			/* if exact number wasn't requested, stop */
911 			if (!wa->exact)
912 				goto out;
913 
914 			/* clean up */
915 			for (j = start_idx; j < cur_idx; j++) {
916 				struct rte_memseg *tmp;
917 				struct rte_fbarray *arr =
918 						&cur_msl->memseg_arr;
919 
920 				tmp = rte_fbarray_get(arr, j);
921 				rte_fbarray_set_free(arr, j);
922 
923 				/* free_seg may attempt to create a file, which
924 				 * may fail.
925 				 */
926 				if (free_seg(tmp, wa->hi, msl_idx, j))
927 					RTE_LOG(DEBUG, EAL, "Cannot free page\n");
928 			}
929 			/* clear the list */
930 			if (wa->ms)
931 				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
932 
933 			if (dir_fd >= 0)
934 				close(dir_fd);
935 			return -1;
936 		}
937 		if (wa->ms)
938 			wa->ms[i] = cur;
939 
940 		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
941 	}
942 out:
943 	wa->segs_allocated = i;
944 	if (i > 0)
945 		cur_msl->version++;
946 	if (dir_fd >= 0)
947 		close(dir_fd);
948 	/* if we didn't allocate any segments, move on to the next list */
949 	return i > 0;
950 }
951 
952 struct free_walk_param {
953 	struct hugepage_info *hi;
954 	struct rte_memseg *ms;
955 };
956 static int
957 free_seg_walk(const struct rte_memseg_list *msl, void *arg)
958 {
959 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
960 	struct rte_memseg_list *found_msl;
961 	struct free_walk_param *wa = arg;
962 	uintptr_t start_addr, end_addr;
963 	int msl_idx, seg_idx, ret, dir_fd = -1;
964 	const struct internal_config *internal_conf =
965 		eal_get_internal_configuration();
966 
967 	start_addr = (uintptr_t) msl->base_va;
968 	end_addr = start_addr + msl->len;
969 
970 	if ((uintptr_t)wa->ms->addr < start_addr ||
971 			(uintptr_t)wa->ms->addr >= end_addr)
972 		return 0;
973 
974 	msl_idx = msl - mcfg->memsegs;
975 	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
976 
977 	/* msl is const */
978 	found_msl = &mcfg->memsegs[msl_idx];
979 
980 	/* do not allow any page allocations during the time we're freeing,
981 	 * because file creation and locking operations are not atomic,
982 	 * and we might be the first or the last ones to use a particular page,
983 	 * so we need to ensure atomicity of every operation.
984 	 *
985 	 * during init, we already hold a write lock, so don't try to take out
986 	 * another one.
987 	 */
988 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
989 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
990 		if (dir_fd < 0) {
991 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
992 				__func__, wa->hi->hugedir, strerror(errno));
993 			return -1;
994 		}
995 		/* blocking writelock */
996 		if (flock(dir_fd, LOCK_EX)) {
997 			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
998 				__func__, wa->hi->hugedir, strerror(errno));
999 			close(dir_fd);
1000 			return -1;
1001 		}
1002 	}
1003 
1004 	found_msl->version++;
1005 
1006 	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
1007 
1008 	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
1009 
1010 	if (dir_fd >= 0)
1011 		close(dir_fd);
1012 
1013 	if (ret < 0)
1014 		return -1;
1015 
1016 	return 1;
1017 }
1018 
1019 int
1020 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
1021 		int socket, bool exact)
1022 {
1023 	int i, ret = -1;
1024 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1025 	bool have_numa = false;
1026 	int oldpolicy;
1027 	struct bitmask *oldmask;
1028 #endif
1029 	struct alloc_walk_param wa;
1030 	struct hugepage_info *hi = NULL;
1031 	struct internal_config *internal_conf =
1032 		eal_get_internal_configuration();
1033 
1034 	memset(&wa, 0, sizeof(wa));
1035 
1036 	/* dynamic allocation not supported in legacy mode */
1037 	if (internal_conf->legacy_mem)
1038 		return -1;
1039 
1040 	for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) {
1041 		if (page_sz ==
1042 				internal_conf->hugepage_info[i].hugepage_sz) {
1043 			hi = &internal_conf->hugepage_info[i];
1044 			break;
1045 		}
1046 	}
1047 	if (!hi) {
1048 		RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
1049 			__func__);
1050 		return -1;
1051 	}
1052 
1053 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1054 	if (check_numa()) {
1055 		oldmask = numa_allocate_nodemask();
1056 		prepare_numa(&oldpolicy, oldmask, socket);
1057 		have_numa = true;
1058 	}
1059 #endif
1060 
1061 	wa.exact = exact;
1062 	wa.hi = hi;
1063 	wa.ms = ms;
1064 	wa.n_segs = n_segs;
1065 	wa.page_sz = page_sz;
1066 	wa.socket = socket;
1067 	wa.segs_allocated = 0;
1068 
1069 	/* memalloc is locked, so it's safe to use thread-unsafe version */
1070 	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
1071 	if (ret == 0) {
1072 		RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
1073 			__func__);
1074 		ret = -1;
1075 	} else if (ret > 0) {
1076 		ret = (int)wa.segs_allocated;
1077 	}
1078 
1079 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1080 	if (have_numa)
1081 		restore_numa(&oldpolicy, oldmask);
1082 #endif
1083 	return ret;
1084 }
1085 
1086 struct rte_memseg *
1087 eal_memalloc_alloc_seg(size_t page_sz, int socket)
1088 {
1089 	struct rte_memseg *ms;
1090 	if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
1091 		return NULL;
1092 	/* return pointer to newly allocated memseg */
1093 	return ms;
1094 }
1095 
1096 int
1097 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
1098 {
1099 	int seg, ret = 0;
1100 	struct internal_config *internal_conf =
1101 		eal_get_internal_configuration();
1102 
1103 	/* dynamic free not supported in legacy mode */
1104 	if (internal_conf->legacy_mem)
1105 		return -1;
1106 
1107 	for (seg = 0; seg < n_segs; seg++) {
1108 		struct rte_memseg *cur = ms[seg];
1109 		struct hugepage_info *hi = NULL;
1110 		struct free_walk_param wa;
1111 		int i, walk_res;
1112 
1113 		/* if this page is marked as unfreeable, fail */
1114 		if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
1115 			RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
1116 			ret = -1;
1117 			continue;
1118 		}
1119 
1120 		memset(&wa, 0, sizeof(wa));
1121 
1122 		for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info);
1123 				i++) {
1124 			hi = &internal_conf->hugepage_info[i];
1125 			if (cur->hugepage_sz == hi->hugepage_sz)
1126 				break;
1127 		}
1128 		if (i == (int)RTE_DIM(internal_conf->hugepage_info)) {
1129 			RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
1130 			ret = -1;
1131 			continue;
1132 		}
1133 
1134 		wa.ms = cur;
1135 		wa.hi = hi;
1136 
1137 		/* memalloc is locked, so it's safe to use thread-unsafe version
1138 		 */
1139 		walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
1140 				&wa);
1141 		if (walk_res == 1)
1142 			continue;
1143 		if (walk_res == 0)
1144 			RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
1145 		ret = -1;
1146 	}
1147 	return ret;
1148 }
1149 
1150 int
1151 eal_memalloc_free_seg(struct rte_memseg *ms)
1152 {
1153 	const struct internal_config *internal_conf =
1154 		eal_get_internal_configuration();
1155 
1156 	/* dynamic free not supported in legacy mode */
1157 	if (internal_conf->legacy_mem)
1158 		return -1;
1159 
1160 	return eal_memalloc_free_seg_bulk(&ms, 1);
1161 }
1162 
1163 static int
1164 sync_chunk(struct rte_memseg_list *primary_msl,
1165 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1166 		unsigned int msl_idx, bool used, int start, int end)
1167 {
1168 	struct rte_fbarray *l_arr, *p_arr;
1169 	int i, ret, chunk_len, diff_len;
1170 
1171 	l_arr = &local_msl->memseg_arr;
1172 	p_arr = &primary_msl->memseg_arr;
1173 
1174 	/* we need to aggregate allocations/deallocations into bigger chunks,
1175 	 * as we don't want to spam the user with per-page callbacks.
1176 	 *
1177 	 * to avoid any potential issues, we also want to trigger
1178 	 * deallocation callbacks *before* we actually deallocate
1179 	 * memory, so that the user application could wrap up its use
1180 	 * before it goes away.
1181 	 */
1182 
1183 	chunk_len = end - start;
1184 
1185 	/* find how many contiguous pages we can map/unmap for this chunk */
1186 	diff_len = used ?
1187 			rte_fbarray_find_contig_free(l_arr, start) :
1188 			rte_fbarray_find_contig_used(l_arr, start);
1189 
1190 	/* has to be at least one page */
1191 	if (diff_len < 1)
1192 		return -1;
1193 
1194 	diff_len = RTE_MIN(chunk_len, diff_len);
1195 
1196 	/* if we are freeing memory, notify the application */
1197 	if (!used) {
1198 		struct rte_memseg *ms;
1199 		void *start_va;
1200 		size_t len, page_sz;
1201 
1202 		ms = rte_fbarray_get(l_arr, start);
1203 		start_va = ms->addr;
1204 		page_sz = (size_t)primary_msl->page_sz;
1205 		len = page_sz * diff_len;
1206 
1207 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
1208 				start_va, len);
1209 	}
1210 
1211 	for (i = 0; i < diff_len; i++) {
1212 		struct rte_memseg *p_ms, *l_ms;
1213 		int seg_idx = start + i;
1214 
1215 		l_ms = rte_fbarray_get(l_arr, seg_idx);
1216 		p_ms = rte_fbarray_get(p_arr, seg_idx);
1217 
1218 		if (l_ms == NULL || p_ms == NULL)
1219 			return -1;
1220 
1221 		if (used) {
1222 			ret = alloc_seg(l_ms, p_ms->addr,
1223 					p_ms->socket_id, hi,
1224 					msl_idx, seg_idx);
1225 			if (ret < 0)
1226 				return -1;
1227 			rte_fbarray_set_used(l_arr, seg_idx);
1228 		} else {
1229 			ret = free_seg(l_ms, hi, msl_idx, seg_idx);
1230 			rte_fbarray_set_free(l_arr, seg_idx);
1231 			if (ret < 0)
1232 				return -1;
1233 		}
1234 	}
1235 
1236 	/* if we just allocated memory, notify the application */
1237 	if (used) {
1238 		struct rte_memseg *ms;
1239 		void *start_va;
1240 		size_t len, page_sz;
1241 
1242 		ms = rte_fbarray_get(l_arr, start);
1243 		start_va = ms->addr;
1244 		page_sz = (size_t)primary_msl->page_sz;
1245 		len = page_sz * diff_len;
1246 
1247 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
1248 				start_va, len);
1249 	}
1250 
1251 	/* calculate how much we can advance until next chunk */
1252 	diff_len = used ?
1253 			rte_fbarray_find_contig_used(l_arr, start) :
1254 			rte_fbarray_find_contig_free(l_arr, start);
1255 	ret = RTE_MIN(chunk_len, diff_len);
1256 
1257 	return ret;
1258 }
1259 
1260 static int
1261 sync_status(struct rte_memseg_list *primary_msl,
1262 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1263 		unsigned int msl_idx, bool used)
1264 {
1265 	struct rte_fbarray *l_arr, *p_arr;
1266 	int p_idx, l_chunk_len, p_chunk_len, ret;
1267 	int start, end;
1268 
1269 	/* this is a little bit tricky, but the basic idea is - walk both lists
1270 	 * and spot any places where there are discrepancies. walking both lists
1271 	 * and noting discrepancies in a single go is a hard problem, so we do
1272 	 * it in two passes - first we spot any places where allocated segments
1273 	 * mismatch (i.e. ensure that everything that's allocated in the primary
1274 	 * is also allocated in the secondary), and then we do it by looking at
1275 	 * free segments instead.
1276 	 *
1277 	 * we also need to aggregate changes into chunks, as we have to call
1278 	 * callbacks per allocation, not per page.
1279 	 */
1280 	l_arr = &local_msl->memseg_arr;
1281 	p_arr = &primary_msl->memseg_arr;
1282 
1283 	if (used)
1284 		p_idx = rte_fbarray_find_next_used(p_arr, 0);
1285 	else
1286 		p_idx = rte_fbarray_find_next_free(p_arr, 0);
1287 
1288 	while (p_idx >= 0) {
1289 		int next_chunk_search_idx;
1290 
1291 		if (used) {
1292 			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
1293 					p_idx);
1294 			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
1295 					p_idx);
1296 		} else {
1297 			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
1298 					p_idx);
1299 			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
1300 					p_idx);
1301 		}
1302 		/* best case scenario - no differences (or bigger, which will be
1303 		 * fixed during next iteration), look for next chunk
1304 		 */
1305 		if (l_chunk_len >= p_chunk_len) {
1306 			next_chunk_search_idx = p_idx + p_chunk_len;
1307 			goto next_chunk;
1308 		}
1309 
1310 		/* if both chunks start at the same point, skip parts we know
1311 		 * are identical, and sync the rest. each call to sync_chunk
1312 		 * will only sync contiguous segments, so we need to call this
1313 		 * until we are sure there are no more differences in this
1314 		 * chunk.
1315 		 */
1316 		start = p_idx + l_chunk_len;
1317 		end = p_idx + p_chunk_len;
1318 		do {
1319 			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
1320 					used, start, end);
1321 			start += ret;
1322 		} while (start < end && ret >= 0);
1323 		/* if ret is negative, something went wrong */
1324 		if (ret < 0)
1325 			return -1;
1326 
1327 		next_chunk_search_idx = p_idx + p_chunk_len;
1328 next_chunk:
1329 		/* skip to end of this chunk */
1330 		if (used) {
1331 			p_idx = rte_fbarray_find_next_used(p_arr,
1332 					next_chunk_search_idx);
1333 		} else {
1334 			p_idx = rte_fbarray_find_next_free(p_arr,
1335 					next_chunk_search_idx);
1336 		}
1337 	}
1338 	return 0;
1339 }
1340 
1341 static int
1342 sync_existing(struct rte_memseg_list *primary_msl,
1343 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1344 		unsigned int msl_idx)
1345 {
1346 	int ret, dir_fd;
1347 
1348 	/* do not allow any page allocations during the time we're allocating,
1349 	 * because file creation and locking operations are not atomic,
1350 	 * and we might be the first or the last ones to use a particular page,
1351 	 * so we need to ensure atomicity of every operation.
1352 	 */
1353 	dir_fd = open(hi->hugedir, O_RDONLY);
1354 	if (dir_fd < 0) {
1355 		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
1356 			hi->hugedir, strerror(errno));
1357 		return -1;
1358 	}
1359 	/* blocking writelock */
1360 	if (flock(dir_fd, LOCK_EX)) {
1361 		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
1362 			hi->hugedir, strerror(errno));
1363 		close(dir_fd);
1364 		return -1;
1365 	}
1366 
1367 	/* ensure all allocated space is the same in both lists */
1368 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
1369 	if (ret < 0)
1370 		goto fail;
1371 
1372 	/* ensure all unallocated space is the same in both lists */
1373 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
1374 	if (ret < 0)
1375 		goto fail;
1376 
1377 	/* update version number */
1378 	local_msl->version = primary_msl->version;
1379 
1380 	close(dir_fd);
1381 
1382 	return 0;
1383 fail:
1384 	close(dir_fd);
1385 	return -1;
1386 }
1387 
1388 static int
1389 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
1390 {
1391 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1392 	struct rte_memseg_list *primary_msl, *local_msl;
1393 	struct hugepage_info *hi = NULL;
1394 	unsigned int i;
1395 	int msl_idx;
1396 	struct internal_config *internal_conf =
1397 		eal_get_internal_configuration();
1398 
1399 	if (msl->external)
1400 		return 0;
1401 
1402 	msl_idx = msl - mcfg->memsegs;
1403 	primary_msl = &mcfg->memsegs[msl_idx];
1404 	local_msl = &local_memsegs[msl_idx];
1405 
1406 	for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
1407 		uint64_t cur_sz =
1408 			internal_conf->hugepage_info[i].hugepage_sz;
1409 		uint64_t msl_sz = primary_msl->page_sz;
1410 		if (msl_sz == cur_sz) {
1411 			hi = &internal_conf->hugepage_info[i];
1412 			break;
1413 		}
1414 	}
1415 	if (!hi) {
1416 		RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
1417 		return -1;
1418 	}
1419 
1420 	/* if versions don't match, synchronize everything */
1421 	if (local_msl->version != primary_msl->version &&
1422 			sync_existing(primary_msl, local_msl, hi, msl_idx))
1423 		return -1;
1424 	return 0;
1425 }
1426 
1427 
1428 int
1429 eal_memalloc_sync_with_primary(void)
1430 {
1431 	/* nothing to be done in primary */
1432 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1433 		return 0;
1434 
1435 	/* memalloc is locked, so it's safe to call thread-unsafe version */
1436 	if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
1437 		return -1;
1438 	return 0;
1439 }
1440 
1441 static int
1442 secondary_msl_create_walk(const struct rte_memseg_list *msl,
1443 		void *arg __rte_unused)
1444 {
1445 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1446 	struct rte_memseg_list *primary_msl, *local_msl;
1447 	char name[PATH_MAX];
1448 	int msl_idx, ret;
1449 
1450 	if (msl->external)
1451 		return 0;
1452 
1453 	msl_idx = msl - mcfg->memsegs;
1454 	primary_msl = &mcfg->memsegs[msl_idx];
1455 	local_msl = &local_memsegs[msl_idx];
1456 
1457 	/* create distinct fbarrays for each secondary */
1458 	snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
1459 		primary_msl->memseg_arr.name, getpid());
1460 
1461 	ret = rte_fbarray_init(&local_msl->memseg_arr, name,
1462 		primary_msl->memseg_arr.len,
1463 		primary_msl->memseg_arr.elt_sz);
1464 	if (ret < 0) {
1465 		RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
1466 		return -1;
1467 	}
1468 	local_msl->base_va = primary_msl->base_va;
1469 	local_msl->len = primary_msl->len;
1470 
1471 	return 0;
1472 }
1473 
1474 static int
1475 secondary_msl_destroy_walk(const struct rte_memseg_list *msl,
1476 		void *arg __rte_unused)
1477 {
1478 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1479 	struct rte_memseg_list *local_msl;
1480 	int msl_idx, ret;
1481 
1482 	if (msl->external)
1483 		return 0;
1484 
1485 	msl_idx = msl - mcfg->memsegs;
1486 	local_msl = &local_memsegs[msl_idx];
1487 
1488 	ret = rte_fbarray_destroy(&local_msl->memseg_arr);
1489 	if (ret < 0) {
1490 		RTE_LOG(ERR, EAL, "Cannot destroy local memory map\n");
1491 		return -1;
1492 	}
1493 	local_msl->base_va = NULL;
1494 	local_msl->len = 0;
1495 
1496 	return 0;
1497 }
1498 
1499 static int
1500 alloc_list(int list_idx, int len)
1501 {
1502 	int *data;
1503 	int i;
1504 	const struct internal_config *internal_conf =
1505 		eal_get_internal_configuration();
1506 
1507 	/* single-file segments mode does not need fd list */
1508 	if (!internal_conf->single_file_segments) {
1509 		/* ensure we have space to store fd per each possible segment */
1510 		data = malloc(sizeof(int) * len);
1511 		if (data == NULL) {
1512 			RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
1513 			return -1;
1514 		}
1515 		/* set all fd's as invalid */
1516 		for (i = 0; i < len; i++)
1517 			data[i] = -1;
1518 		fd_list[list_idx].fds = data;
1519 		fd_list[list_idx].len = len;
1520 	} else {
1521 		fd_list[list_idx].fds = NULL;
1522 		fd_list[list_idx].len = 0;
1523 	}
1524 
1525 	fd_list[list_idx].count = 0;
1526 	fd_list[list_idx].memseg_list_fd = -1;
1527 
1528 	return 0;
1529 }
1530 
1531 static int
1532 destroy_list(int list_idx)
1533 {
1534 	const struct internal_config *internal_conf =
1535 			eal_get_internal_configuration();
1536 
1537 	/* single-file segments mode does not need fd list */
1538 	if (!internal_conf->single_file_segments) {
1539 		int *fds = fd_list[list_idx].fds;
1540 		int i;
1541 		/* go through each fd and ensure it's closed */
1542 		for (i = 0; i < fd_list[list_idx].len; i++) {
1543 			if (fds[i] >= 0) {
1544 				close(fds[i]);
1545 				fds[i] = -1;
1546 			}
1547 		}
1548 		free(fds);
1549 		fd_list[list_idx].fds = NULL;
1550 		fd_list[list_idx].len = 0;
1551 	} else if (fd_list[list_idx].memseg_list_fd >= 0) {
1552 		close(fd_list[list_idx].memseg_list_fd);
1553 		fd_list[list_idx].count = 0;
1554 		fd_list[list_idx].memseg_list_fd = -1;
1555 	}
1556 	return 0;
1557 }
1558 
1559 static int
1560 fd_list_create_walk(const struct rte_memseg_list *msl,
1561 		void *arg __rte_unused)
1562 {
1563 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1564 	unsigned int len;
1565 	int msl_idx;
1566 
1567 	if (msl->external)
1568 		return 0;
1569 
1570 	msl_idx = msl - mcfg->memsegs;
1571 	len = msl->memseg_arr.len;
1572 
1573 	return alloc_list(msl_idx, len);
1574 }
1575 
1576 static int
1577 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
1578 {
1579 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1580 	int msl_idx;
1581 
1582 	if (msl->external)
1583 		return 0;
1584 
1585 	msl_idx = msl - mcfg->memsegs;
1586 
1587 	return destroy_list(msl_idx);
1588 }
1589 
1590 int
1591 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
1592 {
1593 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1594 	const struct internal_config *internal_conf =
1595 		eal_get_internal_configuration();
1596 
1597 	/* single file segments mode doesn't support individual segment fd's */
1598 	if (internal_conf->single_file_segments)
1599 		return -ENOTSUP;
1600 
1601 	/* if list is not allocated, allocate it */
1602 	if (fd_list[list_idx].len == 0) {
1603 		int len = mcfg->memsegs[list_idx].memseg_arr.len;
1604 
1605 		if (alloc_list(list_idx, len) < 0)
1606 			return -ENOMEM;
1607 	}
1608 	fd_list[list_idx].fds[seg_idx] = fd;
1609 
1610 	return 0;
1611 }
1612 
1613 int
1614 eal_memalloc_set_seg_list_fd(int list_idx, int fd)
1615 {
1616 	const struct internal_config *internal_conf =
1617 		eal_get_internal_configuration();
1618 
1619 	/* non-single file segment mode doesn't support segment list fd's */
1620 	if (!internal_conf->single_file_segments)
1621 		return -ENOTSUP;
1622 
1623 	fd_list[list_idx].memseg_list_fd = fd;
1624 
1625 	return 0;
1626 }
1627 
1628 int
1629 eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
1630 {
1631 	int fd;
1632 	const struct internal_config *internal_conf =
1633 		eal_get_internal_configuration();
1634 
1635 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
1636 #ifndef MEMFD_SUPPORTED
1637 		/* in in-memory or no-huge mode, we rely on memfd support */
1638 		return -ENOTSUP;
1639 #endif
1640 		/* memfd supported, but hugetlbfs memfd may not be */
1641 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
1642 			return -ENOTSUP;
1643 	}
1644 
1645 	if (internal_conf->single_file_segments) {
1646 		fd = fd_list[list_idx].memseg_list_fd;
1647 	} else if (fd_list[list_idx].len == 0) {
1648 		/* list not initialized */
1649 		fd = -1;
1650 	} else {
1651 		fd = fd_list[list_idx].fds[seg_idx];
1652 	}
1653 	if (fd < 0)
1654 		return -ENODEV;
1655 	return fd;
1656 }
1657 
1658 static int
1659 test_memfd_create(void)
1660 {
1661 #ifdef MEMFD_SUPPORTED
1662 	const struct internal_config *internal_conf =
1663 		eal_get_internal_configuration();
1664 	unsigned int i;
1665 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
1666 		uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz;
1667 		int pagesz_flag = pagesz_flags(pagesz);
1668 		int flags;
1669 
1670 		flags = pagesz_flag | RTE_MFD_HUGETLB;
1671 		int fd = memfd_create("test", flags);
1672 		if (fd < 0) {
1673 			/* we failed - let memalloc know this isn't working */
1674 			if (errno == EINVAL) {
1675 				memfd_create_supported = 0;
1676 				return 0; /* not supported */
1677 			}
1678 
1679 			/* we got other error - something's wrong */
1680 			return -1; /* error */
1681 		}
1682 		close(fd);
1683 		return 1; /* supported */
1684 	}
1685 #endif
1686 	return 0; /* not supported */
1687 }
1688 
1689 int
1690 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
1691 {
1692 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1693 	const struct internal_config *internal_conf =
1694 		eal_get_internal_configuration();
1695 
1696 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
1697 #ifndef MEMFD_SUPPORTED
1698 		/* in in-memory or no-huge mode, we rely on memfd support */
1699 		return -ENOTSUP;
1700 #endif
1701 		/* memfd supported, but hugetlbfs memfd may not be */
1702 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
1703 			return -ENOTSUP;
1704 	}
1705 
1706 	if (internal_conf->single_file_segments) {
1707 		size_t pgsz = mcfg->memsegs[list_idx].page_sz;
1708 
1709 		/* segment not active? */
1710 		if (fd_list[list_idx].memseg_list_fd < 0)
1711 			return -ENOENT;
1712 		*offset = pgsz * seg_idx;
1713 	} else {
1714 		/* fd_list not initialized? */
1715 		if (fd_list[list_idx].len == 0)
1716 			return -ENODEV;
1717 
1718 		/* segment not active? */
1719 		if (fd_list[list_idx].fds[seg_idx] < 0)
1720 			return -ENOENT;
1721 		*offset = 0;
1722 	}
1723 	return 0;
1724 }
1725 
1726 int
1727 eal_memalloc_cleanup(void)
1728 {
1729 	/* close all remaining fd's - these are per-process, so it's safe */
1730 	if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL))
1731 		return -1;
1732 
1733 	/* destroy the shadow page table if we're a secondary process */
1734 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1735 		return 0;
1736 
1737 	if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk,
1738 			NULL))
1739 		return -1;
1740 
1741 	return 0;
1742 }
1743 
1744 int
1745 eal_memalloc_init(void)
1746 {
1747 	const struct internal_config *internal_conf =
1748 		eal_get_internal_configuration();
1749 
1750 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1751 		if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
1752 			return -1;
1753 	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
1754 			internal_conf->in_memory) {
1755 		int mfd_res = test_memfd_create();
1756 
1757 		if (mfd_res < 0) {
1758 			RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
1759 			return -1;
1760 		}
1761 		if (mfd_res == 1)
1762 			RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
1763 		else
1764 			RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
1765 
1766 		/* we only support single-file segments mode with in-memory mode
1767 		 * if we support hugetlbfs with memfd_create. this code will
1768 		 * test if we do.
1769 		 */
1770 		if (internal_conf->single_file_segments &&
1771 				mfd_res != 1) {
1772 			RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
1773 			return -1;
1774 		}
1775 		/* this cannot ever happen but better safe than sorry */
1776 		if (!anonymous_hugepages_supported) {
1777 			RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
1778 			return -1;
1779 		}
1780 		/* safety net, should be impossible to configure */
1781 		if (internal_conf->hugepage_file.unlink_before_mapping &&
1782 				!internal_conf->hugepage_file.unlink_existing) {
1783 			RTE_LOG(ERR, EAL, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.\n");
1784 			return -1;
1785 		}
1786 	}
1787 
1788 	/* initialize all of the fd lists */
1789 	if (rte_memseg_list_walk(fd_list_create_walk, NULL))
1790 		return -1;
1791 	return 0;
1792 }
1793