xref: /dpdk/lib/eal/linux/eal_memalloc.c (revision 8f4611d893b4eeffb942fffdadc4cde394e4c309)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2018 Intel Corporation
3  */
4 
5 #include <errno.h>
6 #include <stdbool.h>
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include <stdint.h>
10 #include <string.h>
11 #include <sys/mman.h>
12 #include <sys/stat.h>
13 #include <sys/file.h>
14 #include <unistd.h>
15 #include <limits.h>
16 #include <fcntl.h>
17 #include <signal.h>
18 #include <setjmp.h>
19 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
20 #include <linux/memfd.h>
21 #define MEMFD_SUPPORTED
22 #endif
23 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
24 #include <numa.h>
25 #include <numaif.h>
26 #endif
27 #include <linux/falloc.h>
28 #include <linux/mman.h> /* for hugetlb-related mmap flags */
29 
30 #include <rte_common.h>
31 #include <rte_log.h>
32 #include <rte_eal.h>
33 #include <rte_memory.h>
34 
35 #include "eal_filesystem.h"
36 #include "eal_internal_cfg.h"
37 #include "eal_memalloc.h"
38 #include "eal_memcfg.h"
39 #include "eal_private.h"
40 
41 const int anonymous_hugepages_supported =
42 #ifdef MAP_HUGE_SHIFT
43 		1;
44 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
45 #else
46 		0;
47 #define RTE_MAP_HUGE_SHIFT 26
48 #endif
49 
50 /*
51  * we've already checked memfd support at compile-time, but we also need to
52  * check if we can create hugepage files with memfd.
53  *
54  * also, this is not a constant, because while we may be *compiled* with memfd
55  * hugetlbfs support, we might not be *running* on a system that supports memfd
56  * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
57  * runtime, and fall back to anonymous memory.
58  */
59 static int memfd_create_supported =
60 #ifdef MFD_HUGETLB
61 		1;
62 #define RTE_MFD_HUGETLB MFD_HUGETLB
63 #else
64 		0;
65 #define RTE_MFD_HUGETLB 4U
66 #endif
67 
68 /*
69  * not all kernel version support fallocate on hugetlbfs, so fall back to
70  * ftruncate and disallow deallocation if fallocate is not supported.
71  */
72 static int fallocate_supported = -1; /* unknown */
73 
74 /*
75  * we have two modes - single file segments, and file-per-page mode.
76  *
77  * for single-file segments, we use memseg_list_fd to store the segment fd,
78  * while the fds[] will not be allocated, and len will be set to 0.
79  *
80  * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
81  * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
82  *
83  * we cannot know how many pages a system will have in advance, but we do know
84  * that they come in lists, and we know lengths of these lists. so, simply store
85  * a malloc'd array of fd's indexed by list and segment index.
86  *
87  * they will be initialized at startup, and filled as we allocate/deallocate
88  * segments.
89  */
90 static struct {
91 	int *fds; /**< dynamically allocated array of segment lock fd's */
92 	int memseg_list_fd; /**< memseg list fd */
93 	int len; /**< total length of the array */
94 	int count; /**< entries used in an array */
95 } fd_list[RTE_MAX_MEMSEG_LISTS];
96 
97 /** local copy of a memory map, used to synchronize memory hotplug in MP */
98 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
99 
100 static sigjmp_buf huge_jmpenv;
101 
huge_sigbus_handler(int signo __rte_unused)102 static void huge_sigbus_handler(int signo __rte_unused)
103 {
104 	siglongjmp(huge_jmpenv, 1);
105 }
106 
107 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
108  * non-static local variable in the stack frame calling sigsetjmp might be
109  * clobbered by a call to longjmp.
110  */
huge_wrap_sigsetjmp(void)111 static int huge_wrap_sigsetjmp(void)
112 {
113 	return sigsetjmp(huge_jmpenv, 1);
114 }
115 
116 static struct sigaction huge_action_old;
117 static int huge_need_recover;
118 
119 static void
huge_register_sigbus(void)120 huge_register_sigbus(void)
121 {
122 	sigset_t mask;
123 	struct sigaction action;
124 
125 	sigemptyset(&mask);
126 	sigaddset(&mask, SIGBUS);
127 	action.sa_flags = 0;
128 	action.sa_mask = mask;
129 	action.sa_handler = huge_sigbus_handler;
130 
131 	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
132 }
133 
134 static void
huge_recover_sigbus(void)135 huge_recover_sigbus(void)
136 {
137 	if (huge_need_recover) {
138 		sigaction(SIGBUS, &huge_action_old, NULL);
139 		huge_need_recover = 0;
140 	}
141 }
142 
143 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
144 static bool
check_numa(void)145 check_numa(void)
146 {
147 	bool ret = true;
148 	/* Check if kernel supports NUMA. */
149 	if (numa_available() != 0) {
150 		EAL_LOG(DEBUG, "NUMA is not supported.");
151 		ret = false;
152 	}
153 	return ret;
154 }
155 
156 static void
prepare_numa(int * oldpolicy,struct bitmask * oldmask,int socket_id)157 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
158 {
159 	EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
160 	if (get_mempolicy(oldpolicy, oldmask->maskp,
161 			  oldmask->size + 1, 0, 0) < 0) {
162 		EAL_LOG(ERR,
163 			"Failed to get current mempolicy: %s. "
164 			"Assuming MPOL_DEFAULT.", strerror(errno));
165 		*oldpolicy = MPOL_DEFAULT;
166 	}
167 	EAL_LOG(DEBUG,
168 		"Setting policy MPOL_PREFERRED for socket %d",
169 		socket_id);
170 	numa_set_preferred(socket_id);
171 }
172 
173 static void
restore_numa(int * oldpolicy,struct bitmask * oldmask)174 restore_numa(int *oldpolicy, struct bitmask *oldmask)
175 {
176 	EAL_LOG(DEBUG,
177 		"Restoring previous memory policy: %d", *oldpolicy);
178 	if (*oldpolicy == MPOL_DEFAULT) {
179 		numa_set_localalloc();
180 	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
181 				 oldmask->size + 1) < 0) {
182 		EAL_LOG(ERR, "Failed to restore mempolicy: %s",
183 			strerror(errno));
184 		numa_set_localalloc();
185 	}
186 	numa_free_cpumask(oldmask);
187 }
188 #endif
189 
190 /*
191  * uses fstat to report the size of a file on disk
192  */
193 static off_t
get_file_size(int fd)194 get_file_size(int fd)
195 {
196 	struct stat st;
197 	if (fstat(fd, &st) < 0)
198 		return 0;
199 	return st.st_size;
200 }
201 
202 static int
pagesz_flags(uint64_t page_sz)203 pagesz_flags(uint64_t page_sz)
204 {
205 	/* as per mmap() manpage, all page sizes are log2 of page size
206 	 * shifted by MAP_HUGE_SHIFT
207 	 */
208 	int log2 = rte_log2_u64(page_sz);
209 	return log2 << RTE_MAP_HUGE_SHIFT;
210 }
211 
212 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
lock(int fd,int type)213 static int lock(int fd, int type)
214 {
215 	int ret;
216 
217 	/* flock may be interrupted */
218 	do {
219 		ret = flock(fd, type | LOCK_NB);
220 	} while (ret && errno == EINTR);
221 
222 	if (ret && errno == EWOULDBLOCK) {
223 		/* couldn't lock */
224 		return 0;
225 	} else if (ret) {
226 		EAL_LOG(ERR, "%s(): error calling flock(): %s",
227 			__func__, strerror(errno));
228 		return -1;
229 	}
230 	/* lock was successful */
231 	return 1;
232 }
233 
234 static int
get_seg_memfd(struct hugepage_info * hi __rte_unused,unsigned int list_idx __rte_unused,unsigned int seg_idx __rte_unused)235 get_seg_memfd(struct hugepage_info *hi __rte_unused,
236 		unsigned int list_idx __rte_unused,
237 		unsigned int seg_idx __rte_unused)
238 {
239 #ifdef MEMFD_SUPPORTED
240 	int fd;
241 	char segname[250]; /* as per manpage, limit is 249 bytes plus null */
242 
243 	int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
244 	const struct internal_config *internal_conf =
245 		eal_get_internal_configuration();
246 
247 	if (internal_conf->single_file_segments) {
248 		fd = fd_list[list_idx].memseg_list_fd;
249 
250 		if (fd < 0) {
251 			snprintf(segname, sizeof(segname), "seg_%i", list_idx);
252 			fd = memfd_create(segname, flags);
253 			if (fd < 0) {
254 				EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
255 					__func__, strerror(errno));
256 				return -1;
257 			}
258 			fd_list[list_idx].memseg_list_fd = fd;
259 		}
260 	} else {
261 		fd = fd_list[list_idx].fds[seg_idx];
262 
263 		if (fd < 0) {
264 			snprintf(segname, sizeof(segname), "seg_%i-%i",
265 					list_idx, seg_idx);
266 			fd = memfd_create(segname, flags);
267 			if (fd < 0) {
268 				EAL_LOG(DEBUG, "%s(): memfd create failed: %s",
269 					__func__, strerror(errno));
270 				return -1;
271 			}
272 			fd_list[list_idx].fds[seg_idx] = fd;
273 		}
274 	}
275 	return fd;
276 #endif
277 	return -1;
278 }
279 
280 static int
get_seg_fd(char * path,int buflen,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx,bool * dirty)281 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
282 		unsigned int list_idx, unsigned int seg_idx,
283 		bool *dirty)
284 {
285 	int fd;
286 	int *out_fd;
287 	struct stat st;
288 	int ret;
289 	const struct internal_config *internal_conf =
290 		eal_get_internal_configuration();
291 
292 	if (dirty != NULL)
293 		*dirty = false;
294 
295 	/* for in-memory mode, we only make it here when we're sure we support
296 	 * memfd, and this is a special case.
297 	 */
298 	if (internal_conf->in_memory)
299 		return get_seg_memfd(hi, list_idx, seg_idx);
300 
301 	if (internal_conf->single_file_segments) {
302 		out_fd = &fd_list[list_idx].memseg_list_fd;
303 		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
304 	} else {
305 		out_fd = &fd_list[list_idx].fds[seg_idx];
306 		eal_get_hugefile_path(path, buflen, hi->hugedir,
307 				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
308 	}
309 	fd = *out_fd;
310 	if (fd >= 0)
311 		return fd;
312 
313 	/*
314 	 * There is no TOCTOU between stat() and unlink()/open()
315 	 * because the hugepage directory is locked.
316 	 */
317 	ret = stat(path, &st);
318 	if (ret < 0 && errno != ENOENT) {
319 		EAL_LOG(DEBUG, "%s(): stat() for '%s' failed: %s",
320 			__func__, path, strerror(errno));
321 		return -1;
322 	}
323 	if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
324 			dirty != NULL)
325 		*dirty = true;
326 
327 	/*
328 	 * The kernel clears a hugepage only when it is mapped
329 	 * from a particular file for the first time.
330 	 * If the file already exists, the old content will be mapped.
331 	 * If the memory manager assumes all mapped pages to be clean,
332 	 * the file must be removed and created anew.
333 	 * Otherwise, the primary caller must be notified
334 	 * that mapped pages will be dirty
335 	 * (secondary callers receive the segment state from the primary one).
336 	 * When multiple hugepages are mapped from the same file,
337 	 * whether they will be dirty depends on the part that is mapped.
338 	 */
339 	if (!internal_conf->single_file_segments &&
340 			internal_conf->hugepage_file.unlink_existing &&
341 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
342 			ret == 0) {
343 		/* coverity[toctou] */
344 		if (unlink(path) < 0) {
345 			EAL_LOG(DEBUG, "%s(): could not remove '%s': %s",
346 				__func__, path, strerror(errno));
347 			return -1;
348 		}
349 	}
350 
351 	/* coverity[toctou] */
352 	fd = open(path, O_CREAT | O_RDWR, 0600);
353 	if (fd < 0) {
354 		EAL_LOG(ERR, "%s(): open '%s' failed: %s",
355 			__func__, path, strerror(errno));
356 		return -1;
357 	}
358 	/* take out a read lock */
359 	if (lock(fd, LOCK_SH) < 0) {
360 		EAL_LOG(ERR, "%s(): lock '%s' failed: %s",
361 			__func__, path, strerror(errno));
362 		close(fd);
363 		return -1;
364 	}
365 	*out_fd = fd;
366 	return fd;
367 }
368 
369 static int
resize_hugefile_in_memory(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow)370 resize_hugefile_in_memory(int fd, uint64_t fa_offset,
371 		uint64_t page_sz, bool grow)
372 {
373 	int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
374 			FALLOC_FL_KEEP_SIZE;
375 	int ret;
376 
377 	/* grow or shrink the file */
378 	ret = fallocate(fd, flags, fa_offset, page_sz);
379 
380 	if (ret < 0) {
381 		EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
382 				__func__,
383 				strerror(errno));
384 		return -1;
385 	}
386 	return 0;
387 }
388 
389 static int
resize_hugefile_in_filesystem(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)390 resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
391 		bool grow, bool *dirty)
392 {
393 	const struct internal_config *internal_conf =
394 			eal_get_internal_configuration();
395 	bool again = false;
396 
397 	do {
398 		if (fallocate_supported == 0) {
399 			/* we cannot deallocate memory if fallocate() is not
400 			 * supported, and hugepage file is already locked at
401 			 * creation, so no further synchronization needed.
402 			 */
403 
404 			if (!grow) {
405 				EAL_LOG(DEBUG, "%s(): fallocate not supported, not freeing page back to the system",
406 					__func__);
407 				return -1;
408 			}
409 			uint64_t new_size = fa_offset + page_sz;
410 			uint64_t cur_size = get_file_size(fd);
411 
412 			/* fallocate isn't supported, fall back to ftruncate */
413 			if (dirty != NULL)
414 				*dirty = new_size <= cur_size;
415 			if (new_size > cur_size &&
416 					ftruncate(fd, new_size) < 0) {
417 				EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
418 					__func__, strerror(errno));
419 				return -1;
420 			}
421 		} else {
422 			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
423 					FALLOC_FL_KEEP_SIZE;
424 			int ret;
425 
426 			/*
427 			 * technically, it is perfectly safe for both primary
428 			 * and secondary to grow and shrink the page files:
429 			 * growing the file repeatedly has no effect because
430 			 * a page can only be allocated once, while mmap ensures
431 			 * that secondaries hold on to the page even after the
432 			 * page itself is removed from the filesystem.
433 			 *
434 			 * however, leaving growing/shrinking to the primary
435 			 * tends to expose bugs in fdlist page count handling,
436 			 * so leave this here just in case.
437 			 */
438 			if (rte_eal_process_type() != RTE_PROC_PRIMARY)
439 				return 0;
440 
441 			/* grow or shrink the file */
442 			ret = fallocate(fd, flags, fa_offset, page_sz);
443 
444 			if (ret < 0) {
445 				if (fallocate_supported == -1 &&
446 						errno == ENOTSUP) {
447 					EAL_LOG(ERR, "%s(): fallocate() not supported, hugepage deallocation will be disabled",
448 						__func__);
449 					again = true;
450 					fallocate_supported = 0;
451 				} else {
452 					EAL_LOG(DEBUG, "%s(): fallocate() failed: %s",
453 						__func__,
454 						strerror(errno));
455 					return -1;
456 				}
457 			} else {
458 				fallocate_supported = 1;
459 				/*
460 				 * It is unknown which portions of an existing
461 				 * hugepage file were allocated previously,
462 				 * so all pages within the file are considered
463 				 * dirty, unless the file is a fresh one.
464 				 */
465 				if (dirty != NULL)
466 					*dirty &= !internal_conf->hugepage_file.unlink_existing;
467 			}
468 		}
469 	} while (again);
470 
471 	return 0;
472 }
473 
474 static void
close_hugefile(int fd,char * path,int list_idx)475 close_hugefile(int fd, char *path, int list_idx)
476 {
477 	const struct internal_config *internal_conf =
478 		eal_get_internal_configuration();
479 	/*
480 	 * primary process must unlink the file, but only when not in in-memory
481 	 * mode (as in that case there is no file to unlink).
482 	 */
483 	if (!internal_conf->in_memory &&
484 			rte_eal_process_type() == RTE_PROC_PRIMARY &&
485 			unlink(path))
486 		EAL_LOG(ERR, "%s(): unlinking '%s' failed: %s",
487 			__func__, path, strerror(errno));
488 
489 	close(fd);
490 	fd_list[list_idx].memseg_list_fd = -1;
491 }
492 
493 static int
resize_hugefile(int fd,uint64_t fa_offset,uint64_t page_sz,bool grow,bool * dirty)494 resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
495 		bool *dirty)
496 {
497 	/* in-memory mode is a special case, because we can be sure that
498 	 * fallocate() is supported.
499 	 */
500 	const struct internal_config *internal_conf =
501 		eal_get_internal_configuration();
502 
503 	if (internal_conf->in_memory) {
504 		if (dirty != NULL)
505 			*dirty = false;
506 		return resize_hugefile_in_memory(fd, fa_offset,
507 				page_sz, grow);
508 	}
509 
510 	return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
511 			grow, dirty);
512 }
513 
514 static int
alloc_seg(struct rte_memseg * ms,void * addr,int socket_id,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)515 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
516 		struct hugepage_info *hi, unsigned int list_idx,
517 		unsigned int seg_idx)
518 {
519 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
520 	int cur_socket_id = 0;
521 #endif
522 	uint64_t map_offset;
523 	rte_iova_t iova;
524 	void *va;
525 	char path[PATH_MAX];
526 	int ret = 0;
527 	int fd;
528 	bool dirty;
529 	size_t alloc_sz;
530 	int flags;
531 	void *new_addr;
532 	const struct internal_config *internal_conf =
533 		eal_get_internal_configuration();
534 
535 	alloc_sz = hi->hugepage_sz;
536 
537 	/* these are checked at init, but code analyzers don't know that */
538 	if (internal_conf->in_memory && !anonymous_hugepages_supported) {
539 		EAL_LOG(ERR, "Anonymous hugepages not supported, in-memory mode cannot allocate memory");
540 		return -1;
541 	}
542 	if (internal_conf->in_memory && !memfd_create_supported &&
543 			internal_conf->single_file_segments) {
544 		EAL_LOG(ERR, "Single-file segments are not supported without memfd support");
545 		return -1;
546 	}
547 
548 	/* in-memory without memfd is a special case */
549 	int mmap_flags;
550 
551 	if (internal_conf->in_memory && !memfd_create_supported) {
552 		const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
553 				MAP_PRIVATE | MAP_ANONYMOUS;
554 		int pagesz_flag;
555 
556 		pagesz_flag = pagesz_flags(alloc_sz);
557 		fd = -1;
558 		dirty = false;
559 		mmap_flags = in_memory_flags | pagesz_flag;
560 
561 		/* single-file segments codepath will never be active
562 		 * here because in-memory mode is incompatible with the
563 		 * fallback path, and it's stopped at EAL initialization
564 		 * stage.
565 		 */
566 		map_offset = 0;
567 	} else {
568 		/* takes out a read lock on segment or segment list */
569 		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
570 				&dirty);
571 		if (fd < 0) {
572 			EAL_LOG(ERR, "Couldn't get fd on hugepage file");
573 			return -1;
574 		}
575 
576 		if (internal_conf->single_file_segments) {
577 			map_offset = seg_idx * alloc_sz;
578 			ret = resize_hugefile(fd, map_offset, alloc_sz, true,
579 					&dirty);
580 			if (ret < 0)
581 				goto resized;
582 
583 			fd_list[list_idx].count++;
584 		} else {
585 			map_offset = 0;
586 			if (ftruncate(fd, alloc_sz) < 0) {
587 				EAL_LOG(DEBUG, "%s(): ftruncate() failed: %s",
588 					__func__, strerror(errno));
589 				goto resized;
590 			}
591 			if (internal_conf->hugepage_file.unlink_before_mapping &&
592 					!internal_conf->in_memory) {
593 				if (unlink(path)) {
594 					EAL_LOG(DEBUG, "%s(): unlink() failed: %s",
595 						__func__, strerror(errno));
596 					goto resized;
597 				}
598 			}
599 		}
600 		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
601 	}
602 
603 	huge_register_sigbus();
604 
605 	/*
606 	 * map the segment, and populate page tables, the kernel fills
607 	 * this segment with zeros if it's a new page.
608 	 */
609 	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
610 			map_offset);
611 
612 	if (va == MAP_FAILED) {
613 		EAL_LOG(DEBUG, "%s(): mmap() failed: %s", __func__,
614 			strerror(errno));
615 		/* mmap failed, but the previous region might have been
616 		 * unmapped anyway. try to remap it
617 		 */
618 		goto unmapped;
619 	}
620 	if (va != addr) {
621 		EAL_LOG(DEBUG, "%s(): wrong mmap() address", __func__);
622 		munmap(va, alloc_sz);
623 		goto resized;
624 	}
625 
626 	/* In linux, hugetlb limitations, like cgroup, are
627 	 * enforced at fault time instead of mmap(), even
628 	 * with the option of MAP_POPULATE. Kernel will send
629 	 * a SIGBUS signal. To avoid to be killed, save stack
630 	 * environment here, if SIGBUS happens, we can jump
631 	 * back here.
632 	 */
633 	if (huge_wrap_sigsetjmp()) {
634 		EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more hugepages of size %uMB",
635 			(unsigned int)(alloc_sz >> 20));
636 		goto mapped;
637 	}
638 
639 	/* we need to trigger a write to the page to enforce page fault and
640 	 * ensure that page is accessible to us, but we can't overwrite value
641 	 * that is already there, so read the old value, and write itback.
642 	 * kernel populates the page with zeroes initially.
643 	 */
644 	*(volatile int *)addr = *(volatile int *)addr;
645 
646 	iova = rte_mem_virt2iova(addr);
647 	if (iova == RTE_BAD_PHYS_ADDR) {
648 		EAL_LOG(DEBUG, "%s(): can't get IOVA addr",
649 			__func__);
650 		goto mapped;
651 	}
652 
653 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
654 	/*
655 	 * If the kernel has been built without NUMA support, get_mempolicy()
656 	 * will return an error. If check_numa() returns false, memory
657 	 * allocation is not NUMA aware and the socket_id should not be
658 	 * checked.
659 	 */
660 	if (check_numa()) {
661 		ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
662 					MPOL_F_NODE | MPOL_F_ADDR);
663 		if (ret < 0) {
664 			EAL_LOG(DEBUG, "%s(): get_mempolicy: %s",
665 				__func__, strerror(errno));
666 			goto mapped;
667 		} else if (cur_socket_id != socket_id) {
668 			EAL_LOG(DEBUG,
669 					"%s(): allocation happened on wrong socket (wanted %d, got %d)",
670 				__func__, socket_id, cur_socket_id);
671 			goto mapped;
672 		}
673 	}
674 #else
675 	if (rte_socket_count() > 1)
676 		EAL_LOG(DEBUG, "%s(): not checking hugepage NUMA node.",
677 				__func__);
678 #endif
679 
680 	huge_recover_sigbus();
681 
682 	ms->addr = addr;
683 	ms->hugepage_sz = alloc_sz;
684 	ms->len = alloc_sz;
685 	ms->nchannel = rte_memory_get_nchannel();
686 	ms->nrank = rte_memory_get_nrank();
687 	ms->iova = iova;
688 	ms->socket_id = socket_id;
689 	ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
690 
691 	return 0;
692 
693 mapped:
694 	munmap(addr, alloc_sz);
695 unmapped:
696 	huge_recover_sigbus();
697 	flags = EAL_RESERVE_FORCE_ADDRESS;
698 	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
699 	if (new_addr != addr) {
700 		if (new_addr != NULL)
701 			munmap(new_addr, alloc_sz);
702 		/* we're leaving a hole in our virtual address space. if
703 		 * somebody else maps this hole now, we could accidentally
704 		 * override it in the future.
705 		 */
706 		EAL_LOG(CRIT, "Can't mmap holes in our virtual address space");
707 	}
708 	/* roll back the ref count */
709 	if (internal_conf->single_file_segments)
710 		fd_list[list_idx].count--;
711 resized:
712 	/* some codepaths will return negative fd, so exit early */
713 	if (fd < 0)
714 		return -1;
715 
716 	if (internal_conf->single_file_segments) {
717 		resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
718 		/* ignore failure, can't make it any worse */
719 
720 		/* if refcount is at zero, close the file */
721 		if (fd_list[list_idx].count == 0)
722 			close_hugefile(fd, path, list_idx);
723 	} else {
724 		/* only remove file if we can take out a write lock */
725 		if (!internal_conf->hugepage_file.unlink_before_mapping &&
726 				internal_conf->in_memory == 0 &&
727 				lock(fd, LOCK_EX) == 1)
728 			unlink(path);
729 		close(fd);
730 		fd_list[list_idx].fds[seg_idx] = -1;
731 	}
732 	return -1;
733 }
734 
735 static int
free_seg(struct rte_memseg * ms,struct hugepage_info * hi,unsigned int list_idx,unsigned int seg_idx)736 free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
737 		unsigned int list_idx, unsigned int seg_idx)
738 {
739 	uint64_t map_offset;
740 	char path[PATH_MAX];
741 	int fd, ret = 0;
742 	const struct internal_config *internal_conf =
743 		eal_get_internal_configuration();
744 
745 	/* erase page data */
746 	memset(ms->addr, 0, ms->len);
747 
748 	if (mmap(ms->addr, ms->len, PROT_NONE,
749 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
750 				MAP_FAILED) {
751 		EAL_LOG(DEBUG, "couldn't unmap page");
752 		return -1;
753 	}
754 
755 	eal_mem_set_dump(ms->addr, ms->len, false);
756 
757 	/* if we're using anonymous hugepages, nothing to be done */
758 	if (internal_conf->in_memory && !memfd_create_supported) {
759 		memset(ms, 0, sizeof(*ms));
760 		return 0;
761 	}
762 
763 	/* if we are not in single file segments mode, we're going to unmap the
764 	 * segment and thus drop the lock on original fd, but hugepage dir is
765 	 * now locked so we can take out another one without races.
766 	 */
767 	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
768 	if (fd < 0)
769 		return -1;
770 
771 	if (internal_conf->single_file_segments) {
772 		map_offset = seg_idx * ms->len;
773 		if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
774 			return -1;
775 
776 		if (--(fd_list[list_idx].count) == 0)
777 			close_hugefile(fd, path, list_idx);
778 
779 		ret = 0;
780 	} else {
781 		/* if we're able to take out a write lock, we're the last one
782 		 * holding onto this page.
783 		 */
784 		if (!internal_conf->in_memory &&
785 				internal_conf->hugepage_file.unlink_existing &&
786 				!internal_conf->hugepage_file.unlink_before_mapping) {
787 			ret = lock(fd, LOCK_EX);
788 			if (ret >= 0) {
789 				/* no one else is using this page */
790 				if (ret == 1)
791 					unlink(path);
792 			}
793 		}
794 		/* closing fd will drop the lock */
795 		close(fd);
796 		fd_list[list_idx].fds[seg_idx] = -1;
797 	}
798 
799 	memset(ms, 0, sizeof(*ms));
800 
801 	return ret < 0 ? -1 : 0;
802 }
803 
804 struct alloc_walk_param {
805 	struct hugepage_info *hi;
806 	struct rte_memseg **ms;
807 	size_t page_sz;
808 	unsigned int segs_allocated;
809 	unsigned int n_segs;
810 	int socket;
811 	bool exact;
812 };
813 static int
alloc_seg_walk(const struct rte_memseg_list * msl,void * arg)814 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
815 {
816 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
817 	struct alloc_walk_param *wa = arg;
818 	struct rte_memseg_list *cur_msl;
819 	size_t page_sz;
820 	int cur_idx, start_idx, j, dir_fd = -1;
821 	unsigned int msl_idx, need, i;
822 	const struct internal_config *internal_conf =
823 		eal_get_internal_configuration();
824 
825 	if (msl->page_sz != wa->page_sz)
826 		return 0;
827 	if (msl->socket_id != wa->socket)
828 		return 0;
829 
830 	page_sz = (size_t)msl->page_sz;
831 
832 	msl_idx = msl - mcfg->memsegs;
833 	cur_msl = &mcfg->memsegs[msl_idx];
834 
835 	need = wa->n_segs;
836 
837 	/* try finding space in memseg list */
838 	if (wa->exact) {
839 		/* if we require exact number of pages in a list, find them */
840 		cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
841 				need);
842 		if (cur_idx < 0)
843 			return 0;
844 		start_idx = cur_idx;
845 	} else {
846 		int cur_len;
847 
848 		/* we don't require exact number of pages, so we're going to go
849 		 * for best-effort allocation. that means finding the biggest
850 		 * unused block, and going with that.
851 		 */
852 		cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
853 				0);
854 		if (cur_idx < 0)
855 			return 0;
856 		start_idx = cur_idx;
857 		/* adjust the size to possibly be smaller than original
858 		 * request, but do not allow it to be bigger.
859 		 */
860 		cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
861 				cur_idx);
862 		need = RTE_MIN(need, (unsigned int)cur_len);
863 	}
864 
865 	/* do not allow any page allocations during the time we're allocating,
866 	 * because file creation and locking operations are not atomic,
867 	 * and we might be the first or the last ones to use a particular page,
868 	 * so we need to ensure atomicity of every operation.
869 	 *
870 	 * during init, we already hold a write lock, so don't try to take out
871 	 * another one.
872 	 */
873 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
874 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
875 		if (dir_fd < 0) {
876 			EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
877 				__func__, wa->hi->hugedir, strerror(errno));
878 			return -1;
879 		}
880 		/* blocking writelock */
881 		if (flock(dir_fd, LOCK_EX)) {
882 			EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
883 				__func__, wa->hi->hugedir, strerror(errno));
884 			close(dir_fd);
885 			return -1;
886 		}
887 	}
888 
889 	for (i = 0; i < need; i++, cur_idx++) {
890 		struct rte_memseg *cur;
891 		void *map_addr;
892 
893 		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
894 		map_addr = RTE_PTR_ADD(cur_msl->base_va,
895 				cur_idx * page_sz);
896 
897 		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
898 				msl_idx, cur_idx)) {
899 			EAL_LOG(DEBUG, "attempted to allocate %i segments, but only %i were allocated",
900 				need, i);
901 
902 			/* if exact number wasn't requested, stop */
903 			if (!wa->exact)
904 				goto out;
905 
906 			/* clean up */
907 			for (j = start_idx; j < cur_idx; j++) {
908 				struct rte_memseg *tmp;
909 				struct rte_fbarray *arr =
910 						&cur_msl->memseg_arr;
911 
912 				tmp = rte_fbarray_get(arr, j);
913 				rte_fbarray_set_free(arr, j);
914 
915 				/* free_seg may attempt to create a file, which
916 				 * may fail.
917 				 */
918 				if (free_seg(tmp, wa->hi, msl_idx, j))
919 					EAL_LOG(DEBUG, "Cannot free page");
920 			}
921 			/* clear the list */
922 			if (wa->ms)
923 				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
924 
925 			if (dir_fd >= 0)
926 				close(dir_fd);
927 			return -1;
928 		}
929 		if (wa->ms)
930 			wa->ms[i] = cur;
931 
932 		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
933 	}
934 out:
935 	wa->segs_allocated = i;
936 	if (i > 0)
937 		cur_msl->version++;
938 	if (dir_fd >= 0)
939 		close(dir_fd);
940 	/* if we didn't allocate any segments, move on to the next list */
941 	return i > 0;
942 }
943 
944 struct free_walk_param {
945 	struct hugepage_info *hi;
946 	struct rte_memseg *ms;
947 };
948 static int
free_seg_walk(const struct rte_memseg_list * msl,void * arg)949 free_seg_walk(const struct rte_memseg_list *msl, void *arg)
950 {
951 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
952 	struct rte_memseg_list *found_msl;
953 	struct free_walk_param *wa = arg;
954 	uintptr_t start_addr, end_addr;
955 	int msl_idx, seg_idx, ret, dir_fd = -1;
956 	const struct internal_config *internal_conf =
957 		eal_get_internal_configuration();
958 
959 	start_addr = (uintptr_t) msl->base_va;
960 	end_addr = start_addr + msl->len;
961 
962 	if ((uintptr_t)wa->ms->addr < start_addr ||
963 			(uintptr_t)wa->ms->addr >= end_addr)
964 		return 0;
965 
966 	msl_idx = msl - mcfg->memsegs;
967 	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
968 
969 	/* msl is const */
970 	found_msl = &mcfg->memsegs[msl_idx];
971 
972 	/* do not allow any page allocations during the time we're freeing,
973 	 * because file creation and locking operations are not atomic,
974 	 * and we might be the first or the last ones to use a particular page,
975 	 * so we need to ensure atomicity of every operation.
976 	 *
977 	 * during init, we already hold a write lock, so don't try to take out
978 	 * another one.
979 	 */
980 	if (wa->hi->lock_descriptor == -1 && !internal_conf->in_memory) {
981 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
982 		if (dir_fd < 0) {
983 			EAL_LOG(ERR, "%s(): Cannot open '%s': %s",
984 				__func__, wa->hi->hugedir, strerror(errno));
985 			return -1;
986 		}
987 		/* blocking writelock */
988 		if (flock(dir_fd, LOCK_EX)) {
989 			EAL_LOG(ERR, "%s(): Cannot lock '%s': %s",
990 				__func__, wa->hi->hugedir, strerror(errno));
991 			close(dir_fd);
992 			return -1;
993 		}
994 	}
995 
996 	found_msl->version++;
997 
998 	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
999 
1000 	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
1001 
1002 	if (dir_fd >= 0)
1003 		close(dir_fd);
1004 
1005 	if (ret < 0)
1006 		return -1;
1007 
1008 	return 1;
1009 }
1010 
1011 int
eal_memalloc_alloc_seg_bulk(struct rte_memseg ** ms,int n_segs,size_t page_sz,int socket,bool exact)1012 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
1013 		int socket, bool exact)
1014 {
1015 	int i, ret = -1;
1016 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1017 	bool have_numa = false;
1018 	int oldpolicy;
1019 	struct bitmask *oldmask;
1020 #endif
1021 	struct alloc_walk_param wa;
1022 	struct hugepage_info *hi = NULL;
1023 	struct internal_config *internal_conf =
1024 		eal_get_internal_configuration();
1025 
1026 	memset(&wa, 0, sizeof(wa));
1027 
1028 	/* dynamic allocation not supported in legacy mode */
1029 	if (internal_conf->legacy_mem)
1030 		return -1;
1031 
1032 	for (i = 0; i < (int) RTE_DIM(internal_conf->hugepage_info); i++) {
1033 		if (page_sz ==
1034 				internal_conf->hugepage_info[i].hugepage_sz) {
1035 			hi = &internal_conf->hugepage_info[i];
1036 			break;
1037 		}
1038 	}
1039 	if (!hi) {
1040 		EAL_LOG(ERR, "%s(): can't find relevant hugepage_info entry",
1041 			__func__);
1042 		return -1;
1043 	}
1044 
1045 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1046 	if (check_numa()) {
1047 		oldmask = numa_allocate_nodemask();
1048 		prepare_numa(&oldpolicy, oldmask, socket);
1049 		have_numa = true;
1050 	}
1051 #endif
1052 
1053 	wa.exact = exact;
1054 	wa.hi = hi;
1055 	wa.ms = ms;
1056 	wa.n_segs = n_segs;
1057 	wa.page_sz = page_sz;
1058 	wa.socket = socket;
1059 	wa.segs_allocated = 0;
1060 
1061 	/* memalloc is locked, so it's safe to use thread-unsafe version */
1062 	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
1063 	if (ret == 0) {
1064 		EAL_LOG(DEBUG, "%s(): couldn't find suitable memseg_list",
1065 			__func__);
1066 		ret = -1;
1067 	} else if (ret > 0) {
1068 		ret = (int)wa.segs_allocated;
1069 	}
1070 
1071 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
1072 	if (have_numa)
1073 		restore_numa(&oldpolicy, oldmask);
1074 #endif
1075 	return ret;
1076 }
1077 
1078 struct rte_memseg *
eal_memalloc_alloc_seg(size_t page_sz,int socket)1079 eal_memalloc_alloc_seg(size_t page_sz, int socket)
1080 {
1081 	struct rte_memseg *ms;
1082 	if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
1083 		return NULL;
1084 	/* return pointer to newly allocated memseg */
1085 	return ms;
1086 }
1087 
1088 int
eal_memalloc_free_seg_bulk(struct rte_memseg ** ms,int n_segs)1089 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
1090 {
1091 	int seg, ret = 0;
1092 	struct internal_config *internal_conf =
1093 		eal_get_internal_configuration();
1094 
1095 	/* dynamic free not supported in legacy mode */
1096 	if (internal_conf->legacy_mem)
1097 		return -1;
1098 
1099 	for (seg = 0; seg < n_segs; seg++) {
1100 		struct rte_memseg *cur = ms[seg];
1101 		struct hugepage_info *hi = NULL;
1102 		struct free_walk_param wa;
1103 		int i, walk_res;
1104 
1105 		/* if this page is marked as unfreeable, fail */
1106 		if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
1107 			EAL_LOG(DEBUG, "Page is not allowed to be freed");
1108 			ret = -1;
1109 			continue;
1110 		}
1111 
1112 		memset(&wa, 0, sizeof(wa));
1113 
1114 		for (i = 0; i < (int)RTE_DIM(internal_conf->hugepage_info);
1115 				i++) {
1116 			hi = &internal_conf->hugepage_info[i];
1117 			if (cur->hugepage_sz == hi->hugepage_sz)
1118 				break;
1119 		}
1120 		if (i == (int)RTE_DIM(internal_conf->hugepage_info)) {
1121 			EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
1122 			ret = -1;
1123 			continue;
1124 		}
1125 
1126 		wa.ms = cur;
1127 		wa.hi = hi;
1128 
1129 		/* memalloc is locked, so it's safe to use thread-unsafe version
1130 		 */
1131 		walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
1132 				&wa);
1133 		if (walk_res == 1)
1134 			continue;
1135 		if (walk_res == 0)
1136 			EAL_LOG(ERR, "Couldn't find memseg list");
1137 		ret = -1;
1138 	}
1139 	return ret;
1140 }
1141 
1142 int
eal_memalloc_free_seg(struct rte_memseg * ms)1143 eal_memalloc_free_seg(struct rte_memseg *ms)
1144 {
1145 	const struct internal_config *internal_conf =
1146 		eal_get_internal_configuration();
1147 
1148 	/* dynamic free not supported in legacy mode */
1149 	if (internal_conf->legacy_mem)
1150 		return -1;
1151 
1152 	return eal_memalloc_free_seg_bulk(&ms, 1);
1153 }
1154 
1155 static int
sync_chunk(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used,int start,int end)1156 sync_chunk(struct rte_memseg_list *primary_msl,
1157 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1158 		unsigned int msl_idx, bool used, int start, int end)
1159 {
1160 	struct rte_fbarray *l_arr, *p_arr;
1161 	int i, ret, chunk_len, diff_len;
1162 
1163 	l_arr = &local_msl->memseg_arr;
1164 	p_arr = &primary_msl->memseg_arr;
1165 
1166 	/* we need to aggregate allocations/deallocations into bigger chunks,
1167 	 * as we don't want to spam the user with per-page callbacks.
1168 	 *
1169 	 * to avoid any potential issues, we also want to trigger
1170 	 * deallocation callbacks *before* we actually deallocate
1171 	 * memory, so that the user application could wrap up its use
1172 	 * before it goes away.
1173 	 */
1174 
1175 	chunk_len = end - start;
1176 
1177 	/* find how many contiguous pages we can map/unmap for this chunk */
1178 	diff_len = used ?
1179 			rte_fbarray_find_contig_free(l_arr, start) :
1180 			rte_fbarray_find_contig_used(l_arr, start);
1181 
1182 	/* has to be at least one page */
1183 	if (diff_len < 1)
1184 		return -1;
1185 
1186 	diff_len = RTE_MIN(chunk_len, diff_len);
1187 
1188 	/* if we are freeing memory, notify the application */
1189 	if (!used) {
1190 		struct rte_memseg *ms;
1191 		void *start_va;
1192 		size_t len, page_sz;
1193 
1194 		ms = rte_fbarray_get(l_arr, start);
1195 		start_va = ms->addr;
1196 		page_sz = (size_t)primary_msl->page_sz;
1197 		len = page_sz * diff_len;
1198 
1199 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
1200 				start_va, len);
1201 	}
1202 
1203 	for (i = 0; i < diff_len; i++) {
1204 		struct rte_memseg *p_ms, *l_ms;
1205 		int seg_idx = start + i;
1206 
1207 		l_ms = rte_fbarray_get(l_arr, seg_idx);
1208 		p_ms = rte_fbarray_get(p_arr, seg_idx);
1209 
1210 		if (l_ms == NULL || p_ms == NULL)
1211 			return -1;
1212 
1213 		if (used) {
1214 			ret = alloc_seg(l_ms, p_ms->addr,
1215 					p_ms->socket_id, hi,
1216 					msl_idx, seg_idx);
1217 			if (ret < 0)
1218 				return -1;
1219 			rte_fbarray_set_used(l_arr, seg_idx);
1220 		} else {
1221 			ret = free_seg(l_ms, hi, msl_idx, seg_idx);
1222 			rte_fbarray_set_free(l_arr, seg_idx);
1223 			if (ret < 0)
1224 				return -1;
1225 		}
1226 	}
1227 
1228 	/* if we just allocated memory, notify the application */
1229 	if (used) {
1230 		struct rte_memseg *ms;
1231 		void *start_va;
1232 		size_t len, page_sz;
1233 
1234 		ms = rte_fbarray_get(l_arr, start);
1235 		start_va = ms->addr;
1236 		page_sz = (size_t)primary_msl->page_sz;
1237 		len = page_sz * diff_len;
1238 
1239 		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
1240 				start_va, len);
1241 	}
1242 
1243 	/* calculate how much we can advance until next chunk */
1244 	diff_len = used ?
1245 			rte_fbarray_find_contig_used(l_arr, start) :
1246 			rte_fbarray_find_contig_free(l_arr, start);
1247 	ret = RTE_MIN(chunk_len, diff_len);
1248 
1249 	return ret;
1250 }
1251 
1252 static int
sync_status(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx,bool used)1253 sync_status(struct rte_memseg_list *primary_msl,
1254 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1255 		unsigned int msl_idx, bool used)
1256 {
1257 	struct rte_fbarray *l_arr, *p_arr;
1258 	int p_idx, l_chunk_len, p_chunk_len, ret;
1259 	int start, end;
1260 
1261 	/* this is a little bit tricky, but the basic idea is - walk both lists
1262 	 * and spot any places where there are discrepancies. walking both lists
1263 	 * and noting discrepancies in a single go is a hard problem, so we do
1264 	 * it in two passes - first we spot any places where allocated segments
1265 	 * mismatch (i.e. ensure that everything that's allocated in the primary
1266 	 * is also allocated in the secondary), and then we do it by looking at
1267 	 * free segments instead.
1268 	 *
1269 	 * we also need to aggregate changes into chunks, as we have to call
1270 	 * callbacks per allocation, not per page.
1271 	 */
1272 	l_arr = &local_msl->memseg_arr;
1273 	p_arr = &primary_msl->memseg_arr;
1274 
1275 	if (used)
1276 		p_idx = rte_fbarray_find_next_used(p_arr, 0);
1277 	else
1278 		p_idx = rte_fbarray_find_next_free(p_arr, 0);
1279 
1280 	while (p_idx >= 0) {
1281 		int next_chunk_search_idx;
1282 
1283 		if (used) {
1284 			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
1285 					p_idx);
1286 			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
1287 					p_idx);
1288 		} else {
1289 			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
1290 					p_idx);
1291 			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
1292 					p_idx);
1293 		}
1294 		/* best case scenario - no differences (or bigger, which will be
1295 		 * fixed during next iteration), look for next chunk
1296 		 */
1297 		if (l_chunk_len >= p_chunk_len) {
1298 			next_chunk_search_idx = p_idx + p_chunk_len;
1299 			goto next_chunk;
1300 		}
1301 
1302 		/* if both chunks start at the same point, skip parts we know
1303 		 * are identical, and sync the rest. each call to sync_chunk
1304 		 * will only sync contiguous segments, so we need to call this
1305 		 * until we are sure there are no more differences in this
1306 		 * chunk.
1307 		 */
1308 		start = p_idx + l_chunk_len;
1309 		end = p_idx + p_chunk_len;
1310 		do {
1311 			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
1312 					used, start, end);
1313 			start += ret;
1314 		} while (start < end && ret >= 0);
1315 		/* if ret is negative, something went wrong */
1316 		if (ret < 0)
1317 			return -1;
1318 
1319 		next_chunk_search_idx = p_idx + p_chunk_len;
1320 next_chunk:
1321 		/* skip to end of this chunk */
1322 		if (used) {
1323 			p_idx = rte_fbarray_find_next_used(p_arr,
1324 					next_chunk_search_idx);
1325 		} else {
1326 			p_idx = rte_fbarray_find_next_free(p_arr,
1327 					next_chunk_search_idx);
1328 		}
1329 	}
1330 	return 0;
1331 }
1332 
1333 static int
sync_existing(struct rte_memseg_list * primary_msl,struct rte_memseg_list * local_msl,struct hugepage_info * hi,unsigned int msl_idx)1334 sync_existing(struct rte_memseg_list *primary_msl,
1335 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1336 		unsigned int msl_idx)
1337 {
1338 	int ret, dir_fd;
1339 
1340 	/* do not allow any page allocations during the time we're allocating,
1341 	 * because file creation and locking operations are not atomic,
1342 	 * and we might be the first or the last ones to use a particular page,
1343 	 * so we need to ensure atomicity of every operation.
1344 	 */
1345 	dir_fd = open(hi->hugedir, O_RDONLY);
1346 	if (dir_fd < 0) {
1347 		EAL_LOG(ERR, "%s(): Cannot open '%s': %s", __func__,
1348 			hi->hugedir, strerror(errno));
1349 		return -1;
1350 	}
1351 	/* blocking writelock */
1352 	if (flock(dir_fd, LOCK_EX)) {
1353 		EAL_LOG(ERR, "%s(): Cannot lock '%s': %s", __func__,
1354 			hi->hugedir, strerror(errno));
1355 		close(dir_fd);
1356 		return -1;
1357 	}
1358 
1359 	/* ensure all allocated space is the same in both lists */
1360 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
1361 	if (ret < 0)
1362 		goto fail;
1363 
1364 	/* ensure all unallocated space is the same in both lists */
1365 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
1366 	if (ret < 0)
1367 		goto fail;
1368 
1369 	/* update version number */
1370 	local_msl->version = primary_msl->version;
1371 
1372 	close(dir_fd);
1373 
1374 	return 0;
1375 fail:
1376 	close(dir_fd);
1377 	return -1;
1378 }
1379 
1380 static int
sync_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)1381 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
1382 {
1383 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1384 	struct rte_memseg_list *primary_msl, *local_msl;
1385 	struct hugepage_info *hi = NULL;
1386 	unsigned int i;
1387 	int msl_idx;
1388 	struct internal_config *internal_conf =
1389 		eal_get_internal_configuration();
1390 
1391 	if (msl->external)
1392 		return 0;
1393 
1394 	msl_idx = msl - mcfg->memsegs;
1395 	primary_msl = &mcfg->memsegs[msl_idx];
1396 	local_msl = &local_memsegs[msl_idx];
1397 
1398 	for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
1399 		uint64_t cur_sz =
1400 			internal_conf->hugepage_info[i].hugepage_sz;
1401 		uint64_t msl_sz = primary_msl->page_sz;
1402 		if (msl_sz == cur_sz) {
1403 			hi = &internal_conf->hugepage_info[i];
1404 			break;
1405 		}
1406 	}
1407 	if (!hi) {
1408 		EAL_LOG(ERR, "Can't find relevant hugepage_info entry");
1409 		return -1;
1410 	}
1411 
1412 	/* if versions don't match, synchronize everything */
1413 	if (local_msl->version != primary_msl->version &&
1414 			sync_existing(primary_msl, local_msl, hi, msl_idx))
1415 		return -1;
1416 	return 0;
1417 }
1418 
1419 
1420 int
eal_memalloc_sync_with_primary(void)1421 eal_memalloc_sync_with_primary(void)
1422 {
1423 	/* nothing to be done in primary */
1424 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1425 		return 0;
1426 
1427 	/* memalloc is locked, so it's safe to call thread-unsafe version */
1428 	if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
1429 		return -1;
1430 	return 0;
1431 }
1432 
1433 static int
secondary_msl_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)1434 secondary_msl_create_walk(const struct rte_memseg_list *msl,
1435 		void *arg __rte_unused)
1436 {
1437 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1438 	struct rte_memseg_list *primary_msl, *local_msl;
1439 	char name[PATH_MAX];
1440 	int msl_idx, ret;
1441 
1442 	if (msl->external)
1443 		return 0;
1444 
1445 	msl_idx = msl - mcfg->memsegs;
1446 	primary_msl = &mcfg->memsegs[msl_idx];
1447 	local_msl = &local_memsegs[msl_idx];
1448 
1449 	/* create distinct fbarrays for each secondary */
1450 	snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
1451 		primary_msl->memseg_arr.name, getpid());
1452 
1453 	ret = rte_fbarray_init(&local_msl->memseg_arr, name,
1454 		primary_msl->memseg_arr.len,
1455 		primary_msl->memseg_arr.elt_sz);
1456 	if (ret < 0) {
1457 		EAL_LOG(ERR, "Cannot initialize local memory map");
1458 		return -1;
1459 	}
1460 	local_msl->base_va = primary_msl->base_va;
1461 	local_msl->len = primary_msl->len;
1462 
1463 	return 0;
1464 }
1465 
1466 static int
secondary_msl_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)1467 secondary_msl_destroy_walk(const struct rte_memseg_list *msl,
1468 		void *arg __rte_unused)
1469 {
1470 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1471 	struct rte_memseg_list *local_msl;
1472 	int msl_idx, ret;
1473 
1474 	if (msl->external)
1475 		return 0;
1476 
1477 	msl_idx = msl - mcfg->memsegs;
1478 	local_msl = &local_memsegs[msl_idx];
1479 
1480 	ret = rte_fbarray_destroy(&local_msl->memseg_arr);
1481 	if (ret < 0) {
1482 		EAL_LOG(ERR, "Cannot destroy local memory map");
1483 		return -1;
1484 	}
1485 	local_msl->base_va = NULL;
1486 	local_msl->len = 0;
1487 
1488 	return 0;
1489 }
1490 
1491 static int
alloc_list(int list_idx,int len)1492 alloc_list(int list_idx, int len)
1493 {
1494 	int *data;
1495 	int i;
1496 	const struct internal_config *internal_conf =
1497 		eal_get_internal_configuration();
1498 
1499 	/* single-file segments mode does not need fd list */
1500 	if (!internal_conf->single_file_segments) {
1501 		/* ensure we have space to store fd per each possible segment */
1502 		data = malloc(sizeof(int) * len);
1503 		if (data == NULL) {
1504 			EAL_LOG(ERR, "Unable to allocate space for file descriptors");
1505 			return -1;
1506 		}
1507 		/* set all fd's as invalid */
1508 		for (i = 0; i < len; i++)
1509 			data[i] = -1;
1510 		fd_list[list_idx].fds = data;
1511 		fd_list[list_idx].len = len;
1512 	} else {
1513 		fd_list[list_idx].fds = NULL;
1514 		fd_list[list_idx].len = 0;
1515 	}
1516 
1517 	fd_list[list_idx].count = 0;
1518 	fd_list[list_idx].memseg_list_fd = -1;
1519 
1520 	return 0;
1521 }
1522 
1523 static int
destroy_list(int list_idx)1524 destroy_list(int list_idx)
1525 {
1526 	const struct internal_config *internal_conf =
1527 			eal_get_internal_configuration();
1528 
1529 	/* single-file segments mode does not need fd list */
1530 	if (!internal_conf->single_file_segments) {
1531 		int *fds = fd_list[list_idx].fds;
1532 		int i;
1533 		/* go through each fd and ensure it's closed */
1534 		for (i = 0; i < fd_list[list_idx].len; i++) {
1535 			if (fds[i] >= 0) {
1536 				close(fds[i]);
1537 				fds[i] = -1;
1538 			}
1539 		}
1540 		free(fds);
1541 		fd_list[list_idx].fds = NULL;
1542 		fd_list[list_idx].len = 0;
1543 	} else if (fd_list[list_idx].memseg_list_fd >= 0) {
1544 		close(fd_list[list_idx].memseg_list_fd);
1545 		fd_list[list_idx].count = 0;
1546 		fd_list[list_idx].memseg_list_fd = -1;
1547 	}
1548 	return 0;
1549 }
1550 
1551 static int
fd_list_create_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)1552 fd_list_create_walk(const struct rte_memseg_list *msl,
1553 		void *arg __rte_unused)
1554 {
1555 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1556 	unsigned int len;
1557 	int msl_idx;
1558 
1559 	if (msl->external)
1560 		return 0;
1561 
1562 	msl_idx = msl - mcfg->memsegs;
1563 	len = msl->memseg_arr.len;
1564 
1565 	return alloc_list(msl_idx, len);
1566 }
1567 
1568 static int
fd_list_destroy_walk(const struct rte_memseg_list * msl,void * arg __rte_unused)1569 fd_list_destroy_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
1570 {
1571 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1572 	int msl_idx;
1573 
1574 	if (msl->external)
1575 		return 0;
1576 
1577 	msl_idx = msl - mcfg->memsegs;
1578 
1579 	return destroy_list(msl_idx);
1580 }
1581 
1582 int
eal_memalloc_set_seg_fd(int list_idx,int seg_idx,int fd)1583 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
1584 {
1585 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1586 	const struct internal_config *internal_conf =
1587 		eal_get_internal_configuration();
1588 
1589 	/* single file segments mode doesn't support individual segment fd's */
1590 	if (internal_conf->single_file_segments)
1591 		return -ENOTSUP;
1592 
1593 	/* if list is not allocated, allocate it */
1594 	if (fd_list[list_idx].len == 0) {
1595 		int len = mcfg->memsegs[list_idx].memseg_arr.len;
1596 
1597 		if (alloc_list(list_idx, len) < 0)
1598 			return -ENOMEM;
1599 	}
1600 	fd_list[list_idx].fds[seg_idx] = fd;
1601 
1602 	return 0;
1603 }
1604 
1605 int
eal_memalloc_set_seg_list_fd(int list_idx,int fd)1606 eal_memalloc_set_seg_list_fd(int list_idx, int fd)
1607 {
1608 	const struct internal_config *internal_conf =
1609 		eal_get_internal_configuration();
1610 
1611 	/* non-single file segment mode doesn't support segment list fd's */
1612 	if (!internal_conf->single_file_segments)
1613 		return -ENOTSUP;
1614 
1615 	fd_list[list_idx].memseg_list_fd = fd;
1616 
1617 	return 0;
1618 }
1619 
1620 int
eal_memalloc_get_seg_fd(int list_idx,int seg_idx)1621 eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
1622 {
1623 	int fd;
1624 	const struct internal_config *internal_conf =
1625 		eal_get_internal_configuration();
1626 
1627 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
1628 #ifndef MEMFD_SUPPORTED
1629 		/* in in-memory or no-huge mode, we rely on memfd support */
1630 		return -ENOTSUP;
1631 #endif
1632 		/* memfd supported, but hugetlbfs memfd may not be */
1633 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
1634 			return -ENOTSUP;
1635 	}
1636 
1637 	if (internal_conf->single_file_segments) {
1638 		fd = fd_list[list_idx].memseg_list_fd;
1639 	} else if (fd_list[list_idx].len == 0) {
1640 		/* list not initialized */
1641 		fd = -1;
1642 	} else {
1643 		fd = fd_list[list_idx].fds[seg_idx];
1644 	}
1645 	if (fd < 0)
1646 		return -ENODEV;
1647 	return fd;
1648 }
1649 
1650 static int
test_memfd_create(void)1651 test_memfd_create(void)
1652 {
1653 #ifdef MEMFD_SUPPORTED
1654 	const struct internal_config *internal_conf =
1655 		eal_get_internal_configuration();
1656 	unsigned int i;
1657 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
1658 		uint64_t pagesz = internal_conf->hugepage_info[i].hugepage_sz;
1659 		int pagesz_flag = pagesz_flags(pagesz);
1660 		int flags;
1661 
1662 		flags = pagesz_flag | RTE_MFD_HUGETLB;
1663 		int fd = memfd_create("test", flags);
1664 		if (fd < 0) {
1665 			/* we failed - let memalloc know this isn't working */
1666 			if (errno == EINVAL) {
1667 				memfd_create_supported = 0;
1668 				return 0; /* not supported */
1669 			}
1670 
1671 			/* we got other error - something's wrong */
1672 			return -1; /* error */
1673 		}
1674 		close(fd);
1675 		return 1; /* supported */
1676 	}
1677 #endif
1678 	return 0; /* not supported */
1679 }
1680 
1681 int
eal_memalloc_get_seg_fd_offset(int list_idx,int seg_idx,size_t * offset)1682 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
1683 {
1684 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1685 	const struct internal_config *internal_conf =
1686 		eal_get_internal_configuration();
1687 
1688 	if (internal_conf->in_memory || internal_conf->no_hugetlbfs) {
1689 #ifndef MEMFD_SUPPORTED
1690 		/* in in-memory or no-huge mode, we rely on memfd support */
1691 		return -ENOTSUP;
1692 #endif
1693 		/* memfd supported, but hugetlbfs memfd may not be */
1694 		if (!internal_conf->no_hugetlbfs && !memfd_create_supported)
1695 			return -ENOTSUP;
1696 	}
1697 
1698 	if (internal_conf->single_file_segments) {
1699 		size_t pgsz = mcfg->memsegs[list_idx].page_sz;
1700 
1701 		/* segment not active? */
1702 		if (fd_list[list_idx].memseg_list_fd < 0)
1703 			return -ENOENT;
1704 		*offset = pgsz * seg_idx;
1705 	} else {
1706 		/* fd_list not initialized? */
1707 		if (fd_list[list_idx].len == 0)
1708 			return -ENODEV;
1709 
1710 		/* segment not active? */
1711 		if (fd_list[list_idx].fds[seg_idx] < 0)
1712 			return -ENOENT;
1713 		*offset = 0;
1714 	}
1715 	return 0;
1716 }
1717 
1718 int
eal_memalloc_cleanup(void)1719 eal_memalloc_cleanup(void)
1720 {
1721 	/* close all remaining fd's - these are per-process, so it's safe */
1722 	if (rte_memseg_list_walk_thread_unsafe(fd_list_destroy_walk, NULL))
1723 		return -1;
1724 
1725 	/* destroy the shadow page table if we're a secondary process */
1726 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1727 		return 0;
1728 
1729 	if (rte_memseg_list_walk_thread_unsafe(secondary_msl_destroy_walk,
1730 			NULL))
1731 		return -1;
1732 
1733 	return 0;
1734 }
1735 
1736 int
eal_memalloc_init(void)1737 eal_memalloc_init(void)
1738 {
1739 	const struct internal_config *internal_conf =
1740 		eal_get_internal_configuration();
1741 
1742 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1743 		/*  memory_hotplug_lock is held during initialization, so it's
1744 		 *  safe to call thread-unsafe version.
1745 		 */
1746 		if (rte_memseg_list_walk_thread_unsafe(secondary_msl_create_walk, NULL) < 0)
1747 			return -1;
1748 	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
1749 			internal_conf->in_memory) {
1750 		int mfd_res = test_memfd_create();
1751 
1752 		if (mfd_res < 0) {
1753 			EAL_LOG(ERR, "Unable to check if memfd is supported");
1754 			return -1;
1755 		}
1756 		if (mfd_res == 1)
1757 			EAL_LOG(DEBUG, "Using memfd for anonymous memory");
1758 		else
1759 			EAL_LOG(INFO, "Using memfd is not supported, falling back to anonymous hugepages");
1760 
1761 		/* we only support single-file segments mode with in-memory mode
1762 		 * if we support hugetlbfs with memfd_create. this code will
1763 		 * test if we do.
1764 		 */
1765 		if (internal_conf->single_file_segments &&
1766 				mfd_res != 1) {
1767 			EAL_LOG(ERR, "Single-file segments mode cannot be used without memfd support");
1768 			return -1;
1769 		}
1770 		/* this cannot ever happen but better safe than sorry */
1771 		if (!anonymous_hugepages_supported) {
1772 			EAL_LOG(ERR, "Using anonymous memory is not supported");
1773 			return -1;
1774 		}
1775 		/* safety net, should be impossible to configure */
1776 		if (internal_conf->hugepage_file.unlink_before_mapping &&
1777 				!internal_conf->hugepage_file.unlink_existing) {
1778 			EAL_LOG(ERR, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.");
1779 			return -1;
1780 		}
1781 	}
1782 
1783 	/* initialize all of the fd lists */
1784 	if (rte_memseg_list_walk_thread_unsafe(fd_list_create_walk, NULL))
1785 		return -1;
1786 	return 0;
1787 }
1788