xref: /dpdk/lib/eal/common/eal_common_memory.c (revision 30a1de105a5f40d77b344a891c4a68f79e815c43)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <errno.h>
6 #include <stdio.h>
7 #include <stdint.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <inttypes.h>
11 
12 #include <rte_fbarray.h>
13 #include <rte_memory.h>
14 #include <rte_eal.h>
15 #include <rte_eal_memconfig.h>
16 #include <rte_eal_paging.h>
17 #include <rte_errno.h>
18 #include <rte_log.h>
19 #ifndef RTE_EXEC_ENV_WINDOWS
20 #include <rte_telemetry.h>
21 #endif
22 
23 #include "eal_memalloc.h"
24 #include "eal_private.h"
25 #include "eal_internal_cfg.h"
26 #include "eal_memcfg.h"
27 #include "eal_options.h"
28 #include "malloc_heap.h"
29 
30 /*
31  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
32  * pointer to the mmap'd area and keep *size unmodified. Else, retry
33  * with a smaller zone: decrease *size by hugepage_sz until it reaches
34  * 0. In this case, return NULL. Note: this function returns an address
35  * which is a multiple of hugepage size.
36  */
37 
38 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
39 
40 static void *next_baseaddr;
41 static uint64_t system_page_sz;
42 
43 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
44 void *
45 eal_get_virtual_area(void *requested_addr, size_t *size,
46 	size_t page_sz, int flags, int reserve_flags)
47 {
48 	bool addr_is_hint, allow_shrink, unmap, no_align;
49 	uint64_t map_sz;
50 	void *mapped_addr, *aligned_addr;
51 	uint8_t try = 0;
52 	struct internal_config *internal_conf =
53 		eal_get_internal_configuration();
54 
55 	if (system_page_sz == 0)
56 		system_page_sz = rte_mem_page_size();
57 
58 	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
59 
60 	addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
61 	allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
62 	unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
63 
64 	if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 &&
65 			rte_eal_process_type() == RTE_PROC_PRIMARY)
66 		next_baseaddr = (void *) internal_conf->base_virtaddr;
67 
68 #ifdef RTE_ARCH_64
69 	if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 &&
70 			rte_eal_process_type() == RTE_PROC_PRIMARY)
71 		next_baseaddr = (void *) eal_get_baseaddr();
72 #endif
73 	if (requested_addr == NULL && next_baseaddr != NULL) {
74 		requested_addr = next_baseaddr;
75 		requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
76 		addr_is_hint = true;
77 	}
78 
79 	/* we don't need alignment of resulting pointer in the following cases:
80 	 *
81 	 * 1. page size is equal to system size
82 	 * 2. we have a requested address, and it is page-aligned, and we will
83 	 *    be discarding the address if we get a different one.
84 	 *
85 	 * for all other cases, alignment is potentially necessary.
86 	 */
87 	no_align = (requested_addr != NULL &&
88 		requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
89 		!addr_is_hint) ||
90 		page_sz == system_page_sz;
91 
92 	do {
93 		map_sz = no_align ? *size : *size + page_sz;
94 		if (map_sz > SIZE_MAX) {
95 			RTE_LOG(ERR, EAL, "Map size too big\n");
96 			rte_errno = E2BIG;
97 			return NULL;
98 		}
99 
100 		mapped_addr = eal_mem_reserve(
101 			requested_addr, (size_t)map_sz, reserve_flags);
102 		if ((mapped_addr == NULL) && allow_shrink)
103 			*size -= page_sz;
104 
105 		if ((mapped_addr != NULL) && addr_is_hint &&
106 				(mapped_addr != requested_addr)) {
107 			try++;
108 			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
109 			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
110 				/* hint was not used. Try with another offset */
111 				eal_mem_free(mapped_addr, map_sz);
112 				mapped_addr = NULL;
113 				requested_addr = next_baseaddr;
114 			}
115 		}
116 	} while ((allow_shrink || addr_is_hint) &&
117 		(mapped_addr == NULL) && (*size > 0));
118 
119 	/* align resulting address - if map failed, we will ignore the value
120 	 * anyway, so no need to add additional checks.
121 	 */
122 	aligned_addr = no_align ? mapped_addr :
123 			RTE_PTR_ALIGN(mapped_addr, page_sz);
124 
125 	if (*size == 0) {
126 		RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
127 			rte_strerror(rte_errno));
128 		return NULL;
129 	} else if (mapped_addr == NULL) {
130 		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
131 			rte_strerror(rte_errno));
132 		return NULL;
133 	} else if (requested_addr != NULL && !addr_is_hint &&
134 			aligned_addr != requested_addr) {
135 		RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
136 			requested_addr, aligned_addr);
137 		eal_mem_free(mapped_addr, map_sz);
138 		rte_errno = EADDRNOTAVAIL;
139 		return NULL;
140 	} else if (requested_addr != NULL && addr_is_hint &&
141 			aligned_addr != requested_addr) {
142 		/*
143 		 * demote this warning to debug if we did not explicitly request
144 		 * a base virtual address.
145 		 */
146 		if (internal_conf->base_virtaddr != 0) {
147 			RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
148 				requested_addr, aligned_addr);
149 			RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory into secondary processes\n");
150 		} else {
151 			RTE_LOG(DEBUG, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
152 				requested_addr, aligned_addr);
153 			RTE_LOG(DEBUG, EAL, "   This may cause issues with mapping memory into secondary processes\n");
154 		}
155 	} else if (next_baseaddr != NULL) {
156 		next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
157 	}
158 
159 	RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
160 		aligned_addr, *size);
161 
162 	if (unmap) {
163 		eal_mem_free(mapped_addr, map_sz);
164 	} else if (!no_align) {
165 		void *map_end, *aligned_end;
166 		size_t before_len, after_len;
167 
168 		/* when we reserve space with alignment, we add alignment to
169 		 * mapping size. On 32-bit, if 1GB alignment was requested, this
170 		 * would waste 1GB of address space, which is a luxury we cannot
171 		 * afford. so, if alignment was performed, check if any unneeded
172 		 * address space can be unmapped back.
173 		 */
174 
175 		map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
176 		aligned_end = RTE_PTR_ADD(aligned_addr, *size);
177 
178 		/* unmap space before aligned mmap address */
179 		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
180 		if (before_len > 0)
181 			eal_mem_free(mapped_addr, before_len);
182 
183 		/* unmap space after aligned end mmap address */
184 		after_len = RTE_PTR_DIFF(map_end, aligned_end);
185 		if (after_len > 0)
186 			eal_mem_free(aligned_end, after_len);
187 	}
188 
189 	if (!unmap) {
190 		/* Exclude these pages from a core dump. */
191 		eal_mem_set_dump(aligned_addr, *size, false);
192 	}
193 
194 	return aligned_addr;
195 }
196 
197 int
198 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
199 		uint64_t page_sz, int n_segs, int socket_id, bool heap)
200 {
201 	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
202 			sizeof(struct rte_memseg))) {
203 		RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
204 			rte_strerror(rte_errno));
205 		return -1;
206 	}
207 
208 	msl->page_sz = page_sz;
209 	msl->socket_id = socket_id;
210 	msl->base_va = NULL;
211 	msl->heap = heap;
212 
213 	RTE_LOG(DEBUG, EAL,
214 		"Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n",
215 		socket_id, page_sz >> 10);
216 
217 	return 0;
218 }
219 
220 int
221 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz,
222 		int n_segs, int socket_id, int type_msl_idx, bool heap)
223 {
224 	char name[RTE_FBARRAY_NAME_LEN];
225 
226 	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
227 		 type_msl_idx);
228 
229 	return eal_memseg_list_init_named(
230 		msl, name, page_sz, n_segs, socket_id, heap);
231 }
232 
233 int
234 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
235 {
236 	size_t page_sz, mem_sz;
237 	void *addr;
238 
239 	page_sz = msl->page_sz;
240 	mem_sz = page_sz * msl->memseg_arr.len;
241 
242 	addr = eal_get_virtual_area(
243 		msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
244 	if (addr == NULL) {
245 #ifndef RTE_EXEC_ENV_WINDOWS
246 		/* The hint would be misleading on Windows, because address
247 		 * is by default system-selected (base VA = 0).
248 		 * However, this function is called from many places,
249 		 * including common code, so don't duplicate the message.
250 		 */
251 		if (rte_errno == EADDRNOTAVAIL)
252 			RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - "
253 				"please use '--" OPT_BASE_VIRTADDR "' option\n",
254 				(unsigned long long)mem_sz, msl->base_va);
255 #endif
256 		return -1;
257 	}
258 	msl->base_va = addr;
259 	msl->len = mem_sz;
260 
261 	RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n",
262 			addr, mem_sz);
263 
264 	return 0;
265 }
266 
267 void
268 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs)
269 {
270 	size_t page_sz = msl->page_sz;
271 	int i;
272 
273 	for (i = 0; i < n_segs; i++) {
274 		struct rte_fbarray *arr = &msl->memseg_arr;
275 		struct rte_memseg *ms = rte_fbarray_get(arr, i);
276 
277 		if (rte_eal_iova_mode() == RTE_IOVA_VA)
278 			ms->iova = (uintptr_t)addr;
279 		else
280 			ms->iova = RTE_BAD_IOVA;
281 		ms->addr = addr;
282 		ms->hugepage_sz = page_sz;
283 		ms->socket_id = 0;
284 		ms->len = page_sz;
285 
286 		rte_fbarray_set_used(arr, i);
287 
288 		addr = RTE_PTR_ADD(addr, page_sz);
289 	}
290 }
291 
292 static struct rte_memseg *
293 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
294 {
295 	const struct rte_fbarray *arr;
296 	void *start, *end;
297 	int ms_idx;
298 
299 	if (msl == NULL)
300 		return NULL;
301 
302 	/* a memseg list was specified, check if it's the right one */
303 	start = msl->base_va;
304 	end = RTE_PTR_ADD(start, msl->len);
305 
306 	if (addr < start || addr >= end)
307 		return NULL;
308 
309 	/* now, calculate index */
310 	arr = &msl->memseg_arr;
311 	ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
312 	return rte_fbarray_get(arr, ms_idx);
313 }
314 
315 static struct rte_memseg_list *
316 virt2memseg_list(const void *addr)
317 {
318 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
319 	struct rte_memseg_list *msl;
320 	int msl_idx;
321 
322 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
323 		void *start, *end;
324 		msl = &mcfg->memsegs[msl_idx];
325 
326 		start = msl->base_va;
327 		end = RTE_PTR_ADD(start, msl->len);
328 		if (addr >= start && addr < end)
329 			break;
330 	}
331 	/* if we didn't find our memseg list */
332 	if (msl_idx == RTE_MAX_MEMSEG_LISTS)
333 		return NULL;
334 	return msl;
335 }
336 
337 struct rte_memseg_list *
338 rte_mem_virt2memseg_list(const void *addr)
339 {
340 	return virt2memseg_list(addr);
341 }
342 
343 struct virtiova {
344 	rte_iova_t iova;
345 	void *virt;
346 };
347 static int
348 find_virt(const struct rte_memseg_list *msl __rte_unused,
349 		const struct rte_memseg *ms, void *arg)
350 {
351 	struct virtiova *vi = arg;
352 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
353 		size_t offset = vi->iova - ms->iova;
354 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
355 		/* stop the walk */
356 		return 1;
357 	}
358 	return 0;
359 }
360 static int
361 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
362 		const struct rte_memseg *ms, size_t len, void *arg)
363 {
364 	struct virtiova *vi = arg;
365 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
366 		size_t offset = vi->iova - ms->iova;
367 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
368 		/* stop the walk */
369 		return 1;
370 	}
371 	return 0;
372 }
373 
374 void *
375 rte_mem_iova2virt(rte_iova_t iova)
376 {
377 	struct virtiova vi;
378 	const struct internal_config *internal_conf =
379 		eal_get_internal_configuration();
380 
381 	memset(&vi, 0, sizeof(vi));
382 
383 	vi.iova = iova;
384 	/* for legacy mem, we can get away with scanning VA-contiguous segments,
385 	 * as we know they are PA-contiguous as well
386 	 */
387 	if (internal_conf->legacy_mem)
388 		rte_memseg_contig_walk(find_virt_legacy, &vi);
389 	else
390 		rte_memseg_walk(find_virt, &vi);
391 
392 	return vi.virt;
393 }
394 
395 struct rte_memseg *
396 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
397 {
398 	return virt2memseg(addr, msl != NULL ? msl :
399 			rte_mem_virt2memseg_list(addr));
400 }
401 
402 static int
403 physmem_size(const struct rte_memseg_list *msl, void *arg)
404 {
405 	uint64_t *total_len = arg;
406 
407 	if (msl->external)
408 		return 0;
409 
410 	*total_len += msl->memseg_arr.count * msl->page_sz;
411 
412 	return 0;
413 }
414 
415 /* get the total size of memory */
416 uint64_t
417 rte_eal_get_physmem_size(void)
418 {
419 	uint64_t total_len = 0;
420 
421 	rte_memseg_list_walk(physmem_size, &total_len);
422 
423 	return total_len;
424 }
425 
426 static int
427 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
428 		void *arg)
429 {
430 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
431 	int msl_idx, ms_idx, fd;
432 	FILE *f = arg;
433 
434 	msl_idx = msl - mcfg->memsegs;
435 	if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
436 		return -1;
437 
438 	ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
439 	if (ms_idx < 0)
440 		return -1;
441 
442 	fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
443 	fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
444 			"virt:%p, socket_id:%"PRId32", "
445 			"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
446 			"nrank:%"PRIx32" fd:%i\n",
447 			msl_idx, ms_idx,
448 			ms->iova,
449 			ms->len,
450 			ms->addr,
451 			ms->socket_id,
452 			ms->hugepage_sz,
453 			ms->nchannel,
454 			ms->nrank,
455 			fd);
456 
457 	return 0;
458 }
459 
460 /*
461  * Defining here because declared in rte_memory.h, but the actual implementation
462  * is in eal_common_memalloc.c, like all other memalloc internals.
463  */
464 int
465 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
466 		void *arg)
467 {
468 	const struct internal_config *internal_conf =
469 		eal_get_internal_configuration();
470 
471 	/* FreeBSD boots with legacy mem enabled by default */
472 	if (internal_conf->legacy_mem) {
473 		RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
474 		rte_errno = ENOTSUP;
475 		return -1;
476 	}
477 	return eal_memalloc_mem_event_callback_register(name, clb, arg);
478 }
479 
480 int
481 rte_mem_event_callback_unregister(const char *name, void *arg)
482 {
483 	const struct internal_config *internal_conf =
484 		eal_get_internal_configuration();
485 
486 	/* FreeBSD boots with legacy mem enabled by default */
487 	if (internal_conf->legacy_mem) {
488 		RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
489 		rte_errno = ENOTSUP;
490 		return -1;
491 	}
492 	return eal_memalloc_mem_event_callback_unregister(name, arg);
493 }
494 
495 int
496 rte_mem_alloc_validator_register(const char *name,
497 		rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
498 {
499 	const struct internal_config *internal_conf =
500 		eal_get_internal_configuration();
501 
502 	/* FreeBSD boots with legacy mem enabled by default */
503 	if (internal_conf->legacy_mem) {
504 		RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
505 		rte_errno = ENOTSUP;
506 		return -1;
507 	}
508 	return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
509 			limit);
510 }
511 
512 int
513 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
514 {
515 	const struct internal_config *internal_conf =
516 		eal_get_internal_configuration();
517 
518 	/* FreeBSD boots with legacy mem enabled by default */
519 	if (internal_conf->legacy_mem) {
520 		RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
521 		rte_errno = ENOTSUP;
522 		return -1;
523 	}
524 	return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
525 }
526 
527 /* Dump the physical memory layout on console */
528 void
529 rte_dump_physmem_layout(FILE *f)
530 {
531 	rte_memseg_walk(dump_memseg, f);
532 }
533 
534 static int
535 check_iova(const struct rte_memseg_list *msl __rte_unused,
536 		const struct rte_memseg *ms, void *arg)
537 {
538 	uint64_t *mask = arg;
539 	rte_iova_t iova;
540 
541 	/* higher address within segment */
542 	iova = (ms->iova + ms->len) - 1;
543 	if (!(iova & *mask))
544 		return 0;
545 
546 	RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
547 			    ms->iova, ms->len);
548 
549 	RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
550 	return 1;
551 }
552 
553 #define MAX_DMA_MASK_BITS 63
554 
555 /* check memseg iovas are within the required range based on dma mask */
556 static int
557 check_dma_mask(uint8_t maskbits, bool thread_unsafe)
558 {
559 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
560 	uint64_t mask;
561 	int ret;
562 
563 	/* Sanity check. We only check width can be managed with 64 bits
564 	 * variables. Indeed any higher value is likely wrong. */
565 	if (maskbits > MAX_DMA_MASK_BITS) {
566 		RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
567 				   maskbits, MAX_DMA_MASK_BITS);
568 		return -1;
569 	}
570 
571 	/* create dma mask */
572 	mask = ~((1ULL << maskbits) - 1);
573 
574 	if (thread_unsafe)
575 		ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
576 	else
577 		ret = rte_memseg_walk(check_iova, &mask);
578 
579 	if (ret)
580 		/*
581 		 * Dma mask precludes hugepage usage.
582 		 * This device can not be used and we do not need to keep
583 		 * the dma mask.
584 		 */
585 		return 1;
586 
587 	/*
588 	 * we need to keep the more restricted maskbit for checking
589 	 * potential dynamic memory allocation in the future.
590 	 */
591 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
592 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
593 
594 	return 0;
595 }
596 
597 int
598 rte_mem_check_dma_mask(uint8_t maskbits)
599 {
600 	return check_dma_mask(maskbits, false);
601 }
602 
603 int
604 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
605 {
606 	return check_dma_mask(maskbits, true);
607 }
608 
609 /*
610  * Set dma mask to use when memory initialization is done.
611  *
612  * This function should ONLY be used by code executed before the memory
613  * initialization. PMDs should use rte_mem_check_dma_mask if addressing
614  * limitations by the device.
615  */
616 void
617 rte_mem_set_dma_mask(uint8_t maskbits)
618 {
619 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
620 
621 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
622 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
623 }
624 
625 /* return the number of memory channels */
626 unsigned rte_memory_get_nchannel(void)
627 {
628 	return rte_eal_get_configuration()->mem_config->nchannel;
629 }
630 
631 /* return the number of memory rank */
632 unsigned rte_memory_get_nrank(void)
633 {
634 	return rte_eal_get_configuration()->mem_config->nrank;
635 }
636 
637 static int
638 rte_eal_memdevice_init(void)
639 {
640 	struct rte_config *config;
641 	const struct internal_config *internal_conf;
642 
643 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
644 		return 0;
645 
646 	internal_conf = eal_get_internal_configuration();
647 	config = rte_eal_get_configuration();
648 	config->mem_config->nchannel = internal_conf->force_nchannel;
649 	config->mem_config->nrank = internal_conf->force_nrank;
650 
651 	return 0;
652 }
653 
654 /* Lock page in physical memory and prevent from swapping. */
655 int
656 rte_mem_lock_page(const void *virt)
657 {
658 	uintptr_t virtual = (uintptr_t)virt;
659 	size_t page_size = rte_mem_page_size();
660 	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
661 	return rte_mem_lock((void *)aligned, page_size);
662 }
663 
664 int
665 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
666 {
667 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
668 	int i, ms_idx, ret = 0;
669 
670 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
671 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
672 		const struct rte_memseg *ms;
673 		struct rte_fbarray *arr;
674 
675 		if (msl->memseg_arr.count == 0)
676 			continue;
677 
678 		arr = &msl->memseg_arr;
679 
680 		ms_idx = rte_fbarray_find_next_used(arr, 0);
681 		while (ms_idx >= 0) {
682 			int n_segs;
683 			size_t len;
684 
685 			ms = rte_fbarray_get(arr, ms_idx);
686 
687 			/* find how many more segments there are, starting with
688 			 * this one.
689 			 */
690 			n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
691 			len = n_segs * msl->page_sz;
692 
693 			ret = func(msl, ms, len, arg);
694 			if (ret)
695 				return ret;
696 			ms_idx = rte_fbarray_find_next_used(arr,
697 					ms_idx + n_segs);
698 		}
699 	}
700 	return 0;
701 }
702 
703 int
704 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
705 {
706 	int ret = 0;
707 
708 	/* do not allow allocations/frees/init while we iterate */
709 	rte_mcfg_mem_read_lock();
710 	ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
711 	rte_mcfg_mem_read_unlock();
712 
713 	return ret;
714 }
715 
716 int
717 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
718 {
719 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
720 	int i, ms_idx, ret = 0;
721 
722 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
723 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
724 		const struct rte_memseg *ms;
725 		struct rte_fbarray *arr;
726 
727 		if (msl->memseg_arr.count == 0)
728 			continue;
729 
730 		arr = &msl->memseg_arr;
731 
732 		ms_idx = rte_fbarray_find_next_used(arr, 0);
733 		while (ms_idx >= 0) {
734 			ms = rte_fbarray_get(arr, ms_idx);
735 			ret = func(msl, ms, arg);
736 			if (ret)
737 				return ret;
738 			ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
739 		}
740 	}
741 	return 0;
742 }
743 
744 int
745 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
746 {
747 	int ret = 0;
748 
749 	/* do not allow allocations/frees/init while we iterate */
750 	rte_mcfg_mem_read_lock();
751 	ret = rte_memseg_walk_thread_unsafe(func, arg);
752 	rte_mcfg_mem_read_unlock();
753 
754 	return ret;
755 }
756 
757 int
758 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
759 {
760 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
761 	int i, ret = 0;
762 
763 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
764 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
765 
766 		if (msl->base_va == NULL)
767 			continue;
768 
769 		ret = func(msl, arg);
770 		if (ret)
771 			return ret;
772 	}
773 	return 0;
774 }
775 
776 int
777 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
778 {
779 	int ret = 0;
780 
781 	/* do not allow allocations/frees/init while we iterate */
782 	rte_mcfg_mem_read_lock();
783 	ret = rte_memseg_list_walk_thread_unsafe(func, arg);
784 	rte_mcfg_mem_read_unlock();
785 
786 	return ret;
787 }
788 
789 int
790 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
791 {
792 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
793 	struct rte_memseg_list *msl;
794 	struct rte_fbarray *arr;
795 	int msl_idx, seg_idx, ret;
796 
797 	if (ms == NULL) {
798 		rte_errno = EINVAL;
799 		return -1;
800 	}
801 
802 	msl = rte_mem_virt2memseg_list(ms->addr);
803 	if (msl == NULL) {
804 		rte_errno = EINVAL;
805 		return -1;
806 	}
807 	arr = &msl->memseg_arr;
808 
809 	msl_idx = msl - mcfg->memsegs;
810 	seg_idx = rte_fbarray_find_idx(arr, ms);
811 
812 	if (!rte_fbarray_is_used(arr, seg_idx)) {
813 		rte_errno = ENOENT;
814 		return -1;
815 	}
816 
817 	/* segment fd API is not supported for external segments */
818 	if (msl->external) {
819 		rte_errno = ENOTSUP;
820 		return -1;
821 	}
822 
823 	ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
824 	if (ret < 0) {
825 		rte_errno = -ret;
826 		ret = -1;
827 	}
828 	return ret;
829 }
830 
831 int
832 rte_memseg_get_fd(const struct rte_memseg *ms)
833 {
834 	int ret;
835 
836 	rte_mcfg_mem_read_lock();
837 	ret = rte_memseg_get_fd_thread_unsafe(ms);
838 	rte_mcfg_mem_read_unlock();
839 
840 	return ret;
841 }
842 
843 int
844 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
845 		size_t *offset)
846 {
847 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
848 	struct rte_memseg_list *msl;
849 	struct rte_fbarray *arr;
850 	int msl_idx, seg_idx, ret;
851 
852 	if (ms == NULL || offset == NULL) {
853 		rte_errno = EINVAL;
854 		return -1;
855 	}
856 
857 	msl = rte_mem_virt2memseg_list(ms->addr);
858 	if (msl == NULL) {
859 		rte_errno = EINVAL;
860 		return -1;
861 	}
862 	arr = &msl->memseg_arr;
863 
864 	msl_idx = msl - mcfg->memsegs;
865 	seg_idx = rte_fbarray_find_idx(arr, ms);
866 
867 	if (!rte_fbarray_is_used(arr, seg_idx)) {
868 		rte_errno = ENOENT;
869 		return -1;
870 	}
871 
872 	/* segment fd API is not supported for external segments */
873 	if (msl->external) {
874 		rte_errno = ENOTSUP;
875 		return -1;
876 	}
877 
878 	ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
879 	if (ret < 0) {
880 		rte_errno = -ret;
881 		ret = -1;
882 	}
883 	return ret;
884 }
885 
886 int
887 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
888 {
889 	int ret;
890 
891 	rte_mcfg_mem_read_lock();
892 	ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
893 	rte_mcfg_mem_read_unlock();
894 
895 	return ret;
896 }
897 
898 int
899 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
900 		unsigned int n_pages, size_t page_sz)
901 {
902 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
903 	unsigned int socket_id, n;
904 	int ret = 0;
905 
906 	if (va_addr == NULL || page_sz == 0 || len == 0 ||
907 			!rte_is_power_of_2(page_sz) ||
908 			RTE_ALIGN(len, page_sz) != len ||
909 			((len / page_sz) != n_pages && iova_addrs != NULL) ||
910 			!rte_is_aligned(va_addr, page_sz)) {
911 		rte_errno = EINVAL;
912 		return -1;
913 	}
914 	rte_mcfg_mem_write_lock();
915 
916 	/* make sure the segment doesn't already exist */
917 	if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
918 		rte_errno = EEXIST;
919 		ret = -1;
920 		goto unlock;
921 	}
922 
923 	/* get next available socket ID */
924 	socket_id = mcfg->next_socket_id;
925 	if (socket_id > INT32_MAX) {
926 		RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
927 		rte_errno = ENOSPC;
928 		ret = -1;
929 		goto unlock;
930 	}
931 
932 	/* we can create a new memseg */
933 	n = len / page_sz;
934 	if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
935 			page_sz, "extmem", socket_id) == NULL) {
936 		ret = -1;
937 		goto unlock;
938 	}
939 
940 	/* memseg list successfully created - increment next socket ID */
941 	mcfg->next_socket_id++;
942 unlock:
943 	rte_mcfg_mem_write_unlock();
944 	return ret;
945 }
946 
947 int
948 rte_extmem_unregister(void *va_addr, size_t len)
949 {
950 	struct rte_memseg_list *msl;
951 	int ret = 0;
952 
953 	if (va_addr == NULL || len == 0) {
954 		rte_errno = EINVAL;
955 		return -1;
956 	}
957 	rte_mcfg_mem_write_lock();
958 
959 	/* find our segment */
960 	msl = malloc_heap_find_external_seg(va_addr, len);
961 	if (msl == NULL) {
962 		rte_errno = ENOENT;
963 		ret = -1;
964 		goto unlock;
965 	}
966 
967 	ret = malloc_heap_destroy_external_seg(msl);
968 unlock:
969 	rte_mcfg_mem_write_unlock();
970 	return ret;
971 }
972 
973 static int
974 sync_memory(void *va_addr, size_t len, bool attach)
975 {
976 	struct rte_memseg_list *msl;
977 	int ret = 0;
978 
979 	if (va_addr == NULL || len == 0) {
980 		rte_errno = EINVAL;
981 		return -1;
982 	}
983 	rte_mcfg_mem_write_lock();
984 
985 	/* find our segment */
986 	msl = malloc_heap_find_external_seg(va_addr, len);
987 	if (msl == NULL) {
988 		rte_errno = ENOENT;
989 		ret = -1;
990 		goto unlock;
991 	}
992 	if (attach)
993 		ret = rte_fbarray_attach(&msl->memseg_arr);
994 	else
995 		ret = rte_fbarray_detach(&msl->memseg_arr);
996 
997 unlock:
998 	rte_mcfg_mem_write_unlock();
999 	return ret;
1000 }
1001 
1002 int
1003 rte_extmem_attach(void *va_addr, size_t len)
1004 {
1005 	return sync_memory(va_addr, len, true);
1006 }
1007 
1008 int
1009 rte_extmem_detach(void *va_addr, size_t len)
1010 {
1011 	return sync_memory(va_addr, len, false);
1012 }
1013 
1014 /* detach all EAL memory */
1015 int
1016 rte_eal_memory_detach(void)
1017 {
1018 	const struct internal_config *internal_conf =
1019 		eal_get_internal_configuration();
1020 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1021 	size_t page_sz = rte_mem_page_size();
1022 	unsigned int i;
1023 
1024 	if (internal_conf->in_memory == 1)
1025 		return 0;
1026 
1027 	rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
1028 
1029 	/* detach internal memory subsystem data first */
1030 	if (eal_memalloc_cleanup())
1031 		RTE_LOG(ERR, EAL, "Could not release memory subsystem data\n");
1032 
1033 	for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) {
1034 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1035 
1036 		/* skip uninitialized segments */
1037 		if (msl->base_va == NULL)
1038 			continue;
1039 		/*
1040 		 * external segments are supposed to be detached at this point,
1041 		 * but if they aren't, we can't really do anything about it,
1042 		 * because if we skip them here, they'll become invalid after
1043 		 * we unmap the memconfig anyway. however, if this is externally
1044 		 * referenced memory, we have no business unmapping it.
1045 		 */
1046 		if (!msl->external)
1047 			if (rte_mem_unmap(msl->base_va, msl->len) != 0)
1048 				RTE_LOG(ERR, EAL, "Could not unmap memory: %s\n",
1049 						rte_strerror(rte_errno));
1050 
1051 		/*
1052 		 * we are detaching the fbarray rather than destroying because
1053 		 * other processes might still reference this fbarray, and we
1054 		 * have no way of knowing if they still do.
1055 		 */
1056 		if (rte_fbarray_detach(&msl->memseg_arr))
1057 			RTE_LOG(ERR, EAL, "Could not detach fbarray: %s\n",
1058 					rte_strerror(rte_errno));
1059 	}
1060 	rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
1061 
1062 	/*
1063 	 * we've detached the memseg lists, so we can unmap the shared mem
1064 	 * config - we can't zero it out because it might still be referenced
1065 	 * by other processes.
1066 	 */
1067 	if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) {
1068 		if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0)
1069 			RTE_LOG(ERR, EAL, "Could not unmap shared memory config: %s\n",
1070 					rte_strerror(rte_errno));
1071 	}
1072 	rte_eal_get_configuration()->mem_config = NULL;
1073 
1074 	return 0;
1075 }
1076 
1077 /* init memory subsystem */
1078 int
1079 rte_eal_memory_init(void)
1080 {
1081 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1082 	const struct internal_config *internal_conf =
1083 		eal_get_internal_configuration();
1084 
1085 	int retval;
1086 	RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
1087 
1088 	if (!mcfg)
1089 		return -1;
1090 
1091 	/* lock mem hotplug here, to prevent races while we init */
1092 	rte_mcfg_mem_read_lock();
1093 
1094 	if (rte_eal_memseg_init() < 0)
1095 		goto fail;
1096 
1097 	if (eal_memalloc_init() < 0)
1098 		goto fail;
1099 
1100 	retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1101 			rte_eal_hugepage_init() :
1102 			rte_eal_hugepage_attach();
1103 	if (retval < 0)
1104 		goto fail;
1105 
1106 	if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
1107 		goto fail;
1108 
1109 	return 0;
1110 fail:
1111 	rte_mcfg_mem_read_unlock();
1112 	return -1;
1113 }
1114 
1115 #ifndef RTE_EXEC_ENV_WINDOWS
1116 #define EAL_MEMZONE_LIST_REQ	"/eal/memzone_list"
1117 #define EAL_MEMZONE_INFO_REQ	"/eal/memzone_info"
1118 #define EAL_HEAP_LIST_REQ	"/eal/heap_list"
1119 #define EAL_HEAP_INFO_REQ	"/eal/heap_info"
1120 #define ADDR_STR		15
1121 
1122 /* Telemetry callback handler to return heap stats for requested heap id. */
1123 static int
1124 handle_eal_heap_info_request(const char *cmd __rte_unused, const char *params,
1125 			     struct rte_tel_data *d)
1126 {
1127 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1128 	struct rte_malloc_socket_stats sock_stats;
1129 	struct malloc_heap *heap;
1130 	unsigned int heap_id;
1131 
1132 	if (params == NULL || strlen(params) == 0)
1133 		return -1;
1134 
1135 	heap_id = (unsigned int)strtoul(params, NULL, 10);
1136 
1137 	/* Get the heap stats of user provided heap id */
1138 	heap = &mcfg->malloc_heaps[heap_id];
1139 	malloc_heap_get_stats(heap, &sock_stats);
1140 
1141 	rte_tel_data_start_dict(d);
1142 	rte_tel_data_add_dict_int(d, "Head id", heap_id);
1143 	rte_tel_data_add_dict_string(d, "Name", heap->name);
1144 	rte_tel_data_add_dict_u64(d, "Heap_size",
1145 				  sock_stats.heap_totalsz_bytes);
1146 	rte_tel_data_add_dict_u64(d, "Free_size", sock_stats.heap_freesz_bytes);
1147 	rte_tel_data_add_dict_u64(d, "Alloc_size",
1148 				  sock_stats.heap_allocsz_bytes);
1149 	rte_tel_data_add_dict_u64(d, "Greatest_free_size",
1150 				  sock_stats.greatest_free_size);
1151 	rte_tel_data_add_dict_u64(d, "Alloc_count", sock_stats.alloc_count);
1152 	rte_tel_data_add_dict_u64(d, "Free_count", sock_stats.free_count);
1153 
1154 	return 0;
1155 }
1156 
1157 /* Telemetry callback handler to list the heap ids setup. */
1158 static int
1159 handle_eal_heap_list_request(const char *cmd __rte_unused,
1160 				const char *params __rte_unused,
1161 				struct rte_tel_data *d)
1162 {
1163 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1164 	struct rte_malloc_socket_stats sock_stats;
1165 	unsigned int heap_id;
1166 
1167 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1168 	/* Iterate through all initialised heaps */
1169 	for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) {
1170 		struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
1171 
1172 		malloc_heap_get_stats(heap, &sock_stats);
1173 		if (sock_stats.heap_totalsz_bytes != 0)
1174 			rte_tel_data_add_array_int(d, heap_id);
1175 	}
1176 
1177 	return 0;
1178 }
1179 
1180 /* Telemetry callback handler to return memzone info for requested index. */
1181 static int
1182 handle_eal_memzone_info_request(const char *cmd __rte_unused,
1183 				const char *params, struct rte_tel_data *d)
1184 {
1185 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1186 	struct rte_memseg_list *msl = NULL;
1187 	int ms_idx, ms_count = 0;
1188 	void *cur_addr, *mz_end;
1189 	struct rte_memzone *mz;
1190 	struct rte_memseg *ms;
1191 	char addr[ADDR_STR];
1192 	unsigned int mz_idx;
1193 	size_t page_sz;
1194 
1195 	if (params == NULL || strlen(params) == 0)
1196 		return -1;
1197 
1198 	mz_idx = strtoul(params, NULL, 10);
1199 
1200 	/* Get the memzone handle using index */
1201 	mz = rte_fbarray_get(&mcfg->memzones, mz_idx);
1202 
1203 	rte_tel_data_start_dict(d);
1204 	rte_tel_data_add_dict_int(d, "Zone", mz_idx);
1205 	rte_tel_data_add_dict_string(d, "Name", mz->name);
1206 	rte_tel_data_add_dict_int(d, "Length", mz->len);
1207 	snprintf(addr, ADDR_STR, "%p", mz->addr);
1208 	rte_tel_data_add_dict_string(d, "Address", addr);
1209 	rte_tel_data_add_dict_int(d, "Socket", mz->socket_id);
1210 	rte_tel_data_add_dict_int(d, "Flags", mz->flags);
1211 
1212 	/* go through each page occupied by this memzone */
1213 	msl = rte_mem_virt2memseg_list(mz->addr);
1214 	if (!msl) {
1215 		RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
1216 		return -1;
1217 	}
1218 	page_sz = (size_t)mz->hugepage_sz;
1219 	cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
1220 	mz_end = RTE_PTR_ADD(cur_addr, mz->len);
1221 
1222 	ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
1223 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1224 
1225 	rte_tel_data_add_dict_int(d, "Hugepage_size", page_sz);
1226 	snprintf(addr, ADDR_STR, "%p", ms->addr);
1227 	rte_tel_data_add_dict_string(d, "Hugepage_base", addr);
1228 
1229 	do {
1230 		/* advance VA to next page */
1231 		cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
1232 
1233 		/* memzones occupy contiguous segments */
1234 		++ms;
1235 		ms_count++;
1236 	} while (cur_addr < mz_end);
1237 
1238 	rte_tel_data_add_dict_int(d, "Hugepage_used", ms_count);
1239 
1240 	return 0;
1241 }
1242 
1243 static void
1244 memzone_list_cb(const struct rte_memzone *mz __rte_unused,
1245 		 void *arg __rte_unused)
1246 {
1247 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1248 	struct rte_tel_data *d = arg;
1249 	int mz_idx;
1250 
1251 	mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz);
1252 	rte_tel_data_add_array_int(d, mz_idx);
1253 }
1254 
1255 
1256 /* Telemetry callback handler to list the memzones reserved. */
1257 static int
1258 handle_eal_memzone_list_request(const char *cmd __rte_unused,
1259 				const char *params __rte_unused,
1260 				struct rte_tel_data *d)
1261 {
1262 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1263 	rte_memzone_walk(memzone_list_cb, d);
1264 
1265 	return 0;
1266 }
1267 
1268 RTE_INIT(memory_telemetry)
1269 {
1270 	rte_telemetry_register_cmd(
1271 			EAL_MEMZONE_LIST_REQ, handle_eal_memzone_list_request,
1272 			"List of memzone index reserved. Takes no parameters");
1273 	rte_telemetry_register_cmd(
1274 			EAL_MEMZONE_INFO_REQ, handle_eal_memzone_info_request,
1275 			"Returns memzone info. Parameters: int mz_id");
1276 	rte_telemetry_register_cmd(
1277 			EAL_HEAP_LIST_REQ, handle_eal_heap_list_request,
1278 			"List of heap index setup. Takes no parameters");
1279 	rte_telemetry_register_cmd(
1280 			EAL_HEAP_INFO_REQ, handle_eal_heap_info_request,
1281 			"Returns malloc heap stats. Parameters: int heap_id");
1282 }
1283 #endif
1284