xref: /dpdk/lib/eal/common/eal_common_memory.c (revision 17bb60044bae68c0f062755527ad8febe9f448d1)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <inttypes.h>
12 
13 #include <rte_fbarray.h>
14 #include <rte_memory.h>
15 #include <rte_eal.h>
16 #include <rte_eal_memconfig.h>
17 #include <rte_eal_paging.h>
18 #include <rte_errno.h>
19 #include <rte_log.h>
20 #ifndef RTE_EXEC_ENV_WINDOWS
21 #include <rte_telemetry.h>
22 #endif
23 
24 #include "eal_memalloc.h"
25 #include "eal_private.h"
26 #include "eal_internal_cfg.h"
27 #include "eal_memcfg.h"
28 #include "eal_options.h"
29 #include "malloc_elem.h"
30 #include "malloc_heap.h"
31 
32 /*
33  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
34  * pointer to the mmap'd area and keep *size unmodified. Else, retry
35  * with a smaller zone: decrease *size by hugepage_sz until it reaches
36  * 0. In this case, return NULL. Note: this function returns an address
37  * which is a multiple of hugepage size.
38  */
39 
40 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
41 
42 static void *next_baseaddr;
43 static uint64_t system_page_sz;
44 
45 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
46 void *
47 eal_get_virtual_area(void *requested_addr, size_t *size,
48 	size_t page_sz, int flags, int reserve_flags)
49 {
50 	bool addr_is_hint, allow_shrink, unmap, no_align;
51 	uint64_t map_sz;
52 	void *mapped_addr, *aligned_addr;
53 	uint8_t try = 0;
54 	struct internal_config *internal_conf =
55 		eal_get_internal_configuration();
56 
57 	if (system_page_sz == 0)
58 		system_page_sz = rte_mem_page_size();
59 
60 	EAL_LOG(DEBUG, "Ask a virtual area of 0x%zx bytes", *size);
61 
62 	addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
63 	allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
64 	unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
65 
66 	if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 &&
67 			rte_eal_process_type() == RTE_PROC_PRIMARY)
68 		next_baseaddr = (void *) internal_conf->base_virtaddr;
69 
70 #ifdef RTE_ARCH_64
71 	if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 &&
72 			rte_eal_process_type() == RTE_PROC_PRIMARY)
73 		next_baseaddr = (void *) eal_get_baseaddr();
74 #endif
75 	if (requested_addr == NULL && next_baseaddr != NULL) {
76 		requested_addr = next_baseaddr;
77 		requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
78 		addr_is_hint = true;
79 	}
80 
81 	/* we don't need alignment of resulting pointer in the following cases:
82 	 *
83 	 * 1. page size is equal to system size
84 	 * 2. we have a requested address, and it is page-aligned, and we will
85 	 *    be discarding the address if we get a different one.
86 	 *
87 	 * for all other cases, alignment is potentially necessary.
88 	 */
89 	no_align = (requested_addr != NULL &&
90 		requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
91 		!addr_is_hint) ||
92 		page_sz == system_page_sz;
93 
94 	do {
95 		map_sz = no_align ? *size : *size + page_sz;
96 		if (map_sz > SIZE_MAX) {
97 			EAL_LOG(ERR, "Map size too big");
98 			rte_errno = E2BIG;
99 			return NULL;
100 		}
101 
102 		mapped_addr = eal_mem_reserve(
103 			requested_addr, (size_t)map_sz, reserve_flags);
104 		if ((mapped_addr == NULL) && allow_shrink)
105 			*size -= page_sz;
106 
107 		if ((mapped_addr != NULL) && addr_is_hint &&
108 				(mapped_addr != requested_addr)) {
109 			try++;
110 			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
111 			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
112 				/* hint was not used. Try with another offset */
113 				eal_mem_free(mapped_addr, map_sz);
114 				mapped_addr = NULL;
115 				requested_addr = next_baseaddr;
116 			}
117 		}
118 	} while ((allow_shrink || addr_is_hint) &&
119 		(mapped_addr == NULL) && (*size > 0));
120 
121 	/* align resulting address - if map failed, we will ignore the value
122 	 * anyway, so no need to add additional checks.
123 	 */
124 	aligned_addr = no_align ? mapped_addr :
125 			RTE_PTR_ALIGN(mapped_addr, page_sz);
126 
127 	if (*size == 0) {
128 		EAL_LOG(ERR, "Cannot get a virtual area of any size: %s",
129 			rte_strerror(rte_errno));
130 		return NULL;
131 	} else if (mapped_addr == NULL) {
132 		EAL_LOG(ERR, "Cannot get a virtual area: %s",
133 			rte_strerror(rte_errno));
134 		return NULL;
135 	} else if (requested_addr != NULL && !addr_is_hint &&
136 			aligned_addr != requested_addr) {
137 		EAL_LOG(ERR, "Cannot get a virtual area at requested address: %p (got %p)",
138 			requested_addr, aligned_addr);
139 		eal_mem_free(mapped_addr, map_sz);
140 		rte_errno = EADDRNOTAVAIL;
141 		return NULL;
142 	} else if (requested_addr != NULL && addr_is_hint &&
143 			aligned_addr != requested_addr) {
144 		/*
145 		 * demote this warning to debug if we did not explicitly request
146 		 * a base virtual address.
147 		 */
148 		if (internal_conf->base_virtaddr != 0) {
149 			EAL_LOG(WARNING, "WARNING! Base virtual address hint (%p != %p) not respected!",
150 				requested_addr, aligned_addr);
151 			EAL_LOG(WARNING, "   This may cause issues with mapping memory into secondary processes");
152 		} else {
153 			EAL_LOG(DEBUG, "WARNING! Base virtual address hint (%p != %p) not respected!",
154 				requested_addr, aligned_addr);
155 			EAL_LOG(DEBUG, "   This may cause issues with mapping memory into secondary processes");
156 		}
157 	} else if (next_baseaddr != NULL) {
158 		next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
159 	}
160 
161 	EAL_LOG(DEBUG, "Virtual area found at %p (size = 0x%zx)",
162 		aligned_addr, *size);
163 
164 	if (unmap) {
165 		eal_mem_free(mapped_addr, map_sz);
166 	} else if (!no_align) {
167 		void *map_end, *aligned_end;
168 		size_t before_len, after_len;
169 
170 		/* when we reserve space with alignment, we add alignment to
171 		 * mapping size. On 32-bit, if 1GB alignment was requested, this
172 		 * would waste 1GB of address space, which is a luxury we cannot
173 		 * afford. so, if alignment was performed, check if any unneeded
174 		 * address space can be unmapped back.
175 		 */
176 
177 		map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
178 		aligned_end = RTE_PTR_ADD(aligned_addr, *size);
179 
180 		/* unmap space before aligned mmap address */
181 		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
182 		if (before_len > 0)
183 			eal_mem_free(mapped_addr, before_len);
184 
185 		/* unmap space after aligned end mmap address */
186 		after_len = RTE_PTR_DIFF(map_end, aligned_end);
187 		if (after_len > 0)
188 			eal_mem_free(aligned_end, after_len);
189 	}
190 
191 	if (!unmap) {
192 		/* Exclude these pages from a core dump. */
193 		eal_mem_set_dump(aligned_addr, *size, false);
194 	}
195 
196 	return aligned_addr;
197 }
198 
199 int
200 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
201 		uint64_t page_sz, int n_segs, int socket_id, bool heap)
202 {
203 	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
204 			sizeof(struct rte_memseg))) {
205 		EAL_LOG(ERR, "Cannot allocate memseg list: %s",
206 			rte_strerror(rte_errno));
207 		return -1;
208 	}
209 
210 	msl->page_sz = page_sz;
211 	msl->socket_id = socket_id;
212 	msl->base_va = NULL;
213 	msl->heap = heap;
214 
215 	EAL_LOG(DEBUG,
216 		"Memseg list allocated at socket %i, page size 0x%"PRIx64"kB",
217 		socket_id, page_sz >> 10);
218 
219 	return 0;
220 }
221 
222 int
223 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz,
224 		int n_segs, int socket_id, int type_msl_idx, bool heap)
225 {
226 	char name[RTE_FBARRAY_NAME_LEN];
227 
228 	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
229 		 type_msl_idx);
230 
231 	return eal_memseg_list_init_named(
232 		msl, name, page_sz, n_segs, socket_id, heap);
233 }
234 
235 int
236 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
237 {
238 	size_t page_sz, mem_sz;
239 	void *addr;
240 
241 	page_sz = msl->page_sz;
242 	mem_sz = page_sz * msl->memseg_arr.len;
243 
244 	addr = eal_get_virtual_area(
245 		msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
246 	if (addr == NULL) {
247 #ifndef RTE_EXEC_ENV_WINDOWS
248 		/* The hint would be misleading on Windows, because address
249 		 * is by default system-selected (base VA = 0).
250 		 * However, this function is called from many places,
251 		 * including common code, so don't duplicate the message.
252 		 */
253 		if (rte_errno == EADDRNOTAVAIL)
254 			EAL_LOG(ERR, "Cannot reserve %llu bytes at [%p] - "
255 				"please use '--" OPT_BASE_VIRTADDR "' option",
256 				(unsigned long long)mem_sz, msl->base_va);
257 #endif
258 		return -1;
259 	}
260 	msl->base_va = addr;
261 	msl->len = mem_sz;
262 
263 	EAL_LOG(DEBUG, "VA reserved for memseg list at %p, size %zx",
264 			addr, mem_sz);
265 
266 	return 0;
267 }
268 
269 void
270 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs)
271 {
272 	size_t page_sz = msl->page_sz;
273 	int i;
274 
275 	for (i = 0; i < n_segs; i++) {
276 		struct rte_fbarray *arr = &msl->memseg_arr;
277 		struct rte_memseg *ms = rte_fbarray_get(arr, i);
278 
279 		if (rte_eal_iova_mode() == RTE_IOVA_VA)
280 			ms->iova = (uintptr_t)addr;
281 		else
282 			ms->iova = RTE_BAD_IOVA;
283 		ms->addr = addr;
284 		ms->hugepage_sz = page_sz;
285 		ms->socket_id = 0;
286 		ms->len = page_sz;
287 
288 		rte_fbarray_set_used(arr, i);
289 
290 		addr = RTE_PTR_ADD(addr, page_sz);
291 	}
292 }
293 
294 static struct rte_memseg *
295 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
296 {
297 	const struct rte_fbarray *arr;
298 	void *start, *end;
299 	int ms_idx;
300 
301 	if (msl == NULL)
302 		return NULL;
303 
304 	/* a memseg list was specified, check if it's the right one */
305 	start = msl->base_va;
306 	end = RTE_PTR_ADD(start, msl->len);
307 
308 	if (addr < start || addr >= end)
309 		return NULL;
310 
311 	/* now, calculate index */
312 	arr = &msl->memseg_arr;
313 	ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
314 	return rte_fbarray_get(arr, ms_idx);
315 }
316 
317 static struct rte_memseg_list *
318 virt2memseg_list(const void *addr)
319 {
320 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
321 	struct rte_memseg_list *msl;
322 	int msl_idx;
323 
324 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
325 		void *start, *end;
326 		msl = &mcfg->memsegs[msl_idx];
327 
328 		start = msl->base_va;
329 		end = RTE_PTR_ADD(start, msl->len);
330 		if (addr >= start && addr < end)
331 			break;
332 	}
333 	/* if we didn't find our memseg list */
334 	if (msl_idx == RTE_MAX_MEMSEG_LISTS)
335 		return NULL;
336 	return msl;
337 }
338 
339 struct rte_memseg_list *
340 rte_mem_virt2memseg_list(const void *addr)
341 {
342 	return virt2memseg_list(addr);
343 }
344 
345 struct virtiova {
346 	rte_iova_t iova;
347 	void *virt;
348 };
349 static int
350 find_virt(const struct rte_memseg_list *msl __rte_unused,
351 		const struct rte_memseg *ms, void *arg)
352 {
353 	struct virtiova *vi = arg;
354 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
355 		size_t offset = vi->iova - ms->iova;
356 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
357 		/* stop the walk */
358 		return 1;
359 	}
360 	return 0;
361 }
362 static int
363 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
364 		const struct rte_memseg *ms, size_t len, void *arg)
365 {
366 	struct virtiova *vi = arg;
367 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
368 		size_t offset = vi->iova - ms->iova;
369 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
370 		/* stop the walk */
371 		return 1;
372 	}
373 	return 0;
374 }
375 
376 void *
377 rte_mem_iova2virt(rte_iova_t iova)
378 {
379 	struct virtiova vi;
380 	const struct internal_config *internal_conf =
381 		eal_get_internal_configuration();
382 
383 	memset(&vi, 0, sizeof(vi));
384 
385 	vi.iova = iova;
386 	/* for legacy mem, we can get away with scanning VA-contiguous segments,
387 	 * as we know they are PA-contiguous as well
388 	 */
389 	if (internal_conf->legacy_mem)
390 		rte_memseg_contig_walk(find_virt_legacy, &vi);
391 	else
392 		rte_memseg_walk(find_virt, &vi);
393 
394 	return vi.virt;
395 }
396 
397 struct rte_memseg *
398 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
399 {
400 	return virt2memseg(addr, msl != NULL ? msl :
401 			rte_mem_virt2memseg_list(addr));
402 }
403 
404 static int
405 physmem_size(const struct rte_memseg_list *msl, void *arg)
406 {
407 	uint64_t *total_len = arg;
408 
409 	if (msl->external)
410 		return 0;
411 
412 	*total_len += msl->memseg_arr.count * msl->page_sz;
413 
414 	return 0;
415 }
416 
417 /* get the total size of memory */
418 uint64_t
419 rte_eal_get_physmem_size(void)
420 {
421 	uint64_t total_len = 0;
422 
423 	rte_memseg_list_walk(physmem_size, &total_len);
424 
425 	return total_len;
426 }
427 
428 static int
429 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
430 		void *arg)
431 {
432 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
433 	int msl_idx, ms_idx, fd;
434 	FILE *f = arg;
435 
436 	msl_idx = msl - mcfg->memsegs;
437 	if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
438 		return -1;
439 
440 	ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
441 	if (ms_idx < 0)
442 		return -1;
443 
444 	fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
445 	fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
446 			"virt:%p, socket_id:%"PRId32", "
447 			"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
448 			"nrank:%"PRIx32" fd:%i\n",
449 			msl_idx, ms_idx,
450 			ms->iova,
451 			ms->len,
452 			ms->addr,
453 			ms->socket_id,
454 			ms->hugepage_sz,
455 			ms->nchannel,
456 			ms->nrank,
457 			fd);
458 
459 	return 0;
460 }
461 
462 /*
463  * Defining here because declared in rte_memory.h, but the actual implementation
464  * is in eal_common_memalloc.c, like all other memalloc internals.
465  */
466 int
467 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
468 		void *arg)
469 {
470 	const struct internal_config *internal_conf =
471 		eal_get_internal_configuration();
472 
473 	/* FreeBSD boots with legacy mem enabled by default */
474 	if (internal_conf->legacy_mem) {
475 		EAL_LOG(DEBUG, "Registering mem event callbacks not supported");
476 		rte_errno = ENOTSUP;
477 		return -1;
478 	}
479 	return eal_memalloc_mem_event_callback_register(name, clb, arg);
480 }
481 
482 int
483 rte_mem_event_callback_unregister(const char *name, void *arg)
484 {
485 	const struct internal_config *internal_conf =
486 		eal_get_internal_configuration();
487 
488 	/* FreeBSD boots with legacy mem enabled by default */
489 	if (internal_conf->legacy_mem) {
490 		EAL_LOG(DEBUG, "Registering mem event callbacks not supported");
491 		rte_errno = ENOTSUP;
492 		return -1;
493 	}
494 	return eal_memalloc_mem_event_callback_unregister(name, arg);
495 }
496 
497 int
498 rte_mem_alloc_validator_register(const char *name,
499 		rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
500 {
501 	const struct internal_config *internal_conf =
502 		eal_get_internal_configuration();
503 
504 	/* FreeBSD boots with legacy mem enabled by default */
505 	if (internal_conf->legacy_mem) {
506 		EAL_LOG(DEBUG, "Registering mem alloc validators not supported");
507 		rte_errno = ENOTSUP;
508 		return -1;
509 	}
510 	return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
511 			limit);
512 }
513 
514 int
515 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
516 {
517 	const struct internal_config *internal_conf =
518 		eal_get_internal_configuration();
519 
520 	/* FreeBSD boots with legacy mem enabled by default */
521 	if (internal_conf->legacy_mem) {
522 		EAL_LOG(DEBUG, "Registering mem alloc validators not supported");
523 		rte_errno = ENOTSUP;
524 		return -1;
525 	}
526 	return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
527 }
528 
529 /* Dump the physical memory layout on console */
530 void
531 rte_dump_physmem_layout(FILE *f)
532 {
533 	rte_memseg_walk(dump_memseg, f);
534 	fprintf(f, "Total Memory Segments size = %"PRIu64"M\n",
535 		rte_eal_get_physmem_size() / (1024 * 1024));
536 }
537 
538 static int
539 check_iova(const struct rte_memseg_list *msl __rte_unused,
540 		const struct rte_memseg *ms, void *arg)
541 {
542 	uint64_t *mask = arg;
543 	rte_iova_t iova;
544 
545 	/* higher address within segment */
546 	iova = (ms->iova + ms->len) - 1;
547 	if (!(iova & *mask))
548 		return 0;
549 
550 	EAL_LOG(DEBUG, "memseg iova %"PRIx64", len %zx, out of range",
551 			    ms->iova, ms->len);
552 
553 	EAL_LOG(DEBUG, "\tusing dma mask %"PRIx64, *mask);
554 	return 1;
555 }
556 
557 #define MAX_DMA_MASK_BITS 63
558 
559 /* check memseg iovas are within the required range based on dma mask */
560 static int
561 check_dma_mask(uint8_t maskbits, bool thread_unsafe)
562 {
563 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
564 	uint64_t mask;
565 	int ret;
566 
567 	/* Sanity check. We only check width can be managed with 64 bits
568 	 * variables. Indeed any higher value is likely wrong. */
569 	if (maskbits > MAX_DMA_MASK_BITS) {
570 		EAL_LOG(ERR, "wrong dma mask size %u (Max: %u)",
571 				   maskbits, MAX_DMA_MASK_BITS);
572 		return -1;
573 	}
574 
575 	/* create dma mask */
576 	mask = ~((1ULL << maskbits) - 1);
577 
578 	if (thread_unsafe)
579 		ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
580 	else
581 		ret = rte_memseg_walk(check_iova, &mask);
582 
583 	if (ret)
584 		/*
585 		 * Dma mask precludes hugepage usage.
586 		 * This device can not be used and we do not need to keep
587 		 * the dma mask.
588 		 */
589 		return 1;
590 
591 	/*
592 	 * we need to keep the more restricted maskbit for checking
593 	 * potential dynamic memory allocation in the future.
594 	 */
595 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
596 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
597 
598 	return 0;
599 }
600 
601 int
602 rte_mem_check_dma_mask(uint8_t maskbits)
603 {
604 	return check_dma_mask(maskbits, false);
605 }
606 
607 int
608 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
609 {
610 	return check_dma_mask(maskbits, true);
611 }
612 
613 /*
614  * Set dma mask to use when memory initialization is done.
615  *
616  * This function should ONLY be used by code executed before the memory
617  * initialization. PMDs should use rte_mem_check_dma_mask if addressing
618  * limitations by the device.
619  */
620 void
621 rte_mem_set_dma_mask(uint8_t maskbits)
622 {
623 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
624 
625 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
626 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
627 }
628 
629 /* return the number of memory channels */
630 unsigned rte_memory_get_nchannel(void)
631 {
632 	return rte_eal_get_configuration()->mem_config->nchannel;
633 }
634 
635 /* return the number of memory rank */
636 unsigned rte_memory_get_nrank(void)
637 {
638 	return rte_eal_get_configuration()->mem_config->nrank;
639 }
640 
641 static int
642 rte_eal_memdevice_init(void)
643 {
644 	struct rte_config *config;
645 	const struct internal_config *internal_conf;
646 
647 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
648 		return 0;
649 
650 	internal_conf = eal_get_internal_configuration();
651 	config = rte_eal_get_configuration();
652 	config->mem_config->nchannel = internal_conf->force_nchannel;
653 	config->mem_config->nrank = internal_conf->force_nrank;
654 
655 	return 0;
656 }
657 
658 /* Lock page in physical memory and prevent from swapping. */
659 int
660 rte_mem_lock_page(const void *virt)
661 {
662 	uintptr_t virtual = (uintptr_t)virt;
663 	size_t page_size = rte_mem_page_size();
664 	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
665 	return rte_mem_lock((void *)aligned, page_size);
666 }
667 
668 int
669 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
670 {
671 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
672 	int i, ms_idx, ret = 0;
673 
674 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
675 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
676 		const struct rte_memseg *ms;
677 		struct rte_fbarray *arr;
678 
679 		if (msl->memseg_arr.count == 0)
680 			continue;
681 
682 		arr = &msl->memseg_arr;
683 
684 		ms_idx = rte_fbarray_find_next_used(arr, 0);
685 		while (ms_idx >= 0) {
686 			int n_segs;
687 			size_t len;
688 
689 			ms = rte_fbarray_get(arr, ms_idx);
690 
691 			/* find how many more segments there are, starting with
692 			 * this one.
693 			 */
694 			n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
695 			len = n_segs * msl->page_sz;
696 
697 			ret = func(msl, ms, len, arg);
698 			if (ret)
699 				return ret;
700 			ms_idx = rte_fbarray_find_next_used(arr,
701 					ms_idx + n_segs);
702 		}
703 	}
704 	return 0;
705 }
706 
707 int
708 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
709 {
710 	int ret = 0;
711 
712 	/* do not allow allocations/frees/init while we iterate */
713 	rte_mcfg_mem_read_lock();
714 	ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
715 	rte_mcfg_mem_read_unlock();
716 
717 	return ret;
718 }
719 
720 int
721 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
722 {
723 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
724 	int i, ms_idx, ret = 0;
725 
726 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
727 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
728 		const struct rte_memseg *ms;
729 		struct rte_fbarray *arr;
730 
731 		if (msl->memseg_arr.count == 0)
732 			continue;
733 
734 		arr = &msl->memseg_arr;
735 
736 		ms_idx = rte_fbarray_find_next_used(arr, 0);
737 		while (ms_idx >= 0) {
738 			ms = rte_fbarray_get(arr, ms_idx);
739 			ret = func(msl, ms, arg);
740 			if (ret)
741 				return ret;
742 			ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
743 		}
744 	}
745 	return 0;
746 }
747 
748 int
749 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
750 {
751 	int ret = 0;
752 
753 	/* do not allow allocations/frees/init while we iterate */
754 	rte_mcfg_mem_read_lock();
755 	ret = rte_memseg_walk_thread_unsafe(func, arg);
756 	rte_mcfg_mem_read_unlock();
757 
758 	return ret;
759 }
760 
761 int
762 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
763 {
764 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
765 	int i, ret = 0;
766 
767 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
768 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
769 
770 		if (msl->base_va == NULL)
771 			continue;
772 
773 		ret = func(msl, arg);
774 		if (ret)
775 			return ret;
776 	}
777 	return 0;
778 }
779 
780 int
781 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
782 {
783 	int ret = 0;
784 
785 	/* do not allow allocations/frees/init while we iterate */
786 	rte_mcfg_mem_read_lock();
787 	ret = rte_memseg_list_walk_thread_unsafe(func, arg);
788 	rte_mcfg_mem_read_unlock();
789 
790 	return ret;
791 }
792 
793 int
794 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
795 {
796 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
797 	struct rte_memseg_list *msl;
798 	struct rte_fbarray *arr;
799 	int msl_idx, seg_idx, ret;
800 
801 	if (ms == NULL) {
802 		rte_errno = EINVAL;
803 		return -1;
804 	}
805 
806 	msl = rte_mem_virt2memseg_list(ms->addr);
807 	if (msl == NULL) {
808 		rte_errno = EINVAL;
809 		return -1;
810 	}
811 	arr = &msl->memseg_arr;
812 
813 	msl_idx = msl - mcfg->memsegs;
814 	seg_idx = rte_fbarray_find_idx(arr, ms);
815 
816 	if (!rte_fbarray_is_used(arr, seg_idx)) {
817 		rte_errno = ENOENT;
818 		return -1;
819 	}
820 
821 	/* segment fd API is not supported for external segments */
822 	if (msl->external) {
823 		rte_errno = ENOTSUP;
824 		return -1;
825 	}
826 
827 	ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
828 	if (ret < 0) {
829 		rte_errno = -ret;
830 		ret = -1;
831 	}
832 	return ret;
833 }
834 
835 int
836 rte_memseg_get_fd(const struct rte_memseg *ms)
837 {
838 	int ret;
839 
840 	rte_mcfg_mem_read_lock();
841 	ret = rte_memseg_get_fd_thread_unsafe(ms);
842 	rte_mcfg_mem_read_unlock();
843 
844 	return ret;
845 }
846 
847 int
848 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
849 		size_t *offset)
850 {
851 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
852 	struct rte_memseg_list *msl;
853 	struct rte_fbarray *arr;
854 	int msl_idx, seg_idx, ret;
855 
856 	if (ms == NULL || offset == NULL) {
857 		rte_errno = EINVAL;
858 		return -1;
859 	}
860 
861 	msl = rte_mem_virt2memseg_list(ms->addr);
862 	if (msl == NULL) {
863 		rte_errno = EINVAL;
864 		return -1;
865 	}
866 	arr = &msl->memseg_arr;
867 
868 	msl_idx = msl - mcfg->memsegs;
869 	seg_idx = rte_fbarray_find_idx(arr, ms);
870 
871 	if (!rte_fbarray_is_used(arr, seg_idx)) {
872 		rte_errno = ENOENT;
873 		return -1;
874 	}
875 
876 	/* segment fd API is not supported for external segments */
877 	if (msl->external) {
878 		rte_errno = ENOTSUP;
879 		return -1;
880 	}
881 
882 	ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
883 	if (ret < 0) {
884 		rte_errno = -ret;
885 		ret = -1;
886 	}
887 	return ret;
888 }
889 
890 int
891 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
892 {
893 	int ret;
894 
895 	rte_mcfg_mem_read_lock();
896 	ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
897 	rte_mcfg_mem_read_unlock();
898 
899 	return ret;
900 }
901 
902 int
903 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
904 		unsigned int n_pages, size_t page_sz)
905 {
906 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
907 	unsigned int socket_id, n;
908 	int ret = 0;
909 
910 	if (va_addr == NULL || page_sz == 0 || len == 0 ||
911 			!rte_is_power_of_2(page_sz) ||
912 			RTE_ALIGN(len, page_sz) != len ||
913 			((len / page_sz) != n_pages && iova_addrs != NULL) ||
914 			!rte_is_aligned(va_addr, page_sz)) {
915 		rte_errno = EINVAL;
916 		return -1;
917 	}
918 	rte_mcfg_mem_write_lock();
919 
920 	/* make sure the segment doesn't already exist */
921 	if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
922 		rte_errno = EEXIST;
923 		ret = -1;
924 		goto unlock;
925 	}
926 
927 	/* get next available socket ID */
928 	socket_id = mcfg->next_socket_id;
929 	if (socket_id > INT32_MAX) {
930 		EAL_LOG(ERR, "Cannot assign new socket ID's");
931 		rte_errno = ENOSPC;
932 		ret = -1;
933 		goto unlock;
934 	}
935 
936 	/* we can create a new memseg */
937 	n = len / page_sz;
938 	if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
939 			page_sz, "extmem", socket_id) == NULL) {
940 		ret = -1;
941 		goto unlock;
942 	}
943 
944 	/* memseg list successfully created - increment next socket ID */
945 	mcfg->next_socket_id++;
946 unlock:
947 	rte_mcfg_mem_write_unlock();
948 	return ret;
949 }
950 
951 int
952 rte_extmem_unregister(void *va_addr, size_t len)
953 {
954 	struct rte_memseg_list *msl;
955 	int ret = 0;
956 
957 	if (va_addr == NULL || len == 0) {
958 		rte_errno = EINVAL;
959 		return -1;
960 	}
961 	rte_mcfg_mem_write_lock();
962 
963 	/* find our segment */
964 	msl = malloc_heap_find_external_seg(va_addr, len);
965 	if (msl == NULL) {
966 		rte_errno = ENOENT;
967 		ret = -1;
968 		goto unlock;
969 	}
970 
971 	ret = malloc_heap_destroy_external_seg(msl);
972 unlock:
973 	rte_mcfg_mem_write_unlock();
974 	return ret;
975 }
976 
977 static int
978 sync_memory(void *va_addr, size_t len, bool attach)
979 {
980 	struct rte_memseg_list *msl;
981 	int ret = 0;
982 
983 	if (va_addr == NULL || len == 0) {
984 		rte_errno = EINVAL;
985 		return -1;
986 	}
987 	rte_mcfg_mem_write_lock();
988 
989 	/* find our segment */
990 	msl = malloc_heap_find_external_seg(va_addr, len);
991 	if (msl == NULL) {
992 		rte_errno = ENOENT;
993 		ret = -1;
994 		goto unlock;
995 	}
996 	if (attach)
997 		ret = rte_fbarray_attach(&msl->memseg_arr);
998 	else
999 		ret = rte_fbarray_detach(&msl->memseg_arr);
1000 
1001 unlock:
1002 	rte_mcfg_mem_write_unlock();
1003 	return ret;
1004 }
1005 
1006 int
1007 rte_extmem_attach(void *va_addr, size_t len)
1008 {
1009 	return sync_memory(va_addr, len, true);
1010 }
1011 
1012 int
1013 rte_extmem_detach(void *va_addr, size_t len)
1014 {
1015 	return sync_memory(va_addr, len, false);
1016 }
1017 
1018 /* detach all EAL memory */
1019 int
1020 rte_eal_memory_detach(void)
1021 {
1022 	const struct internal_config *internal_conf =
1023 		eal_get_internal_configuration();
1024 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1025 	size_t page_sz = rte_mem_page_size();
1026 	unsigned int i;
1027 
1028 	if (internal_conf->in_memory == 1)
1029 		return 0;
1030 
1031 	rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
1032 
1033 	/* detach internal memory subsystem data first */
1034 	if (eal_memalloc_cleanup())
1035 		EAL_LOG(ERR, "Could not release memory subsystem data");
1036 
1037 	for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) {
1038 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1039 
1040 		/* skip uninitialized segments */
1041 		if (msl->base_va == NULL)
1042 			continue;
1043 		/*
1044 		 * external segments are supposed to be detached at this point,
1045 		 * but if they aren't, we can't really do anything about it,
1046 		 * because if we skip them here, they'll become invalid after
1047 		 * we unmap the memconfig anyway. however, if this is externally
1048 		 * referenced memory, we have no business unmapping it.
1049 		 */
1050 		if (!msl->external)
1051 			if (rte_mem_unmap(msl->base_va, msl->len) != 0)
1052 				EAL_LOG(ERR, "Could not unmap memory: %s",
1053 						rte_strerror(rte_errno));
1054 
1055 		/*
1056 		 * we are detaching the fbarray rather than destroying because
1057 		 * other processes might still reference this fbarray, and we
1058 		 * have no way of knowing if they still do.
1059 		 */
1060 		if (rte_fbarray_detach(&msl->memseg_arr))
1061 			EAL_LOG(ERR, "Could not detach fbarray: %s",
1062 					rte_strerror(rte_errno));
1063 	}
1064 	rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
1065 
1066 	/*
1067 	 * we've detached the memseg lists, so we can unmap the shared mem
1068 	 * config - we can't zero it out because it might still be referenced
1069 	 * by other processes.
1070 	 */
1071 	if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) {
1072 		if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0)
1073 			EAL_LOG(ERR, "Could not unmap shared memory config: %s",
1074 					rte_strerror(rte_errno));
1075 	}
1076 	rte_eal_get_configuration()->mem_config = NULL;
1077 
1078 	return 0;
1079 }
1080 
1081 /* init memory subsystem */
1082 int
1083 rte_eal_memory_init(void)
1084 {
1085 	const struct internal_config *internal_conf =
1086 		eal_get_internal_configuration();
1087 	int retval;
1088 
1089 	EAL_LOG(DEBUG, "Setting up physically contiguous memory...");
1090 
1091 	if (rte_eal_memseg_init() < 0)
1092 		goto fail;
1093 
1094 	if (eal_memalloc_init() < 0)
1095 		goto fail;
1096 
1097 	retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1098 			rte_eal_hugepage_init() :
1099 			rte_eal_hugepage_attach();
1100 	if (retval < 0)
1101 		goto fail;
1102 
1103 	if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
1104 		goto fail;
1105 
1106 	return 0;
1107 fail:
1108 	return -1;
1109 }
1110 
1111 #ifndef RTE_EXEC_ENV_WINDOWS
1112 #define EAL_MEMZONE_LIST_REQ		"/eal/memzone_list"
1113 #define EAL_MEMZONE_INFO_REQ		"/eal/memzone_info"
1114 #define EAL_HEAP_LIST_REQ		"/eal/heap_list"
1115 #define EAL_HEAP_INFO_REQ		"/eal/heap_info"
1116 #define EAL_MEMSEG_LISTS_REQ		"/eal/memseg_lists"
1117 #define EAL_MEMSEG_LIST_INFO_REQ	"/eal/memseg_list_info"
1118 #define EAL_MEMSEG_INFO_REQ		"/eal/memseg_info"
1119 #define EAL_ELEMENT_LIST_REQ		"/eal/mem_element_list"
1120 #define EAL_ELEMENT_INFO_REQ		"/eal/mem_element_info"
1121 #define ADDR_STR			15
1122 
1123 
1124 /* Telemetry callback handler to return heap stats for requested heap id. */
1125 static int
1126 handle_eal_heap_info_request(const char *cmd __rte_unused, const char *params,
1127 			     struct rte_tel_data *d)
1128 {
1129 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1130 	struct rte_malloc_socket_stats sock_stats;
1131 	struct malloc_heap *heap;
1132 	unsigned int heap_id;
1133 
1134 	if (params == NULL || strlen(params) == 0)
1135 		return -1;
1136 
1137 	heap_id = (unsigned int)strtoul(params, NULL, 10);
1138 
1139 	/* Get the heap stats of user provided heap id */
1140 	heap = &mcfg->malloc_heaps[heap_id];
1141 	malloc_heap_get_stats(heap, &sock_stats);
1142 
1143 	rte_tel_data_start_dict(d);
1144 	rte_tel_data_add_dict_uint(d, "Heap_id", heap_id);
1145 	rte_tel_data_add_dict_string(d, "Name", heap->name);
1146 	rte_tel_data_add_dict_uint(d, "Heap_size",
1147 				   sock_stats.heap_totalsz_bytes);
1148 	rte_tel_data_add_dict_uint(d, "Free_size",
1149 				   sock_stats.heap_freesz_bytes);
1150 	rte_tel_data_add_dict_uint(d, "Alloc_size",
1151 				   sock_stats.heap_allocsz_bytes);
1152 	rte_tel_data_add_dict_uint(d, "Greatest_free_size",
1153 				   sock_stats.greatest_free_size);
1154 	rte_tel_data_add_dict_uint(d, "Alloc_count", sock_stats.alloc_count);
1155 	rte_tel_data_add_dict_uint(d, "Free_count", sock_stats.free_count);
1156 
1157 	return 0;
1158 }
1159 
1160 /* Telemetry callback handler to list the heap ids setup. */
1161 static int
1162 handle_eal_heap_list_request(const char *cmd __rte_unused,
1163 				const char *params __rte_unused,
1164 				struct rte_tel_data *d)
1165 {
1166 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1167 	struct rte_malloc_socket_stats sock_stats;
1168 	unsigned int heap_id;
1169 
1170 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1171 	/* Iterate through all initialised heaps */
1172 	for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) {
1173 		struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
1174 
1175 		malloc_heap_get_stats(heap, &sock_stats);
1176 		if (sock_stats.heap_totalsz_bytes != 0)
1177 			rte_tel_data_add_array_int(d, heap_id);
1178 	}
1179 
1180 	return 0;
1181 }
1182 
1183 /* Telemetry callback handler to return memzone info for requested index. */
1184 static int
1185 handle_eal_memzone_info_request(const char *cmd __rte_unused,
1186 				const char *params, struct rte_tel_data *d)
1187 {
1188 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1189 	struct rte_memseg_list *msl = NULL;
1190 	int ms_idx, ms_count = 0;
1191 	void *cur_addr, *mz_end;
1192 	struct rte_memzone *mz;
1193 	struct rte_memseg *ms;
1194 	char addr[ADDR_STR];
1195 	unsigned int mz_idx;
1196 	size_t page_sz;
1197 
1198 	if (params == NULL || strlen(params) == 0)
1199 		return -1;
1200 
1201 	mz_idx = strtoul(params, NULL, 10);
1202 
1203 	/* Get the memzone handle using index */
1204 	mz = rte_fbarray_get(&mcfg->memzones, mz_idx);
1205 
1206 	rte_tel_data_start_dict(d);
1207 	rte_tel_data_add_dict_uint(d, "Zone", mz_idx);
1208 	rte_tel_data_add_dict_string(d, "Name", mz->name);
1209 	rte_tel_data_add_dict_uint(d, "Length", mz->len);
1210 	snprintf(addr, ADDR_STR, "%p", mz->addr);
1211 	rte_tel_data_add_dict_string(d, "Address", addr);
1212 	rte_tel_data_add_dict_int(d, "Socket", mz->socket_id);
1213 	rte_tel_data_add_dict_uint(d, "Flags", mz->flags);
1214 
1215 	/* go through each page occupied by this memzone */
1216 	msl = rte_mem_virt2memseg_list(mz->addr);
1217 	if (!msl) {
1218 		EAL_LOG(DEBUG, "Skipping bad memzone");
1219 		return -1;
1220 	}
1221 	page_sz = (size_t)mz->hugepage_sz;
1222 	cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
1223 	mz_end = RTE_PTR_ADD(cur_addr, mz->len);
1224 
1225 	ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
1226 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1227 
1228 	rte_tel_data_add_dict_uint(d, "Hugepage_size", page_sz);
1229 	snprintf(addr, ADDR_STR, "%p", ms->addr);
1230 	rte_tel_data_add_dict_string(d, "Hugepage_base", addr);
1231 
1232 	do {
1233 		/* advance VA to next page */
1234 		cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
1235 
1236 		/* memzones occupy contiguous segments */
1237 		++ms;
1238 		ms_count++;
1239 	} while (cur_addr < mz_end);
1240 
1241 	rte_tel_data_add_dict_int(d, "Hugepage_used", ms_count);
1242 
1243 	return 0;
1244 }
1245 
1246 static void
1247 memzone_list_cb(const struct rte_memzone *mz __rte_unused,
1248 		 void *arg __rte_unused)
1249 {
1250 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1251 	struct rte_tel_data *d = arg;
1252 	int mz_idx;
1253 
1254 	mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz);
1255 	rte_tel_data_add_array_int(d, mz_idx);
1256 }
1257 
1258 
1259 /* Telemetry callback handler to list the memzones reserved. */
1260 static int
1261 handle_eal_memzone_list_request(const char *cmd __rte_unused,
1262 				const char *params __rte_unused,
1263 				struct rte_tel_data *d)
1264 {
1265 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1266 	rte_memzone_walk(memzone_list_cb, d);
1267 
1268 	return 0;
1269 }
1270 
1271 /* n_vals is the number of params to be parsed. */
1272 static int
1273 parse_params(const char *params, uint32_t *vals, size_t n_vals)
1274 {
1275 	char dlim[2] = ",";
1276 	char *params_args;
1277 	size_t count = 0;
1278 	char *token;
1279 
1280 	if (vals == NULL || params == NULL || strlen(params) == 0)
1281 		return -1;
1282 
1283 	/* strtok expects char * and param is const char *. Hence on using
1284 	 * params as "const char *" compiler throws warning.
1285 	 */
1286 	params_args = strdup(params);
1287 	if (params_args == NULL)
1288 		return -1;
1289 
1290 	token = strtok(params_args, dlim);
1291 	while (token && isdigit(*token) && count < n_vals) {
1292 		vals[count++] = strtoul(token, NULL, 10);
1293 		token = strtok(NULL, dlim);
1294 	}
1295 
1296 	free(params_args);
1297 
1298 	if (count < n_vals)
1299 		return -1;
1300 
1301 	return 0;
1302 }
1303 
1304 static int
1305 handle_eal_memseg_lists_request(const char *cmd __rte_unused,
1306 				const char *params __rte_unused,
1307 				struct rte_tel_data *d)
1308 {
1309 	struct rte_mem_config *mcfg;
1310 	int i;
1311 
1312 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1313 
1314 	rte_mcfg_mem_read_lock();
1315 	mcfg = rte_eal_get_configuration()->mem_config;
1316 
1317 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
1318 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1319 		if (msl->memseg_arr.count == 0)
1320 			continue;
1321 
1322 		rte_tel_data_add_array_int(d, i);
1323 	}
1324 	rte_mcfg_mem_read_unlock();
1325 
1326 	return 0;
1327 }
1328 
1329 static int
1330 handle_eal_memseg_list_info_request(const char *cmd __rte_unused,
1331 				    const char *params, struct rte_tel_data *d)
1332 {
1333 	struct rte_mem_config *mcfg;
1334 	struct rte_memseg_list *msl;
1335 	struct rte_fbarray *arr;
1336 	uint32_t ms_list_idx;
1337 	int ms_idx;
1338 	/* size of an array == num params to be parsed. */
1339 	uint32_t vals[1] = {0};
1340 
1341 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1342 		return -1;
1343 
1344 	ms_list_idx = vals[0];
1345 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1346 		return -1;
1347 
1348 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1349 
1350 	rte_mcfg_mem_read_lock();
1351 	mcfg = rte_eal_get_configuration()->mem_config;
1352 	msl = &mcfg->memsegs[ms_list_idx];
1353 	if (msl->memseg_arr.count == 0)
1354 		goto done;
1355 
1356 	arr = &msl->memseg_arr;
1357 
1358 	ms_idx = rte_fbarray_find_next_used(arr, 0);
1359 	while (ms_idx >= 0) {
1360 		rte_tel_data_add_array_int(d, ms_idx);
1361 		ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
1362 	}
1363 
1364 done:
1365 	rte_mcfg_mem_read_unlock();
1366 
1367 	return 0;
1368 }
1369 
1370 static int
1371 handle_eal_memseg_info_request(const char *cmd __rte_unused,
1372 			       const char *params, struct rte_tel_data *d)
1373 {
1374 	struct rte_mem_config *mcfg;
1375 	uint64_t ms_start_addr, ms_end_addr, ms_size, hugepage_size, ms_iova;
1376 	struct rte_memseg_list *msl;
1377 	const struct rte_memseg *ms;
1378 	struct rte_fbarray *arr;
1379 	char addr[ADDR_STR];
1380 	uint32_t ms_list_idx = 0;
1381 	uint32_t ms_idx = 0;
1382 	int32_t ms_socket_id;
1383 	uint32_t ms_flags;
1384 	/* size of an array == num params to be parsed. */
1385 	uint32_t vals[2] = {0};
1386 
1387 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1388 		return -1;
1389 
1390 	ms_list_idx = vals[0];
1391 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1392 		return -1;
1393 
1394 	ms_idx = vals[1];
1395 
1396 	rte_mcfg_mem_read_lock();
1397 
1398 	mcfg = rte_eal_get_configuration()->mem_config;
1399 	msl = &mcfg->memsegs[ms_list_idx];
1400 	if (msl->memseg_arr.count == 0) {
1401 		rte_mcfg_mem_read_unlock();
1402 		return -1;
1403 	}
1404 
1405 	arr = &msl->memseg_arr;
1406 	ms = rte_fbarray_get(arr, ms_idx);
1407 	if (ms == NULL) {
1408 		rte_mcfg_mem_read_unlock();
1409 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1410 		return -1;
1411 	}
1412 
1413 	ms_iova = ms->iova;
1414 	ms_start_addr = ms->addr_64;
1415 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1416 	ms_size = ms->len;
1417 	hugepage_size = ms->hugepage_sz;
1418 	ms_socket_id = ms->socket_id;
1419 	ms_flags = ms->flags;
1420 
1421 	rte_mcfg_mem_read_unlock();
1422 
1423 	rte_tel_data_start_dict(d);
1424 	rte_tel_data_add_dict_int(d, "Memseg_list_index", ms_list_idx);
1425 	rte_tel_data_add_dict_int(d, "Memseg_index", ms_idx);
1426 	if (ms_iova == RTE_BAD_IOVA)
1427 		snprintf(addr, ADDR_STR, "Bad IOVA");
1428 	else
1429 		snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_iova);
1430 
1431 	rte_tel_data_add_dict_string(d, "IOVA_addr", addr);
1432 	snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_start_addr);
1433 	rte_tel_data_add_dict_string(d, "Start_addr", addr);
1434 	snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_end_addr);
1435 	rte_tel_data_add_dict_string(d, "End_addr", addr);
1436 	rte_tel_data_add_dict_uint(d, "Size", ms_size);
1437 	rte_tel_data_add_dict_uint(d, "Hugepage_size", hugepage_size);
1438 	rte_tel_data_add_dict_int(d, "Socket_id", ms_socket_id);
1439 	rte_tel_data_add_dict_int(d, "flags", ms_flags);
1440 
1441 	return 0;
1442 }
1443 
1444 static int
1445 handle_eal_element_list_request(const char *cmd __rte_unused,
1446 				const char *params, struct rte_tel_data *d)
1447 {
1448 	struct rte_mem_config *mcfg;
1449 	struct rte_memseg_list *msl;
1450 	const struct rte_memseg *ms;
1451 	struct malloc_elem *elem;
1452 	struct malloc_heap *heap;
1453 	uint64_t ms_start_addr, ms_end_addr;
1454 	uint64_t elem_start_addr, elem_end_addr;
1455 	uint32_t ms_list_idx = 0;
1456 	uint32_t heap_id = 0;
1457 	uint32_t ms_idx = 0;
1458 	int elem_count = 0;
1459 	/* size of an array == num params to be parsed. */
1460 	uint32_t vals[3] = {0};
1461 
1462 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1463 		return -1;
1464 
1465 	heap_id = vals[0];
1466 	if (heap_id >= RTE_MAX_HEAPS)
1467 		return -1;
1468 
1469 	ms_list_idx = vals[1];
1470 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1471 		return -1;
1472 
1473 	ms_idx = vals[2];
1474 
1475 	rte_mcfg_mem_read_lock();
1476 
1477 	mcfg = rte_eal_get_configuration()->mem_config;
1478 	msl = &mcfg->memsegs[ms_list_idx];
1479 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1480 	if (ms == NULL) {
1481 		rte_mcfg_mem_read_unlock();
1482 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1483 		return -1;
1484 	}
1485 
1486 	ms_start_addr = ms->addr_64;
1487 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1488 	rte_mcfg_mem_read_unlock();
1489 
1490 	rte_tel_data_start_dict(d);
1491 
1492 	heap = &mcfg->malloc_heaps[heap_id];
1493 	rte_spinlock_lock(&heap->lock);
1494 
1495 	elem = heap->first;
1496 	while (elem) {
1497 		elem_start_addr = (uint64_t)elem;
1498 		elem_end_addr =
1499 			(uint64_t)RTE_PTR_ADD(elem_start_addr, elem->size);
1500 
1501 		if ((uint64_t)elem_start_addr >= ms_start_addr &&
1502 		    (uint64_t)elem_end_addr <= ms_end_addr)
1503 			elem_count++;
1504 		elem = elem->next;
1505 	}
1506 
1507 	rte_spinlock_unlock(&heap->lock);
1508 
1509 	rte_tel_data_add_dict_int(d, "Element_count", elem_count);
1510 
1511 	return 0;
1512 }
1513 
1514 static int
1515 handle_eal_element_info_request(const char *cmd __rte_unused,
1516 				const char *params, struct rte_tel_data *d)
1517 {
1518 	struct rte_mem_config *mcfg;
1519 	struct rte_memseg_list *msl;
1520 	const struct rte_memseg *ms;
1521 	struct malloc_elem *elem;
1522 	struct malloc_heap *heap;
1523 	struct rte_tel_data *c;
1524 	uint64_t ms_start_addr, ms_end_addr;
1525 	uint64_t elem_start_addr, elem_end_addr;
1526 	uint32_t ms_list_idx = 0;
1527 	uint32_t heap_id = 0;
1528 	uint32_t ms_idx = 0;
1529 	uint32_t start_elem = 0, end_elem = 0;
1530 	uint32_t count = 0, elem_count = 0;
1531 	char str[ADDR_STR];
1532 	/* size of an array == num params to be parsed. */
1533 	uint32_t vals[5] = {0};
1534 
1535 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1536 		return -1;
1537 
1538 	heap_id = vals[0];
1539 	if (heap_id >= RTE_MAX_HEAPS)
1540 		return -1;
1541 
1542 	ms_list_idx = vals[1];
1543 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1544 		return -1;
1545 
1546 	ms_idx = vals[2];
1547 	start_elem = vals[3];
1548 	end_elem = vals[4];
1549 
1550 	if (end_elem < start_elem)
1551 		return -1;
1552 
1553 	rte_mcfg_mem_read_lock();
1554 
1555 	mcfg = rte_eal_get_configuration()->mem_config;
1556 	msl = &mcfg->memsegs[ms_list_idx];
1557 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1558 	if (ms == NULL) {
1559 		rte_mcfg_mem_read_unlock();
1560 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1561 		return -1;
1562 	}
1563 
1564 	ms_start_addr = ms->addr_64;
1565 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1566 
1567 	rte_mcfg_mem_read_unlock();
1568 
1569 	rte_tel_data_start_dict(d);
1570 
1571 	heap = &mcfg->malloc_heaps[heap_id];
1572 	rte_spinlock_lock(&heap->lock);
1573 
1574 	elem = heap->first;
1575 	while (elem) {
1576 		elem_start_addr = (uint64_t)elem;
1577 		elem_end_addr =
1578 			(uint64_t)RTE_PTR_ADD(elem_start_addr, elem->size);
1579 
1580 		if (elem_start_addr < ms_start_addr ||
1581 				elem_end_addr > ms_end_addr) {
1582 			elem = elem->next;
1583 			continue;
1584 		}
1585 
1586 		if (count < start_elem) {
1587 			elem = elem->next;
1588 			count++;
1589 			continue;
1590 		}
1591 
1592 		c = rte_tel_data_alloc();
1593 		if (c == NULL)
1594 			break;
1595 
1596 		rte_tel_data_start_dict(c);
1597 		rte_tel_data_add_dict_int(c, "msl_id", ms_list_idx);
1598 		rte_tel_data_add_dict_int(c, "ms_id", ms_idx);
1599 		snprintf(str, ADDR_STR, "0x%"PRIx64, ms_start_addr);
1600 		rte_tel_data_add_dict_string(c, "memseg_start_addr", str);
1601 		snprintf(str, ADDR_STR, "0x%"PRIx64, ms_end_addr);
1602 		rte_tel_data_add_dict_string(c, "memseg_end_addr", str);
1603 		snprintf(str, ADDR_STR, "0x%"PRIx64, elem_start_addr);
1604 		rte_tel_data_add_dict_string(c, "element_start_addr", str);
1605 		snprintf(str, ADDR_STR, "0x%"PRIx64, elem_end_addr);
1606 		rte_tel_data_add_dict_string(c, "element_end_addr", str);
1607 		rte_tel_data_add_dict_int(c, "element_size", elem->size);
1608 		snprintf(str, ADDR_STR, "%s", elem->state == 0 ? "Free" :
1609 			 elem->state == 1 ? "Busy" : elem->state == 2 ?
1610 			 "Pad" : "Error");
1611 		rte_tel_data_add_dict_string(c, "element_state", str);
1612 
1613 		snprintf(str, ADDR_STR, "%s_%u", "element", count);
1614 		if (rte_tel_data_add_dict_container(d, str, c, 0) != 0) {
1615 			rte_tel_data_free(c);
1616 			break;
1617 		}
1618 
1619 		elem_count++;
1620 		count++;
1621 		if (count > end_elem)
1622 			break;
1623 
1624 		elem = elem->next;
1625 	}
1626 
1627 	rte_spinlock_unlock(&heap->lock);
1628 
1629 	rte_tel_data_add_dict_int(d, "Element_count", elem_count);
1630 
1631 	return 0;
1632 }
1633 
1634 RTE_INIT(memory_telemetry)
1635 {
1636 	rte_telemetry_register_cmd(
1637 			EAL_MEMZONE_LIST_REQ, handle_eal_memzone_list_request,
1638 			"List of memzone index reserved. Takes no parameters");
1639 	rte_telemetry_register_cmd(
1640 			EAL_MEMZONE_INFO_REQ, handle_eal_memzone_info_request,
1641 			"Returns memzone info. Parameters: int mz_id");
1642 	rte_telemetry_register_cmd(
1643 			EAL_HEAP_LIST_REQ, handle_eal_heap_list_request,
1644 			"List of heap index setup. Takes no parameters");
1645 	rte_telemetry_register_cmd(
1646 			EAL_HEAP_INFO_REQ, handle_eal_heap_info_request,
1647 			"Returns malloc heap stats. Parameters: int heap_id");
1648 	rte_telemetry_register_cmd(
1649 			EAL_MEMSEG_LISTS_REQ,
1650 			handle_eal_memseg_lists_request,
1651 			"Returns array of memseg list IDs. Takes no parameters");
1652 	rte_telemetry_register_cmd(
1653 			EAL_MEMSEG_LIST_INFO_REQ,
1654 			handle_eal_memseg_list_info_request,
1655 			"Returns memseg list info. Parameters: int memseg_list_id");
1656 	rte_telemetry_register_cmd(
1657 			EAL_MEMSEG_INFO_REQ, handle_eal_memseg_info_request,
1658 			"Returns memseg info. Parameter: int memseg_list_id,int memseg_id");
1659 	rte_telemetry_register_cmd(EAL_ELEMENT_LIST_REQ,
1660 			handle_eal_element_list_request,
1661 			"Returns array of heap element IDs. Parameters: int heap_id, int memseg_list_id, int memseg_id");
1662 	rte_telemetry_register_cmd(EAL_ELEMENT_INFO_REQ,
1663 			handle_eal_element_info_request,
1664 			"Returns element info. Parameters: int heap_id, int memseg_list_id, int memseg_id, int start_elem_id, int end_elem_id");
1665 }
1666 #endif
1667