xref: /dpdk/lib/eal/common/eal_common_memory.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <inttypes.h>
12 
13 #include <rte_fbarray.h>
14 #include <rte_memory.h>
15 #include <rte_eal.h>
16 #include <rte_eal_memconfig.h>
17 #include <rte_eal_paging.h>
18 #include <rte_errno.h>
19 #include <rte_log.h>
20 #ifndef RTE_EXEC_ENV_WINDOWS
21 #include <rte_telemetry.h>
22 #endif
23 
24 #include "eal_memalloc.h"
25 #include "eal_private.h"
26 #include "eal_internal_cfg.h"
27 #include "eal_memcfg.h"
28 #include "eal_options.h"
29 #include "malloc_elem.h"
30 #include "malloc_heap.h"
31 
32 /*
33  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
34  * pointer to the mmap'd area and keep *size unmodified. Else, retry
35  * with a smaller zone: decrease *size by hugepage_sz until it reaches
36  * 0. In this case, return NULL. Note: this function returns an address
37  * which is a multiple of hugepage size.
38  */
39 
40 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
41 
42 static void *next_baseaddr;
43 static uint64_t system_page_sz;
44 
45 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
46 void *
47 eal_get_virtual_area(void *requested_addr, size_t *size,
48 	size_t page_sz, int flags, int reserve_flags)
49 {
50 	bool addr_is_hint, allow_shrink, unmap, no_align;
51 	uint64_t map_sz;
52 	void *mapped_addr, *aligned_addr;
53 	uint8_t try = 0;
54 	struct internal_config *internal_conf =
55 		eal_get_internal_configuration();
56 
57 	if (system_page_sz == 0)
58 		system_page_sz = rte_mem_page_size();
59 
60 	EAL_LOG(DEBUG, "Ask a virtual area of 0x%zx bytes", *size);
61 
62 	addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
63 	allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
64 	unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
65 
66 	if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 &&
67 			rte_eal_process_type() == RTE_PROC_PRIMARY)
68 		next_baseaddr = (void *) internal_conf->base_virtaddr;
69 
70 #ifdef RTE_ARCH_64
71 	if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 &&
72 			rte_eal_process_type() == RTE_PROC_PRIMARY)
73 		next_baseaddr = (void *) eal_get_baseaddr();
74 #endif
75 	if (requested_addr == NULL && next_baseaddr != NULL) {
76 		requested_addr = next_baseaddr;
77 		requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
78 		addr_is_hint = true;
79 	}
80 
81 	/* we don't need alignment of resulting pointer in the following cases:
82 	 *
83 	 * 1. page size is equal to system size
84 	 * 2. we have a requested address, and it is page-aligned, and we will
85 	 *    be discarding the address if we get a different one.
86 	 *
87 	 * for all other cases, alignment is potentially necessary.
88 	 */
89 	no_align = (requested_addr != NULL &&
90 		requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
91 		!addr_is_hint) ||
92 		page_sz == system_page_sz;
93 
94 	do {
95 		map_sz = no_align ? *size : *size + page_sz;
96 		if (map_sz > SIZE_MAX) {
97 			EAL_LOG(ERR, "Map size too big");
98 			rte_errno = E2BIG;
99 			return NULL;
100 		}
101 
102 		mapped_addr = eal_mem_reserve(
103 			requested_addr, (size_t)map_sz, reserve_flags);
104 		if ((mapped_addr == NULL) && allow_shrink)
105 			*size -= page_sz;
106 
107 		if ((mapped_addr != NULL) && addr_is_hint &&
108 				(mapped_addr != requested_addr)) {
109 			try++;
110 			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
111 			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
112 				/* hint was not used. Try with another offset */
113 				eal_mem_free(mapped_addr, map_sz);
114 				mapped_addr = NULL;
115 				requested_addr = next_baseaddr;
116 			}
117 		}
118 	} while ((allow_shrink || addr_is_hint) &&
119 		(mapped_addr == NULL) && (*size > 0));
120 
121 	/* align resulting address - if map failed, we will ignore the value
122 	 * anyway, so no need to add additional checks.
123 	 */
124 	aligned_addr = no_align ? mapped_addr :
125 			RTE_PTR_ALIGN(mapped_addr, page_sz);
126 
127 	if (*size == 0) {
128 		EAL_LOG(ERR, "Cannot get a virtual area of any size: %s",
129 			rte_strerror(rte_errno));
130 		return NULL;
131 	} else if (mapped_addr == NULL) {
132 		EAL_LOG(ERR, "Cannot get a virtual area: %s",
133 			rte_strerror(rte_errno));
134 		return NULL;
135 	} else if (requested_addr != NULL && !addr_is_hint &&
136 			aligned_addr != requested_addr) {
137 		EAL_LOG(ERR, "Cannot get a virtual area at requested address: %p (got %p)",
138 			requested_addr, aligned_addr);
139 		eal_mem_free(mapped_addr, map_sz);
140 		rte_errno = EADDRNOTAVAIL;
141 		return NULL;
142 	} else if (requested_addr != NULL && addr_is_hint &&
143 			aligned_addr != requested_addr) {
144 		/*
145 		 * demote this warning to debug if we did not explicitly request
146 		 * a base virtual address.
147 		 */
148 		if (internal_conf->base_virtaddr != 0) {
149 			EAL_LOG(WARNING, "WARNING! Base virtual address hint (%p != %p) not respected!",
150 				requested_addr, aligned_addr);
151 			EAL_LOG(WARNING, "   This may cause issues with mapping memory into secondary processes");
152 		} else {
153 			EAL_LOG(DEBUG, "WARNING! Base virtual address hint (%p != %p) not respected!",
154 				requested_addr, aligned_addr);
155 			EAL_LOG(DEBUG, "   This may cause issues with mapping memory into secondary processes");
156 		}
157 	} else if (next_baseaddr != NULL) {
158 		next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
159 	}
160 
161 	EAL_LOG(DEBUG, "Virtual area found at %p (size = 0x%zx)",
162 		aligned_addr, *size);
163 
164 	if (unmap) {
165 		eal_mem_free(mapped_addr, map_sz);
166 	} else if (!no_align) {
167 		void *map_end, *aligned_end;
168 		size_t before_len, after_len;
169 
170 		/* when we reserve space with alignment, we add alignment to
171 		 * mapping size. On 32-bit, if 1GB alignment was requested, this
172 		 * would waste 1GB of address space, which is a luxury we cannot
173 		 * afford. so, if alignment was performed, check if any unneeded
174 		 * address space can be unmapped back.
175 		 */
176 
177 		map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
178 		aligned_end = RTE_PTR_ADD(aligned_addr, *size);
179 
180 		/* unmap space before aligned mmap address */
181 		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
182 		if (before_len > 0)
183 			eal_mem_free(mapped_addr, before_len);
184 
185 		/* unmap space after aligned end mmap address */
186 		after_len = RTE_PTR_DIFF(map_end, aligned_end);
187 		if (after_len > 0)
188 			eal_mem_free(aligned_end, after_len);
189 	}
190 
191 	if (!unmap) {
192 		/* Exclude these pages from a core dump. */
193 		eal_mem_set_dump(aligned_addr, *size, false);
194 	}
195 
196 	return aligned_addr;
197 }
198 
199 int
200 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
201 		uint64_t page_sz, int n_segs, int socket_id, bool heap)
202 {
203 	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
204 			sizeof(struct rte_memseg))) {
205 		EAL_LOG(ERR, "Cannot allocate memseg list: %s",
206 			rte_strerror(rte_errno));
207 		return -1;
208 	}
209 
210 	msl->page_sz = page_sz;
211 	msl->socket_id = socket_id;
212 	msl->base_va = NULL;
213 	msl->heap = heap;
214 
215 	EAL_LOG(DEBUG,
216 		"Memseg list allocated at socket %i, page size 0x%"PRIx64"kB",
217 		socket_id, page_sz >> 10);
218 
219 	return 0;
220 }
221 
222 int
223 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz,
224 		int n_segs, int socket_id, int type_msl_idx, bool heap)
225 {
226 	char name[RTE_FBARRAY_NAME_LEN];
227 
228 	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
229 		 type_msl_idx);
230 
231 	return eal_memseg_list_init_named(
232 		msl, name, page_sz, n_segs, socket_id, heap);
233 }
234 
235 int
236 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
237 {
238 	size_t page_sz, mem_sz;
239 	void *addr;
240 
241 	page_sz = msl->page_sz;
242 	mem_sz = page_sz * msl->memseg_arr.len;
243 
244 	addr = eal_get_virtual_area(
245 		msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
246 	if (addr == NULL) {
247 #ifndef RTE_EXEC_ENV_WINDOWS
248 		/* The hint would be misleading on Windows, because address
249 		 * is by default system-selected (base VA = 0).
250 		 * However, this function is called from many places,
251 		 * including common code, so don't duplicate the message.
252 		 */
253 		if (rte_errno == EADDRNOTAVAIL)
254 			EAL_LOG(ERR, "Cannot reserve %llu bytes at [%p] - "
255 				"please use '--" OPT_BASE_VIRTADDR "' option",
256 				(unsigned long long)mem_sz, msl->base_va);
257 #endif
258 		return -1;
259 	}
260 	msl->base_va = addr;
261 	msl->len = mem_sz;
262 
263 	EAL_LOG(DEBUG, "VA reserved for memseg list at %p, size %zx",
264 			addr, mem_sz);
265 
266 	return 0;
267 }
268 
269 void
270 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs)
271 {
272 	size_t page_sz = msl->page_sz;
273 	int i;
274 
275 	for (i = 0; i < n_segs; i++) {
276 		struct rte_fbarray *arr = &msl->memseg_arr;
277 		struct rte_memseg *ms = rte_fbarray_get(arr, i);
278 
279 		if (rte_eal_iova_mode() == RTE_IOVA_VA)
280 			ms->iova = (uintptr_t)addr;
281 		else
282 			ms->iova = RTE_BAD_IOVA;
283 		ms->addr = addr;
284 		ms->hugepage_sz = page_sz;
285 		ms->socket_id = 0;
286 		ms->len = page_sz;
287 
288 		rte_fbarray_set_used(arr, i);
289 
290 		addr = RTE_PTR_ADD(addr, page_sz);
291 	}
292 }
293 
294 static struct rte_memseg *
295 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
296 {
297 	const struct rte_fbarray *arr;
298 	void *start, *end;
299 	int ms_idx;
300 
301 	if (msl == NULL)
302 		return NULL;
303 
304 	/* a memseg list was specified, check if it's the right one */
305 	start = msl->base_va;
306 	end = RTE_PTR_ADD(start, msl->len);
307 
308 	if (addr < start || addr >= end)
309 		return NULL;
310 
311 	/* now, calculate index */
312 	arr = &msl->memseg_arr;
313 	ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
314 	return rte_fbarray_get(arr, ms_idx);
315 }
316 
317 static struct rte_memseg_list *
318 virt2memseg_list(const void *addr)
319 {
320 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
321 	struct rte_memseg_list *msl;
322 	int msl_idx;
323 
324 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
325 		void *start, *end;
326 		msl = &mcfg->memsegs[msl_idx];
327 
328 		start = msl->base_va;
329 		end = RTE_PTR_ADD(start, msl->len);
330 		if (addr >= start && addr < end)
331 			break;
332 	}
333 	/* if we didn't find our memseg list */
334 	if (msl_idx == RTE_MAX_MEMSEG_LISTS)
335 		return NULL;
336 	return msl;
337 }
338 
339 struct rte_memseg_list *
340 rte_mem_virt2memseg_list(const void *addr)
341 {
342 	return virt2memseg_list(addr);
343 }
344 
345 struct virtiova {
346 	rte_iova_t iova;
347 	void *virt;
348 };
349 static int
350 find_virt(const struct rte_memseg_list *msl __rte_unused,
351 		const struct rte_memseg *ms, void *arg)
352 {
353 	struct virtiova *vi = arg;
354 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
355 		size_t offset = vi->iova - ms->iova;
356 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
357 		/* stop the walk */
358 		return 1;
359 	}
360 	return 0;
361 }
362 static int
363 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
364 		const struct rte_memseg *ms, size_t len, void *arg)
365 {
366 	struct virtiova *vi = arg;
367 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
368 		size_t offset = vi->iova - ms->iova;
369 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
370 		/* stop the walk */
371 		return 1;
372 	}
373 	return 0;
374 }
375 
376 void *
377 rte_mem_iova2virt(rte_iova_t iova)
378 {
379 	struct virtiova vi;
380 	const struct internal_config *internal_conf =
381 		eal_get_internal_configuration();
382 
383 	memset(&vi, 0, sizeof(vi));
384 
385 	vi.iova = iova;
386 	/* for legacy mem, we can get away with scanning VA-contiguous segments,
387 	 * as we know they are PA-contiguous as well
388 	 */
389 	if (internal_conf->legacy_mem)
390 		rte_memseg_contig_walk(find_virt_legacy, &vi);
391 	else
392 		rte_memseg_walk(find_virt, &vi);
393 
394 	return vi.virt;
395 }
396 
397 struct rte_memseg *
398 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
399 {
400 	return virt2memseg(addr, msl != NULL ? msl :
401 			rte_mem_virt2memseg_list(addr));
402 }
403 
404 static int
405 physmem_size(const struct rte_memseg_list *msl, void *arg)
406 {
407 	uint64_t *total_len = arg;
408 
409 	if (msl->external)
410 		return 0;
411 
412 	*total_len += msl->memseg_arr.count * msl->page_sz;
413 
414 	return 0;
415 }
416 
417 /* get the total size of memory */
418 uint64_t
419 rte_eal_get_physmem_size(void)
420 {
421 	uint64_t total_len = 0;
422 
423 	rte_memseg_list_walk(physmem_size, &total_len);
424 
425 	return total_len;
426 }
427 
428 static int
429 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
430 		void *arg)
431 {
432 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
433 	int msl_idx, ms_idx, fd;
434 	FILE *f = arg;
435 
436 	msl_idx = msl - mcfg->memsegs;
437 	if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
438 		return -1;
439 
440 	ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
441 	if (ms_idx < 0)
442 		return -1;
443 
444 	fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
445 	fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
446 			"virt:%p, socket_id:%"PRId32", "
447 			"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
448 			"nrank:%"PRIx32" fd:%i\n",
449 			msl_idx, ms_idx,
450 			ms->iova,
451 			ms->len,
452 			ms->addr,
453 			ms->socket_id,
454 			ms->hugepage_sz,
455 			ms->nchannel,
456 			ms->nrank,
457 			fd);
458 
459 	return 0;
460 }
461 
462 /*
463  * Defining here because declared in rte_memory.h, but the actual implementation
464  * is in eal_common_memalloc.c, like all other memalloc internals.
465  */
466 int
467 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
468 		void *arg)
469 {
470 	const struct internal_config *internal_conf =
471 		eal_get_internal_configuration();
472 
473 	/* FreeBSD boots with legacy mem enabled by default */
474 	if (internal_conf->legacy_mem) {
475 		EAL_LOG(DEBUG, "Registering mem event callbacks not supported");
476 		rte_errno = ENOTSUP;
477 		return -1;
478 	}
479 	return eal_memalloc_mem_event_callback_register(name, clb, arg);
480 }
481 
482 int
483 rte_mem_event_callback_unregister(const char *name, void *arg)
484 {
485 	const struct internal_config *internal_conf =
486 		eal_get_internal_configuration();
487 
488 	/* FreeBSD boots with legacy mem enabled by default */
489 	if (internal_conf->legacy_mem) {
490 		EAL_LOG(DEBUG, "Registering mem event callbacks not supported");
491 		rte_errno = ENOTSUP;
492 		return -1;
493 	}
494 	return eal_memalloc_mem_event_callback_unregister(name, arg);
495 }
496 
497 int
498 rte_mem_alloc_validator_register(const char *name,
499 		rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
500 {
501 	const struct internal_config *internal_conf =
502 		eal_get_internal_configuration();
503 
504 	/* FreeBSD boots with legacy mem enabled by default */
505 	if (internal_conf->legacy_mem) {
506 		EAL_LOG(DEBUG, "Registering mem alloc validators not supported");
507 		rte_errno = ENOTSUP;
508 		return -1;
509 	}
510 	return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
511 			limit);
512 }
513 
514 int
515 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
516 {
517 	const struct internal_config *internal_conf =
518 		eal_get_internal_configuration();
519 
520 	/* FreeBSD boots with legacy mem enabled by default */
521 	if (internal_conf->legacy_mem) {
522 		EAL_LOG(DEBUG, "Registering mem alloc validators not supported");
523 		rte_errno = ENOTSUP;
524 		return -1;
525 	}
526 	return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
527 }
528 
529 /* Dump the physical memory layout on console */
530 void
531 rte_dump_physmem_layout(FILE *f)
532 {
533 	rte_memseg_walk(dump_memseg, f);
534 }
535 
536 static int
537 check_iova(const struct rte_memseg_list *msl __rte_unused,
538 		const struct rte_memseg *ms, void *arg)
539 {
540 	uint64_t *mask = arg;
541 	rte_iova_t iova;
542 
543 	/* higher address within segment */
544 	iova = (ms->iova + ms->len) - 1;
545 	if (!(iova & *mask))
546 		return 0;
547 
548 	EAL_LOG(DEBUG, "memseg iova %"PRIx64", len %zx, out of range",
549 			    ms->iova, ms->len);
550 
551 	EAL_LOG(DEBUG, "\tusing dma mask %"PRIx64, *mask);
552 	return 1;
553 }
554 
555 #define MAX_DMA_MASK_BITS 63
556 
557 /* check memseg iovas are within the required range based on dma mask */
558 static int
559 check_dma_mask(uint8_t maskbits, bool thread_unsafe)
560 {
561 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
562 	uint64_t mask;
563 	int ret;
564 
565 	/* Sanity check. We only check width can be managed with 64 bits
566 	 * variables. Indeed any higher value is likely wrong. */
567 	if (maskbits > MAX_DMA_MASK_BITS) {
568 		EAL_LOG(ERR, "wrong dma mask size %u (Max: %u)",
569 				   maskbits, MAX_DMA_MASK_BITS);
570 		return -1;
571 	}
572 
573 	/* create dma mask */
574 	mask = ~((1ULL << maskbits) - 1);
575 
576 	if (thread_unsafe)
577 		ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
578 	else
579 		ret = rte_memseg_walk(check_iova, &mask);
580 
581 	if (ret)
582 		/*
583 		 * Dma mask precludes hugepage usage.
584 		 * This device can not be used and we do not need to keep
585 		 * the dma mask.
586 		 */
587 		return 1;
588 
589 	/*
590 	 * we need to keep the more restricted maskbit for checking
591 	 * potential dynamic memory allocation in the future.
592 	 */
593 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
594 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
595 
596 	return 0;
597 }
598 
599 int
600 rte_mem_check_dma_mask(uint8_t maskbits)
601 {
602 	return check_dma_mask(maskbits, false);
603 }
604 
605 int
606 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
607 {
608 	return check_dma_mask(maskbits, true);
609 }
610 
611 /*
612  * Set dma mask to use when memory initialization is done.
613  *
614  * This function should ONLY be used by code executed before the memory
615  * initialization. PMDs should use rte_mem_check_dma_mask if addressing
616  * limitations by the device.
617  */
618 void
619 rte_mem_set_dma_mask(uint8_t maskbits)
620 {
621 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
622 
623 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
624 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
625 }
626 
627 /* return the number of memory channels */
628 unsigned rte_memory_get_nchannel(void)
629 {
630 	return rte_eal_get_configuration()->mem_config->nchannel;
631 }
632 
633 /* return the number of memory rank */
634 unsigned rte_memory_get_nrank(void)
635 {
636 	return rte_eal_get_configuration()->mem_config->nrank;
637 }
638 
639 static int
640 rte_eal_memdevice_init(void)
641 {
642 	struct rte_config *config;
643 	const struct internal_config *internal_conf;
644 
645 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
646 		return 0;
647 
648 	internal_conf = eal_get_internal_configuration();
649 	config = rte_eal_get_configuration();
650 	config->mem_config->nchannel = internal_conf->force_nchannel;
651 	config->mem_config->nrank = internal_conf->force_nrank;
652 
653 	return 0;
654 }
655 
656 /* Lock page in physical memory and prevent from swapping. */
657 int
658 rte_mem_lock_page(const void *virt)
659 {
660 	uintptr_t virtual = (uintptr_t)virt;
661 	size_t page_size = rte_mem_page_size();
662 	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
663 	return rte_mem_lock((void *)aligned, page_size);
664 }
665 
666 int
667 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
668 {
669 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
670 	int i, ms_idx, ret = 0;
671 
672 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
673 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
674 		const struct rte_memseg *ms;
675 		struct rte_fbarray *arr;
676 
677 		if (msl->memseg_arr.count == 0)
678 			continue;
679 
680 		arr = &msl->memseg_arr;
681 
682 		ms_idx = rte_fbarray_find_next_used(arr, 0);
683 		while (ms_idx >= 0) {
684 			int n_segs;
685 			size_t len;
686 
687 			ms = rte_fbarray_get(arr, ms_idx);
688 
689 			/* find how many more segments there are, starting with
690 			 * this one.
691 			 */
692 			n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
693 			len = n_segs * msl->page_sz;
694 
695 			ret = func(msl, ms, len, arg);
696 			if (ret)
697 				return ret;
698 			ms_idx = rte_fbarray_find_next_used(arr,
699 					ms_idx + n_segs);
700 		}
701 	}
702 	return 0;
703 }
704 
705 int
706 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
707 {
708 	int ret = 0;
709 
710 	/* do not allow allocations/frees/init while we iterate */
711 	rte_mcfg_mem_read_lock();
712 	ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
713 	rte_mcfg_mem_read_unlock();
714 
715 	return ret;
716 }
717 
718 int
719 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
720 {
721 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
722 	int i, ms_idx, ret = 0;
723 
724 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
725 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
726 		const struct rte_memseg *ms;
727 		struct rte_fbarray *arr;
728 
729 		if (msl->memseg_arr.count == 0)
730 			continue;
731 
732 		arr = &msl->memseg_arr;
733 
734 		ms_idx = rte_fbarray_find_next_used(arr, 0);
735 		while (ms_idx >= 0) {
736 			ms = rte_fbarray_get(arr, ms_idx);
737 			ret = func(msl, ms, arg);
738 			if (ret)
739 				return ret;
740 			ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
741 		}
742 	}
743 	return 0;
744 }
745 
746 int
747 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
748 {
749 	int ret = 0;
750 
751 	/* do not allow allocations/frees/init while we iterate */
752 	rte_mcfg_mem_read_lock();
753 	ret = rte_memseg_walk_thread_unsafe(func, arg);
754 	rte_mcfg_mem_read_unlock();
755 
756 	return ret;
757 }
758 
759 int
760 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
761 {
762 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
763 	int i, ret = 0;
764 
765 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
766 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
767 
768 		if (msl->base_va == NULL)
769 			continue;
770 
771 		ret = func(msl, arg);
772 		if (ret)
773 			return ret;
774 	}
775 	return 0;
776 }
777 
778 int
779 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
780 {
781 	int ret = 0;
782 
783 	/* do not allow allocations/frees/init while we iterate */
784 	rte_mcfg_mem_read_lock();
785 	ret = rte_memseg_list_walk_thread_unsafe(func, arg);
786 	rte_mcfg_mem_read_unlock();
787 
788 	return ret;
789 }
790 
791 int
792 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
793 {
794 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
795 	struct rte_memseg_list *msl;
796 	struct rte_fbarray *arr;
797 	int msl_idx, seg_idx, ret;
798 
799 	if (ms == NULL) {
800 		rte_errno = EINVAL;
801 		return -1;
802 	}
803 
804 	msl = rte_mem_virt2memseg_list(ms->addr);
805 	if (msl == NULL) {
806 		rte_errno = EINVAL;
807 		return -1;
808 	}
809 	arr = &msl->memseg_arr;
810 
811 	msl_idx = msl - mcfg->memsegs;
812 	seg_idx = rte_fbarray_find_idx(arr, ms);
813 
814 	if (!rte_fbarray_is_used(arr, seg_idx)) {
815 		rte_errno = ENOENT;
816 		return -1;
817 	}
818 
819 	/* segment fd API is not supported for external segments */
820 	if (msl->external) {
821 		rte_errno = ENOTSUP;
822 		return -1;
823 	}
824 
825 	ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
826 	if (ret < 0) {
827 		rte_errno = -ret;
828 		ret = -1;
829 	}
830 	return ret;
831 }
832 
833 int
834 rte_memseg_get_fd(const struct rte_memseg *ms)
835 {
836 	int ret;
837 
838 	rte_mcfg_mem_read_lock();
839 	ret = rte_memseg_get_fd_thread_unsafe(ms);
840 	rte_mcfg_mem_read_unlock();
841 
842 	return ret;
843 }
844 
845 int
846 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
847 		size_t *offset)
848 {
849 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
850 	struct rte_memseg_list *msl;
851 	struct rte_fbarray *arr;
852 	int msl_idx, seg_idx, ret;
853 
854 	if (ms == NULL || offset == NULL) {
855 		rte_errno = EINVAL;
856 		return -1;
857 	}
858 
859 	msl = rte_mem_virt2memseg_list(ms->addr);
860 	if (msl == NULL) {
861 		rte_errno = EINVAL;
862 		return -1;
863 	}
864 	arr = &msl->memseg_arr;
865 
866 	msl_idx = msl - mcfg->memsegs;
867 	seg_idx = rte_fbarray_find_idx(arr, ms);
868 
869 	if (!rte_fbarray_is_used(arr, seg_idx)) {
870 		rte_errno = ENOENT;
871 		return -1;
872 	}
873 
874 	/* segment fd API is not supported for external segments */
875 	if (msl->external) {
876 		rte_errno = ENOTSUP;
877 		return -1;
878 	}
879 
880 	ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
881 	if (ret < 0) {
882 		rte_errno = -ret;
883 		ret = -1;
884 	}
885 	return ret;
886 }
887 
888 int
889 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
890 {
891 	int ret;
892 
893 	rte_mcfg_mem_read_lock();
894 	ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
895 	rte_mcfg_mem_read_unlock();
896 
897 	return ret;
898 }
899 
900 int
901 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
902 		unsigned int n_pages, size_t page_sz)
903 {
904 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
905 	unsigned int socket_id, n;
906 	int ret = 0;
907 
908 	if (va_addr == NULL || page_sz == 0 || len == 0 ||
909 			!rte_is_power_of_2(page_sz) ||
910 			RTE_ALIGN(len, page_sz) != len ||
911 			((len / page_sz) != n_pages && iova_addrs != NULL) ||
912 			!rte_is_aligned(va_addr, page_sz)) {
913 		rte_errno = EINVAL;
914 		return -1;
915 	}
916 	rte_mcfg_mem_write_lock();
917 
918 	/* make sure the segment doesn't already exist */
919 	if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
920 		rte_errno = EEXIST;
921 		ret = -1;
922 		goto unlock;
923 	}
924 
925 	/* get next available socket ID */
926 	socket_id = mcfg->next_socket_id;
927 	if (socket_id > INT32_MAX) {
928 		EAL_LOG(ERR, "Cannot assign new socket ID's");
929 		rte_errno = ENOSPC;
930 		ret = -1;
931 		goto unlock;
932 	}
933 
934 	/* we can create a new memseg */
935 	n = len / page_sz;
936 	if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
937 			page_sz, "extmem", socket_id) == NULL) {
938 		ret = -1;
939 		goto unlock;
940 	}
941 
942 	/* memseg list successfully created - increment next socket ID */
943 	mcfg->next_socket_id++;
944 unlock:
945 	rte_mcfg_mem_write_unlock();
946 	return ret;
947 }
948 
949 int
950 rte_extmem_unregister(void *va_addr, size_t len)
951 {
952 	struct rte_memseg_list *msl;
953 	int ret = 0;
954 
955 	if (va_addr == NULL || len == 0) {
956 		rte_errno = EINVAL;
957 		return -1;
958 	}
959 	rte_mcfg_mem_write_lock();
960 
961 	/* find our segment */
962 	msl = malloc_heap_find_external_seg(va_addr, len);
963 	if (msl == NULL) {
964 		rte_errno = ENOENT;
965 		ret = -1;
966 		goto unlock;
967 	}
968 
969 	ret = malloc_heap_destroy_external_seg(msl);
970 unlock:
971 	rte_mcfg_mem_write_unlock();
972 	return ret;
973 }
974 
975 static int
976 sync_memory(void *va_addr, size_t len, bool attach)
977 {
978 	struct rte_memseg_list *msl;
979 	int ret = 0;
980 
981 	if (va_addr == NULL || len == 0) {
982 		rte_errno = EINVAL;
983 		return -1;
984 	}
985 	rte_mcfg_mem_write_lock();
986 
987 	/* find our segment */
988 	msl = malloc_heap_find_external_seg(va_addr, len);
989 	if (msl == NULL) {
990 		rte_errno = ENOENT;
991 		ret = -1;
992 		goto unlock;
993 	}
994 	if (attach)
995 		ret = rte_fbarray_attach(&msl->memseg_arr);
996 	else
997 		ret = rte_fbarray_detach(&msl->memseg_arr);
998 
999 unlock:
1000 	rte_mcfg_mem_write_unlock();
1001 	return ret;
1002 }
1003 
1004 int
1005 rte_extmem_attach(void *va_addr, size_t len)
1006 {
1007 	return sync_memory(va_addr, len, true);
1008 }
1009 
1010 int
1011 rte_extmem_detach(void *va_addr, size_t len)
1012 {
1013 	return sync_memory(va_addr, len, false);
1014 }
1015 
1016 /* detach all EAL memory */
1017 int
1018 rte_eal_memory_detach(void)
1019 {
1020 	const struct internal_config *internal_conf =
1021 		eal_get_internal_configuration();
1022 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1023 	size_t page_sz = rte_mem_page_size();
1024 	unsigned int i;
1025 
1026 	if (internal_conf->in_memory == 1)
1027 		return 0;
1028 
1029 	rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
1030 
1031 	/* detach internal memory subsystem data first */
1032 	if (eal_memalloc_cleanup())
1033 		EAL_LOG(ERR, "Could not release memory subsystem data");
1034 
1035 	for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) {
1036 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1037 
1038 		/* skip uninitialized segments */
1039 		if (msl->base_va == NULL)
1040 			continue;
1041 		/*
1042 		 * external segments are supposed to be detached at this point,
1043 		 * but if they aren't, we can't really do anything about it,
1044 		 * because if we skip them here, they'll become invalid after
1045 		 * we unmap the memconfig anyway. however, if this is externally
1046 		 * referenced memory, we have no business unmapping it.
1047 		 */
1048 		if (!msl->external)
1049 			if (rte_mem_unmap(msl->base_va, msl->len) != 0)
1050 				EAL_LOG(ERR, "Could not unmap memory: %s",
1051 						rte_strerror(rte_errno));
1052 
1053 		/*
1054 		 * we are detaching the fbarray rather than destroying because
1055 		 * other processes might still reference this fbarray, and we
1056 		 * have no way of knowing if they still do.
1057 		 */
1058 		if (rte_fbarray_detach(&msl->memseg_arr))
1059 			EAL_LOG(ERR, "Could not detach fbarray: %s",
1060 					rte_strerror(rte_errno));
1061 	}
1062 	rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
1063 
1064 	/*
1065 	 * we've detached the memseg lists, so we can unmap the shared mem
1066 	 * config - we can't zero it out because it might still be referenced
1067 	 * by other processes.
1068 	 */
1069 	if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) {
1070 		if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0)
1071 			EAL_LOG(ERR, "Could not unmap shared memory config: %s",
1072 					rte_strerror(rte_errno));
1073 	}
1074 	rte_eal_get_configuration()->mem_config = NULL;
1075 
1076 	return 0;
1077 }
1078 
1079 /* init memory subsystem */
1080 int
1081 rte_eal_memory_init(void)
1082 {
1083 	const struct internal_config *internal_conf =
1084 		eal_get_internal_configuration();
1085 	int retval;
1086 
1087 	EAL_LOG(DEBUG, "Setting up physically contiguous memory...");
1088 
1089 	if (rte_eal_memseg_init() < 0)
1090 		goto fail;
1091 
1092 	if (eal_memalloc_init() < 0)
1093 		goto fail;
1094 
1095 	retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1096 			rte_eal_hugepage_init() :
1097 			rte_eal_hugepage_attach();
1098 	if (retval < 0)
1099 		goto fail;
1100 
1101 	if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
1102 		goto fail;
1103 
1104 	return 0;
1105 fail:
1106 	return -1;
1107 }
1108 
1109 #ifndef RTE_EXEC_ENV_WINDOWS
1110 #define EAL_MEMZONE_LIST_REQ		"/eal/memzone_list"
1111 #define EAL_MEMZONE_INFO_REQ		"/eal/memzone_info"
1112 #define EAL_HEAP_LIST_REQ		"/eal/heap_list"
1113 #define EAL_HEAP_INFO_REQ		"/eal/heap_info"
1114 #define EAL_MEMSEG_LISTS_REQ		"/eal/memseg_lists"
1115 #define EAL_MEMSEG_LIST_INFO_REQ	"/eal/memseg_list_info"
1116 #define EAL_MEMSEG_INFO_REQ		"/eal/memseg_info"
1117 #define EAL_ELEMENT_LIST_REQ		"/eal/mem_element_list"
1118 #define EAL_ELEMENT_INFO_REQ		"/eal/mem_element_info"
1119 #define ADDR_STR			15
1120 
1121 
1122 /* Telemetry callback handler to return heap stats for requested heap id. */
1123 static int
1124 handle_eal_heap_info_request(const char *cmd __rte_unused, const char *params,
1125 			     struct rte_tel_data *d)
1126 {
1127 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1128 	struct rte_malloc_socket_stats sock_stats;
1129 	struct malloc_heap *heap;
1130 	unsigned int heap_id;
1131 
1132 	if (params == NULL || strlen(params) == 0)
1133 		return -1;
1134 
1135 	heap_id = (unsigned int)strtoul(params, NULL, 10);
1136 
1137 	/* Get the heap stats of user provided heap id */
1138 	heap = &mcfg->malloc_heaps[heap_id];
1139 	malloc_heap_get_stats(heap, &sock_stats);
1140 
1141 	rte_tel_data_start_dict(d);
1142 	rte_tel_data_add_dict_uint(d, "Heap_id", heap_id);
1143 	rte_tel_data_add_dict_string(d, "Name", heap->name);
1144 	rte_tel_data_add_dict_uint(d, "Heap_size",
1145 				   sock_stats.heap_totalsz_bytes);
1146 	rte_tel_data_add_dict_uint(d, "Free_size",
1147 				   sock_stats.heap_freesz_bytes);
1148 	rte_tel_data_add_dict_uint(d, "Alloc_size",
1149 				   sock_stats.heap_allocsz_bytes);
1150 	rte_tel_data_add_dict_uint(d, "Greatest_free_size",
1151 				   sock_stats.greatest_free_size);
1152 	rte_tel_data_add_dict_uint(d, "Alloc_count", sock_stats.alloc_count);
1153 	rte_tel_data_add_dict_uint(d, "Free_count", sock_stats.free_count);
1154 
1155 	return 0;
1156 }
1157 
1158 /* Telemetry callback handler to list the heap ids setup. */
1159 static int
1160 handle_eal_heap_list_request(const char *cmd __rte_unused,
1161 				const char *params __rte_unused,
1162 				struct rte_tel_data *d)
1163 {
1164 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1165 	struct rte_malloc_socket_stats sock_stats;
1166 	unsigned int heap_id;
1167 
1168 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1169 	/* Iterate through all initialised heaps */
1170 	for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) {
1171 		struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
1172 
1173 		malloc_heap_get_stats(heap, &sock_stats);
1174 		if (sock_stats.heap_totalsz_bytes != 0)
1175 			rte_tel_data_add_array_int(d, heap_id);
1176 	}
1177 
1178 	return 0;
1179 }
1180 
1181 /* Telemetry callback handler to return memzone info for requested index. */
1182 static int
1183 handle_eal_memzone_info_request(const char *cmd __rte_unused,
1184 				const char *params, struct rte_tel_data *d)
1185 {
1186 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1187 	struct rte_memseg_list *msl = NULL;
1188 	int ms_idx, ms_count = 0;
1189 	void *cur_addr, *mz_end;
1190 	struct rte_memzone *mz;
1191 	struct rte_memseg *ms;
1192 	char addr[ADDR_STR];
1193 	unsigned int mz_idx;
1194 	size_t page_sz;
1195 
1196 	if (params == NULL || strlen(params) == 0)
1197 		return -1;
1198 
1199 	mz_idx = strtoul(params, NULL, 10);
1200 
1201 	/* Get the memzone handle using index */
1202 	mz = rte_fbarray_get(&mcfg->memzones, mz_idx);
1203 
1204 	rte_tel_data_start_dict(d);
1205 	rte_tel_data_add_dict_uint(d, "Zone", mz_idx);
1206 	rte_tel_data_add_dict_string(d, "Name", mz->name);
1207 	rte_tel_data_add_dict_uint(d, "Length", mz->len);
1208 	snprintf(addr, ADDR_STR, "%p", mz->addr);
1209 	rte_tel_data_add_dict_string(d, "Address", addr);
1210 	rte_tel_data_add_dict_int(d, "Socket", mz->socket_id);
1211 	rte_tel_data_add_dict_uint(d, "Flags", mz->flags);
1212 
1213 	/* go through each page occupied by this memzone */
1214 	msl = rte_mem_virt2memseg_list(mz->addr);
1215 	if (!msl) {
1216 		EAL_LOG(DEBUG, "Skipping bad memzone");
1217 		return -1;
1218 	}
1219 	page_sz = (size_t)mz->hugepage_sz;
1220 	cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
1221 	mz_end = RTE_PTR_ADD(cur_addr, mz->len);
1222 
1223 	ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
1224 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1225 
1226 	rte_tel_data_add_dict_uint(d, "Hugepage_size", page_sz);
1227 	snprintf(addr, ADDR_STR, "%p", ms->addr);
1228 	rte_tel_data_add_dict_string(d, "Hugepage_base", addr);
1229 
1230 	do {
1231 		/* advance VA to next page */
1232 		cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
1233 
1234 		/* memzones occupy contiguous segments */
1235 		++ms;
1236 		ms_count++;
1237 	} while (cur_addr < mz_end);
1238 
1239 	rte_tel_data_add_dict_int(d, "Hugepage_used", ms_count);
1240 
1241 	return 0;
1242 }
1243 
1244 static void
1245 memzone_list_cb(const struct rte_memzone *mz __rte_unused,
1246 		 void *arg __rte_unused)
1247 {
1248 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1249 	struct rte_tel_data *d = arg;
1250 	int mz_idx;
1251 
1252 	mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz);
1253 	rte_tel_data_add_array_int(d, mz_idx);
1254 }
1255 
1256 
1257 /* Telemetry callback handler to list the memzones reserved. */
1258 static int
1259 handle_eal_memzone_list_request(const char *cmd __rte_unused,
1260 				const char *params __rte_unused,
1261 				struct rte_tel_data *d)
1262 {
1263 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1264 	rte_memzone_walk(memzone_list_cb, d);
1265 
1266 	return 0;
1267 }
1268 
1269 /* n_vals is the number of params to be parsed. */
1270 static int
1271 parse_params(const char *params, uint32_t *vals, size_t n_vals)
1272 {
1273 	char dlim[2] = ",";
1274 	char *params_args;
1275 	size_t count = 0;
1276 	char *token;
1277 
1278 	if (vals == NULL || params == NULL || strlen(params) == 0)
1279 		return -1;
1280 
1281 	/* strtok expects char * and param is const char *. Hence on using
1282 	 * params as "const char *" compiler throws warning.
1283 	 */
1284 	params_args = strdup(params);
1285 	if (params_args == NULL)
1286 		return -1;
1287 
1288 	token = strtok(params_args, dlim);
1289 	while (token && isdigit(*token) && count < n_vals) {
1290 		vals[count++] = strtoul(token, NULL, 10);
1291 		token = strtok(NULL, dlim);
1292 	}
1293 
1294 	free(params_args);
1295 
1296 	if (count < n_vals)
1297 		return -1;
1298 
1299 	return 0;
1300 }
1301 
1302 static int
1303 handle_eal_memseg_lists_request(const char *cmd __rte_unused,
1304 				const char *params __rte_unused,
1305 				struct rte_tel_data *d)
1306 {
1307 	struct rte_mem_config *mcfg;
1308 	int i;
1309 
1310 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1311 
1312 	rte_mcfg_mem_read_lock();
1313 	mcfg = rte_eal_get_configuration()->mem_config;
1314 
1315 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
1316 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1317 		if (msl->memseg_arr.count == 0)
1318 			continue;
1319 
1320 		rte_tel_data_add_array_int(d, i);
1321 	}
1322 	rte_mcfg_mem_read_unlock();
1323 
1324 	return 0;
1325 }
1326 
1327 static int
1328 handle_eal_memseg_list_info_request(const char *cmd __rte_unused,
1329 				    const char *params, struct rte_tel_data *d)
1330 {
1331 	struct rte_mem_config *mcfg;
1332 	struct rte_memseg_list *msl;
1333 	struct rte_fbarray *arr;
1334 	uint32_t ms_list_idx;
1335 	int ms_idx;
1336 	/* size of an array == num params to be parsed. */
1337 	uint32_t vals[1] = {0};
1338 
1339 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1340 		return -1;
1341 
1342 	ms_list_idx = vals[0];
1343 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1344 		return -1;
1345 
1346 	rte_tel_data_start_array(d, RTE_TEL_INT_VAL);
1347 
1348 	rte_mcfg_mem_read_lock();
1349 	mcfg = rte_eal_get_configuration()->mem_config;
1350 	msl = &mcfg->memsegs[ms_list_idx];
1351 	if (msl->memseg_arr.count == 0)
1352 		goto done;
1353 
1354 	arr = &msl->memseg_arr;
1355 
1356 	ms_idx = rte_fbarray_find_next_used(arr, 0);
1357 	while (ms_idx >= 0) {
1358 		rte_tel_data_add_array_int(d, ms_idx);
1359 		ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
1360 	}
1361 
1362 done:
1363 	rte_mcfg_mem_read_unlock();
1364 
1365 	return 0;
1366 }
1367 
1368 static int
1369 handle_eal_memseg_info_request(const char *cmd __rte_unused,
1370 			       const char *params, struct rte_tel_data *d)
1371 {
1372 	struct rte_mem_config *mcfg;
1373 	uint64_t ms_start_addr, ms_end_addr, ms_size, hugepage_size, ms_iova;
1374 	struct rte_memseg_list *msl;
1375 	const struct rte_memseg *ms;
1376 	struct rte_fbarray *arr;
1377 	char addr[ADDR_STR];
1378 	uint32_t ms_list_idx = 0;
1379 	uint32_t ms_idx = 0;
1380 	int32_t ms_socket_id;
1381 	uint32_t ms_flags;
1382 	/* size of an array == num params to be parsed. */
1383 	uint32_t vals[2] = {0};
1384 
1385 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1386 		return -1;
1387 
1388 	ms_list_idx = vals[0];
1389 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1390 		return -1;
1391 
1392 	ms_idx = vals[1];
1393 
1394 	rte_mcfg_mem_read_lock();
1395 
1396 	mcfg = rte_eal_get_configuration()->mem_config;
1397 	msl = &mcfg->memsegs[ms_list_idx];
1398 	if (msl->memseg_arr.count == 0) {
1399 		rte_mcfg_mem_read_unlock();
1400 		return -1;
1401 	}
1402 
1403 	arr = &msl->memseg_arr;
1404 	ms = rte_fbarray_get(arr, ms_idx);
1405 	if (ms == NULL) {
1406 		rte_mcfg_mem_read_unlock();
1407 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1408 		return -1;
1409 	}
1410 
1411 	ms_iova = ms->iova;
1412 	ms_start_addr = ms->addr_64;
1413 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1414 	ms_size = ms->len;
1415 	hugepage_size = ms->hugepage_sz;
1416 	ms_socket_id = ms->socket_id;
1417 	ms_flags = ms->flags;
1418 
1419 	rte_mcfg_mem_read_unlock();
1420 
1421 	rte_tel_data_start_dict(d);
1422 	rte_tel_data_add_dict_int(d, "Memseg_list_index", ms_list_idx);
1423 	rte_tel_data_add_dict_int(d, "Memseg_index", ms_idx);
1424 	if (ms_iova == RTE_BAD_IOVA)
1425 		snprintf(addr, ADDR_STR, "Bad IOVA");
1426 	else
1427 		snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_iova);
1428 
1429 	rte_tel_data_add_dict_string(d, "IOVA_addr", addr);
1430 	snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_start_addr);
1431 	rte_tel_data_add_dict_string(d, "Start_addr", addr);
1432 	snprintf(addr, ADDR_STR, "0x%"PRIx64, ms_end_addr);
1433 	rte_tel_data_add_dict_string(d, "End_addr", addr);
1434 	rte_tel_data_add_dict_uint(d, "Size", ms_size);
1435 	rte_tel_data_add_dict_uint(d, "Hugepage_size", hugepage_size);
1436 	rte_tel_data_add_dict_int(d, "Socket_id", ms_socket_id);
1437 	rte_tel_data_add_dict_int(d, "flags", ms_flags);
1438 
1439 	return 0;
1440 }
1441 
1442 static int
1443 handle_eal_element_list_request(const char *cmd __rte_unused,
1444 				const char *params, struct rte_tel_data *d)
1445 {
1446 	struct rte_mem_config *mcfg;
1447 	struct rte_memseg_list *msl;
1448 	const struct rte_memseg *ms;
1449 	struct malloc_elem *elem;
1450 	struct malloc_heap *heap;
1451 	uint64_t ms_start_addr, ms_end_addr;
1452 	uint64_t elem_start_addr, elem_end_addr;
1453 	uint32_t ms_list_idx = 0;
1454 	uint32_t heap_id = 0;
1455 	uint32_t ms_idx = 0;
1456 	int elem_count = 0;
1457 	/* size of an array == num params to be parsed. */
1458 	uint32_t vals[3] = {0};
1459 
1460 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1461 		return -1;
1462 
1463 	heap_id = vals[0];
1464 	if (heap_id >= RTE_MAX_HEAPS)
1465 		return -1;
1466 
1467 	ms_list_idx = vals[1];
1468 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1469 		return -1;
1470 
1471 	ms_idx = vals[2];
1472 
1473 	rte_mcfg_mem_read_lock();
1474 
1475 	mcfg = rte_eal_get_configuration()->mem_config;
1476 	msl = &mcfg->memsegs[ms_list_idx];
1477 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1478 	if (ms == NULL) {
1479 		rte_mcfg_mem_read_unlock();
1480 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1481 		return -1;
1482 	}
1483 
1484 	ms_start_addr = ms->addr_64;
1485 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1486 	rte_mcfg_mem_read_unlock();
1487 
1488 	rte_tel_data_start_dict(d);
1489 
1490 	heap = &mcfg->malloc_heaps[heap_id];
1491 	rte_spinlock_lock(&heap->lock);
1492 
1493 	elem = heap->first;
1494 	while (elem) {
1495 		elem_start_addr = (uint64_t)elem;
1496 		elem_end_addr =
1497 			(uint64_t)RTE_PTR_ADD(elem_start_addr, elem->size);
1498 
1499 		if ((uint64_t)elem_start_addr >= ms_start_addr &&
1500 		    (uint64_t)elem_end_addr <= ms_end_addr)
1501 			elem_count++;
1502 		elem = elem->next;
1503 	}
1504 
1505 	rte_spinlock_unlock(&heap->lock);
1506 
1507 	rte_tel_data_add_dict_int(d, "Element_count", elem_count);
1508 
1509 	return 0;
1510 }
1511 
1512 static int
1513 handle_eal_element_info_request(const char *cmd __rte_unused,
1514 				const char *params, struct rte_tel_data *d)
1515 {
1516 	struct rte_mem_config *mcfg;
1517 	struct rte_memseg_list *msl;
1518 	const struct rte_memseg *ms;
1519 	struct malloc_elem *elem;
1520 	struct malloc_heap *heap;
1521 	struct rte_tel_data *c;
1522 	uint64_t ms_start_addr, ms_end_addr;
1523 	uint64_t elem_start_addr, elem_end_addr;
1524 	uint32_t ms_list_idx = 0;
1525 	uint32_t heap_id = 0;
1526 	uint32_t ms_idx = 0;
1527 	uint32_t start_elem = 0, end_elem = 0;
1528 	uint32_t count = 0, elem_count = 0;
1529 	char str[ADDR_STR];
1530 	/* size of an array == num params to be parsed. */
1531 	uint32_t vals[5] = {0};
1532 
1533 	if (parse_params(params, vals, RTE_DIM(vals)) < 0)
1534 		return -1;
1535 
1536 	heap_id = vals[0];
1537 	if (heap_id >= RTE_MAX_HEAPS)
1538 		return -1;
1539 
1540 	ms_list_idx = vals[1];
1541 	if (ms_list_idx >= RTE_MAX_MEMSEG_LISTS)
1542 		return -1;
1543 
1544 	ms_idx = vals[2];
1545 	start_elem = vals[3];
1546 	end_elem = vals[4];
1547 
1548 	if (end_elem < start_elem)
1549 		return -1;
1550 
1551 	rte_mcfg_mem_read_lock();
1552 
1553 	mcfg = rte_eal_get_configuration()->mem_config;
1554 	msl = &mcfg->memsegs[ms_list_idx];
1555 	ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
1556 	if (ms == NULL) {
1557 		rte_mcfg_mem_read_unlock();
1558 		EAL_LOG(DEBUG, "Error fetching requested memseg.");
1559 		return -1;
1560 	}
1561 
1562 	ms_start_addr = ms->addr_64;
1563 	ms_end_addr = (uint64_t)RTE_PTR_ADD(ms_start_addr, ms->len);
1564 
1565 	rte_mcfg_mem_read_unlock();
1566 
1567 	rte_tel_data_start_dict(d);
1568 
1569 	heap = &mcfg->malloc_heaps[heap_id];
1570 	rte_spinlock_lock(&heap->lock);
1571 
1572 	elem = heap->first;
1573 	while (elem) {
1574 		elem_start_addr = (uint64_t)elem;
1575 		elem_end_addr =
1576 			(uint64_t)RTE_PTR_ADD(elem_start_addr, elem->size);
1577 
1578 		if (elem_start_addr < ms_start_addr ||
1579 				elem_end_addr > ms_end_addr) {
1580 			elem = elem->next;
1581 			continue;
1582 		}
1583 
1584 		if (count < start_elem) {
1585 			elem = elem->next;
1586 			count++;
1587 			continue;
1588 		}
1589 
1590 		c = rte_tel_data_alloc();
1591 		if (c == NULL)
1592 			break;
1593 
1594 		rte_tel_data_start_dict(c);
1595 		rte_tel_data_add_dict_int(c, "msl_id", ms_list_idx);
1596 		rte_tel_data_add_dict_int(c, "ms_id", ms_idx);
1597 		snprintf(str, ADDR_STR, "0x%"PRIx64, ms_start_addr);
1598 		rte_tel_data_add_dict_string(c, "memseg_start_addr", str);
1599 		snprintf(str, ADDR_STR, "0x%"PRIx64, ms_end_addr);
1600 		rte_tel_data_add_dict_string(c, "memseg_end_addr", str);
1601 		snprintf(str, ADDR_STR, "0x%"PRIx64, elem_start_addr);
1602 		rte_tel_data_add_dict_string(c, "element_start_addr", str);
1603 		snprintf(str, ADDR_STR, "0x%"PRIx64, elem_end_addr);
1604 		rte_tel_data_add_dict_string(c, "element_end_addr", str);
1605 		rte_tel_data_add_dict_int(c, "element_size", elem->size);
1606 		snprintf(str, ADDR_STR, "%s", elem->state == 0 ? "Free" :
1607 			 elem->state == 1 ? "Busy" : elem->state == 2 ?
1608 			 "Pad" : "Error");
1609 		rte_tel_data_add_dict_string(c, "element_state", str);
1610 
1611 		snprintf(str, ADDR_STR, "%s_%u", "element", count);
1612 		if (rte_tel_data_add_dict_container(d, str, c, 0) != 0) {
1613 			rte_tel_data_free(c);
1614 			break;
1615 		}
1616 
1617 		elem_count++;
1618 		count++;
1619 		if (count > end_elem)
1620 			break;
1621 
1622 		elem = elem->next;
1623 	}
1624 
1625 	rte_spinlock_unlock(&heap->lock);
1626 
1627 	rte_tel_data_add_dict_int(d, "Element_count", elem_count);
1628 
1629 	return 0;
1630 }
1631 
1632 RTE_INIT(memory_telemetry)
1633 {
1634 	rte_telemetry_register_cmd(
1635 			EAL_MEMZONE_LIST_REQ, handle_eal_memzone_list_request,
1636 			"List of memzone index reserved. Takes no parameters");
1637 	rte_telemetry_register_cmd(
1638 			EAL_MEMZONE_INFO_REQ, handle_eal_memzone_info_request,
1639 			"Returns memzone info. Parameters: int mz_id");
1640 	rte_telemetry_register_cmd(
1641 			EAL_HEAP_LIST_REQ, handle_eal_heap_list_request,
1642 			"List of heap index setup. Takes no parameters");
1643 	rte_telemetry_register_cmd(
1644 			EAL_HEAP_INFO_REQ, handle_eal_heap_info_request,
1645 			"Returns malloc heap stats. Parameters: int heap_id");
1646 	rte_telemetry_register_cmd(
1647 			EAL_MEMSEG_LISTS_REQ,
1648 			handle_eal_memseg_lists_request,
1649 			"Returns array of memseg list IDs. Takes no parameters");
1650 	rte_telemetry_register_cmd(
1651 			EAL_MEMSEG_LIST_INFO_REQ,
1652 			handle_eal_memseg_list_info_request,
1653 			"Returns memseg list info. Parameters: int memseg_list_id");
1654 	rte_telemetry_register_cmd(
1655 			EAL_MEMSEG_INFO_REQ, handle_eal_memseg_info_request,
1656 			"Returns memseg info. Parameter: int memseg_list_id,int memseg_id");
1657 	rte_telemetry_register_cmd(EAL_ELEMENT_LIST_REQ,
1658 			handle_eal_element_list_request,
1659 			"Returns array of heap element IDs. Parameters: int heap_id, int memseg_list_id, int memseg_id");
1660 	rte_telemetry_register_cmd(EAL_ELEMENT_INFO_REQ,
1661 			handle_eal_element_info_request,
1662 			"Returns element info. Parameters: int heap_id, int memseg_list_id, int memseg_id, int start_elem_id, int end_elem_id");
1663 }
1664 #endif
1665