xref: /dpdk/lib/eal/common/eal_common_memory.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <fcntl.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <stdarg.h>
11 #include <string.h>
12 #include <unistd.h>
13 #include <inttypes.h>
14 #include <sys/queue.h>
15 
16 #include <rte_fbarray.h>
17 #include <rte_memory.h>
18 #include <rte_eal.h>
19 #include <rte_eal_memconfig.h>
20 #include <rte_eal_paging.h>
21 #include <rte_errno.h>
22 #include <rte_log.h>
23 
24 #include "eal_memalloc.h"
25 #include "eal_private.h"
26 #include "eal_internal_cfg.h"
27 #include "eal_memcfg.h"
28 #include "eal_options.h"
29 #include "malloc_heap.h"
30 
31 /*
32  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
33  * pointer to the mmap'd area and keep *size unmodified. Else, retry
34  * with a smaller zone: decrease *size by hugepage_sz until it reaches
35  * 0. In this case, return NULL. Note: this function returns an address
36  * which is a multiple of hugepage size.
37  */
38 
39 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
40 
41 static void *next_baseaddr;
42 static uint64_t system_page_sz;
43 
44 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
45 void *
46 eal_get_virtual_area(void *requested_addr, size_t *size,
47 	size_t page_sz, int flags, int reserve_flags)
48 {
49 	bool addr_is_hint, allow_shrink, unmap, no_align;
50 	uint64_t map_sz;
51 	void *mapped_addr, *aligned_addr;
52 	uint8_t try = 0;
53 	struct internal_config *internal_conf =
54 		eal_get_internal_configuration();
55 
56 	if (system_page_sz == 0)
57 		system_page_sz = rte_mem_page_size();
58 
59 	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
60 
61 	addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
62 	allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
63 	unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
64 
65 	if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 &&
66 			rte_eal_process_type() == RTE_PROC_PRIMARY)
67 		next_baseaddr = (void *) internal_conf->base_virtaddr;
68 
69 #ifdef RTE_ARCH_64
70 	if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 &&
71 			rte_eal_process_type() == RTE_PROC_PRIMARY)
72 		next_baseaddr = (void *) eal_get_baseaddr();
73 #endif
74 	if (requested_addr == NULL && next_baseaddr != NULL) {
75 		requested_addr = next_baseaddr;
76 		requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
77 		addr_is_hint = true;
78 	}
79 
80 	/* we don't need alignment of resulting pointer in the following cases:
81 	 *
82 	 * 1. page size is equal to system size
83 	 * 2. we have a requested address, and it is page-aligned, and we will
84 	 *    be discarding the address if we get a different one.
85 	 *
86 	 * for all other cases, alignment is potentially necessary.
87 	 */
88 	no_align = (requested_addr != NULL &&
89 		requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
90 		!addr_is_hint) ||
91 		page_sz == system_page_sz;
92 
93 	do {
94 		map_sz = no_align ? *size : *size + page_sz;
95 		if (map_sz > SIZE_MAX) {
96 			RTE_LOG(ERR, EAL, "Map size too big\n");
97 			rte_errno = E2BIG;
98 			return NULL;
99 		}
100 
101 		mapped_addr = eal_mem_reserve(
102 			requested_addr, (size_t)map_sz, reserve_flags);
103 		if ((mapped_addr == NULL) && allow_shrink)
104 			*size -= page_sz;
105 
106 		if ((mapped_addr != NULL) && addr_is_hint &&
107 				(mapped_addr != requested_addr)) {
108 			try++;
109 			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
110 			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
111 				/* hint was not used. Try with another offset */
112 				eal_mem_free(mapped_addr, map_sz);
113 				mapped_addr = NULL;
114 				requested_addr = next_baseaddr;
115 			}
116 		}
117 	} while ((allow_shrink || addr_is_hint) &&
118 		(mapped_addr == NULL) && (*size > 0));
119 
120 	/* align resulting address - if map failed, we will ignore the value
121 	 * anyway, so no need to add additional checks.
122 	 */
123 	aligned_addr = no_align ? mapped_addr :
124 			RTE_PTR_ALIGN(mapped_addr, page_sz);
125 
126 	if (*size == 0) {
127 		RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
128 			rte_strerror(rte_errno));
129 		return NULL;
130 	} else if (mapped_addr == NULL) {
131 		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
132 			rte_strerror(rte_errno));
133 		return NULL;
134 	} else if (requested_addr != NULL && !addr_is_hint &&
135 			aligned_addr != requested_addr) {
136 		RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
137 			requested_addr, aligned_addr);
138 		eal_mem_free(mapped_addr, map_sz);
139 		rte_errno = EADDRNOTAVAIL;
140 		return NULL;
141 	} else if (requested_addr != NULL && addr_is_hint &&
142 			aligned_addr != requested_addr) {
143 		RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
144 			requested_addr, aligned_addr);
145 		RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory into secondary processes\n");
146 	} else if (next_baseaddr != NULL) {
147 		next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
148 	}
149 
150 	RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
151 		aligned_addr, *size);
152 
153 	if (unmap) {
154 		eal_mem_free(mapped_addr, map_sz);
155 	} else if (!no_align) {
156 		void *map_end, *aligned_end;
157 		size_t before_len, after_len;
158 
159 		/* when we reserve space with alignment, we add alignment to
160 		 * mapping size. On 32-bit, if 1GB alignment was requested, this
161 		 * would waste 1GB of address space, which is a luxury we cannot
162 		 * afford. so, if alignment was performed, check if any unneeded
163 		 * address space can be unmapped back.
164 		 */
165 
166 		map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
167 		aligned_end = RTE_PTR_ADD(aligned_addr, *size);
168 
169 		/* unmap space before aligned mmap address */
170 		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
171 		if (before_len > 0)
172 			eal_mem_free(mapped_addr, before_len);
173 
174 		/* unmap space after aligned end mmap address */
175 		after_len = RTE_PTR_DIFF(map_end, aligned_end);
176 		if (after_len > 0)
177 			eal_mem_free(aligned_end, after_len);
178 	}
179 
180 	if (!unmap) {
181 		/* Exclude these pages from a core dump. */
182 		eal_mem_set_dump(aligned_addr, *size, false);
183 	}
184 
185 	return aligned_addr;
186 }
187 
188 int
189 eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
190 		uint64_t page_sz, int n_segs, int socket_id, bool heap)
191 {
192 	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
193 			sizeof(struct rte_memseg))) {
194 		RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
195 			rte_strerror(rte_errno));
196 		return -1;
197 	}
198 
199 	msl->page_sz = page_sz;
200 	msl->socket_id = socket_id;
201 	msl->base_va = NULL;
202 	msl->heap = heap;
203 
204 	RTE_LOG(DEBUG, EAL,
205 		"Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n",
206 		socket_id, page_sz >> 10);
207 
208 	return 0;
209 }
210 
211 int
212 eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz,
213 		int n_segs, int socket_id, int type_msl_idx, bool heap)
214 {
215 	char name[RTE_FBARRAY_NAME_LEN];
216 
217 	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
218 		 type_msl_idx);
219 
220 	return eal_memseg_list_init_named(
221 		msl, name, page_sz, n_segs, socket_id, heap);
222 }
223 
224 int
225 eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
226 {
227 	size_t page_sz, mem_sz;
228 	void *addr;
229 
230 	page_sz = msl->page_sz;
231 	mem_sz = page_sz * msl->memseg_arr.len;
232 
233 	addr = eal_get_virtual_area(
234 		msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
235 	if (addr == NULL) {
236 #ifndef RTE_EXEC_ENV_WINDOWS
237 		/* The hint would be misleading on Windows, because address
238 		 * is by default system-selected (base VA = 0).
239 		 * However, this function is called from many places,
240 		 * including common code, so don't duplicate the message.
241 		 */
242 		if (rte_errno == EADDRNOTAVAIL)
243 			RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - "
244 				"please use '--" OPT_BASE_VIRTADDR "' option\n",
245 				(unsigned long long)mem_sz, msl->base_va);
246 #endif
247 		return -1;
248 	}
249 	msl->base_va = addr;
250 	msl->len = mem_sz;
251 
252 	RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n",
253 			addr, mem_sz);
254 
255 	return 0;
256 }
257 
258 void
259 eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs)
260 {
261 	size_t page_sz = msl->page_sz;
262 	int i;
263 
264 	for (i = 0; i < n_segs; i++) {
265 		struct rte_fbarray *arr = &msl->memseg_arr;
266 		struct rte_memseg *ms = rte_fbarray_get(arr, i);
267 
268 		if (rte_eal_iova_mode() == RTE_IOVA_VA)
269 			ms->iova = (uintptr_t)addr;
270 		else
271 			ms->iova = RTE_BAD_IOVA;
272 		ms->addr = addr;
273 		ms->hugepage_sz = page_sz;
274 		ms->socket_id = 0;
275 		ms->len = page_sz;
276 
277 		rte_fbarray_set_used(arr, i);
278 
279 		addr = RTE_PTR_ADD(addr, page_sz);
280 	}
281 }
282 
283 static struct rte_memseg *
284 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
285 {
286 	const struct rte_fbarray *arr;
287 	void *start, *end;
288 	int ms_idx;
289 
290 	if (msl == NULL)
291 		return NULL;
292 
293 	/* a memseg list was specified, check if it's the right one */
294 	start = msl->base_va;
295 	end = RTE_PTR_ADD(start, msl->len);
296 
297 	if (addr < start || addr >= end)
298 		return NULL;
299 
300 	/* now, calculate index */
301 	arr = &msl->memseg_arr;
302 	ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
303 	return rte_fbarray_get(arr, ms_idx);
304 }
305 
306 static struct rte_memseg_list *
307 virt2memseg_list(const void *addr)
308 {
309 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
310 	struct rte_memseg_list *msl;
311 	int msl_idx;
312 
313 	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
314 		void *start, *end;
315 		msl = &mcfg->memsegs[msl_idx];
316 
317 		start = msl->base_va;
318 		end = RTE_PTR_ADD(start, msl->len);
319 		if (addr >= start && addr < end)
320 			break;
321 	}
322 	/* if we didn't find our memseg list */
323 	if (msl_idx == RTE_MAX_MEMSEG_LISTS)
324 		return NULL;
325 	return msl;
326 }
327 
328 struct rte_memseg_list *
329 rte_mem_virt2memseg_list(const void *addr)
330 {
331 	return virt2memseg_list(addr);
332 }
333 
334 struct virtiova {
335 	rte_iova_t iova;
336 	void *virt;
337 };
338 static int
339 find_virt(const struct rte_memseg_list *msl __rte_unused,
340 		const struct rte_memseg *ms, void *arg)
341 {
342 	struct virtiova *vi = arg;
343 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
344 		size_t offset = vi->iova - ms->iova;
345 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
346 		/* stop the walk */
347 		return 1;
348 	}
349 	return 0;
350 }
351 static int
352 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
353 		const struct rte_memseg *ms, size_t len, void *arg)
354 {
355 	struct virtiova *vi = arg;
356 	if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
357 		size_t offset = vi->iova - ms->iova;
358 		vi->virt = RTE_PTR_ADD(ms->addr, offset);
359 		/* stop the walk */
360 		return 1;
361 	}
362 	return 0;
363 }
364 
365 void *
366 rte_mem_iova2virt(rte_iova_t iova)
367 {
368 	struct virtiova vi;
369 	const struct internal_config *internal_conf =
370 		eal_get_internal_configuration();
371 
372 	memset(&vi, 0, sizeof(vi));
373 
374 	vi.iova = iova;
375 	/* for legacy mem, we can get away with scanning VA-contiguous segments,
376 	 * as we know they are PA-contiguous as well
377 	 */
378 	if (internal_conf->legacy_mem)
379 		rte_memseg_contig_walk(find_virt_legacy, &vi);
380 	else
381 		rte_memseg_walk(find_virt, &vi);
382 
383 	return vi.virt;
384 }
385 
386 struct rte_memseg *
387 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
388 {
389 	return virt2memseg(addr, msl != NULL ? msl :
390 			rte_mem_virt2memseg_list(addr));
391 }
392 
393 static int
394 physmem_size(const struct rte_memseg_list *msl, void *arg)
395 {
396 	uint64_t *total_len = arg;
397 
398 	if (msl->external)
399 		return 0;
400 
401 	*total_len += msl->memseg_arr.count * msl->page_sz;
402 
403 	return 0;
404 }
405 
406 /* get the total size of memory */
407 uint64_t
408 rte_eal_get_physmem_size(void)
409 {
410 	uint64_t total_len = 0;
411 
412 	rte_memseg_list_walk(physmem_size, &total_len);
413 
414 	return total_len;
415 }
416 
417 static int
418 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
419 		void *arg)
420 {
421 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
422 	int msl_idx, ms_idx, fd;
423 	FILE *f = arg;
424 
425 	msl_idx = msl - mcfg->memsegs;
426 	if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
427 		return -1;
428 
429 	ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
430 	if (ms_idx < 0)
431 		return -1;
432 
433 	fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
434 	fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
435 			"virt:%p, socket_id:%"PRId32", "
436 			"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
437 			"nrank:%"PRIx32" fd:%i\n",
438 			msl_idx, ms_idx,
439 			ms->iova,
440 			ms->len,
441 			ms->addr,
442 			ms->socket_id,
443 			ms->hugepage_sz,
444 			ms->nchannel,
445 			ms->nrank,
446 			fd);
447 
448 	return 0;
449 }
450 
451 /*
452  * Defining here because declared in rte_memory.h, but the actual implementation
453  * is in eal_common_memalloc.c, like all other memalloc internals.
454  */
455 int
456 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
457 		void *arg)
458 {
459 	const struct internal_config *internal_conf =
460 		eal_get_internal_configuration();
461 
462 	/* FreeBSD boots with legacy mem enabled by default */
463 	if (internal_conf->legacy_mem) {
464 		RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
465 		rte_errno = ENOTSUP;
466 		return -1;
467 	}
468 	return eal_memalloc_mem_event_callback_register(name, clb, arg);
469 }
470 
471 int
472 rte_mem_event_callback_unregister(const char *name, void *arg)
473 {
474 	const struct internal_config *internal_conf =
475 		eal_get_internal_configuration();
476 
477 	/* FreeBSD boots with legacy mem enabled by default */
478 	if (internal_conf->legacy_mem) {
479 		RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
480 		rte_errno = ENOTSUP;
481 		return -1;
482 	}
483 	return eal_memalloc_mem_event_callback_unregister(name, arg);
484 }
485 
486 int
487 rte_mem_alloc_validator_register(const char *name,
488 		rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
489 {
490 	const struct internal_config *internal_conf =
491 		eal_get_internal_configuration();
492 
493 	/* FreeBSD boots with legacy mem enabled by default */
494 	if (internal_conf->legacy_mem) {
495 		RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
496 		rte_errno = ENOTSUP;
497 		return -1;
498 	}
499 	return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
500 			limit);
501 }
502 
503 int
504 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
505 {
506 	const struct internal_config *internal_conf =
507 		eal_get_internal_configuration();
508 
509 	/* FreeBSD boots with legacy mem enabled by default */
510 	if (internal_conf->legacy_mem) {
511 		RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
512 		rte_errno = ENOTSUP;
513 		return -1;
514 	}
515 	return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
516 }
517 
518 /* Dump the physical memory layout on console */
519 void
520 rte_dump_physmem_layout(FILE *f)
521 {
522 	rte_memseg_walk(dump_memseg, f);
523 }
524 
525 static int
526 check_iova(const struct rte_memseg_list *msl __rte_unused,
527 		const struct rte_memseg *ms, void *arg)
528 {
529 	uint64_t *mask = arg;
530 	rte_iova_t iova;
531 
532 	/* higher address within segment */
533 	iova = (ms->iova + ms->len) - 1;
534 	if (!(iova & *mask))
535 		return 0;
536 
537 	RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
538 			    ms->iova, ms->len);
539 
540 	RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
541 	return 1;
542 }
543 
544 #define MAX_DMA_MASK_BITS 63
545 
546 /* check memseg iovas are within the required range based on dma mask */
547 static int
548 check_dma_mask(uint8_t maskbits, bool thread_unsafe)
549 {
550 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
551 	uint64_t mask;
552 	int ret;
553 
554 	/* Sanity check. We only check width can be managed with 64 bits
555 	 * variables. Indeed any higher value is likely wrong. */
556 	if (maskbits > MAX_DMA_MASK_BITS) {
557 		RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
558 				   maskbits, MAX_DMA_MASK_BITS);
559 		return -1;
560 	}
561 
562 	/* create dma mask */
563 	mask = ~((1ULL << maskbits) - 1);
564 
565 	if (thread_unsafe)
566 		ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
567 	else
568 		ret = rte_memseg_walk(check_iova, &mask);
569 
570 	if (ret)
571 		/*
572 		 * Dma mask precludes hugepage usage.
573 		 * This device can not be used and we do not need to keep
574 		 * the dma mask.
575 		 */
576 		return 1;
577 
578 	/*
579 	 * we need to keep the more restricted maskbit for checking
580 	 * potential dynamic memory allocation in the future.
581 	 */
582 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
583 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
584 
585 	return 0;
586 }
587 
588 int
589 rte_mem_check_dma_mask(uint8_t maskbits)
590 {
591 	return check_dma_mask(maskbits, false);
592 }
593 
594 int
595 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
596 {
597 	return check_dma_mask(maskbits, true);
598 }
599 
600 /*
601  * Set dma mask to use when memory initialization is done.
602  *
603  * This function should ONLY be used by code executed before the memory
604  * initialization. PMDs should use rte_mem_check_dma_mask if addressing
605  * limitations by the device.
606  */
607 void
608 rte_mem_set_dma_mask(uint8_t maskbits)
609 {
610 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
611 
612 	mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
613 			     RTE_MIN(mcfg->dma_maskbits, maskbits);
614 }
615 
616 /* return the number of memory channels */
617 unsigned rte_memory_get_nchannel(void)
618 {
619 	return rte_eal_get_configuration()->mem_config->nchannel;
620 }
621 
622 /* return the number of memory rank */
623 unsigned rte_memory_get_nrank(void)
624 {
625 	return rte_eal_get_configuration()->mem_config->nrank;
626 }
627 
628 static int
629 rte_eal_memdevice_init(void)
630 {
631 	struct rte_config *config;
632 	const struct internal_config *internal_conf;
633 
634 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
635 		return 0;
636 
637 	internal_conf = eal_get_internal_configuration();
638 	config = rte_eal_get_configuration();
639 	config->mem_config->nchannel = internal_conf->force_nchannel;
640 	config->mem_config->nrank = internal_conf->force_nrank;
641 
642 	return 0;
643 }
644 
645 /* Lock page in physical memory and prevent from swapping. */
646 int
647 rte_mem_lock_page(const void *virt)
648 {
649 	uintptr_t virtual = (uintptr_t)virt;
650 	size_t page_size = rte_mem_page_size();
651 	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
652 	return rte_mem_lock((void *)aligned, page_size);
653 }
654 
655 int
656 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
657 {
658 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
659 	int i, ms_idx, ret = 0;
660 
661 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
662 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
663 		const struct rte_memseg *ms;
664 		struct rte_fbarray *arr;
665 
666 		if (msl->memseg_arr.count == 0)
667 			continue;
668 
669 		arr = &msl->memseg_arr;
670 
671 		ms_idx = rte_fbarray_find_next_used(arr, 0);
672 		while (ms_idx >= 0) {
673 			int n_segs;
674 			size_t len;
675 
676 			ms = rte_fbarray_get(arr, ms_idx);
677 
678 			/* find how many more segments there are, starting with
679 			 * this one.
680 			 */
681 			n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
682 			len = n_segs * msl->page_sz;
683 
684 			ret = func(msl, ms, len, arg);
685 			if (ret)
686 				return ret;
687 			ms_idx = rte_fbarray_find_next_used(arr,
688 					ms_idx + n_segs);
689 		}
690 	}
691 	return 0;
692 }
693 
694 int
695 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
696 {
697 	int ret = 0;
698 
699 	/* do not allow allocations/frees/init while we iterate */
700 	rte_mcfg_mem_read_lock();
701 	ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
702 	rte_mcfg_mem_read_unlock();
703 
704 	return ret;
705 }
706 
707 int
708 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
709 {
710 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
711 	int i, ms_idx, ret = 0;
712 
713 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
714 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
715 		const struct rte_memseg *ms;
716 		struct rte_fbarray *arr;
717 
718 		if (msl->memseg_arr.count == 0)
719 			continue;
720 
721 		arr = &msl->memseg_arr;
722 
723 		ms_idx = rte_fbarray_find_next_used(arr, 0);
724 		while (ms_idx >= 0) {
725 			ms = rte_fbarray_get(arr, ms_idx);
726 			ret = func(msl, ms, arg);
727 			if (ret)
728 				return ret;
729 			ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
730 		}
731 	}
732 	return 0;
733 }
734 
735 int
736 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
737 {
738 	int ret = 0;
739 
740 	/* do not allow allocations/frees/init while we iterate */
741 	rte_mcfg_mem_read_lock();
742 	ret = rte_memseg_walk_thread_unsafe(func, arg);
743 	rte_mcfg_mem_read_unlock();
744 
745 	return ret;
746 }
747 
748 int
749 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
750 {
751 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
752 	int i, ret = 0;
753 
754 	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
755 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
756 
757 		if (msl->base_va == NULL)
758 			continue;
759 
760 		ret = func(msl, arg);
761 		if (ret)
762 			return ret;
763 	}
764 	return 0;
765 }
766 
767 int
768 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
769 {
770 	int ret = 0;
771 
772 	/* do not allow allocations/frees/init while we iterate */
773 	rte_mcfg_mem_read_lock();
774 	ret = rte_memseg_list_walk_thread_unsafe(func, arg);
775 	rte_mcfg_mem_read_unlock();
776 
777 	return ret;
778 }
779 
780 int
781 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
782 {
783 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
784 	struct rte_memseg_list *msl;
785 	struct rte_fbarray *arr;
786 	int msl_idx, seg_idx, ret;
787 
788 	if (ms == NULL) {
789 		rte_errno = EINVAL;
790 		return -1;
791 	}
792 
793 	msl = rte_mem_virt2memseg_list(ms->addr);
794 	if (msl == NULL) {
795 		rte_errno = EINVAL;
796 		return -1;
797 	}
798 	arr = &msl->memseg_arr;
799 
800 	msl_idx = msl - mcfg->memsegs;
801 	seg_idx = rte_fbarray_find_idx(arr, ms);
802 
803 	if (!rte_fbarray_is_used(arr, seg_idx)) {
804 		rte_errno = ENOENT;
805 		return -1;
806 	}
807 
808 	/* segment fd API is not supported for external segments */
809 	if (msl->external) {
810 		rte_errno = ENOTSUP;
811 		return -1;
812 	}
813 
814 	ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
815 	if (ret < 0) {
816 		rte_errno = -ret;
817 		ret = -1;
818 	}
819 	return ret;
820 }
821 
822 int
823 rte_memseg_get_fd(const struct rte_memseg *ms)
824 {
825 	int ret;
826 
827 	rte_mcfg_mem_read_lock();
828 	ret = rte_memseg_get_fd_thread_unsafe(ms);
829 	rte_mcfg_mem_read_unlock();
830 
831 	return ret;
832 }
833 
834 int
835 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
836 		size_t *offset)
837 {
838 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
839 	struct rte_memseg_list *msl;
840 	struct rte_fbarray *arr;
841 	int msl_idx, seg_idx, ret;
842 
843 	if (ms == NULL || offset == NULL) {
844 		rte_errno = EINVAL;
845 		return -1;
846 	}
847 
848 	msl = rte_mem_virt2memseg_list(ms->addr);
849 	if (msl == NULL) {
850 		rte_errno = EINVAL;
851 		return -1;
852 	}
853 	arr = &msl->memseg_arr;
854 
855 	msl_idx = msl - mcfg->memsegs;
856 	seg_idx = rte_fbarray_find_idx(arr, ms);
857 
858 	if (!rte_fbarray_is_used(arr, seg_idx)) {
859 		rte_errno = ENOENT;
860 		return -1;
861 	}
862 
863 	/* segment fd API is not supported for external segments */
864 	if (msl->external) {
865 		rte_errno = ENOTSUP;
866 		return -1;
867 	}
868 
869 	ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
870 	if (ret < 0) {
871 		rte_errno = -ret;
872 		ret = -1;
873 	}
874 	return ret;
875 }
876 
877 int
878 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
879 {
880 	int ret;
881 
882 	rte_mcfg_mem_read_lock();
883 	ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
884 	rte_mcfg_mem_read_unlock();
885 
886 	return ret;
887 }
888 
889 int
890 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
891 		unsigned int n_pages, size_t page_sz)
892 {
893 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
894 	unsigned int socket_id, n;
895 	int ret = 0;
896 
897 	if (va_addr == NULL || page_sz == 0 || len == 0 ||
898 			!rte_is_power_of_2(page_sz) ||
899 			RTE_ALIGN(len, page_sz) != len ||
900 			((len / page_sz) != n_pages && iova_addrs != NULL) ||
901 			!rte_is_aligned(va_addr, page_sz)) {
902 		rte_errno = EINVAL;
903 		return -1;
904 	}
905 	rte_mcfg_mem_write_lock();
906 
907 	/* make sure the segment doesn't already exist */
908 	if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
909 		rte_errno = EEXIST;
910 		ret = -1;
911 		goto unlock;
912 	}
913 
914 	/* get next available socket ID */
915 	socket_id = mcfg->next_socket_id;
916 	if (socket_id > INT32_MAX) {
917 		RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
918 		rte_errno = ENOSPC;
919 		ret = -1;
920 		goto unlock;
921 	}
922 
923 	/* we can create a new memseg */
924 	n = len / page_sz;
925 	if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
926 			page_sz, "extmem", socket_id) == NULL) {
927 		ret = -1;
928 		goto unlock;
929 	}
930 
931 	/* memseg list successfully created - increment next socket ID */
932 	mcfg->next_socket_id++;
933 unlock:
934 	rte_mcfg_mem_write_unlock();
935 	return ret;
936 }
937 
938 int
939 rte_extmem_unregister(void *va_addr, size_t len)
940 {
941 	struct rte_memseg_list *msl;
942 	int ret = 0;
943 
944 	if (va_addr == NULL || len == 0) {
945 		rte_errno = EINVAL;
946 		return -1;
947 	}
948 	rte_mcfg_mem_write_lock();
949 
950 	/* find our segment */
951 	msl = malloc_heap_find_external_seg(va_addr, len);
952 	if (msl == NULL) {
953 		rte_errno = ENOENT;
954 		ret = -1;
955 		goto unlock;
956 	}
957 
958 	ret = malloc_heap_destroy_external_seg(msl);
959 unlock:
960 	rte_mcfg_mem_write_unlock();
961 	return ret;
962 }
963 
964 static int
965 sync_memory(void *va_addr, size_t len, bool attach)
966 {
967 	struct rte_memseg_list *msl;
968 	int ret = 0;
969 
970 	if (va_addr == NULL || len == 0) {
971 		rte_errno = EINVAL;
972 		return -1;
973 	}
974 	rte_mcfg_mem_write_lock();
975 
976 	/* find our segment */
977 	msl = malloc_heap_find_external_seg(va_addr, len);
978 	if (msl == NULL) {
979 		rte_errno = ENOENT;
980 		ret = -1;
981 		goto unlock;
982 	}
983 	if (attach)
984 		ret = rte_fbarray_attach(&msl->memseg_arr);
985 	else
986 		ret = rte_fbarray_detach(&msl->memseg_arr);
987 
988 unlock:
989 	rte_mcfg_mem_write_unlock();
990 	return ret;
991 }
992 
993 int
994 rte_extmem_attach(void *va_addr, size_t len)
995 {
996 	return sync_memory(va_addr, len, true);
997 }
998 
999 int
1000 rte_extmem_detach(void *va_addr, size_t len)
1001 {
1002 	return sync_memory(va_addr, len, false);
1003 }
1004 
1005 /* detach all EAL memory */
1006 int
1007 rte_eal_memory_detach(void)
1008 {
1009 	const struct internal_config *internal_conf =
1010 		eal_get_internal_configuration();
1011 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1012 	size_t page_sz = rte_mem_page_size();
1013 	unsigned int i;
1014 
1015 	if (internal_conf->in_memory == 1)
1016 		return 0;
1017 
1018 	rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
1019 
1020 	/* detach internal memory subsystem data first */
1021 	if (eal_memalloc_cleanup())
1022 		RTE_LOG(ERR, EAL, "Could not release memory subsystem data\n");
1023 
1024 	for (i = 0; i < RTE_DIM(mcfg->memsegs); i++) {
1025 		struct rte_memseg_list *msl = &mcfg->memsegs[i];
1026 
1027 		/* skip uninitialized segments */
1028 		if (msl->base_va == NULL)
1029 			continue;
1030 		/*
1031 		 * external segments are supposed to be detached at this point,
1032 		 * but if they aren't, we can't really do anything about it,
1033 		 * because if we skip them here, they'll become invalid after
1034 		 * we unmap the memconfig anyway. however, if this is externally
1035 		 * referenced memory, we have no business unmapping it.
1036 		 */
1037 		if (!msl->external)
1038 			if (rte_mem_unmap(msl->base_va, msl->len) != 0)
1039 				RTE_LOG(ERR, EAL, "Could not unmap memory: %s\n",
1040 						rte_strerror(rte_errno));
1041 
1042 		/*
1043 		 * we are detaching the fbarray rather than destroying because
1044 		 * other processes might still reference this fbarray, and we
1045 		 * have no way of knowing if they still do.
1046 		 */
1047 		if (rte_fbarray_detach(&msl->memseg_arr))
1048 			RTE_LOG(ERR, EAL, "Could not detach fbarray: %s\n",
1049 					rte_strerror(rte_errno));
1050 	}
1051 	rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
1052 
1053 	/*
1054 	 * we've detached the memseg lists, so we can unmap the shared mem
1055 	 * config - we can't zero it out because it might still be referenced
1056 	 * by other processes.
1057 	 */
1058 	if (internal_conf->no_shconf == 0 && mcfg->mem_cfg_addr != 0) {
1059 		if (rte_mem_unmap(mcfg, RTE_ALIGN(sizeof(*mcfg), page_sz)) != 0)
1060 			RTE_LOG(ERR, EAL, "Could not unmap shared memory config: %s\n",
1061 					rte_strerror(rte_errno));
1062 	}
1063 	rte_eal_get_configuration()->mem_config = NULL;
1064 
1065 	return 0;
1066 }
1067 
1068 /* init memory subsystem */
1069 int
1070 rte_eal_memory_init(void)
1071 {
1072 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1073 	const struct internal_config *internal_conf =
1074 		eal_get_internal_configuration();
1075 
1076 	int retval;
1077 	RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
1078 
1079 	if (!mcfg)
1080 		return -1;
1081 
1082 	/* lock mem hotplug here, to prevent races while we init */
1083 	rte_mcfg_mem_read_lock();
1084 
1085 	if (rte_eal_memseg_init() < 0)
1086 		goto fail;
1087 
1088 	if (eal_memalloc_init() < 0)
1089 		goto fail;
1090 
1091 	retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1092 			rte_eal_hugepage_init() :
1093 			rte_eal_hugepage_attach();
1094 	if (retval < 0)
1095 		goto fail;
1096 
1097 	if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
1098 		goto fail;
1099 
1100 	return 0;
1101 fail:
1102 	rte_mcfg_mem_read_unlock();
1103 	return -1;
1104 }
1105