xref: /dpdk/lib/eal/common/eal_common_dynmem.c (revision ae67895b507bb6af22263c79ba0d5c374b396485)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation.
3  * Copyright(c) 2013 6WIND S.A.
4  */
5 
6 #include <inttypes.h>
7 #include <stdlib.h>
8 #include <string.h>
9 
10 #include <rte_log.h>
11 #include <rte_string_fns.h>
12 
13 #include "eal_internal_cfg.h"
14 #include "eal_memalloc.h"
15 #include "eal_memcfg.h"
16 #include "eal_private.h"
17 
18 /** @file Functions common to EALs that support dynamic memory allocation. */
19 
20 int
eal_dynmem_memseg_lists_init(void)21 eal_dynmem_memseg_lists_init(void)
22 {
23 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
24 	struct memtype {
25 		uint64_t page_sz;
26 		int socket_id;
27 	} *memtypes = NULL;
28 	int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
29 	struct rte_memseg_list *msl;
30 	uint64_t max_mem, max_mem_per_type;
31 	unsigned int max_seglists_per_type;
32 	unsigned int n_memtypes, cur_type;
33 	struct internal_config *internal_conf =
34 		eal_get_internal_configuration();
35 
36 	/* no-huge does not need this at all */
37 	if (internal_conf->no_hugetlbfs)
38 		return 0;
39 
40 	/*
41 	 * figuring out amount of memory we're going to have is a long and very
42 	 * involved process. the basic element we're operating with is a memory
43 	 * type, defined as a combination of NUMA node ID and page size (so that
44 	 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
45 	 *
46 	 * deciding amount of memory going towards each memory type is a
47 	 * balancing act between maximum segments per type, maximum memory per
48 	 * type, and number of detected NUMA nodes. the goal is to make sure
49 	 * each memory type gets at least one memseg list.
50 	 *
51 	 * the total amount of memory is limited by RTE_MAX_MEM_MB value.
52 	 *
53 	 * the total amount of memory per type is limited by either
54 	 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
55 	 * of detected NUMA nodes. additionally, maximum number of segments per
56 	 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
57 	 * smaller page sizes, it can take hundreds of thousands of segments to
58 	 * reach the above specified per-type memory limits.
59 	 *
60 	 * additionally, each type may have multiple memseg lists associated
61 	 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
62 	 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
63 	 *
64 	 * the number of memseg lists per type is decided based on the above
65 	 * limits, and also taking number of detected NUMA nodes, to make sure
66 	 * that we don't run out of memseg lists before we populate all NUMA
67 	 * nodes with memory.
68 	 *
69 	 * we do this in three stages. first, we collect the number of types.
70 	 * then, we figure out memory constraints and populate the list of
71 	 * would-be memseg lists. then, we go ahead and allocate the memseg
72 	 * lists.
73 	 */
74 
75 	/* create space for mem types */
76 	n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count();
77 	memtypes = calloc(n_memtypes, sizeof(*memtypes));
78 	if (memtypes == NULL) {
79 		EAL_LOG(ERR, "Cannot allocate space for memory types");
80 		return -1;
81 	}
82 
83 	/* populate mem types */
84 	cur_type = 0;
85 	for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes;
86 			hpi_idx++) {
87 		struct hugepage_info *hpi;
88 		uint64_t hugepage_sz;
89 
90 		hpi = &internal_conf->hugepage_info[hpi_idx];
91 		hugepage_sz = hpi->hugepage_sz;
92 
93 		for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
94 			int socket_id = rte_socket_id_by_idx(i);
95 
96 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
97 			/* we can still sort pages by socket in legacy mode */
98 			if (!internal_conf->legacy_mem && socket_id > 0)
99 				break;
100 #endif
101 			memtypes[cur_type].page_sz = hugepage_sz;
102 			memtypes[cur_type].socket_id = socket_id;
103 
104 			EAL_LOG(DEBUG, "Detected memory type: "
105 				"socket_id:%u hugepage_sz:%" PRIu64,
106 				socket_id, hugepage_sz);
107 		}
108 	}
109 	/* number of memtypes could have been lower due to no NUMA support */
110 	n_memtypes = cur_type;
111 
112 	/* set up limits for types */
113 	max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
114 	max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
115 			max_mem / n_memtypes);
116 	/*
117 	 * limit maximum number of segment lists per type to ensure there's
118 	 * space for memseg lists for all NUMA nodes with all page sizes
119 	 */
120 	max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
121 
122 	if (max_seglists_per_type == 0) {
123 		EAL_LOG(ERR, "Cannot accommodate all memory types, please increase RTE_MAX_MEMSEG_LISTS");
124 		goto out;
125 	}
126 
127 	/* go through all mem types and create segment lists */
128 	msl_idx = 0;
129 	for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
130 		unsigned int cur_seglist, n_seglists, n_segs;
131 		unsigned int max_segs_per_type, max_segs_per_list;
132 		struct memtype *type = &memtypes[cur_type];
133 		uint64_t max_mem_per_list, pagesz;
134 		int socket_id;
135 
136 		pagesz = type->page_sz;
137 		socket_id = type->socket_id;
138 
139 		/*
140 		 * we need to create segment lists for this type. we must take
141 		 * into account the following things:
142 		 *
143 		 * 1. total amount of memory we can use for this memory type
144 		 * 2. total amount of memory per memseg list allowed
145 		 * 3. number of segments needed to fit the amount of memory
146 		 * 4. number of segments allowed per type
147 		 * 5. number of segments allowed per memseg list
148 		 * 6. number of memseg lists we are allowed to take up
149 		 */
150 
151 		/* calculate how much segments we will need in total */
152 		max_segs_per_type = max_mem_per_type / pagesz;
153 		/* limit number of segments to maximum allowed per type */
154 		max_segs_per_type = RTE_MIN(max_segs_per_type,
155 				(unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
156 		/* limit number of segments to maximum allowed per list */
157 		max_segs_per_list = RTE_MIN(max_segs_per_type,
158 				(unsigned int)RTE_MAX_MEMSEG_PER_LIST);
159 
160 		/* calculate how much memory we can have per segment list */
161 		max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
162 				(uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
163 
164 		/* calculate how many segments each segment list will have */
165 		n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
166 
167 		/* calculate how many segment lists we can have */
168 		n_seglists = RTE_MIN(max_segs_per_type / n_segs,
169 				max_mem_per_type / max_mem_per_list);
170 
171 		/* limit number of segment lists according to our maximum */
172 		n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
173 
174 		EAL_LOG(DEBUG, "Creating %i segment lists: "
175 				"n_segs:%i socket_id:%i hugepage_sz:%" PRIu64,
176 			n_seglists, n_segs, socket_id, pagesz);
177 
178 		/* create all segment lists */
179 		for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
180 			if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
181 				EAL_LOG(ERR,
182 					"No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
183 				goto out;
184 			}
185 			msl = &mcfg->memsegs[msl_idx++];
186 
187 			if (eal_memseg_list_init(msl, pagesz, n_segs,
188 					socket_id, cur_seglist, true))
189 				goto out;
190 
191 			if (eal_memseg_list_alloc(msl, 0)) {
192 				EAL_LOG(ERR, "Cannot allocate VA space for memseg list");
193 				goto out;
194 			}
195 		}
196 	}
197 	/* we're successful */
198 	ret = 0;
199 out:
200 	free(memtypes);
201 	return ret;
202 }
203 
204 static int __rte_unused
hugepage_count_walk(const struct rte_memseg_list * msl,void * arg)205 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
206 {
207 	struct hugepage_info *hpi = arg;
208 
209 	if (msl->page_sz != hpi->hugepage_sz)
210 		return 0;
211 
212 	hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
213 	return 0;
214 }
215 
216 static int
limits_callback(int socket_id,size_t cur_limit,size_t new_len)217 limits_callback(int socket_id, size_t cur_limit, size_t new_len)
218 {
219 	RTE_SET_USED(socket_id);
220 	RTE_SET_USED(cur_limit);
221 	RTE_SET_USED(new_len);
222 	return -1;
223 }
224 
225 int
eal_dynmem_hugepage_init(void)226 eal_dynmem_hugepage_init(void)
227 {
228 	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
229 	uint64_t memory[RTE_MAX_NUMA_NODES];
230 	int hp_sz_idx, socket_id;
231 	struct internal_config *internal_conf =
232 		eal_get_internal_configuration();
233 
234 	memset(used_hp, 0, sizeof(used_hp));
235 
236 	for (hp_sz_idx = 0;
237 			hp_sz_idx < (int) internal_conf->num_hugepage_sizes;
238 			hp_sz_idx++) {
239 #ifndef RTE_ARCH_64
240 		struct hugepage_info dummy;
241 		unsigned int i;
242 #endif
243 		/* also initialize used_hp hugepage sizes in used_hp */
244 		struct hugepage_info *hpi;
245 		hpi = &internal_conf->hugepage_info[hp_sz_idx];
246 		used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
247 
248 #ifndef RTE_ARCH_64
249 		/* for 32-bit, limit number of pages on socket to whatever we've
250 		 * preallocated, as we cannot allocate more.
251 		 */
252 		memset(&dummy, 0, sizeof(dummy));
253 		dummy.hugepage_sz = hpi->hugepage_sz;
254 		/*  memory_hotplug_lock is held during initialization, so it's
255 		 *  safe to call thread-unsafe version.
256 		 */
257 		if (rte_memseg_list_walk_thread_unsafe(hugepage_count_walk, &dummy) < 0)
258 			return -1;
259 
260 		for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
261 			hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
262 					dummy.num_pages[i]);
263 		}
264 #endif
265 	}
266 
267 	/* make a copy of socket_mem, needed for balanced allocation. */
268 	for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
269 		memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx];
270 
271 	/* calculate final number of pages */
272 	if (eal_dynmem_calc_num_pages_per_socket(memory,
273 			internal_conf->hugepage_info, used_hp,
274 			internal_conf->num_hugepage_sizes) < 0)
275 		return -1;
276 
277 	for (hp_sz_idx = 0;
278 			hp_sz_idx < (int)internal_conf->num_hugepage_sizes;
279 			hp_sz_idx++) {
280 		for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
281 				socket_id++) {
282 			struct rte_memseg **pages;
283 			struct hugepage_info *hpi = &used_hp[hp_sz_idx];
284 			unsigned int num_pages = hpi->num_pages[socket_id];
285 			unsigned int num_pages_alloc;
286 
287 			if (num_pages == 0)
288 				continue;
289 
290 			EAL_LOG(DEBUG,
291 				"Allocating %u pages of size %" PRIu64 "M "
292 				"on socket %i",
293 				num_pages, hpi->hugepage_sz >> 20, socket_id);
294 
295 			/* we may not be able to allocate all pages in one go,
296 			 * because we break up our memory map into multiple
297 			 * memseg lists. therefore, try allocating multiple
298 			 * times and see if we can get the desired number of
299 			 * pages from multiple allocations.
300 			 */
301 
302 			num_pages_alloc = 0;
303 			do {
304 				int i, cur_pages, needed;
305 
306 				needed = num_pages - num_pages_alloc;
307 
308 				pages = malloc(sizeof(*pages) * needed);
309 				if (pages == NULL) {
310 					EAL_LOG(ERR, "Failed to malloc pages");
311 					return -1;
312 				}
313 
314 				/* do not request exact number of pages */
315 				cur_pages = eal_memalloc_alloc_seg_bulk(pages,
316 						needed, hpi->hugepage_sz,
317 						socket_id, false);
318 				if (cur_pages <= 0) {
319 					free(pages);
320 					return -1;
321 				}
322 
323 				/* mark preallocated pages as unfreeable */
324 				for (i = 0; i < cur_pages; i++) {
325 					struct rte_memseg *ms = pages[i];
326 					ms->flags |=
327 						RTE_MEMSEG_FLAG_DO_NOT_FREE;
328 				}
329 				free(pages);
330 
331 				num_pages_alloc += cur_pages;
332 			} while (num_pages_alloc != num_pages);
333 		}
334 	}
335 
336 	/* if socket limits were specified, set them */
337 	if (internal_conf->force_socket_limits) {
338 		unsigned int i;
339 		for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
340 			uint64_t limit = internal_conf->socket_limit[i];
341 			if (limit == 0)
342 				continue;
343 			if (rte_mem_alloc_validator_register("socket-limit",
344 					limits_callback, i, limit))
345 				EAL_LOG(ERR, "Failed to register socket limits validator callback");
346 		}
347 	}
348 	return 0;
349 }
350 
351 __rte_unused /* function is unused on 32-bit builds */
352 static inline uint64_t
get_socket_mem_size(int socket)353 get_socket_mem_size(int socket)
354 {
355 	uint64_t size = 0;
356 	unsigned int i;
357 	struct internal_config *internal_conf =
358 		eal_get_internal_configuration();
359 
360 	for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
361 		struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
362 		size += hpi->hugepage_sz * hpi->num_pages[socket];
363 	}
364 
365 	return size;
366 }
367 
368 int
eal_dynmem_calc_num_pages_per_socket(uint64_t * memory,struct hugepage_info * hp_info,struct hugepage_info * hp_used,unsigned int num_hp_info)369 eal_dynmem_calc_num_pages_per_socket(
370 	uint64_t *memory, struct hugepage_info *hp_info,
371 	struct hugepage_info *hp_used, unsigned int num_hp_info)
372 {
373 	unsigned int socket, j, i = 0;
374 	unsigned int requested, available;
375 	int total_num_pages = 0;
376 	uint64_t remaining_mem, cur_mem;
377 	const struct internal_config *internal_conf =
378 		eal_get_internal_configuration();
379 	uint64_t total_mem = internal_conf->memory;
380 
381 	if (num_hp_info == 0)
382 		return -1;
383 
384 	/* if specific memory amounts per socket weren't requested */
385 	if (internal_conf->force_sockets == 0) {
386 		size_t total_size;
387 #ifdef RTE_ARCH_64
388 		int cpu_per_socket[RTE_MAX_NUMA_NODES];
389 		size_t default_size;
390 		unsigned int lcore_id;
391 
392 		/* Compute number of cores per socket */
393 		memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
394 		RTE_LCORE_FOREACH(lcore_id) {
395 			cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
396 		}
397 
398 		/*
399 		 * Automatically spread requested memory amongst detected
400 		 * sockets according to number of cores from CPU mask present
401 		 * on each socket.
402 		 */
403 		total_size = internal_conf->memory;
404 		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
405 				socket++) {
406 
407 			/* Set memory amount per socket */
408 			default_size = internal_conf->memory *
409 				cpu_per_socket[socket] / rte_lcore_count();
410 
411 			/* Limit to maximum available memory on socket */
412 			default_size = RTE_MIN(
413 				default_size, get_socket_mem_size(socket));
414 
415 			/* Update sizes */
416 			memory[socket] = default_size;
417 			total_size -= default_size;
418 		}
419 
420 		/*
421 		 * If some memory is remaining, try to allocate it by getting
422 		 * all available memory from sockets, one after the other.
423 		 */
424 		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
425 				socket++) {
426 			/* take whatever is available */
427 			default_size = RTE_MIN(
428 				get_socket_mem_size(socket) - memory[socket],
429 				total_size);
430 
431 			/* Update sizes */
432 			memory[socket] += default_size;
433 			total_size -= default_size;
434 		}
435 #else
436 		/* in 32-bit mode, allocate all of the memory only on main
437 		 * lcore socket
438 		 */
439 		total_size = internal_conf->memory;
440 		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
441 				socket++) {
442 			struct rte_config *cfg = rte_eal_get_configuration();
443 			unsigned int main_lcore_socket;
444 
445 			main_lcore_socket =
446 				rte_lcore_to_socket_id(cfg->main_lcore);
447 
448 			if (main_lcore_socket != socket)
449 				continue;
450 
451 			/* Update sizes */
452 			memory[socket] = total_size;
453 			break;
454 		}
455 #endif
456 	}
457 
458 	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0;
459 			socket++) {
460 		/* skips if the memory on specific socket wasn't requested */
461 		for (i = 0; i < num_hp_info && memory[socket] != 0; i++) {
462 			rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir,
463 				sizeof(hp_used[i].hugedir));
464 			hp_used[i].num_pages[socket] = RTE_MIN(
465 					memory[socket] / hp_info[i].hugepage_sz,
466 					hp_info[i].num_pages[socket]);
467 
468 			cur_mem = hp_used[i].num_pages[socket] *
469 					hp_used[i].hugepage_sz;
470 
471 			memory[socket] -= cur_mem;
472 			total_mem -= cur_mem;
473 
474 			total_num_pages += hp_used[i].num_pages[socket];
475 
476 			/* check if we have met all memory requests */
477 			if (memory[socket] == 0)
478 				break;
479 
480 			/* Check if we have any more pages left at this size,
481 			 * if so, move on to next size.
482 			 */
483 			if (hp_used[i].num_pages[socket] ==
484 					hp_info[i].num_pages[socket])
485 				continue;
486 			/* At this point we know that there are more pages
487 			 * available that are bigger than the memory we want,
488 			 * so lets see if we can get enough from other page
489 			 * sizes.
490 			 */
491 			remaining_mem = 0;
492 			for (j = i+1; j < num_hp_info; j++)
493 				remaining_mem += hp_info[j].hugepage_sz *
494 				hp_info[j].num_pages[socket];
495 
496 			/* Is there enough other memory?
497 			 * If not, allocate another page and quit.
498 			 */
499 			if (remaining_mem < memory[socket]) {
500 				cur_mem = RTE_MIN(
501 					memory[socket], hp_info[i].hugepage_sz);
502 				memory[socket] -= cur_mem;
503 				total_mem -= cur_mem;
504 				hp_used[i].num_pages[socket]++;
505 				total_num_pages++;
506 				break; /* we are done with this socket*/
507 			}
508 		}
509 
510 		/* if we didn't satisfy all memory requirements per socket */
511 		if (memory[socket] > 0 &&
512 				internal_conf->socket_mem[socket] != 0) {
513 			/* to prevent icc errors */
514 			requested = (unsigned int)(
515 				internal_conf->socket_mem[socket] / 0x100000);
516 			available = requested -
517 				((unsigned int)(memory[socket] / 0x100000));
518 			EAL_LOG(ERR, "Not enough memory available on "
519 				"socket %u! Requested: %uMB, available: %uMB",
520 				socket, requested, available);
521 			return -1;
522 		}
523 	}
524 
525 	/* if we didn't satisfy total memory requirements */
526 	if (total_mem > 0) {
527 		requested = (unsigned int)(internal_conf->memory / 0x100000);
528 		available = requested - (unsigned int)(total_mem / 0x100000);
529 		EAL_LOG(ERR, "Not enough memory available! "
530 			"Requested: %uMB, available: %uMB",
531 			requested, available);
532 		return -1;
533 	}
534 	return total_num_pages;
535 }
536