1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation. 3 * Copyright(c) 2013 6WIND S.A. 4 */ 5 6 #include <inttypes.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include <rte_log.h> 11 #include <rte_string_fns.h> 12 13 #include "eal_internal_cfg.h" 14 #include "eal_memalloc.h" 15 #include "eal_memcfg.h" 16 #include "eal_private.h" 17 18 /** @file Functions common to EALs that support dynamic memory allocation. */ 19 20 int 21 eal_dynmem_memseg_lists_init(void) 22 { 23 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 24 struct memtype { 25 uint64_t page_sz; 26 int socket_id; 27 } *memtypes = NULL; 28 int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ 29 struct rte_memseg_list *msl; 30 uint64_t max_mem, max_mem_per_type; 31 unsigned int max_seglists_per_type; 32 unsigned int n_memtypes, cur_type; 33 struct internal_config *internal_conf = 34 eal_get_internal_configuration(); 35 36 /* no-huge does not need this at all */ 37 if (internal_conf->no_hugetlbfs) 38 return 0; 39 40 /* 41 * figuring out amount of memory we're going to have is a long and very 42 * involved process. the basic element we're operating with is a memory 43 * type, defined as a combination of NUMA node ID and page size (so that 44 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). 45 * 46 * deciding amount of memory going towards each memory type is a 47 * balancing act between maximum segments per type, maximum memory per 48 * type, and number of detected NUMA nodes. the goal is to make sure 49 * each memory type gets at least one memseg list. 50 * 51 * the total amount of memory is limited by RTE_MAX_MEM_MB value. 52 * 53 * the total amount of memory per type is limited by either 54 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number 55 * of detected NUMA nodes. additionally, maximum number of segments per 56 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for 57 * smaller page sizes, it can take hundreds of thousands of segments to 58 * reach the above specified per-type memory limits. 59 * 60 * additionally, each type may have multiple memseg lists associated 61 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger 62 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. 63 * 64 * the number of memseg lists per type is decided based on the above 65 * limits, and also taking number of detected NUMA nodes, to make sure 66 * that we don't run out of memseg lists before we populate all NUMA 67 * nodes with memory. 68 * 69 * we do this in three stages. first, we collect the number of types. 70 * then, we figure out memory constraints and populate the list of 71 * would-be memseg lists. then, we go ahead and allocate the memseg 72 * lists. 73 */ 74 75 /* create space for mem types */ 76 n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count(); 77 memtypes = calloc(n_memtypes, sizeof(*memtypes)); 78 if (memtypes == NULL) { 79 EAL_LOG(ERR, "Cannot allocate space for memory types"); 80 return -1; 81 } 82 83 /* populate mem types */ 84 cur_type = 0; 85 for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes; 86 hpi_idx++) { 87 struct hugepage_info *hpi; 88 uint64_t hugepage_sz; 89 90 hpi = &internal_conf->hugepage_info[hpi_idx]; 91 hugepage_sz = hpi->hugepage_sz; 92 93 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { 94 int socket_id = rte_socket_id_by_idx(i); 95 96 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES 97 /* we can still sort pages by socket in legacy mode */ 98 if (!internal_conf->legacy_mem && socket_id > 0) 99 break; 100 #endif 101 memtypes[cur_type].page_sz = hugepage_sz; 102 memtypes[cur_type].socket_id = socket_id; 103 104 EAL_LOG(DEBUG, "Detected memory type: " 105 "socket_id:%u hugepage_sz:%" PRIu64, 106 socket_id, hugepage_sz); 107 } 108 } 109 /* number of memtypes could have been lower due to no NUMA support */ 110 n_memtypes = cur_type; 111 112 /* set up limits for types */ 113 max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; 114 max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, 115 max_mem / n_memtypes); 116 /* 117 * limit maximum number of segment lists per type to ensure there's 118 * space for memseg lists for all NUMA nodes with all page sizes 119 */ 120 max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; 121 122 if (max_seglists_per_type == 0) { 123 EAL_LOG(ERR, "Cannot accommodate all memory types, please increase RTE_MAX_MEMSEG_LISTS"); 124 goto out; 125 } 126 127 /* go through all mem types and create segment lists */ 128 msl_idx = 0; 129 for (cur_type = 0; cur_type < n_memtypes; cur_type++) { 130 unsigned int cur_seglist, n_seglists, n_segs; 131 unsigned int max_segs_per_type, max_segs_per_list; 132 struct memtype *type = &memtypes[cur_type]; 133 uint64_t max_mem_per_list, pagesz; 134 int socket_id; 135 136 pagesz = type->page_sz; 137 socket_id = type->socket_id; 138 139 /* 140 * we need to create segment lists for this type. we must take 141 * into account the following things: 142 * 143 * 1. total amount of memory we can use for this memory type 144 * 2. total amount of memory per memseg list allowed 145 * 3. number of segments needed to fit the amount of memory 146 * 4. number of segments allowed per type 147 * 5. number of segments allowed per memseg list 148 * 6. number of memseg lists we are allowed to take up 149 */ 150 151 /* calculate how much segments we will need in total */ 152 max_segs_per_type = max_mem_per_type / pagesz; 153 /* limit number of segments to maximum allowed per type */ 154 max_segs_per_type = RTE_MIN(max_segs_per_type, 155 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); 156 /* limit number of segments to maximum allowed per list */ 157 max_segs_per_list = RTE_MIN(max_segs_per_type, 158 (unsigned int)RTE_MAX_MEMSEG_PER_LIST); 159 160 /* calculate how much memory we can have per segment list */ 161 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, 162 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); 163 164 /* calculate how many segments each segment list will have */ 165 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); 166 167 /* calculate how many segment lists we can have */ 168 n_seglists = RTE_MIN(max_segs_per_type / n_segs, 169 max_mem_per_type / max_mem_per_list); 170 171 /* limit number of segment lists according to our maximum */ 172 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); 173 174 EAL_LOG(DEBUG, "Creating %i segment lists: " 175 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64, 176 n_seglists, n_segs, socket_id, pagesz); 177 178 /* create all segment lists */ 179 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { 180 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { 181 EAL_LOG(ERR, 182 "No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS"); 183 goto out; 184 } 185 msl = &mcfg->memsegs[msl_idx++]; 186 187 if (eal_memseg_list_init(msl, pagesz, n_segs, 188 socket_id, cur_seglist, true)) 189 goto out; 190 191 if (eal_memseg_list_alloc(msl, 0)) { 192 EAL_LOG(ERR, "Cannot allocate VA space for memseg list"); 193 goto out; 194 } 195 } 196 } 197 /* we're successful */ 198 ret = 0; 199 out: 200 free(memtypes); 201 return ret; 202 } 203 204 static int __rte_unused 205 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) 206 { 207 struct hugepage_info *hpi = arg; 208 209 if (msl->page_sz != hpi->hugepage_sz) 210 return 0; 211 212 hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; 213 return 0; 214 } 215 216 static int 217 limits_callback(int socket_id, size_t cur_limit, size_t new_len) 218 { 219 RTE_SET_USED(socket_id); 220 RTE_SET_USED(cur_limit); 221 RTE_SET_USED(new_len); 222 return -1; 223 } 224 225 int 226 eal_dynmem_hugepage_init(void) 227 { 228 struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; 229 uint64_t memory[RTE_MAX_NUMA_NODES]; 230 int hp_sz_idx, socket_id; 231 struct internal_config *internal_conf = 232 eal_get_internal_configuration(); 233 234 memset(used_hp, 0, sizeof(used_hp)); 235 236 for (hp_sz_idx = 0; 237 hp_sz_idx < (int) internal_conf->num_hugepage_sizes; 238 hp_sz_idx++) { 239 #ifndef RTE_ARCH_64 240 struct hugepage_info dummy; 241 unsigned int i; 242 #endif 243 /* also initialize used_hp hugepage sizes in used_hp */ 244 struct hugepage_info *hpi; 245 hpi = &internal_conf->hugepage_info[hp_sz_idx]; 246 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; 247 248 #ifndef RTE_ARCH_64 249 /* for 32-bit, limit number of pages on socket to whatever we've 250 * preallocated, as we cannot allocate more. 251 */ 252 memset(&dummy, 0, sizeof(dummy)); 253 dummy.hugepage_sz = hpi->hugepage_sz; 254 /* memory_hotplug_lock is held during initialization, so it's 255 * safe to call thread-unsafe version. 256 */ 257 if (rte_memseg_list_walk_thread_unsafe(hugepage_count_walk, &dummy) < 0) 258 return -1; 259 260 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { 261 hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], 262 dummy.num_pages[i]); 263 } 264 #endif 265 } 266 267 /* make a copy of socket_mem, needed for balanced allocation. */ 268 for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) 269 memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx]; 270 271 /* calculate final number of pages */ 272 if (eal_dynmem_calc_num_pages_per_socket(memory, 273 internal_conf->hugepage_info, used_hp, 274 internal_conf->num_hugepage_sizes) < 0) 275 return -1; 276 277 for (hp_sz_idx = 0; 278 hp_sz_idx < (int)internal_conf->num_hugepage_sizes; 279 hp_sz_idx++) { 280 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; 281 socket_id++) { 282 struct rte_memseg **pages; 283 struct hugepage_info *hpi = &used_hp[hp_sz_idx]; 284 unsigned int num_pages = hpi->num_pages[socket_id]; 285 unsigned int num_pages_alloc; 286 287 if (num_pages == 0) 288 continue; 289 290 EAL_LOG(DEBUG, 291 "Allocating %u pages of size %" PRIu64 "M " 292 "on socket %i", 293 num_pages, hpi->hugepage_sz >> 20, socket_id); 294 295 /* we may not be able to allocate all pages in one go, 296 * because we break up our memory map into multiple 297 * memseg lists. therefore, try allocating multiple 298 * times and see if we can get the desired number of 299 * pages from multiple allocations. 300 */ 301 302 num_pages_alloc = 0; 303 do { 304 int i, cur_pages, needed; 305 306 needed = num_pages - num_pages_alloc; 307 308 pages = malloc(sizeof(*pages) * needed); 309 if (pages == NULL) { 310 EAL_LOG(ERR, "Failed to malloc pages"); 311 return -1; 312 } 313 314 /* do not request exact number of pages */ 315 cur_pages = eal_memalloc_alloc_seg_bulk(pages, 316 needed, hpi->hugepage_sz, 317 socket_id, false); 318 if (cur_pages <= 0) { 319 free(pages); 320 return -1; 321 } 322 323 /* mark preallocated pages as unfreeable */ 324 for (i = 0; i < cur_pages; i++) { 325 struct rte_memseg *ms = pages[i]; 326 ms->flags |= 327 RTE_MEMSEG_FLAG_DO_NOT_FREE; 328 } 329 free(pages); 330 331 num_pages_alloc += cur_pages; 332 } while (num_pages_alloc != num_pages); 333 } 334 } 335 336 /* if socket limits were specified, set them */ 337 if (internal_conf->force_socket_limits) { 338 unsigned int i; 339 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { 340 uint64_t limit = internal_conf->socket_limit[i]; 341 if (limit == 0) 342 continue; 343 if (rte_mem_alloc_validator_register("socket-limit", 344 limits_callback, i, limit)) 345 EAL_LOG(ERR, "Failed to register socket limits validator callback"); 346 } 347 } 348 return 0; 349 } 350 351 __rte_unused /* function is unused on 32-bit builds */ 352 static inline uint64_t 353 get_socket_mem_size(int socket) 354 { 355 uint64_t size = 0; 356 unsigned int i; 357 struct internal_config *internal_conf = 358 eal_get_internal_configuration(); 359 360 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 361 struct hugepage_info *hpi = &internal_conf->hugepage_info[i]; 362 size += hpi->hugepage_sz * hpi->num_pages[socket]; 363 } 364 365 return size; 366 } 367 368 int 369 eal_dynmem_calc_num_pages_per_socket( 370 uint64_t *memory, struct hugepage_info *hp_info, 371 struct hugepage_info *hp_used, unsigned int num_hp_info) 372 { 373 unsigned int socket, j, i = 0; 374 unsigned int requested, available; 375 int total_num_pages = 0; 376 uint64_t remaining_mem, cur_mem; 377 const struct internal_config *internal_conf = 378 eal_get_internal_configuration(); 379 uint64_t total_mem = internal_conf->memory; 380 381 if (num_hp_info == 0) 382 return -1; 383 384 /* if specific memory amounts per socket weren't requested */ 385 if (internal_conf->force_sockets == 0) { 386 size_t total_size; 387 #ifdef RTE_ARCH_64 388 int cpu_per_socket[RTE_MAX_NUMA_NODES]; 389 size_t default_size; 390 unsigned int lcore_id; 391 392 /* Compute number of cores per socket */ 393 memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); 394 RTE_LCORE_FOREACH(lcore_id) { 395 cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; 396 } 397 398 /* 399 * Automatically spread requested memory amongst detected 400 * sockets according to number of cores from CPU mask present 401 * on each socket. 402 */ 403 total_size = internal_conf->memory; 404 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 405 socket++) { 406 407 /* Set memory amount per socket */ 408 default_size = internal_conf->memory * 409 cpu_per_socket[socket] / rte_lcore_count(); 410 411 /* Limit to maximum available memory on socket */ 412 default_size = RTE_MIN( 413 default_size, get_socket_mem_size(socket)); 414 415 /* Update sizes */ 416 memory[socket] = default_size; 417 total_size -= default_size; 418 } 419 420 /* 421 * If some memory is remaining, try to allocate it by getting 422 * all available memory from sockets, one after the other. 423 */ 424 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 425 socket++) { 426 /* take whatever is available */ 427 default_size = RTE_MIN( 428 get_socket_mem_size(socket) - memory[socket], 429 total_size); 430 431 /* Update sizes */ 432 memory[socket] += default_size; 433 total_size -= default_size; 434 } 435 #else 436 /* in 32-bit mode, allocate all of the memory only on main 437 * lcore socket 438 */ 439 total_size = internal_conf->memory; 440 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 441 socket++) { 442 struct rte_config *cfg = rte_eal_get_configuration(); 443 unsigned int main_lcore_socket; 444 445 main_lcore_socket = 446 rte_lcore_to_socket_id(cfg->main_lcore); 447 448 if (main_lcore_socket != socket) 449 continue; 450 451 /* Update sizes */ 452 memory[socket] = total_size; 453 break; 454 } 455 #endif 456 } 457 458 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; 459 socket++) { 460 /* skips if the memory on specific socket wasn't requested */ 461 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) { 462 rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir, 463 sizeof(hp_used[i].hugedir)); 464 hp_used[i].num_pages[socket] = RTE_MIN( 465 memory[socket] / hp_info[i].hugepage_sz, 466 hp_info[i].num_pages[socket]); 467 468 cur_mem = hp_used[i].num_pages[socket] * 469 hp_used[i].hugepage_sz; 470 471 memory[socket] -= cur_mem; 472 total_mem -= cur_mem; 473 474 total_num_pages += hp_used[i].num_pages[socket]; 475 476 /* check if we have met all memory requests */ 477 if (memory[socket] == 0) 478 break; 479 480 /* Check if we have any more pages left at this size, 481 * if so, move on to next size. 482 */ 483 if (hp_used[i].num_pages[socket] == 484 hp_info[i].num_pages[socket]) 485 continue; 486 /* At this point we know that there are more pages 487 * available that are bigger than the memory we want, 488 * so lets see if we can get enough from other page 489 * sizes. 490 */ 491 remaining_mem = 0; 492 for (j = i+1; j < num_hp_info; j++) 493 remaining_mem += hp_info[j].hugepage_sz * 494 hp_info[j].num_pages[socket]; 495 496 /* Is there enough other memory? 497 * If not, allocate another page and quit. 498 */ 499 if (remaining_mem < memory[socket]) { 500 cur_mem = RTE_MIN( 501 memory[socket], hp_info[i].hugepage_sz); 502 memory[socket] -= cur_mem; 503 total_mem -= cur_mem; 504 hp_used[i].num_pages[socket]++; 505 total_num_pages++; 506 break; /* we are done with this socket*/ 507 } 508 } 509 510 /* if we didn't satisfy all memory requirements per socket */ 511 if (memory[socket] > 0 && 512 internal_conf->socket_mem[socket] != 0) { 513 /* to prevent icc errors */ 514 requested = (unsigned int)( 515 internal_conf->socket_mem[socket] / 0x100000); 516 available = requested - 517 ((unsigned int)(memory[socket] / 0x100000)); 518 EAL_LOG(ERR, "Not enough memory available on " 519 "socket %u! Requested: %uMB, available: %uMB", 520 socket, requested, available); 521 return -1; 522 } 523 } 524 525 /* if we didn't satisfy total memory requirements */ 526 if (total_mem > 0) { 527 requested = (unsigned int)(internal_conf->memory / 0x100000); 528 available = requested - (unsigned int)(total_mem / 0x100000); 529 EAL_LOG(ERR, "Not enough memory available! " 530 "Requested: %uMB, available: %uMB", 531 requested, available); 532 return -1; 533 } 534 return total_num_pages; 535 } 536