1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation. 3 * Copyright(c) 2013 6WIND S.A. 4 */ 5 6 #include <inttypes.h> 7 #include <string.h> 8 9 #include <rte_log.h> 10 #include <rte_string_fns.h> 11 12 #include "eal_internal_cfg.h" 13 #include "eal_memalloc.h" 14 #include "eal_memcfg.h" 15 #include "eal_private.h" 16 17 /** @file Functions common to EALs that support dynamic memory allocation. */ 18 19 int 20 eal_dynmem_memseg_lists_init(void) 21 { 22 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 23 struct memtype { 24 uint64_t page_sz; 25 int socket_id; 26 } *memtypes = NULL; 27 int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ 28 struct rte_memseg_list *msl; 29 uint64_t max_mem, max_mem_per_type; 30 unsigned int max_seglists_per_type; 31 unsigned int n_memtypes, cur_type; 32 struct internal_config *internal_conf = 33 eal_get_internal_configuration(); 34 35 /* no-huge does not need this at all */ 36 if (internal_conf->no_hugetlbfs) 37 return 0; 38 39 /* 40 * figuring out amount of memory we're going to have is a long and very 41 * involved process. the basic element we're operating with is a memory 42 * type, defined as a combination of NUMA node ID and page size (so that 43 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). 44 * 45 * deciding amount of memory going towards each memory type is a 46 * balancing act between maximum segments per type, maximum memory per 47 * type, and number of detected NUMA nodes. the goal is to make sure 48 * each memory type gets at least one memseg list. 49 * 50 * the total amount of memory is limited by RTE_MAX_MEM_MB value. 51 * 52 * the total amount of memory per type is limited by either 53 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number 54 * of detected NUMA nodes. additionally, maximum number of segments per 55 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for 56 * smaller page sizes, it can take hundreds of thousands of segments to 57 * reach the above specified per-type memory limits. 58 * 59 * additionally, each type may have multiple memseg lists associated 60 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger 61 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. 62 * 63 * the number of memseg lists per type is decided based on the above 64 * limits, and also taking number of detected NUMA nodes, to make sure 65 * that we don't run out of memseg lists before we populate all NUMA 66 * nodes with memory. 67 * 68 * we do this in three stages. first, we collect the number of types. 69 * then, we figure out memory constraints and populate the list of 70 * would-be memseg lists. then, we go ahead and allocate the memseg 71 * lists. 72 */ 73 74 /* create space for mem types */ 75 n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count(); 76 memtypes = calloc(n_memtypes, sizeof(*memtypes)); 77 if (memtypes == NULL) { 78 RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); 79 return -1; 80 } 81 82 /* populate mem types */ 83 cur_type = 0; 84 for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes; 85 hpi_idx++) { 86 struct hugepage_info *hpi; 87 uint64_t hugepage_sz; 88 89 hpi = &internal_conf->hugepage_info[hpi_idx]; 90 hugepage_sz = hpi->hugepage_sz; 91 92 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { 93 int socket_id = rte_socket_id_by_idx(i); 94 95 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES 96 /* we can still sort pages by socket in legacy mode */ 97 if (!internal_conf->legacy_mem && socket_id > 0) 98 break; 99 #endif 100 memtypes[cur_type].page_sz = hugepage_sz; 101 memtypes[cur_type].socket_id = socket_id; 102 103 RTE_LOG(DEBUG, EAL, "Detected memory type: " 104 "socket_id:%u hugepage_sz:%" PRIu64 "\n", 105 socket_id, hugepage_sz); 106 } 107 } 108 /* number of memtypes could have been lower due to no NUMA support */ 109 n_memtypes = cur_type; 110 111 /* set up limits for types */ 112 max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; 113 max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, 114 max_mem / n_memtypes); 115 /* 116 * limit maximum number of segment lists per type to ensure there's 117 * space for memseg lists for all NUMA nodes with all page sizes 118 */ 119 max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; 120 121 if (max_seglists_per_type == 0) { 122 RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", 123 RTE_STR(RTE_MAX_MEMSEG_LISTS)); 124 goto out; 125 } 126 127 /* go through all mem types and create segment lists */ 128 msl_idx = 0; 129 for (cur_type = 0; cur_type < n_memtypes; cur_type++) { 130 unsigned int cur_seglist, n_seglists, n_segs; 131 unsigned int max_segs_per_type, max_segs_per_list; 132 struct memtype *type = &memtypes[cur_type]; 133 uint64_t max_mem_per_list, pagesz; 134 int socket_id; 135 136 pagesz = type->page_sz; 137 socket_id = type->socket_id; 138 139 /* 140 * we need to create segment lists for this type. we must take 141 * into account the following things: 142 * 143 * 1. total amount of memory we can use for this memory type 144 * 2. total amount of memory per memseg list allowed 145 * 3. number of segments needed to fit the amount of memory 146 * 4. number of segments allowed per type 147 * 5. number of segments allowed per memseg list 148 * 6. number of memseg lists we are allowed to take up 149 */ 150 151 /* calculate how much segments we will need in total */ 152 max_segs_per_type = max_mem_per_type / pagesz; 153 /* limit number of segments to maximum allowed per type */ 154 max_segs_per_type = RTE_MIN(max_segs_per_type, 155 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); 156 /* limit number of segments to maximum allowed per list */ 157 max_segs_per_list = RTE_MIN(max_segs_per_type, 158 (unsigned int)RTE_MAX_MEMSEG_PER_LIST); 159 160 /* calculate how much memory we can have per segment list */ 161 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, 162 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); 163 164 /* calculate how many segments each segment list will have */ 165 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); 166 167 /* calculate how many segment lists we can have */ 168 n_seglists = RTE_MIN(max_segs_per_type / n_segs, 169 max_mem_per_type / max_mem_per_list); 170 171 /* limit number of segment lists according to our maximum */ 172 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); 173 174 RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " 175 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", 176 n_seglists, n_segs, socket_id, pagesz); 177 178 /* create all segment lists */ 179 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { 180 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { 181 RTE_LOG(ERR, EAL, 182 "No more space in memseg lists, please increase %s\n", 183 RTE_STR(RTE_MAX_MEMSEG_LISTS)); 184 goto out; 185 } 186 msl = &mcfg->memsegs[msl_idx++]; 187 188 if (eal_memseg_list_init(msl, pagesz, n_segs, 189 socket_id, cur_seglist, true)) 190 goto out; 191 192 if (eal_memseg_list_alloc(msl, 0)) { 193 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); 194 goto out; 195 } 196 } 197 } 198 /* we're successful */ 199 ret = 0; 200 out: 201 free(memtypes); 202 return ret; 203 } 204 205 static int __rte_unused 206 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) 207 { 208 struct hugepage_info *hpi = arg; 209 210 if (msl->page_sz != hpi->hugepage_sz) 211 return 0; 212 213 hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; 214 return 0; 215 } 216 217 static int 218 limits_callback(int socket_id, size_t cur_limit, size_t new_len) 219 { 220 RTE_SET_USED(socket_id); 221 RTE_SET_USED(cur_limit); 222 RTE_SET_USED(new_len); 223 return -1; 224 } 225 226 int 227 eal_dynmem_hugepage_init(void) 228 { 229 struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; 230 uint64_t memory[RTE_MAX_NUMA_NODES]; 231 int hp_sz_idx, socket_id; 232 struct internal_config *internal_conf = 233 eal_get_internal_configuration(); 234 235 memset(used_hp, 0, sizeof(used_hp)); 236 237 for (hp_sz_idx = 0; 238 hp_sz_idx < (int) internal_conf->num_hugepage_sizes; 239 hp_sz_idx++) { 240 #ifndef RTE_ARCH_64 241 struct hugepage_info dummy; 242 unsigned int i; 243 #endif 244 /* also initialize used_hp hugepage sizes in used_hp */ 245 struct hugepage_info *hpi; 246 hpi = &internal_conf->hugepage_info[hp_sz_idx]; 247 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; 248 249 #ifndef RTE_ARCH_64 250 /* for 32-bit, limit number of pages on socket to whatever we've 251 * preallocated, as we cannot allocate more. 252 */ 253 memset(&dummy, 0, sizeof(dummy)); 254 dummy.hugepage_sz = hpi->hugepage_sz; 255 if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) 256 return -1; 257 258 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { 259 hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], 260 dummy.num_pages[i]); 261 } 262 #endif 263 } 264 265 /* make a copy of socket_mem, needed for balanced allocation. */ 266 for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) 267 memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx]; 268 269 /* calculate final number of pages */ 270 if (eal_dynmem_calc_num_pages_per_socket(memory, 271 internal_conf->hugepage_info, used_hp, 272 internal_conf->num_hugepage_sizes) < 0) 273 return -1; 274 275 for (hp_sz_idx = 0; 276 hp_sz_idx < (int)internal_conf->num_hugepage_sizes; 277 hp_sz_idx++) { 278 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; 279 socket_id++) { 280 struct rte_memseg **pages; 281 struct hugepage_info *hpi = &used_hp[hp_sz_idx]; 282 unsigned int num_pages = hpi->num_pages[socket_id]; 283 unsigned int num_pages_alloc; 284 285 if (num_pages == 0) 286 continue; 287 288 RTE_LOG(DEBUG, EAL, 289 "Allocating %u pages of size %" PRIu64 "M " 290 "on socket %i\n", 291 num_pages, hpi->hugepage_sz >> 20, socket_id); 292 293 /* we may not be able to allocate all pages in one go, 294 * because we break up our memory map into multiple 295 * memseg lists. therefore, try allocating multiple 296 * times and see if we can get the desired number of 297 * pages from multiple allocations. 298 */ 299 300 num_pages_alloc = 0; 301 do { 302 int i, cur_pages, needed; 303 304 needed = num_pages - num_pages_alloc; 305 306 pages = malloc(sizeof(*pages) * needed); 307 if (pages == NULL) { 308 RTE_LOG(ERR, EAL, "Failed to malloc pages\n"); 309 return -1; 310 } 311 312 /* do not request exact number of pages */ 313 cur_pages = eal_memalloc_alloc_seg_bulk(pages, 314 needed, hpi->hugepage_sz, 315 socket_id, false); 316 if (cur_pages <= 0) { 317 free(pages); 318 return -1; 319 } 320 321 /* mark preallocated pages as unfreeable */ 322 for (i = 0; i < cur_pages; i++) { 323 struct rte_memseg *ms = pages[i]; 324 ms->flags |= 325 RTE_MEMSEG_FLAG_DO_NOT_FREE; 326 } 327 free(pages); 328 329 num_pages_alloc += cur_pages; 330 } while (num_pages_alloc != num_pages); 331 } 332 } 333 334 /* if socket limits were specified, set them */ 335 if (internal_conf->force_socket_limits) { 336 unsigned int i; 337 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { 338 uint64_t limit = internal_conf->socket_limit[i]; 339 if (limit == 0) 340 continue; 341 if (rte_mem_alloc_validator_register("socket-limit", 342 limits_callback, i, limit)) 343 RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); 344 } 345 } 346 return 0; 347 } 348 349 __rte_unused /* function is unused on 32-bit builds */ 350 static inline uint64_t 351 get_socket_mem_size(int socket) 352 { 353 uint64_t size = 0; 354 unsigned int i; 355 struct internal_config *internal_conf = 356 eal_get_internal_configuration(); 357 358 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 359 struct hugepage_info *hpi = &internal_conf->hugepage_info[i]; 360 size += hpi->hugepage_sz * hpi->num_pages[socket]; 361 } 362 363 return size; 364 } 365 366 int 367 eal_dynmem_calc_num_pages_per_socket( 368 uint64_t *memory, struct hugepage_info *hp_info, 369 struct hugepage_info *hp_used, unsigned int num_hp_info) 370 { 371 unsigned int socket, j, i = 0; 372 unsigned int requested, available; 373 int total_num_pages = 0; 374 uint64_t remaining_mem, cur_mem; 375 const struct internal_config *internal_conf = 376 eal_get_internal_configuration(); 377 uint64_t total_mem = internal_conf->memory; 378 379 if (num_hp_info == 0) 380 return -1; 381 382 /* if specific memory amounts per socket weren't requested */ 383 if (internal_conf->force_sockets == 0) { 384 size_t total_size; 385 #ifdef RTE_ARCH_64 386 int cpu_per_socket[RTE_MAX_NUMA_NODES]; 387 size_t default_size; 388 unsigned int lcore_id; 389 390 /* Compute number of cores per socket */ 391 memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); 392 RTE_LCORE_FOREACH(lcore_id) { 393 cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; 394 } 395 396 /* 397 * Automatically spread requested memory amongst detected 398 * sockets according to number of cores from CPU mask present 399 * on each socket. 400 */ 401 total_size = internal_conf->memory; 402 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 403 socket++) { 404 405 /* Set memory amount per socket */ 406 default_size = internal_conf->memory * 407 cpu_per_socket[socket] / rte_lcore_count(); 408 409 /* Limit to maximum available memory on socket */ 410 default_size = RTE_MIN( 411 default_size, get_socket_mem_size(socket)); 412 413 /* Update sizes */ 414 memory[socket] = default_size; 415 total_size -= default_size; 416 } 417 418 /* 419 * If some memory is remaining, try to allocate it by getting 420 * all available memory from sockets, one after the other. 421 */ 422 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 423 socket++) { 424 /* take whatever is available */ 425 default_size = RTE_MIN( 426 get_socket_mem_size(socket) - memory[socket], 427 total_size); 428 429 /* Update sizes */ 430 memory[socket] += default_size; 431 total_size -= default_size; 432 } 433 #else 434 /* in 32-bit mode, allocate all of the memory only on main 435 * lcore socket 436 */ 437 total_size = internal_conf->memory; 438 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 439 socket++) { 440 struct rte_config *cfg = rte_eal_get_configuration(); 441 unsigned int main_lcore_socket; 442 443 main_lcore_socket = 444 rte_lcore_to_socket_id(cfg->main_lcore); 445 446 if (main_lcore_socket != socket) 447 continue; 448 449 /* Update sizes */ 450 memory[socket] = total_size; 451 break; 452 } 453 #endif 454 } 455 456 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; 457 socket++) { 458 /* skips if the memory on specific socket wasn't requested */ 459 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) { 460 rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir, 461 sizeof(hp_used[i].hugedir)); 462 hp_used[i].num_pages[socket] = RTE_MIN( 463 memory[socket] / hp_info[i].hugepage_sz, 464 hp_info[i].num_pages[socket]); 465 466 cur_mem = hp_used[i].num_pages[socket] * 467 hp_used[i].hugepage_sz; 468 469 memory[socket] -= cur_mem; 470 total_mem -= cur_mem; 471 472 total_num_pages += hp_used[i].num_pages[socket]; 473 474 /* check if we have met all memory requests */ 475 if (memory[socket] == 0) 476 break; 477 478 /* Check if we have any more pages left at this size, 479 * if so, move on to next size. 480 */ 481 if (hp_used[i].num_pages[socket] == 482 hp_info[i].num_pages[socket]) 483 continue; 484 /* At this point we know that there are more pages 485 * available that are bigger than the memory we want, 486 * so lets see if we can get enough from other page 487 * sizes. 488 */ 489 remaining_mem = 0; 490 for (j = i+1; j < num_hp_info; j++) 491 remaining_mem += hp_info[j].hugepage_sz * 492 hp_info[j].num_pages[socket]; 493 494 /* Is there enough other memory? 495 * If not, allocate another page and quit. 496 */ 497 if (remaining_mem < memory[socket]) { 498 cur_mem = RTE_MIN( 499 memory[socket], hp_info[i].hugepage_sz); 500 memory[socket] -= cur_mem; 501 total_mem -= cur_mem; 502 hp_used[i].num_pages[socket]++; 503 total_num_pages++; 504 break; /* we are done with this socket*/ 505 } 506 } 507 508 /* if we didn't satisfy all memory requirements per socket */ 509 if (memory[socket] > 0 && 510 internal_conf->socket_mem[socket] != 0) { 511 /* to prevent icc errors */ 512 requested = (unsigned int)( 513 internal_conf->socket_mem[socket] / 0x100000); 514 available = requested - 515 ((unsigned int)(memory[socket] / 0x100000)); 516 RTE_LOG(ERR, EAL, "Not enough memory available on " 517 "socket %u! Requested: %uMB, available: %uMB\n", 518 socket, requested, available); 519 return -1; 520 } 521 } 522 523 /* if we didn't satisfy total memory requirements */ 524 if (total_mem > 0) { 525 requested = (unsigned int)(internal_conf->memory / 0x100000); 526 available = requested - (unsigned int)(total_mem / 0x100000); 527 RTE_LOG(ERR, EAL, "Not enough memory available! " 528 "Requested: %uMB, available: %uMB\n", 529 requested, available); 530 return -1; 531 } 532 return total_num_pages; 533 } 534