1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation. 3 * Copyright(c) 2013 6WIND S.A. 4 */ 5 6 #include <inttypes.h> 7 #include <string.h> 8 9 #include <rte_log.h> 10 #include <rte_string_fns.h> 11 12 #include "eal_internal_cfg.h" 13 #include "eal_memalloc.h" 14 #include "eal_memcfg.h" 15 #include "eal_private.h" 16 17 /** @file Functions common to EALs that support dynamic memory allocation. */ 18 19 int 20 eal_dynmem_memseg_lists_init(void) 21 { 22 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 23 struct memtype { 24 uint64_t page_sz; 25 int socket_id; 26 } *memtypes = NULL; 27 int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ 28 struct rte_memseg_list *msl; 29 uint64_t max_mem, max_mem_per_type; 30 unsigned int max_seglists_per_type; 31 unsigned int n_memtypes, cur_type; 32 struct internal_config *internal_conf = 33 eal_get_internal_configuration(); 34 35 /* no-huge does not need this at all */ 36 if (internal_conf->no_hugetlbfs) 37 return 0; 38 39 /* 40 * figuring out amount of memory we're going to have is a long and very 41 * involved process. the basic element we're operating with is a memory 42 * type, defined as a combination of NUMA node ID and page size (so that 43 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). 44 * 45 * deciding amount of memory going towards each memory type is a 46 * balancing act between maximum segments per type, maximum memory per 47 * type, and number of detected NUMA nodes. the goal is to make sure 48 * each memory type gets at least one memseg list. 49 * 50 * the total amount of memory is limited by RTE_MAX_MEM_MB value. 51 * 52 * the total amount of memory per type is limited by either 53 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number 54 * of detected NUMA nodes. additionally, maximum number of segments per 55 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for 56 * smaller page sizes, it can take hundreds of thousands of segments to 57 * reach the above specified per-type memory limits. 58 * 59 * additionally, each type may have multiple memseg lists associated 60 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger 61 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. 62 * 63 * the number of memseg lists per type is decided based on the above 64 * limits, and also taking number of detected NUMA nodes, to make sure 65 * that we don't run out of memseg lists before we populate all NUMA 66 * nodes with memory. 67 * 68 * we do this in three stages. first, we collect the number of types. 69 * then, we figure out memory constraints and populate the list of 70 * would-be memseg lists. then, we go ahead and allocate the memseg 71 * lists. 72 */ 73 74 /* create space for mem types */ 75 n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count(); 76 memtypes = calloc(n_memtypes, sizeof(*memtypes)); 77 if (memtypes == NULL) { 78 RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); 79 return -1; 80 } 81 82 /* populate mem types */ 83 cur_type = 0; 84 for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes; 85 hpi_idx++) { 86 struct hugepage_info *hpi; 87 uint64_t hugepage_sz; 88 89 hpi = &internal_conf->hugepage_info[hpi_idx]; 90 hugepage_sz = hpi->hugepage_sz; 91 92 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { 93 int socket_id = rte_socket_id_by_idx(i); 94 95 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES 96 /* we can still sort pages by socket in legacy mode */ 97 if (!internal_conf->legacy_mem && socket_id > 0) 98 break; 99 #endif 100 memtypes[cur_type].page_sz = hugepage_sz; 101 memtypes[cur_type].socket_id = socket_id; 102 103 RTE_LOG(DEBUG, EAL, "Detected memory type: " 104 "socket_id:%u hugepage_sz:%" PRIu64 "\n", 105 socket_id, hugepage_sz); 106 } 107 } 108 /* number of memtypes could have been lower due to no NUMA support */ 109 n_memtypes = cur_type; 110 111 /* set up limits for types */ 112 max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; 113 max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, 114 max_mem / n_memtypes); 115 /* 116 * limit maximum number of segment lists per type to ensure there's 117 * space for memseg lists for all NUMA nodes with all page sizes 118 */ 119 max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; 120 121 if (max_seglists_per_type == 0) { 122 RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", 123 RTE_STR(RTE_MAX_MEMSEG_LISTS)); 124 goto out; 125 } 126 127 /* go through all mem types and create segment lists */ 128 msl_idx = 0; 129 for (cur_type = 0; cur_type < n_memtypes; cur_type++) { 130 unsigned int cur_seglist, n_seglists, n_segs; 131 unsigned int max_segs_per_type, max_segs_per_list; 132 struct memtype *type = &memtypes[cur_type]; 133 uint64_t max_mem_per_list, pagesz; 134 int socket_id; 135 136 pagesz = type->page_sz; 137 socket_id = type->socket_id; 138 139 /* 140 * we need to create segment lists for this type. we must take 141 * into account the following things: 142 * 143 * 1. total amount of memory we can use for this memory type 144 * 2. total amount of memory per memseg list allowed 145 * 3. number of segments needed to fit the amount of memory 146 * 4. number of segments allowed per type 147 * 5. number of segments allowed per memseg list 148 * 6. number of memseg lists we are allowed to take up 149 */ 150 151 /* calculate how much segments we will need in total */ 152 max_segs_per_type = max_mem_per_type / pagesz; 153 /* limit number of segments to maximum allowed per type */ 154 max_segs_per_type = RTE_MIN(max_segs_per_type, 155 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); 156 /* limit number of segments to maximum allowed per list */ 157 max_segs_per_list = RTE_MIN(max_segs_per_type, 158 (unsigned int)RTE_MAX_MEMSEG_PER_LIST); 159 160 /* calculate how much memory we can have per segment list */ 161 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, 162 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); 163 164 /* calculate how many segments each segment list will have */ 165 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); 166 167 /* calculate how many segment lists we can have */ 168 n_seglists = RTE_MIN(max_segs_per_type / n_segs, 169 max_mem_per_type / max_mem_per_list); 170 171 /* limit number of segment lists according to our maximum */ 172 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); 173 174 RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " 175 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", 176 n_seglists, n_segs, socket_id, pagesz); 177 178 /* create all segment lists */ 179 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { 180 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { 181 RTE_LOG(ERR, EAL, 182 "No more space in memseg lists, please increase %s\n", 183 RTE_STR(RTE_MAX_MEMSEG_LISTS)); 184 goto out; 185 } 186 msl = &mcfg->memsegs[msl_idx++]; 187 188 if (eal_memseg_list_init(msl, pagesz, n_segs, 189 socket_id, cur_seglist, true)) 190 goto out; 191 192 if (eal_memseg_list_alloc(msl, 0)) { 193 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); 194 goto out; 195 } 196 } 197 } 198 /* we're successful */ 199 ret = 0; 200 out: 201 free(memtypes); 202 return ret; 203 } 204 205 static int __rte_unused 206 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) 207 { 208 struct hugepage_info *hpi = arg; 209 210 if (msl->page_sz != hpi->hugepage_sz) 211 return 0; 212 213 hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; 214 return 0; 215 } 216 217 static int 218 limits_callback(int socket_id, size_t cur_limit, size_t new_len) 219 { 220 RTE_SET_USED(socket_id); 221 RTE_SET_USED(cur_limit); 222 RTE_SET_USED(new_len); 223 return -1; 224 } 225 226 int 227 eal_dynmem_hugepage_init(void) 228 { 229 struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; 230 uint64_t memory[RTE_MAX_NUMA_NODES]; 231 int hp_sz_idx, socket_id; 232 struct internal_config *internal_conf = 233 eal_get_internal_configuration(); 234 235 memset(used_hp, 0, sizeof(used_hp)); 236 237 for (hp_sz_idx = 0; 238 hp_sz_idx < (int) internal_conf->num_hugepage_sizes; 239 hp_sz_idx++) { 240 #ifndef RTE_ARCH_64 241 struct hugepage_info dummy; 242 unsigned int i; 243 #endif 244 /* also initialize used_hp hugepage sizes in used_hp */ 245 struct hugepage_info *hpi; 246 hpi = &internal_conf->hugepage_info[hp_sz_idx]; 247 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; 248 249 #ifndef RTE_ARCH_64 250 /* for 32-bit, limit number of pages on socket to whatever we've 251 * preallocated, as we cannot allocate more. 252 */ 253 memset(&dummy, 0, sizeof(dummy)); 254 dummy.hugepage_sz = hpi->hugepage_sz; 255 if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) 256 return -1; 257 258 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { 259 hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], 260 dummy.num_pages[i]); 261 } 262 #endif 263 } 264 265 /* make a copy of socket_mem, needed for balanced allocation. */ 266 for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) 267 memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx]; 268 269 /* calculate final number of pages */ 270 if (eal_dynmem_calc_num_pages_per_socket(memory, 271 internal_conf->hugepage_info, used_hp, 272 internal_conf->num_hugepage_sizes) < 0) 273 return -1; 274 275 for (hp_sz_idx = 0; 276 hp_sz_idx < (int)internal_conf->num_hugepage_sizes; 277 hp_sz_idx++) { 278 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; 279 socket_id++) { 280 struct rte_memseg **pages; 281 struct hugepage_info *hpi = &used_hp[hp_sz_idx]; 282 unsigned int num_pages = hpi->num_pages[socket_id]; 283 unsigned int num_pages_alloc; 284 285 if (num_pages == 0) 286 continue; 287 288 RTE_LOG(DEBUG, EAL, 289 "Allocating %u pages of size %" PRIu64 "M " 290 "on socket %i\n", 291 num_pages, hpi->hugepage_sz >> 20, socket_id); 292 293 /* we may not be able to allocate all pages in one go, 294 * because we break up our memory map into multiple 295 * memseg lists. therefore, try allocating multiple 296 * times and see if we can get the desired number of 297 * pages from multiple allocations. 298 */ 299 300 num_pages_alloc = 0; 301 do { 302 int i, cur_pages, needed; 303 304 needed = num_pages - num_pages_alloc; 305 306 pages = malloc(sizeof(*pages) * needed); 307 308 /* do not request exact number of pages */ 309 cur_pages = eal_memalloc_alloc_seg_bulk(pages, 310 needed, hpi->hugepage_sz, 311 socket_id, false); 312 if (cur_pages <= 0) { 313 free(pages); 314 return -1; 315 } 316 317 /* mark preallocated pages as unfreeable */ 318 for (i = 0; i < cur_pages; i++) { 319 struct rte_memseg *ms = pages[i]; 320 ms->flags |= 321 RTE_MEMSEG_FLAG_DO_NOT_FREE; 322 } 323 free(pages); 324 325 num_pages_alloc += cur_pages; 326 } while (num_pages_alloc != num_pages); 327 } 328 } 329 330 /* if socket limits were specified, set them */ 331 if (internal_conf->force_socket_limits) { 332 unsigned int i; 333 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { 334 uint64_t limit = internal_conf->socket_limit[i]; 335 if (limit == 0) 336 continue; 337 if (rte_mem_alloc_validator_register("socket-limit", 338 limits_callback, i, limit)) 339 RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); 340 } 341 } 342 return 0; 343 } 344 345 __rte_unused /* function is unused on 32-bit builds */ 346 static inline uint64_t 347 get_socket_mem_size(int socket) 348 { 349 uint64_t size = 0; 350 unsigned int i; 351 struct internal_config *internal_conf = 352 eal_get_internal_configuration(); 353 354 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) { 355 struct hugepage_info *hpi = &internal_conf->hugepage_info[i]; 356 size += hpi->hugepage_sz * hpi->num_pages[socket]; 357 } 358 359 return size; 360 } 361 362 int 363 eal_dynmem_calc_num_pages_per_socket( 364 uint64_t *memory, struct hugepage_info *hp_info, 365 struct hugepage_info *hp_used, unsigned int num_hp_info) 366 { 367 unsigned int socket, j, i = 0; 368 unsigned int requested, available; 369 int total_num_pages = 0; 370 uint64_t remaining_mem, cur_mem; 371 const struct internal_config *internal_conf = 372 eal_get_internal_configuration(); 373 uint64_t total_mem = internal_conf->memory; 374 375 if (num_hp_info == 0) 376 return -1; 377 378 /* if specific memory amounts per socket weren't requested */ 379 if (internal_conf->force_sockets == 0) { 380 size_t total_size; 381 #ifdef RTE_ARCH_64 382 int cpu_per_socket[RTE_MAX_NUMA_NODES]; 383 size_t default_size; 384 unsigned int lcore_id; 385 386 /* Compute number of cores per socket */ 387 memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); 388 RTE_LCORE_FOREACH(lcore_id) { 389 cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; 390 } 391 392 /* 393 * Automatically spread requested memory amongst detected 394 * sockets according to number of cores from CPU mask present 395 * on each socket. 396 */ 397 total_size = internal_conf->memory; 398 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 399 socket++) { 400 401 /* Set memory amount per socket */ 402 default_size = internal_conf->memory * 403 cpu_per_socket[socket] / rte_lcore_count(); 404 405 /* Limit to maximum available memory on socket */ 406 default_size = RTE_MIN( 407 default_size, get_socket_mem_size(socket)); 408 409 /* Update sizes */ 410 memory[socket] = default_size; 411 total_size -= default_size; 412 } 413 414 /* 415 * If some memory is remaining, try to allocate it by getting 416 * all available memory from sockets, one after the other. 417 */ 418 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 419 socket++) { 420 /* take whatever is available */ 421 default_size = RTE_MIN( 422 get_socket_mem_size(socket) - memory[socket], 423 total_size); 424 425 /* Update sizes */ 426 memory[socket] += default_size; 427 total_size -= default_size; 428 } 429 #else 430 /* in 32-bit mode, allocate all of the memory only on main 431 * lcore socket 432 */ 433 total_size = internal_conf->memory; 434 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; 435 socket++) { 436 struct rte_config *cfg = rte_eal_get_configuration(); 437 unsigned int main_lcore_socket; 438 439 main_lcore_socket = 440 rte_lcore_to_socket_id(cfg->main_lcore); 441 442 if (main_lcore_socket != socket) 443 continue; 444 445 /* Update sizes */ 446 memory[socket] = total_size; 447 break; 448 } 449 #endif 450 } 451 452 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; 453 socket++) { 454 /* skips if the memory on specific socket wasn't requested */ 455 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) { 456 rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir, 457 sizeof(hp_used[i].hugedir)); 458 hp_used[i].num_pages[socket] = RTE_MIN( 459 memory[socket] / hp_info[i].hugepage_sz, 460 hp_info[i].num_pages[socket]); 461 462 cur_mem = hp_used[i].num_pages[socket] * 463 hp_used[i].hugepage_sz; 464 465 memory[socket] -= cur_mem; 466 total_mem -= cur_mem; 467 468 total_num_pages += hp_used[i].num_pages[socket]; 469 470 /* check if we have met all memory requests */ 471 if (memory[socket] == 0) 472 break; 473 474 /* Check if we have any more pages left at this size, 475 * if so, move on to next size. 476 */ 477 if (hp_used[i].num_pages[socket] == 478 hp_info[i].num_pages[socket]) 479 continue; 480 /* At this point we know that there are more pages 481 * available that are bigger than the memory we want, 482 * so lets see if we can get enough from other page 483 * sizes. 484 */ 485 remaining_mem = 0; 486 for (j = i+1; j < num_hp_info; j++) 487 remaining_mem += hp_info[j].hugepage_sz * 488 hp_info[j].num_pages[socket]; 489 490 /* Is there enough other memory? 491 * If not, allocate another page and quit. 492 */ 493 if (remaining_mem < memory[socket]) { 494 cur_mem = RTE_MIN( 495 memory[socket], hp_info[i].hugepage_sz); 496 memory[socket] -= cur_mem; 497 total_mem -= cur_mem; 498 hp_used[i].num_pages[socket]++; 499 total_num_pages++; 500 break; /* we are done with this socket*/ 501 } 502 } 503 504 /* if we didn't satisfy all memory requirements per socket */ 505 if (memory[socket] > 0 && 506 internal_conf->socket_mem[socket] != 0) { 507 /* to prevent icc errors */ 508 requested = (unsigned int)( 509 internal_conf->socket_mem[socket] / 0x100000); 510 available = requested - 511 ((unsigned int)(memory[socket] / 0x100000)); 512 RTE_LOG(ERR, EAL, "Not enough memory available on " 513 "socket %u! Requested: %uMB, available: %uMB\n", 514 socket, requested, available); 515 return -1; 516 } 517 } 518 519 /* if we didn't satisfy total memory requirements */ 520 if (total_mem > 0) { 521 requested = (unsigned int)(internal_conf->memory / 0x100000); 522 available = requested - (unsigned int)(total_mem / 0x100000); 523 RTE_LOG(ERR, EAL, "Not enough memory available! " 524 "Requested: %uMB, available: %uMB\n", 525 requested, available); 526 return -1; 527 } 528 return total_num_pages; 529 } 530