1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 10 #include "spdk/version.h" 11 #include "spdk/env_dpdk.h" 12 #include "spdk/log.h" 13 14 #include <rte_config.h> 15 #include <rte_eal.h> 16 #include <rte_errno.h> 17 #include <rte_vfio.h> 18 19 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE -1 23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 26 27 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 28 #define DPDK_ALLOW_PARAM "--pci-whitelist" 29 #define DPDK_BLOCK_PARAM "--pci-blacklist" 30 #define DPDK_MAIN_CORE_PARAM "--master-lcore" 31 #else 32 #define DPDK_ALLOW_PARAM "--allow" 33 #define DPDK_BLOCK_PARAM "--block" 34 #define DPDK_MAIN_CORE_PARAM "--main-lcore" 35 #endif 36 37 static char **g_eal_cmdline; 38 static int g_eal_cmdline_argcount; 39 static bool g_external_init = true; 40 41 static char * 42 _sprintf_alloc(const char *format, ...) 43 { 44 va_list args; 45 va_list args_copy; 46 char *buf; 47 size_t bufsize; 48 int rc; 49 50 va_start(args, format); 51 52 /* Try with a small buffer first. */ 53 bufsize = 32; 54 55 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 56 while (bufsize <= 1024 * 1024) { 57 buf = malloc(bufsize); 58 if (buf == NULL) { 59 va_end(args); 60 return NULL; 61 } 62 63 va_copy(args_copy, args); 64 rc = vsnprintf(buf, bufsize, format, args_copy); 65 va_end(args_copy); 66 67 /* 68 * If vsnprintf() returned a count within our current buffer size, we are done. 69 * The count does not include the \0 terminator, so rc == bufsize is not OK. 70 */ 71 if (rc >= 0 && (size_t)rc < bufsize) { 72 va_end(args); 73 return buf; 74 } 75 76 /* 77 * vsnprintf() should return the required space, but some libc versions do not 78 * implement this correctly, so just double the buffer size and try again. 79 * 80 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 81 * again to avoid a copy. 82 */ 83 free(buf); 84 bufsize *= 2; 85 } 86 87 va_end(args); 88 return NULL; 89 } 90 91 void 92 spdk_env_opts_init(struct spdk_env_opts *opts) 93 { 94 if (!opts) { 95 return; 96 } 97 98 memset(opts, 0, sizeof(*opts)); 99 100 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 101 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 102 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 103 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 104 opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE; 105 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 106 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 107 } 108 109 static void 110 free_args(char **args, int argcount) 111 { 112 int i; 113 114 if (args == NULL) { 115 return; 116 } 117 118 for (i = 0; i < argcount; i++) { 119 free(args[i]); 120 } 121 122 if (argcount) { 123 free(args); 124 } 125 } 126 127 static char ** 128 push_arg(char *args[], int *argcount, char *arg) 129 { 130 char **tmp; 131 132 if (arg == NULL) { 133 SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); 134 free_args(args, *argcount); 135 return NULL; 136 } 137 138 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 139 if (tmp == NULL) { 140 free(arg); 141 free_args(args, *argcount); 142 return NULL; 143 } 144 145 tmp[*argcount] = arg; 146 (*argcount)++; 147 148 return tmp; 149 } 150 151 #if defined(__linux__) && defined(__x86_64__) 152 153 /* TODO: Can likely get this value from rlimits in the future */ 154 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 155 #define VTD_CAP_MGAW_SHIFT 16 156 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 157 #define RD_AMD_CAP_VASIZE_SHIFT 15 158 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT) 159 160 static int 161 get_iommu_width(void) 162 { 163 int width = 0; 164 glob_t glob_results = {}; 165 166 /* Break * and / into separate strings to appease check_format.sh comment style check. */ 167 glob("/sys/devices/virtual/iommu/dmar*" "/intel-iommu/cap", 0, NULL, &glob_results); 168 glob("/sys/class/iommu/ivhd*" "/amd-iommu/cap", GLOB_APPEND, NULL, &glob_results); 169 170 for (size_t i = 0; i < glob_results.gl_pathc; i++) { 171 const char *filename = glob_results.gl_pathv[0]; 172 FILE *file = fopen(filename, "r"); 173 uint64_t cap_reg = 0; 174 175 if (file == NULL) { 176 continue; 177 } 178 179 if (fscanf(file, "%" PRIx64, &cap_reg) == 1) { 180 if (strstr(filename, "intel-iommu") != NULL) { 181 /* We have an Intel IOMMU */ 182 int mgaw = ((cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 183 184 if (width == 0 || (mgaw > 0 && mgaw < width)) { 185 width = mgaw; 186 } 187 } else if (strstr(filename, "amd-iommu") != NULL) { 188 /* We have an AMD IOMMU */ 189 int mgaw = ((cap_reg & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT) + 1; 190 191 if (width == 0 || (mgaw > 0 && mgaw < width)) { 192 width = mgaw; 193 } 194 } 195 } 196 197 fclose(file); 198 } 199 200 globfree(&glob_results); 201 return width; 202 } 203 204 #endif 205 206 static int 207 build_eal_cmdline(const struct spdk_env_opts *opts) 208 { 209 int argcount = 0; 210 char **args; 211 212 args = NULL; 213 214 /* set the program name */ 215 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 216 if (args == NULL) { 217 return -1; 218 } 219 220 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 221 if (opts->shm_id < 0) { 222 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 223 if (args == NULL) { 224 return -1; 225 } 226 } 227 228 /* Either lcore_map or core_mask must be set. If both, or none specified, fail */ 229 if ((opts->core_mask == NULL) == (opts->lcore_map == NULL)) { 230 if (opts->core_mask && opts->lcore_map) { 231 fprintf(stderr, 232 "Both, lcore map and core mask are provided, while only one can be set\n"); 233 } else { 234 fprintf(stderr, "Core mask or lcore map must be specified\n"); 235 } 236 free_args(args, argcount); 237 return -1; 238 } 239 240 if (opts->lcore_map) { 241 /* If lcore list is set, generate --lcores parameter */ 242 args = push_arg(args, &argcount, _sprintf_alloc("--lcores=%s", opts->lcore_map)); 243 } else if (opts->core_mask[0] == '-') { 244 /* 245 * Set the coremask: 246 * 247 * - if it starts with '-', we presume it's literal EAL arguments such 248 * as --lcores. 249 * 250 * - if it starts with '[', we presume it's a core list to use with the 251 * -l option. 252 * 253 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the 254 * -c option. 255 */ 256 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask)); 257 } else if (opts->core_mask[0] == '[') { 258 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 259 260 if (l_arg != NULL) { 261 int len = strlen(l_arg); 262 263 if (l_arg[len - 1] == ']') { 264 l_arg[len - 1] = '\0'; 265 } 266 } 267 args = push_arg(args, &argcount, l_arg); 268 } else { 269 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 270 } 271 272 if (args == NULL) { 273 return -1; 274 } 275 276 /* set the memory channel number */ 277 if (opts->mem_channel > 0) { 278 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 279 if (args == NULL) { 280 return -1; 281 } 282 } 283 284 /* set the memory size */ 285 if (opts->mem_size >= 0) { 286 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 287 if (args == NULL) { 288 return -1; 289 } 290 } 291 292 /* set the main core */ 293 if (opts->main_core > 0) { 294 args = push_arg(args, &argcount, _sprintf_alloc("%s=%d", 295 DPDK_MAIN_CORE_PARAM, opts->main_core)); 296 if (args == NULL) { 297 return -1; 298 } 299 } 300 301 /* set no pci if enabled */ 302 if (opts->no_pci) { 303 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 304 if (args == NULL) { 305 return -1; 306 } 307 } 308 309 if (opts->env_context && strstr(opts->env_context, "--no-huge") != NULL) { 310 if (opts->hugepage_single_segments || opts->unlink_hugepage || opts->hugedir) { 311 fprintf(stderr, "--no-huge invalid with other hugepage options\n"); 312 free_args(args, argcount); 313 return -1; 314 } 315 } else { 316 /* create just one hugetlbfs file */ 317 if (opts->hugepage_single_segments) { 318 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 319 if (args == NULL) { 320 return -1; 321 } 322 } 323 324 /* unlink hugepages after initialization */ 325 /* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using 326 * multi-process so we don't need the hugepage links anymore. But we need to make sure 327 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since 328 * DPDK doesn't support that. 329 */ 330 if (opts->unlink_hugepage || 331 (opts->shm_id < 0 && !opts->hugepage_single_segments)) { 332 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 333 if (args == NULL) { 334 return -1; 335 } 336 } 337 338 /* use a specific hugetlbfs mount */ 339 if (opts->hugedir) { 340 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 341 if (args == NULL) { 342 return -1; 343 } 344 } 345 } 346 347 if (opts->num_pci_addr) { 348 size_t i; 349 char bdf[32]; 350 struct spdk_pci_addr *pci_addr = 351 opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed; 352 353 for (i = 0; i < opts->num_pci_addr; i++) { 354 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 355 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 356 (opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM), 357 bdf)); 358 if (args == NULL) { 359 return -1; 360 } 361 } 362 } 363 364 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 365 * This can be overridden by specifying the same option in opts->env_context 366 */ 367 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 368 if (args == NULL) { 369 return -1; 370 } 371 372 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 373 * This can be overridden by specifying the same option in opts->env_context 374 */ 375 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 376 if (args == NULL) { 377 return -1; 378 } 379 380 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 381 * vhost user message. We don't want that. The same log type is also used by a couple 382 * of other DPDK libs, but none of which we make use right now. If necessary, this can 383 * be overridden via opts->env_context. 384 */ 385 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 386 if (args == NULL) { 387 return -1; 388 } 389 390 if (opts->env_context) { 391 char *ptr = strdup(opts->env_context); 392 char *tok = strtok(ptr, " \t"); 393 394 /* DPDK expects each argument as a separate string in the argv 395 * array, so we need to tokenize here in case the caller 396 * passed multiple arguments in the env_context string. 397 */ 398 while (tok != NULL) { 399 args = push_arg(args, &argcount, strdup(tok)); 400 tok = strtok(NULL, " \t"); 401 } 402 403 free(ptr); 404 } 405 406 #ifdef __linux__ 407 408 if (opts->iova_mode) { 409 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 410 if (args == NULL) { 411 return -1; 412 } 413 } else { 414 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 415 * but DPDK guesses it should be iova-mode=va. Add a check and force 416 * iova-mode=pa here. */ 417 if (rte_vfio_noiommu_is_enabled()) { 418 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 419 if (args == NULL) { 420 return -1; 421 } 422 } 423 424 #if defined(__x86_64__) 425 /* DPDK by default guesses that it should be using iova-mode=va so that it can 426 * support running as an unprivileged user. However, some systems (especially 427 * virtual machines) don't have an IOMMU capable of handling the full virtual 428 * address space and DPDK doesn't currently catch that. Add a check in SPDK 429 * and force iova-mode=pa here. */ 430 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 431 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 432 if (args == NULL) { 433 return -1; 434 } 435 } 436 #elif defined(__PPC64__) 437 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 438 * auto-detect at the moment, so we'll just force it here. */ 439 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 440 if (args == NULL) { 441 return -1; 442 } 443 #endif 444 } 445 446 447 /* Set the base virtual address - it must be an address that is not in the 448 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 449 * mmap hint. 450 * 451 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 452 */ 453 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 454 if (args == NULL) { 455 return -1; 456 } 457 458 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 459 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 460 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 461 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 462 */ 463 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 464 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 465 if (args == NULL) { 466 return -1; 467 } 468 } 469 470 if (opts->shm_id < 0) { 471 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 472 getpid())); 473 if (args == NULL) { 474 return -1; 475 } 476 } else { 477 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 478 opts->shm_id)); 479 if (args == NULL) { 480 return -1; 481 } 482 483 /* set the process type */ 484 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 485 if (args == NULL) { 486 return -1; 487 } 488 } 489 490 /* --vfio-vf-token used for VF initialized by vfio_pci driver. */ 491 if (opts->vf_token) { 492 args = push_arg(args, &argcount, _sprintf_alloc("--vfio-vf-token=%s", 493 opts->vf_token)); 494 if (args == NULL) { 495 return -1; 496 } 497 } 498 #endif 499 500 g_eal_cmdline = args; 501 g_eal_cmdline_argcount = argcount; 502 return argcount; 503 } 504 505 int 506 spdk_env_dpdk_post_init(bool legacy_mem) 507 { 508 int rc; 509 510 rc = pci_env_init(); 511 if (rc < 0) { 512 SPDK_ERRLOG("pci_env_init() failed\n"); 513 return rc; 514 } 515 516 rc = mem_map_init(legacy_mem); 517 if (rc < 0) { 518 SPDK_ERRLOG("Failed to allocate mem_map\n"); 519 return rc; 520 } 521 522 rc = vtophys_init(); 523 if (rc < 0) { 524 SPDK_ERRLOG("Failed to initialize vtophys\n"); 525 return rc; 526 } 527 528 return 0; 529 } 530 531 void 532 spdk_env_dpdk_post_fini(void) 533 { 534 pci_env_fini(); 535 536 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 537 g_eal_cmdline = NULL; 538 g_eal_cmdline_argcount = 0; 539 } 540 541 int 542 spdk_env_init(const struct spdk_env_opts *opts) 543 { 544 char **dpdk_args = NULL; 545 char *args_print = NULL, *args_tmp = NULL; 546 int i, rc; 547 int orig_optind; 548 bool legacy_mem; 549 550 /* If SPDK env has been initialized before, then only pci env requires 551 * reinitialization. 552 */ 553 if (g_external_init == false) { 554 if (opts != NULL) { 555 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 556 return -EINVAL; 557 } 558 559 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 560 pci_env_reinit(); 561 562 return 0; 563 } 564 565 if (opts == NULL) { 566 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 567 return -EINVAL; 568 } 569 570 rc = build_eal_cmdline(opts); 571 if (rc < 0) { 572 SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); 573 return -EINVAL; 574 } 575 576 SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 577 578 args_print = _sprintf_alloc("[ DPDK EAL parameters: "); 579 if (args_print == NULL) { 580 return -ENOMEM; 581 } 582 for (i = 0; i < g_eal_cmdline_argcount; i++) { 583 args_tmp = args_print; 584 args_print = _sprintf_alloc("%s%s ", args_tmp, g_eal_cmdline[i]); 585 if (args_print == NULL) { 586 free(args_tmp); 587 return -ENOMEM; 588 } 589 free(args_tmp); 590 } 591 SPDK_PRINTF("%s]\n", args_print); 592 free(args_print); 593 594 /* DPDK rearranges the array we pass to it, so make a copy 595 * before passing so we can still free the individual strings 596 * correctly. 597 */ 598 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 599 if (dpdk_args == NULL) { 600 SPDK_ERRLOG("Failed to allocate dpdk_args\n"); 601 return -ENOMEM; 602 } 603 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 604 605 fflush(stdout); 606 orig_optind = optind; 607 optind = 1; 608 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 609 optind = orig_optind; 610 611 free(dpdk_args); 612 613 if (rc < 0) { 614 if (rte_errno == EALREADY) { 615 SPDK_ERRLOG("DPDK already initialized\n"); 616 } else { 617 SPDK_ERRLOG("Failed to initialize DPDK\n"); 618 } 619 return -rte_errno; 620 } 621 622 legacy_mem = false; 623 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 624 legacy_mem = true; 625 } 626 627 rc = spdk_env_dpdk_post_init(legacy_mem); 628 if (rc == 0) { 629 g_external_init = false; 630 } 631 632 return rc; 633 } 634 635 /* We use priority 101 which is the highest priority level available 636 * to applications (the toolchains reserve 1 to 100 for internal usage). 637 * This ensures this destructor runs last, after any other destructors 638 * that might still need the environment up and running. 639 */ 640 __attribute__((destructor(101))) static void 641 dpdk_cleanup(void) 642 { 643 /* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */ 644 if (!g_external_init) { 645 rte_eal_cleanup(); 646 } 647 } 648 649 void 650 spdk_env_fini(void) 651 { 652 spdk_env_dpdk_post_fini(); 653 } 654 655 bool 656 spdk_env_dpdk_external_init(void) 657 { 658 return g_external_init; 659 } 660