1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 10 #include "spdk/version.h" 11 #include "spdk/env_dpdk.h" 12 #include "spdk/log.h" 13 14 #include <rte_config.h> 15 #include <rte_eal.h> 16 #include <rte_errno.h> 17 #include <rte_vfio.h> 18 19 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE -1 23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 26 27 #define DPDK_ALLOW_PARAM "--allow" 28 #define DPDK_BLOCK_PARAM "--block" 29 #define DPDK_MAIN_CORE_PARAM "--main-lcore" 30 31 static char **g_eal_cmdline; 32 static int g_eal_cmdline_argcount; 33 static bool g_external_init = true; 34 35 static char * 36 _sprintf_alloc(const char *format, ...) 37 { 38 va_list args; 39 va_list args_copy; 40 char *buf; 41 size_t bufsize; 42 int rc; 43 44 va_start(args, format); 45 46 /* Try with a small buffer first. */ 47 bufsize = 32; 48 49 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 50 while (bufsize <= 1024 * 1024) { 51 buf = malloc(bufsize); 52 if (buf == NULL) { 53 va_end(args); 54 return NULL; 55 } 56 57 va_copy(args_copy, args); 58 rc = vsnprintf(buf, bufsize, format, args_copy); 59 va_end(args_copy); 60 61 /* 62 * If vsnprintf() returned a count within our current buffer size, we are done. 63 * The count does not include the \0 terminator, so rc == bufsize is not OK. 64 */ 65 if (rc >= 0 && (size_t)rc < bufsize) { 66 va_end(args); 67 return buf; 68 } 69 70 /* 71 * vsnprintf() should return the required space, but some libc versions do not 72 * implement this correctly, so just double the buffer size and try again. 73 * 74 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 75 * again to avoid a copy. 76 */ 77 free(buf); 78 bufsize *= 2; 79 } 80 81 va_end(args); 82 return NULL; 83 } 84 85 void 86 spdk_env_opts_init(struct spdk_env_opts *opts) 87 { 88 if (!opts) { 89 return; 90 } 91 92 memset(opts, 0, sizeof(*opts)); 93 94 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 95 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 96 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 97 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 98 opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE; 99 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 100 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 101 } 102 103 static void 104 free_args(char **args, int argcount) 105 { 106 int i; 107 108 if (args == NULL) { 109 return; 110 } 111 112 for (i = 0; i < argcount; i++) { 113 free(args[i]); 114 } 115 116 if (argcount) { 117 free(args); 118 } 119 } 120 121 static char ** 122 push_arg(char *args[], int *argcount, char *arg) 123 { 124 char **tmp; 125 126 if (arg == NULL) { 127 SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); 128 free_args(args, *argcount); 129 return NULL; 130 } 131 132 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 133 if (tmp == NULL) { 134 free(arg); 135 free_args(args, *argcount); 136 return NULL; 137 } 138 139 tmp[*argcount] = arg; 140 (*argcount)++; 141 142 return tmp; 143 } 144 145 #if defined(__linux__) && defined(__x86_64__) 146 147 /* TODO: Can likely get this value from rlimits in the future */ 148 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 149 #define VTD_CAP_MGAW_SHIFT 16 150 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 151 #define RD_AMD_CAP_VASIZE_SHIFT 15 152 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT) 153 154 static int 155 get_iommu_width(void) 156 { 157 int width = 0; 158 glob_t glob_results = {}; 159 160 /* Break * and / into separate strings to appease check_format.sh comment style check. */ 161 glob("/sys/devices/virtual/iommu/dmar*" "/intel-iommu/cap", 0, NULL, &glob_results); 162 glob("/sys/class/iommu/ivhd*" "/amd-iommu/cap", GLOB_APPEND, NULL, &glob_results); 163 164 for (size_t i = 0; i < glob_results.gl_pathc; i++) { 165 const char *filename = glob_results.gl_pathv[0]; 166 FILE *file = fopen(filename, "r"); 167 uint64_t cap_reg = 0; 168 169 if (file == NULL) { 170 continue; 171 } 172 173 if (fscanf(file, "%" PRIx64, &cap_reg) == 1) { 174 if (strstr(filename, "intel-iommu") != NULL) { 175 /* We have an Intel IOMMU */ 176 int mgaw = ((cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 177 178 if (width == 0 || (mgaw > 0 && mgaw < width)) { 179 width = mgaw; 180 } 181 } else if (strstr(filename, "amd-iommu") != NULL) { 182 /* We have an AMD IOMMU */ 183 int mgaw = ((cap_reg & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT) + 1; 184 185 if (width == 0 || (mgaw > 0 && mgaw < width)) { 186 width = mgaw; 187 } 188 } 189 } 190 191 fclose(file); 192 } 193 194 globfree(&glob_results); 195 return width; 196 } 197 198 #endif 199 200 static int 201 build_eal_cmdline(const struct spdk_env_opts *opts) 202 { 203 int argcount = 0; 204 char **args; 205 206 args = NULL; 207 208 /* set the program name */ 209 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 210 if (args == NULL) { 211 return -1; 212 } 213 214 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 215 if (opts->shm_id < 0) { 216 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 217 if (args == NULL) { 218 return -1; 219 } 220 } 221 222 /* Either lcore_map or core_mask must be set. If both, or none specified, fail */ 223 if ((opts->core_mask == NULL) == (opts->lcore_map == NULL)) { 224 if (opts->core_mask && opts->lcore_map) { 225 fprintf(stderr, 226 "Both, lcore map and core mask are provided, while only one can be set\n"); 227 } else { 228 fprintf(stderr, "Core mask or lcore map must be specified\n"); 229 } 230 free_args(args, argcount); 231 return -1; 232 } 233 234 if (opts->lcore_map) { 235 /* If lcore list is set, generate --lcores parameter */ 236 args = push_arg(args, &argcount, _sprintf_alloc("--lcores=%s", opts->lcore_map)); 237 } else if (opts->core_mask[0] == '-') { 238 /* 239 * Set the coremask: 240 * 241 * - if it starts with '-', we presume it's literal EAL arguments such 242 * as --lcores. 243 * 244 * - if it starts with '[', we presume it's a core list to use with the 245 * -l option. 246 * 247 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the 248 * -c option. 249 */ 250 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask)); 251 } else if (opts->core_mask[0] == '[') { 252 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 253 254 if (l_arg != NULL) { 255 int len = strlen(l_arg); 256 257 if (l_arg[len - 1] == ']') { 258 l_arg[len - 1] = '\0'; 259 } 260 } 261 args = push_arg(args, &argcount, l_arg); 262 } else { 263 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 264 } 265 266 if (args == NULL) { 267 return -1; 268 } 269 270 /* set the memory channel number */ 271 if (opts->mem_channel > 0) { 272 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 273 if (args == NULL) { 274 return -1; 275 } 276 } 277 278 /* set the memory size */ 279 if (opts->mem_size >= 0) { 280 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 281 if (args == NULL) { 282 return -1; 283 } 284 } 285 286 /* set the main core */ 287 if (opts->main_core > 0) { 288 args = push_arg(args, &argcount, _sprintf_alloc("%s=%d", 289 DPDK_MAIN_CORE_PARAM, opts->main_core)); 290 if (args == NULL) { 291 return -1; 292 } 293 } 294 295 /* set no pci if enabled */ 296 if (opts->no_pci) { 297 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 298 if (args == NULL) { 299 return -1; 300 } 301 } 302 303 if (opts->env_context && strstr(opts->env_context, "--no-huge") != NULL) { 304 if (opts->hugepage_single_segments || opts->unlink_hugepage || opts->hugedir) { 305 fprintf(stderr, "--no-huge invalid with other hugepage options\n"); 306 free_args(args, argcount); 307 return -1; 308 } 309 } else { 310 /* create just one hugetlbfs file */ 311 if (opts->hugepage_single_segments) { 312 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 313 if (args == NULL) { 314 return -1; 315 } 316 } 317 318 /* unlink hugepages after initialization */ 319 /* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using 320 * multi-process so we don't need the hugepage links anymore. But we need to make sure 321 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since 322 * DPDK doesn't support that. 323 */ 324 if (opts->unlink_hugepage || 325 (opts->shm_id < 0 && !opts->hugepage_single_segments)) { 326 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 327 if (args == NULL) { 328 return -1; 329 } 330 } 331 332 /* use a specific hugetlbfs mount */ 333 if (opts->hugedir) { 334 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 335 if (args == NULL) { 336 return -1; 337 } 338 } 339 } 340 341 if (opts->num_pci_addr) { 342 size_t i; 343 char bdf[32]; 344 struct spdk_pci_addr *pci_addr = 345 opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed; 346 347 for (i = 0; i < opts->num_pci_addr; i++) { 348 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 349 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 350 (opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM), 351 bdf)); 352 if (args == NULL) { 353 return -1; 354 } 355 } 356 } 357 358 /* Disable DPDK telemetry information by default, can be modified with env_context. 359 * Prevents creation of dpdk_telemetry socket and additional pthread for it. 360 */ 361 args = push_arg(args, &argcount, _sprintf_alloc("--no-telemetry")); 362 if (args == NULL) { 363 return -1; 364 } 365 366 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 367 * This can be overridden by specifying the same option in opts->env_context 368 */ 369 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 370 if (args == NULL) { 371 return -1; 372 } 373 374 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 375 * This can be overridden by specifying the same option in opts->env_context 376 */ 377 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 378 if (args == NULL) { 379 return -1; 380 } 381 382 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 383 * vhost user message. We don't want that. The same log type is also used by a couple 384 * of other DPDK libs, but none of which we make use right now. If necessary, this can 385 * be overridden via opts->env_context. 386 */ 387 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 388 if (args == NULL) { 389 return -1; 390 } 391 392 if (opts->env_context) { 393 char *ptr = strdup(opts->env_context); 394 char *tok = strtok(ptr, " \t"); 395 396 /* DPDK expects each argument as a separate string in the argv 397 * array, so we need to tokenize here in case the caller 398 * passed multiple arguments in the env_context string. 399 */ 400 while (tok != NULL) { 401 args = push_arg(args, &argcount, strdup(tok)); 402 tok = strtok(NULL, " \t"); 403 } 404 405 free(ptr); 406 } 407 408 #ifdef __linux__ 409 410 if (opts->iova_mode) { 411 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 412 if (args == NULL) { 413 return -1; 414 } 415 } else { 416 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 417 * but DPDK guesses it should be iova-mode=va. Add a check and force 418 * iova-mode=pa here. */ 419 if (rte_vfio_noiommu_is_enabled()) { 420 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 421 if (args == NULL) { 422 return -1; 423 } 424 } 425 426 #if defined(__x86_64__) 427 /* DPDK by default guesses that it should be using iova-mode=va so that it can 428 * support running as an unprivileged user. However, some systems (especially 429 * virtual machines) don't have an IOMMU capable of handling the full virtual 430 * address space and DPDK doesn't currently catch that. Add a check in SPDK 431 * and force iova-mode=pa here. */ 432 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 433 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 434 if (args == NULL) { 435 return -1; 436 } 437 } 438 #elif defined(__PPC64__) 439 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 440 * auto-detect at the moment, so we'll just force it here. */ 441 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 442 if (args == NULL) { 443 return -1; 444 } 445 #endif 446 } 447 448 449 /* Set the base virtual address - it must be an address that is not in the 450 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 451 * mmap hint. 452 * 453 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 454 */ 455 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 456 if (args == NULL) { 457 return -1; 458 } 459 460 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 461 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 462 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 463 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 464 */ 465 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 466 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 467 if (args == NULL) { 468 return -1; 469 } 470 } 471 472 if (opts->shm_id < 0) { 473 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 474 getpid())); 475 if (args == NULL) { 476 return -1; 477 } 478 } else { 479 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 480 opts->shm_id)); 481 if (args == NULL) { 482 return -1; 483 } 484 485 /* set the process type */ 486 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 487 if (args == NULL) { 488 return -1; 489 } 490 } 491 492 /* --vfio-vf-token used for VF initialized by vfio_pci driver. */ 493 if (opts->vf_token) { 494 args = push_arg(args, &argcount, _sprintf_alloc("--vfio-vf-token=%s", 495 opts->vf_token)); 496 if (args == NULL) { 497 return -1; 498 } 499 } 500 #endif 501 502 g_eal_cmdline = args; 503 g_eal_cmdline_argcount = argcount; 504 return argcount; 505 } 506 507 int 508 spdk_env_dpdk_post_init(bool legacy_mem) 509 { 510 int rc; 511 512 rc = pci_env_init(); 513 if (rc < 0) { 514 SPDK_ERRLOG("pci_env_init() failed\n"); 515 return rc; 516 } 517 518 rc = mem_map_init(legacy_mem); 519 if (rc < 0) { 520 SPDK_ERRLOG("Failed to allocate mem_map\n"); 521 return rc; 522 } 523 524 rc = vtophys_init(); 525 if (rc < 0) { 526 SPDK_ERRLOG("Failed to initialize vtophys\n"); 527 return rc; 528 } 529 530 return 0; 531 } 532 533 void 534 spdk_env_dpdk_post_fini(void) 535 { 536 pci_env_fini(); 537 538 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 539 g_eal_cmdline = NULL; 540 g_eal_cmdline_argcount = 0; 541 } 542 543 int 544 spdk_env_init(const struct spdk_env_opts *opts) 545 { 546 char **dpdk_args = NULL; 547 char *args_print = NULL, *args_tmp = NULL; 548 int i, rc; 549 int orig_optind; 550 bool legacy_mem; 551 552 /* If SPDK env has been initialized before, then only pci env requires 553 * reinitialization. 554 */ 555 if (g_external_init == false) { 556 if (opts != NULL) { 557 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 558 return -EINVAL; 559 } 560 561 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 562 pci_env_reinit(); 563 564 return 0; 565 } 566 567 if (opts == NULL) { 568 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 569 return -EINVAL; 570 } 571 572 rc = build_eal_cmdline(opts); 573 if (rc < 0) { 574 SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); 575 return -EINVAL; 576 } 577 578 SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 579 580 args_print = _sprintf_alloc("[ DPDK EAL parameters: "); 581 if (args_print == NULL) { 582 return -ENOMEM; 583 } 584 for (i = 0; i < g_eal_cmdline_argcount; i++) { 585 args_tmp = args_print; 586 args_print = _sprintf_alloc("%s%s ", args_tmp, g_eal_cmdline[i]); 587 if (args_print == NULL) { 588 free(args_tmp); 589 return -ENOMEM; 590 } 591 free(args_tmp); 592 } 593 SPDK_PRINTF("%s]\n", args_print); 594 free(args_print); 595 596 /* DPDK rearranges the array we pass to it, so make a copy 597 * before passing so we can still free the individual strings 598 * correctly. 599 */ 600 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 601 if (dpdk_args == NULL) { 602 SPDK_ERRLOG("Failed to allocate dpdk_args\n"); 603 return -ENOMEM; 604 } 605 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 606 607 fflush(stdout); 608 orig_optind = optind; 609 optind = 1; 610 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 611 optind = orig_optind; 612 613 free(dpdk_args); 614 615 if (rc < 0) { 616 if (rte_errno == EALREADY) { 617 SPDK_ERRLOG("DPDK already initialized\n"); 618 } else { 619 SPDK_ERRLOG("Failed to initialize DPDK\n"); 620 } 621 return -rte_errno; 622 } 623 624 legacy_mem = false; 625 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 626 legacy_mem = true; 627 } 628 629 rc = spdk_env_dpdk_post_init(legacy_mem); 630 if (rc == 0) { 631 g_external_init = false; 632 } 633 634 return rc; 635 } 636 637 /* We use priority 101 which is the highest priority level available 638 * to applications (the toolchains reserve 1 to 100 for internal usage). 639 * This ensures this destructor runs last, after any other destructors 640 * that might still need the environment up and running. 641 */ 642 __attribute__((destructor(101))) static void 643 dpdk_cleanup(void) 644 { 645 /* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */ 646 if (!g_external_init) { 647 rte_eal_cleanup(); 648 } 649 } 650 651 void 652 spdk_env_fini(void) 653 { 654 spdk_env_dpdk_post_fini(); 655 } 656 657 bool 658 spdk_env_dpdk_external_init(void) 659 { 660 return g_external_init; 661 } 662