1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 10 #include "spdk/version.h" 11 #include "spdk/env_dpdk.h" 12 #include "spdk/log.h" 13 14 #include <rte_config.h> 15 #include <rte_eal.h> 16 #include <rte_errno.h> 17 #include <rte_vfio.h> 18 19 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE -1 23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 26 27 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 28 #define DPDK_ALLOW_PARAM "--pci-whitelist" 29 #define DPDK_BLOCK_PARAM "--pci-blacklist" 30 #define DPDK_MAIN_CORE_PARAM "--master-lcore" 31 #else 32 #define DPDK_ALLOW_PARAM "--allow" 33 #define DPDK_BLOCK_PARAM "--block" 34 #define DPDK_MAIN_CORE_PARAM "--main-lcore" 35 #endif 36 37 static char **g_eal_cmdline; 38 static int g_eal_cmdline_argcount; 39 static bool g_external_init = true; 40 41 static char * 42 _sprintf_alloc(const char *format, ...) 43 { 44 va_list args; 45 va_list args_copy; 46 char *buf; 47 size_t bufsize; 48 int rc; 49 50 va_start(args, format); 51 52 /* Try with a small buffer first. */ 53 bufsize = 32; 54 55 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 56 while (bufsize <= 1024 * 1024) { 57 buf = malloc(bufsize); 58 if (buf == NULL) { 59 va_end(args); 60 return NULL; 61 } 62 63 va_copy(args_copy, args); 64 rc = vsnprintf(buf, bufsize, format, args_copy); 65 va_end(args_copy); 66 67 /* 68 * If vsnprintf() returned a count within our current buffer size, we are done. 69 * The count does not include the \0 terminator, so rc == bufsize is not OK. 70 */ 71 if (rc >= 0 && (size_t)rc < bufsize) { 72 va_end(args); 73 return buf; 74 } 75 76 /* 77 * vsnprintf() should return the required space, but some libc versions do not 78 * implement this correctly, so just double the buffer size and try again. 79 * 80 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 81 * again to avoid a copy. 82 */ 83 free(buf); 84 bufsize *= 2; 85 } 86 87 va_end(args); 88 return NULL; 89 } 90 91 void 92 spdk_env_opts_init(struct spdk_env_opts *opts) 93 { 94 if (!opts) { 95 return; 96 } 97 98 memset(opts, 0, sizeof(*opts)); 99 100 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 101 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 102 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 103 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 104 opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE; 105 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 106 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 107 } 108 109 static void 110 free_args(char **args, int argcount) 111 { 112 int i; 113 114 if (args == NULL) { 115 return; 116 } 117 118 for (i = 0; i < argcount; i++) { 119 free(args[i]); 120 } 121 122 if (argcount) { 123 free(args); 124 } 125 } 126 127 static char ** 128 push_arg(char *args[], int *argcount, char *arg) 129 { 130 char **tmp; 131 132 if (arg == NULL) { 133 SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); 134 free_args(args, *argcount); 135 return NULL; 136 } 137 138 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 139 if (tmp == NULL) { 140 free(arg); 141 free_args(args, *argcount); 142 return NULL; 143 } 144 145 tmp[*argcount] = arg; 146 (*argcount)++; 147 148 return tmp; 149 } 150 151 #if defined(__linux__) && defined(__x86_64__) 152 153 /* TODO: Can likely get this value from rlimits in the future */ 154 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 155 #define VTD_CAP_MGAW_SHIFT 16 156 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 157 #define RD_AMD_CAP_VASIZE_SHIFT 15 158 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT) 159 160 static int 161 get_amd_iommu_width(void) 162 { 163 FILE *file; 164 char buf[64]; 165 char *end; 166 long long int amd_cap; 167 168 file = fopen("/sys/class/iommu/ivhd2/amd-iommu/cap", "r"); 169 if (file == NULL) { 170 return 0; 171 } 172 173 if (fgets(buf, sizeof(buf), file) == NULL) { 174 fclose(file); 175 return 0; 176 } 177 178 amd_cap = strtoll(buf, &end, 16); 179 if (amd_cap == LLONG_MIN || amd_cap == LLONG_MAX) { 180 fclose(file); 181 return 0; 182 } 183 184 fclose(file); 185 return (amd_cap & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT; 186 } 187 188 static int 189 get_iommu_width(void) 190 { 191 DIR *dir; 192 FILE *file; 193 struct dirent *entry; 194 char mgaw_path[64]; 195 char buf[64]; 196 char *end; 197 long long int val; 198 int width, tmp; 199 struct stat s; 200 201 if (stat("/sys/class/iommu/ivhd2/amd-iommu", &s) == 0) { 202 return get_amd_iommu_width(); 203 } 204 205 dir = opendir("/sys/devices/virtual/iommu/"); 206 if (dir == NULL) { 207 return -EINVAL; 208 } 209 210 width = 0; 211 212 while ((entry = readdir(dir)) != NULL) { 213 /* Find directories named "dmar0", "dmar1", etc */ 214 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 215 continue; 216 } 217 218 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 219 entry->d_name); 220 if ((unsigned)tmp >= sizeof(mgaw_path)) { 221 continue; 222 } 223 224 file = fopen(mgaw_path, "r"); 225 if (file == NULL) { 226 continue; 227 } 228 229 if (fgets(buf, sizeof(buf), file) == NULL) { 230 fclose(file); 231 continue; 232 } 233 234 val = strtoll(buf, &end, 16); 235 if (val == LLONG_MIN || val == LLONG_MAX) { 236 fclose(file); 237 continue; 238 } 239 240 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 241 if (width == 0 || tmp < width) { 242 width = tmp; 243 } 244 245 fclose(file); 246 } 247 248 closedir(dir); 249 250 return width; 251 } 252 253 #endif 254 255 static int 256 build_eal_cmdline(const struct spdk_env_opts *opts) 257 { 258 int argcount = 0; 259 char **args; 260 261 args = NULL; 262 263 /* set the program name */ 264 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 265 if (args == NULL) { 266 return -1; 267 } 268 269 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 270 if (opts->shm_id < 0) { 271 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 272 if (args == NULL) { 273 return -1; 274 } 275 } 276 277 /* 278 * Set the coremask: 279 * 280 * - if it starts with '-', we presume it's literal EAL arguments such 281 * as --lcores. 282 * 283 * - if it starts with '[', we presume it's a core list to use with the 284 * -l option. 285 * 286 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the 287 * -c option. 288 */ 289 if (opts->core_mask[0] == '-') { 290 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask)); 291 } else if (opts->core_mask[0] == '[') { 292 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 293 294 if (l_arg != NULL) { 295 int len = strlen(l_arg); 296 297 if (l_arg[len - 1] == ']') { 298 l_arg[len - 1] = '\0'; 299 } 300 } 301 args = push_arg(args, &argcount, l_arg); 302 } else { 303 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 304 } 305 306 if (args == NULL) { 307 return -1; 308 } 309 310 /* set the memory channel number */ 311 if (opts->mem_channel > 0) { 312 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 313 if (args == NULL) { 314 return -1; 315 } 316 } 317 318 /* set the memory size */ 319 if (opts->mem_size >= 0) { 320 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 321 if (args == NULL) { 322 return -1; 323 } 324 } 325 326 /* set the main core */ 327 if (opts->main_core > 0) { 328 args = push_arg(args, &argcount, _sprintf_alloc("%s=%d", 329 DPDK_MAIN_CORE_PARAM, opts->main_core)); 330 if (args == NULL) { 331 return -1; 332 } 333 } 334 335 /* set no pci if enabled */ 336 if (opts->no_pci) { 337 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 338 if (args == NULL) { 339 return -1; 340 } 341 } 342 343 /* create just one hugetlbfs file */ 344 if (opts->hugepage_single_segments) { 345 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 346 if (args == NULL) { 347 return -1; 348 } 349 } 350 351 /* unlink hugepages after initialization */ 352 /* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using 353 * multi-process so we don't need the hugepage links anymore. But we need to make sure 354 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since 355 * DPDK doesn't support that. 356 */ 357 if (opts->unlink_hugepage || 358 (opts->shm_id < 0 && !opts->hugepage_single_segments)) { 359 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 360 if (args == NULL) { 361 return -1; 362 } 363 } 364 365 /* use a specific hugetlbfs mount */ 366 if (opts->hugedir) { 367 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 368 if (args == NULL) { 369 return -1; 370 } 371 } 372 373 if (opts->num_pci_addr) { 374 size_t i; 375 char bdf[32]; 376 struct spdk_pci_addr *pci_addr = 377 opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed; 378 379 for (i = 0; i < opts->num_pci_addr; i++) { 380 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 381 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 382 (opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM), 383 bdf)); 384 if (args == NULL) { 385 return -1; 386 } 387 } 388 } 389 390 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 391 * This can be overridden by specifying the same option in opts->env_context 392 */ 393 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 394 if (args == NULL) { 395 return -1; 396 } 397 398 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 399 * This can be overridden by specifying the same option in opts->env_context 400 */ 401 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 402 if (args == NULL) { 403 return -1; 404 } 405 406 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 407 * vhost user message. We don't want that. The same log type is also used by a couple 408 * of other DPDK libs, but none of which we make use right now. If necessary, this can 409 * be overridden via opts->env_context. 410 */ 411 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 412 if (args == NULL) { 413 return -1; 414 } 415 416 if (opts->env_context) { 417 char *ptr = strdup(opts->env_context); 418 char *tok = strtok(ptr, " \t"); 419 420 /* DPDK expects each argument as a separate string in the argv 421 * array, so we need to tokenize here in case the caller 422 * passed multiple arguments in the env_context string. 423 */ 424 while (tok != NULL) { 425 args = push_arg(args, &argcount, strdup(tok)); 426 tok = strtok(NULL, " \t"); 427 } 428 429 free(ptr); 430 } 431 432 #ifdef __linux__ 433 434 if (opts->iova_mode) { 435 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 436 if (args == NULL) { 437 return -1; 438 } 439 } else { 440 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 441 * but DPDK guesses it should be iova-mode=va. Add a check and force 442 * iova-mode=pa here. */ 443 if (rte_vfio_noiommu_is_enabled()) { 444 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 445 if (args == NULL) { 446 return -1; 447 } 448 } 449 450 #if defined(__x86_64__) 451 /* DPDK by default guesses that it should be using iova-mode=va so that it can 452 * support running as an unprivileged user. However, some systems (especially 453 * virtual machines) don't have an IOMMU capable of handling the full virtual 454 * address space and DPDK doesn't currently catch that. Add a check in SPDK 455 * and force iova-mode=pa here. */ 456 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 457 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 458 if (args == NULL) { 459 return -1; 460 } 461 } 462 #elif defined(__PPC64__) 463 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 464 * auto-detect at the moment, so we'll just force it here. */ 465 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 466 if (args == NULL) { 467 return -1; 468 } 469 #endif 470 } 471 472 473 /* Set the base virtual address - it must be an address that is not in the 474 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 475 * mmap hint. 476 * 477 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 478 */ 479 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 480 if (args == NULL) { 481 return -1; 482 } 483 484 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 485 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 486 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 487 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 488 */ 489 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 490 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 491 if (args == NULL) { 492 return -1; 493 } 494 } 495 496 if (opts->shm_id < 0) { 497 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 498 getpid())); 499 if (args == NULL) { 500 return -1; 501 } 502 } else { 503 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 504 opts->shm_id)); 505 if (args == NULL) { 506 return -1; 507 } 508 509 /* set the process type */ 510 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 511 if (args == NULL) { 512 return -1; 513 } 514 } 515 #endif 516 517 g_eal_cmdline = args; 518 g_eal_cmdline_argcount = argcount; 519 return argcount; 520 } 521 522 int 523 spdk_env_dpdk_post_init(bool legacy_mem) 524 { 525 int rc; 526 527 rc = pci_env_init(); 528 if (rc < 0) { 529 SPDK_ERRLOG("pci_env_init() failed\n"); 530 return rc; 531 } 532 533 rc = mem_map_init(legacy_mem); 534 if (rc < 0) { 535 SPDK_ERRLOG("Failed to allocate mem_map\n"); 536 return rc; 537 } 538 539 rc = vtophys_init(); 540 if (rc < 0) { 541 SPDK_ERRLOG("Failed to initialize vtophys\n"); 542 return rc; 543 } 544 545 return 0; 546 } 547 548 void 549 spdk_env_dpdk_post_fini(void) 550 { 551 pci_env_fini(); 552 553 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 554 g_eal_cmdline = NULL; 555 g_eal_cmdline_argcount = 0; 556 } 557 558 int 559 spdk_env_init(const struct spdk_env_opts *opts) 560 { 561 char **dpdk_args = NULL; 562 int i, rc; 563 int orig_optind; 564 bool legacy_mem; 565 566 /* If SPDK env has been initialized before, then only pci env requires 567 * reinitialization. 568 */ 569 if (g_external_init == false) { 570 if (opts != NULL) { 571 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 572 return -EINVAL; 573 } 574 575 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 576 pci_env_reinit(); 577 578 return 0; 579 } 580 581 if (opts == NULL) { 582 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 583 return -EINVAL; 584 } 585 586 rc = build_eal_cmdline(opts); 587 if (rc < 0) { 588 SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); 589 return -EINVAL; 590 } 591 592 SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 593 SPDK_PRINTF("[ DPDK EAL parameters: "); 594 for (i = 0; i < g_eal_cmdline_argcount; i++) { 595 SPDK_PRINTF("%s ", g_eal_cmdline[i]); 596 } 597 SPDK_PRINTF("]\n"); 598 599 /* DPDK rearranges the array we pass to it, so make a copy 600 * before passing so we can still free the individual strings 601 * correctly. 602 */ 603 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 604 if (dpdk_args == NULL) { 605 SPDK_ERRLOG("Failed to allocate dpdk_args\n"); 606 return -ENOMEM; 607 } 608 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 609 610 fflush(stdout); 611 orig_optind = optind; 612 optind = 1; 613 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 614 optind = orig_optind; 615 616 free(dpdk_args); 617 618 if (rc < 0) { 619 if (rte_errno == EALREADY) { 620 SPDK_ERRLOG("DPDK already initialized\n"); 621 } else { 622 SPDK_ERRLOG("Failed to initialize DPDK\n"); 623 } 624 return -rte_errno; 625 } 626 627 legacy_mem = false; 628 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 629 legacy_mem = true; 630 } 631 632 rc = spdk_env_dpdk_post_init(legacy_mem); 633 if (rc == 0) { 634 g_external_init = false; 635 } 636 637 return rc; 638 } 639 640 /* We use priority 101 which is the highest priority level available 641 * to applications (the toolchains reserve 1 to 100 for internal usage). 642 * This ensures this destructor runs last, after any other destructors 643 * that might still need the environment up and running. 644 */ 645 __attribute__((destructor(101))) static void 646 dpdk_cleanup(void) 647 { 648 /* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */ 649 if (!g_external_init) { 650 rte_eal_cleanup(); 651 } 652 } 653 654 void 655 spdk_env_fini(void) 656 { 657 spdk_env_dpdk_post_fini(); 658 } 659 660 bool 661 spdk_env_dpdk_external_init(void) 662 { 663 return g_external_init; 664 } 665