1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include "spdk/version.h" 39 #include "spdk/env_dpdk.h" 40 #include "spdk/log.h" 41 42 #include <rte_config.h> 43 #include <rte_eal.h> 44 #include <rte_errno.h> 45 #include <rte_vfio.h> 46 47 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 48 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 49 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 50 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE -1 51 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 52 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 53 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 54 55 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 56 #define DPDK_ALLOW_PARAM "--pci-whitelist" 57 #define DPDK_BLOCK_PARAM "--pci-blacklist" 58 #define DPDK_MAIN_CORE_PARAM "--master-lcore" 59 #else 60 #define DPDK_ALLOW_PARAM "--allow" 61 #define DPDK_BLOCK_PARAM "--block" 62 #define DPDK_MAIN_CORE_PARAM "--main-lcore" 63 #endif 64 65 static char **g_eal_cmdline; 66 static int g_eal_cmdline_argcount; 67 static bool g_external_init = true; 68 69 static char * 70 _sprintf_alloc(const char *format, ...) 71 { 72 va_list args; 73 va_list args_copy; 74 char *buf; 75 size_t bufsize; 76 int rc; 77 78 va_start(args, format); 79 80 /* Try with a small buffer first. */ 81 bufsize = 32; 82 83 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 84 while (bufsize <= 1024 * 1024) { 85 buf = malloc(bufsize); 86 if (buf == NULL) { 87 va_end(args); 88 return NULL; 89 } 90 91 va_copy(args_copy, args); 92 rc = vsnprintf(buf, bufsize, format, args_copy); 93 va_end(args_copy); 94 95 /* 96 * If vsnprintf() returned a count within our current buffer size, we are done. 97 * The count does not include the \0 terminator, so rc == bufsize is not OK. 98 */ 99 if (rc >= 0 && (size_t)rc < bufsize) { 100 va_end(args); 101 return buf; 102 } 103 104 /* 105 * vsnprintf() should return the required space, but some libc versions do not 106 * implement this correctly, so just double the buffer size and try again. 107 * 108 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 109 * again to avoid a copy. 110 */ 111 free(buf); 112 bufsize *= 2; 113 } 114 115 va_end(args); 116 return NULL; 117 } 118 119 void 120 spdk_env_opts_init(struct spdk_env_opts *opts) 121 { 122 if (!opts) { 123 return; 124 } 125 126 memset(opts, 0, sizeof(*opts)); 127 128 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 129 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 130 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 131 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 132 opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE; 133 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 134 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 135 } 136 137 static void 138 free_args(char **args, int argcount) 139 { 140 int i; 141 142 if (args == NULL) { 143 return; 144 } 145 146 for (i = 0; i < argcount; i++) { 147 free(args[i]); 148 } 149 150 if (argcount) { 151 free(args); 152 } 153 } 154 155 static char ** 156 push_arg(char *args[], int *argcount, char *arg) 157 { 158 char **tmp; 159 160 if (arg == NULL) { 161 SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); 162 free_args(args, *argcount); 163 return NULL; 164 } 165 166 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 167 if (tmp == NULL) { 168 free(arg); 169 free_args(args, *argcount); 170 return NULL; 171 } 172 173 tmp[*argcount] = arg; 174 (*argcount)++; 175 176 return tmp; 177 } 178 179 #if defined(__linux__) && defined(__x86_64__) 180 181 /* TODO: Can likely get this value from rlimits in the future */ 182 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 183 #define VTD_CAP_MGAW_SHIFT 16 184 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 185 186 static int 187 get_iommu_width(void) 188 { 189 DIR *dir; 190 FILE *file; 191 struct dirent *entry; 192 char mgaw_path[64]; 193 char buf[64]; 194 char *end; 195 long long int val; 196 int width, tmp; 197 198 dir = opendir("/sys/devices/virtual/iommu/"); 199 if (dir == NULL) { 200 return -EINVAL; 201 } 202 203 width = 0; 204 205 while ((entry = readdir(dir)) != NULL) { 206 /* Find directories named "dmar0", "dmar1", etc */ 207 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 208 continue; 209 } 210 211 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 212 entry->d_name); 213 if ((unsigned)tmp >= sizeof(mgaw_path)) { 214 continue; 215 } 216 217 file = fopen(mgaw_path, "r"); 218 if (file == NULL) { 219 continue; 220 } 221 222 if (fgets(buf, sizeof(buf), file) == NULL) { 223 fclose(file); 224 continue; 225 } 226 227 val = strtoll(buf, &end, 16); 228 if (val == LLONG_MIN || val == LLONG_MAX) { 229 fclose(file); 230 continue; 231 } 232 233 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 234 if (width == 0 || tmp < width) { 235 width = tmp; 236 } 237 238 fclose(file); 239 } 240 241 closedir(dir); 242 243 return width; 244 } 245 246 #endif 247 248 static int 249 build_eal_cmdline(const struct spdk_env_opts *opts) 250 { 251 int argcount = 0; 252 char **args; 253 254 args = NULL; 255 256 /* set the program name */ 257 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 258 if (args == NULL) { 259 return -1; 260 } 261 262 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 263 if (opts->shm_id < 0) { 264 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 265 if (args == NULL) { 266 return -1; 267 } 268 } 269 270 /* 271 * Set the coremask: 272 * 273 * - if it starts with '-', we presume it's literal EAL arguments such 274 * as --lcores. 275 * 276 * - if it starts with '[', we presume it's a core list to use with the 277 * -l option. 278 * 279 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the 280 * -c option. 281 */ 282 if (opts->core_mask[0] == '-') { 283 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask)); 284 } else if (opts->core_mask[0] == '[') { 285 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 286 287 if (l_arg != NULL) { 288 int len = strlen(l_arg); 289 290 if (l_arg[len - 1] == ']') { 291 l_arg[len - 1] = '\0'; 292 } 293 } 294 args = push_arg(args, &argcount, l_arg); 295 } else { 296 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 297 } 298 299 if (args == NULL) { 300 return -1; 301 } 302 303 /* set the memory channel number */ 304 if (opts->mem_channel > 0) { 305 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 306 if (args == NULL) { 307 return -1; 308 } 309 } 310 311 /* set the memory size */ 312 if (opts->mem_size >= 0) { 313 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 314 if (args == NULL) { 315 return -1; 316 } 317 } 318 319 /* set the main core */ 320 if (opts->main_core > 0) { 321 args = push_arg(args, &argcount, _sprintf_alloc("%s=%d", 322 DPDK_MAIN_CORE_PARAM, opts->main_core)); 323 if (args == NULL) { 324 return -1; 325 } 326 } 327 328 /* set no pci if enabled */ 329 if (opts->no_pci) { 330 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 331 if (args == NULL) { 332 return -1; 333 } 334 } 335 336 /* create just one hugetlbfs file */ 337 if (opts->hugepage_single_segments) { 338 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 339 if (args == NULL) { 340 return -1; 341 } 342 } 343 344 /* unlink hugepages after initialization */ 345 /* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using 346 * multi-process so we don't need the hugepage links anymore. But we need to make sure 347 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since 348 * DPDK doesn't support that. 349 */ 350 if (opts->unlink_hugepage || 351 (opts->shm_id < 0 && !opts->hugepage_single_segments)) { 352 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 353 if (args == NULL) { 354 return -1; 355 } 356 } 357 358 /* use a specific hugetlbfs mount */ 359 if (opts->hugedir) { 360 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 361 if (args == NULL) { 362 return -1; 363 } 364 } 365 366 if (opts->num_pci_addr) { 367 size_t i; 368 char bdf[32]; 369 struct spdk_pci_addr *pci_addr = 370 opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed; 371 372 for (i = 0; i < opts->num_pci_addr; i++) { 373 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 374 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 375 (opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM), 376 bdf)); 377 if (args == NULL) { 378 return -1; 379 } 380 } 381 } 382 383 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 384 * This can be overridden by specifying the same option in opts->env_context 385 */ 386 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 387 if (args == NULL) { 388 return -1; 389 } 390 391 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 392 * This can be overridden by specifying the same option in opts->env_context 393 */ 394 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 395 if (args == NULL) { 396 return -1; 397 } 398 399 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 400 * vhost user message. We don't want that. The same log type is also used by a couple 401 * of other DPDK libs, but none of which we make use right now. If necessary, this can 402 * be overridden via opts->env_context. 403 */ 404 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 405 if (args == NULL) { 406 return -1; 407 } 408 409 if (opts->env_context) { 410 char *ptr = strdup(opts->env_context); 411 char *tok = strtok(ptr, " \t"); 412 413 /* DPDK expects each argument as a separate string in the argv 414 * array, so we need to tokenize here in case the caller 415 * passed multiple arguments in the env_context string. 416 */ 417 while (tok != NULL) { 418 args = push_arg(args, &argcount, strdup(tok)); 419 tok = strtok(NULL, " \t"); 420 } 421 422 free(ptr); 423 } 424 425 #ifdef __linux__ 426 427 if (opts->iova_mode) { 428 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 429 if (args == NULL) { 430 return -1; 431 } 432 } else { 433 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 434 * but DPDK guesses it should be iova-mode=va. Add a check and force 435 * iova-mode=pa here. */ 436 if (rte_vfio_noiommu_is_enabled()) { 437 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 438 if (args == NULL) { 439 return -1; 440 } 441 } 442 443 #if defined(__x86_64__) 444 /* DPDK by default guesses that it should be using iova-mode=va so that it can 445 * support running as an unprivileged user. However, some systems (especially 446 * virtual machines) don't have an IOMMU capable of handling the full virtual 447 * address space and DPDK doesn't currently catch that. Add a check in SPDK 448 * and force iova-mode=pa here. */ 449 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 450 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 451 if (args == NULL) { 452 return -1; 453 } 454 } 455 #elif defined(__PPC64__) 456 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 457 * auto-detect at the moment, so we'll just force it here. */ 458 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 459 if (args == NULL) { 460 return -1; 461 } 462 #endif 463 } 464 465 466 /* Set the base virtual address - it must be an address that is not in the 467 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 468 * mmap hint. 469 * 470 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 471 */ 472 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 473 if (args == NULL) { 474 return -1; 475 } 476 477 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 478 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 479 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 480 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 481 */ 482 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 483 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 484 if (args == NULL) { 485 return -1; 486 } 487 } 488 489 if (opts->shm_id < 0) { 490 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 491 getpid())); 492 if (args == NULL) { 493 return -1; 494 } 495 } else { 496 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 497 opts->shm_id)); 498 if (args == NULL) { 499 return -1; 500 } 501 502 /* set the process type */ 503 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 504 if (args == NULL) { 505 return -1; 506 } 507 } 508 #endif 509 510 g_eal_cmdline = args; 511 g_eal_cmdline_argcount = argcount; 512 return argcount; 513 } 514 515 int 516 spdk_env_dpdk_post_init(bool legacy_mem) 517 { 518 int rc; 519 520 pci_env_init(); 521 522 rc = mem_map_init(legacy_mem); 523 if (rc < 0) { 524 SPDK_ERRLOG("Failed to allocate mem_map\n"); 525 return rc; 526 } 527 528 rc = vtophys_init(); 529 if (rc < 0) { 530 SPDK_ERRLOG("Failed to initialize vtophys\n"); 531 return rc; 532 } 533 534 return 0; 535 } 536 537 void 538 spdk_env_dpdk_post_fini(void) 539 { 540 pci_env_fini(); 541 542 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 543 g_eal_cmdline = NULL; 544 g_eal_cmdline_argcount = 0; 545 } 546 547 int 548 spdk_env_init(const struct spdk_env_opts *opts) 549 { 550 char **dpdk_args = NULL; 551 int i, rc; 552 int orig_optind; 553 bool legacy_mem; 554 555 /* If SPDK env has been initialized before, then only pci env requires 556 * reinitialization. 557 */ 558 if (g_external_init == false) { 559 if (opts != NULL) { 560 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 561 return -EINVAL; 562 } 563 564 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 565 pci_env_reinit(); 566 567 return 0; 568 } 569 570 if (opts == NULL) { 571 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 572 return -EINVAL; 573 } 574 575 rc = build_eal_cmdline(opts); 576 if (rc < 0) { 577 SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); 578 return -EINVAL; 579 } 580 581 SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 582 SPDK_PRINTF("[ DPDK EAL parameters: "); 583 for (i = 0; i < g_eal_cmdline_argcount; i++) { 584 SPDK_PRINTF("%s ", g_eal_cmdline[i]); 585 } 586 SPDK_PRINTF("]\n"); 587 588 /* DPDK rearranges the array we pass to it, so make a copy 589 * before passing so we can still free the individual strings 590 * correctly. 591 */ 592 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 593 if (dpdk_args == NULL) { 594 SPDK_ERRLOG("Failed to allocate dpdk_args\n"); 595 return -ENOMEM; 596 } 597 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 598 599 fflush(stdout); 600 orig_optind = optind; 601 optind = 1; 602 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 603 optind = orig_optind; 604 605 free(dpdk_args); 606 607 if (rc < 0) { 608 if (rte_errno == EALREADY) { 609 SPDK_ERRLOG("DPDK already initialized\n"); 610 } else { 611 SPDK_ERRLOG("Failed to initialize DPDK\n"); 612 } 613 return -rte_errno; 614 } 615 616 legacy_mem = false; 617 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 618 legacy_mem = true; 619 } 620 621 rc = spdk_env_dpdk_post_init(legacy_mem); 622 if (rc == 0) { 623 g_external_init = false; 624 } 625 626 return rc; 627 } 628 629 /* We use priority 101 which is the highest priority level available 630 * to applications (the toolchains reserve 1 to 100 for internal usage). 631 * This ensures this destructor runs last, after any other destructors 632 * that might still need the environment up and running. 633 */ 634 __attribute__((destructor(101))) static void 635 dpdk_cleanup(void) 636 { 637 /* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */ 638 if (!g_external_init) { 639 rte_eal_cleanup(); 640 } 641 } 642 643 void 644 spdk_env_fini(void) 645 { 646 spdk_env_dpdk_post_fini(); 647 } 648 649 bool 650 spdk_env_dpdk_external_init(void) 651 { 652 return g_external_init; 653 } 654