1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include "spdk/version.h" 39 #include "spdk/env_dpdk.h" 40 41 #include <rte_config.h> 42 #include <rte_eal.h> 43 #include <rte_errno.h> 44 #include <rte_vfio.h> 45 46 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 52 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 53 54 static char **g_eal_cmdline; 55 static int g_eal_cmdline_argcount; 56 static bool g_external_init = true; 57 58 static char * 59 _sprintf_alloc(const char *format, ...) 60 { 61 va_list args; 62 va_list args_copy; 63 char *buf; 64 size_t bufsize; 65 int rc; 66 67 va_start(args, format); 68 69 /* Try with a small buffer first. */ 70 bufsize = 32; 71 72 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 73 while (bufsize <= 1024 * 1024) { 74 buf = malloc(bufsize); 75 if (buf == NULL) { 76 va_end(args); 77 return NULL; 78 } 79 80 va_copy(args_copy, args); 81 rc = vsnprintf(buf, bufsize, format, args_copy); 82 va_end(args_copy); 83 84 /* 85 * If vsnprintf() returned a count within our current buffer size, we are done. 86 * The count does not include the \0 terminator, so rc == bufsize is not OK. 87 */ 88 if (rc >= 0 && (size_t)rc < bufsize) { 89 va_end(args); 90 return buf; 91 } 92 93 /* 94 * vsnprintf() should return the required space, but some libc versions do not 95 * implement this correctly, so just double the buffer size and try again. 96 * 97 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 98 * again to avoid a copy. 99 */ 100 free(buf); 101 bufsize *= 2; 102 } 103 104 va_end(args); 105 return NULL; 106 } 107 108 static void 109 env_unlink_shared_files(void) 110 { 111 /* Starting with DPDK 18.05, there are more files with unpredictable paths 112 * and filenames. The --no-shconf option prevents from creating them, but 113 * only for DPDK 18.08+. For DPDK 18.05 we just leave them be. 114 */ 115 #if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) 116 char buffer[PATH_MAX]; 117 118 snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid()); 119 if (unlink(buffer)) { 120 fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno); 121 } 122 #endif 123 } 124 125 void 126 spdk_env_opts_init(struct spdk_env_opts *opts) 127 { 128 if (!opts) { 129 return; 130 } 131 132 memset(opts, 0, sizeof(*opts)); 133 134 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 135 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 136 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 137 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 138 opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; 139 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 140 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 141 } 142 143 static void 144 free_args(char **args, int argcount) 145 { 146 int i; 147 148 for (i = 0; i < argcount; i++) { 149 free(args[i]); 150 } 151 152 if (argcount) { 153 free(args); 154 } 155 } 156 157 static char ** 158 push_arg(char *args[], int *argcount, char *arg) 159 { 160 char **tmp; 161 162 if (arg == NULL) { 163 fprintf(stderr, "%s: NULL arg supplied\n", __func__); 164 free_args(args, *argcount); 165 return NULL; 166 } 167 168 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 169 if (tmp == NULL) { 170 free(arg); 171 free_args(args, *argcount); 172 return NULL; 173 } 174 175 tmp[*argcount] = arg; 176 (*argcount)++; 177 178 return tmp; 179 } 180 181 #if defined(__linux__) && defined(__x86_64__) 182 183 /* TODO: Can likely get this value from rlimits in the future */ 184 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 185 #define VTD_CAP_MGAW_SHIFT 16 186 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 187 188 static int 189 get_iommu_width(void) 190 { 191 DIR *dir; 192 FILE *file; 193 struct dirent *entry; 194 char mgaw_path[64]; 195 char buf[64]; 196 char *end; 197 long long int val; 198 int width, tmp; 199 200 dir = opendir("/sys/devices/virtual/iommu/"); 201 if (dir == NULL) { 202 return -EINVAL; 203 } 204 205 width = 0; 206 207 while ((entry = readdir(dir)) != NULL) { 208 /* Find directories named "dmar0", "dmar1", etc */ 209 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 210 continue; 211 } 212 213 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 214 entry->d_name); 215 if ((unsigned)tmp >= sizeof(mgaw_path)) { 216 continue; 217 } 218 219 file = fopen(mgaw_path, "r"); 220 if (file == NULL) { 221 continue; 222 } 223 224 if (fgets(buf, sizeof(buf), file) == NULL) { 225 fclose(file); 226 continue; 227 } 228 229 val = strtoll(buf, &end, 16); 230 if (val == LLONG_MIN || val == LLONG_MAX) { 231 fclose(file); 232 continue; 233 } 234 235 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 236 if (width == 0 || tmp < width) { 237 width = tmp; 238 } 239 240 fclose(file); 241 } 242 243 closedir(dir); 244 245 return width; 246 } 247 248 #endif 249 250 static int 251 build_eal_cmdline(const struct spdk_env_opts *opts) 252 { 253 int argcount = 0; 254 char **args; 255 256 args = NULL; 257 258 /* set the program name */ 259 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 260 if (args == NULL) { 261 return -1; 262 } 263 264 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 265 if (opts->shm_id < 0) { 266 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 267 if (args == NULL) { 268 return -1; 269 } 270 } 271 272 /* set the coremask */ 273 /* NOTE: If coremask starts with '[' and ends with ']' it is a core list 274 */ 275 if (opts->core_mask[0] == '[') { 276 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 277 278 if (l_arg != NULL) { 279 int len = strlen(l_arg); 280 281 if (l_arg[len - 1] == ']') { 282 l_arg[len - 1] = '\0'; 283 } 284 } 285 args = push_arg(args, &argcount, l_arg); 286 } else { 287 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 288 } 289 290 if (args == NULL) { 291 return -1; 292 } 293 294 /* set the memory channel number */ 295 if (opts->mem_channel > 0) { 296 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 297 if (args == NULL) { 298 return -1; 299 } 300 } 301 302 /* set the memory size */ 303 if (opts->mem_size >= 0) { 304 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 305 if (args == NULL) { 306 return -1; 307 } 308 } 309 310 /* set the master core */ 311 if (opts->master_core > 0) { 312 args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", 313 opts->master_core)); 314 if (args == NULL) { 315 return -1; 316 } 317 } 318 319 /* set no pci if enabled */ 320 if (opts->no_pci) { 321 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 322 if (args == NULL) { 323 return -1; 324 } 325 } 326 327 /* create just one hugetlbfs file */ 328 if (opts->hugepage_single_segments) { 329 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 330 if (args == NULL) { 331 return -1; 332 } 333 } 334 335 /* unlink hugepages after initialization */ 336 if (opts->unlink_hugepage) { 337 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 338 if (args == NULL) { 339 return -1; 340 } 341 } 342 343 /* use a specific hugetlbfs mount */ 344 if (opts->hugedir) { 345 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 346 if (args == NULL) { 347 return -1; 348 } 349 } 350 351 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0) 352 /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */ 353 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 354 args = push_arg(args, &argcount, _sprintf_alloc("--legacy-mem")); 355 if (args == NULL) { 356 return -1; 357 } 358 } 359 #endif 360 361 if (opts->num_pci_addr) { 362 size_t i; 363 char bdf[32]; 364 struct spdk_pci_addr *pci_addr = 365 opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; 366 367 for (i = 0; i < opts->num_pci_addr; i++) { 368 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 369 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 370 (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), 371 bdf)); 372 if (args == NULL) { 373 return -1; 374 } 375 } 376 } 377 378 /* The following log-level options are not understood by older DPDKs */ 379 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 380 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 381 * This can be overridden by specifying the same option in opts->env_context 382 */ 383 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 384 if (args == NULL) { 385 return -1; 386 } 387 388 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 389 * This can be overridden by specifying the same option in opts->env_context 390 */ 391 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 392 if (args == NULL) { 393 return -1; 394 } 395 396 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 397 * vhost user message. We don't want that. The same log type is also used by a couple 398 * of other DPDK libs, but none of which we make use right now. If necessary, this can 399 * be overridden via opts->env_context. 400 */ 401 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 402 if (args == NULL) { 403 return -1; 404 } 405 #endif 406 407 if (opts->env_context) { 408 args = push_arg(args, &argcount, strdup(opts->env_context)); 409 if (args == NULL) { 410 return -1; 411 } 412 } 413 414 #ifdef __linux__ 415 416 if (opts->iova_mode) { 417 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 418 if (args == NULL) { 419 return -1; 420 } 421 } else { 422 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 423 * but DPDK guesses it should be iova-mode=va. Add a check and force 424 * iova-mode=pa here. */ 425 if (rte_vfio_noiommu_is_enabled()) { 426 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 427 if (args == NULL) { 428 return -1; 429 } 430 } 431 432 #if defined(__x86_64__) 433 /* DPDK by default guesses that it should be using iova-mode=va so that it can 434 * support running as an unprivileged user. However, some systems (especially 435 * virtual machines) don't have an IOMMU capable of handling the full virtual 436 * address space and DPDK doesn't currently catch that. Add a check in SPDK 437 * and force iova-mode=pa here. */ 438 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 439 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 440 if (args == NULL) { 441 return -1; 442 } 443 } 444 #elif defined(__PPC64__) 445 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 446 * auto-detect at the moment, so we'll just force it here. */ 447 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 448 if (args == NULL) { 449 return -1; 450 } 451 #endif 452 } 453 454 455 /* Set the base virtual address - it must be an address that is not in the 456 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 457 * mmap hint. 458 * 459 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 460 */ 461 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 462 if (args == NULL) { 463 return -1; 464 } 465 466 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 467 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 468 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 469 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 470 */ 471 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 472 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 473 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 474 if (args == NULL) { 475 return -1; 476 } 477 } 478 #endif 479 480 if (opts->shm_id < 0) { 481 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 482 getpid())); 483 if (args == NULL) { 484 return -1; 485 } 486 } else { 487 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 488 opts->shm_id)); 489 if (args == NULL) { 490 return -1; 491 } 492 493 /* set the process type */ 494 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 495 if (args == NULL) { 496 return -1; 497 } 498 } 499 #endif 500 501 g_eal_cmdline = args; 502 g_eal_cmdline_argcount = argcount; 503 return argcount; 504 } 505 506 int 507 spdk_env_dpdk_post_init(bool legacy_mem) 508 { 509 int rc; 510 511 pci_env_init(); 512 513 rc = mem_map_init(legacy_mem); 514 if (rc < 0) { 515 fprintf(stderr, "Failed to allocate mem_map\n"); 516 return rc; 517 } 518 519 rc = vtophys_init(); 520 if (rc < 0) { 521 fprintf(stderr, "Failed to initialize vtophys\n"); 522 return rc; 523 } 524 525 return 0; 526 } 527 528 void 529 spdk_env_dpdk_post_fini(void) 530 { 531 pci_env_fini(); 532 533 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 534 } 535 536 int 537 spdk_env_init(const struct spdk_env_opts *opts) 538 { 539 char **dpdk_args = NULL; 540 int i, rc; 541 int orig_optind; 542 bool legacy_mem; 543 544 g_external_init = false; 545 546 rc = build_eal_cmdline(opts); 547 if (rc < 0) { 548 fprintf(stderr, "Invalid arguments to initialize DPDK\n"); 549 return -EINVAL; 550 } 551 552 printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 553 printf("[ DPDK EAL parameters: "); 554 for (i = 0; i < g_eal_cmdline_argcount; i++) { 555 printf("%s ", g_eal_cmdline[i]); 556 } 557 printf("]\n"); 558 559 /* DPDK rearranges the array we pass to it, so make a copy 560 * before passing so we can still free the individual strings 561 * correctly. 562 */ 563 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 564 if (dpdk_args == NULL) { 565 fprintf(stderr, "Failed to allocate dpdk_args\n"); 566 return -ENOMEM; 567 } 568 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 569 570 fflush(stdout); 571 orig_optind = optind; 572 optind = 1; 573 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 574 optind = orig_optind; 575 576 free(dpdk_args); 577 578 if (rc < 0) { 579 if (rte_errno == EALREADY) { 580 fprintf(stderr, "DPDK already initialized\n"); 581 } else { 582 fprintf(stderr, "Failed to initialize DPDK\n"); 583 } 584 return -rte_errno; 585 } 586 587 if (opts->shm_id < 0 && !opts->hugepage_single_segments) { 588 /* 589 * Unlink hugepage and config info files after init. This will ensure they get 590 * deleted on app exit, even if the app crashes and does not exit normally. 591 * Only do this when not in multi-process mode, since for multi-process other 592 * apps will need to open these files. These files are not created for 593 * "single file segments". 594 */ 595 env_unlink_shared_files(); 596 } 597 598 legacy_mem = false; 599 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 600 legacy_mem = true; 601 } 602 603 return spdk_env_dpdk_post_init(legacy_mem); 604 } 605 606 void 607 spdk_env_fini(void) 608 { 609 spdk_env_dpdk_post_fini(); 610 } 611 612 bool 613 spdk_env_dpdk_external_init(void) 614 { 615 return g_external_init; 616 } 617