1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include "spdk/version.h" 39 #include "spdk/env_dpdk.h" 40 #include "spdk/log.h" 41 42 #include <rte_config.h> 43 #include <rte_eal.h> 44 #include <rte_errno.h> 45 #include <rte_vfio.h> 46 47 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 48 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 49 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 50 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 51 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 52 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 53 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 54 55 static char **g_eal_cmdline; 56 static int g_eal_cmdline_argcount; 57 static bool g_external_init = true; 58 59 static char * 60 _sprintf_alloc(const char *format, ...) 61 { 62 va_list args; 63 va_list args_copy; 64 char *buf; 65 size_t bufsize; 66 int rc; 67 68 va_start(args, format); 69 70 /* Try with a small buffer first. */ 71 bufsize = 32; 72 73 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 74 while (bufsize <= 1024 * 1024) { 75 buf = malloc(bufsize); 76 if (buf == NULL) { 77 va_end(args); 78 return NULL; 79 } 80 81 va_copy(args_copy, args); 82 rc = vsnprintf(buf, bufsize, format, args_copy); 83 va_end(args_copy); 84 85 /* 86 * If vsnprintf() returned a count within our current buffer size, we are done. 87 * The count does not include the \0 terminator, so rc == bufsize is not OK. 88 */ 89 if (rc >= 0 && (size_t)rc < bufsize) { 90 va_end(args); 91 return buf; 92 } 93 94 /* 95 * vsnprintf() should return the required space, but some libc versions do not 96 * implement this correctly, so just double the buffer size and try again. 97 * 98 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 99 * again to avoid a copy. 100 */ 101 free(buf); 102 bufsize *= 2; 103 } 104 105 va_end(args); 106 return NULL; 107 } 108 109 void 110 spdk_env_opts_init(struct spdk_env_opts *opts) 111 { 112 if (!opts) { 113 return; 114 } 115 116 memset(opts, 0, sizeof(*opts)); 117 118 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 119 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 120 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 121 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 122 opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; 123 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 124 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 125 } 126 127 static void 128 free_args(char **args, int argcount) 129 { 130 int i; 131 132 if (args == NULL) { 133 return; 134 } 135 136 for (i = 0; i < argcount; i++) { 137 free(args[i]); 138 } 139 140 if (argcount) { 141 free(args); 142 } 143 } 144 145 static char ** 146 push_arg(char *args[], int *argcount, char *arg) 147 { 148 char **tmp; 149 150 if (arg == NULL) { 151 SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); 152 free_args(args, *argcount); 153 return NULL; 154 } 155 156 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 157 if (tmp == NULL) { 158 free(arg); 159 free_args(args, *argcount); 160 return NULL; 161 } 162 163 tmp[*argcount] = arg; 164 (*argcount)++; 165 166 return tmp; 167 } 168 169 #if defined(__linux__) && defined(__x86_64__) 170 171 /* TODO: Can likely get this value from rlimits in the future */ 172 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 173 #define VTD_CAP_MGAW_SHIFT 16 174 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 175 176 static int 177 get_iommu_width(void) 178 { 179 DIR *dir; 180 FILE *file; 181 struct dirent *entry; 182 char mgaw_path[64]; 183 char buf[64]; 184 char *end; 185 long long int val; 186 int width, tmp; 187 188 dir = opendir("/sys/devices/virtual/iommu/"); 189 if (dir == NULL) { 190 return -EINVAL; 191 } 192 193 width = 0; 194 195 while ((entry = readdir(dir)) != NULL) { 196 /* Find directories named "dmar0", "dmar1", etc */ 197 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 198 continue; 199 } 200 201 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 202 entry->d_name); 203 if ((unsigned)tmp >= sizeof(mgaw_path)) { 204 continue; 205 } 206 207 file = fopen(mgaw_path, "r"); 208 if (file == NULL) { 209 continue; 210 } 211 212 if (fgets(buf, sizeof(buf), file) == NULL) { 213 fclose(file); 214 continue; 215 } 216 217 val = strtoll(buf, &end, 16); 218 if (val == LLONG_MIN || val == LLONG_MAX) { 219 fclose(file); 220 continue; 221 } 222 223 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 224 if (width == 0 || tmp < width) { 225 width = tmp; 226 } 227 228 fclose(file); 229 } 230 231 closedir(dir); 232 233 return width; 234 } 235 236 #endif 237 238 static int 239 build_eal_cmdline(const struct spdk_env_opts *opts) 240 { 241 int argcount = 0; 242 char **args; 243 244 args = NULL; 245 246 /* set the program name */ 247 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 248 if (args == NULL) { 249 return -1; 250 } 251 252 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 253 if (opts->shm_id < 0) { 254 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 255 if (args == NULL) { 256 return -1; 257 } 258 } 259 260 /* set the coremask */ 261 /* NOTE: If coremask starts with '[' and ends with ']' it is a core list 262 */ 263 if (opts->core_mask[0] == '[') { 264 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 265 266 if (l_arg != NULL) { 267 int len = strlen(l_arg); 268 269 if (l_arg[len - 1] == ']') { 270 l_arg[len - 1] = '\0'; 271 } 272 } 273 args = push_arg(args, &argcount, l_arg); 274 } else { 275 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 276 } 277 278 if (args == NULL) { 279 return -1; 280 } 281 282 /* set the memory channel number */ 283 if (opts->mem_channel > 0) { 284 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 285 if (args == NULL) { 286 return -1; 287 } 288 } 289 290 /* set the memory size */ 291 if (opts->mem_size >= 0) { 292 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 293 if (args == NULL) { 294 return -1; 295 } 296 } 297 298 /* set the master core */ 299 if (opts->master_core > 0) { 300 args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", 301 opts->master_core)); 302 if (args == NULL) { 303 return -1; 304 } 305 } 306 307 /* set no pci if enabled */ 308 if (opts->no_pci) { 309 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 310 if (args == NULL) { 311 return -1; 312 } 313 } 314 315 /* create just one hugetlbfs file */ 316 if (opts->hugepage_single_segments) { 317 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 318 if (args == NULL) { 319 return -1; 320 } 321 } 322 323 /* unlink hugepages after initialization */ 324 if (opts->unlink_hugepage) { 325 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 326 if (args == NULL) { 327 return -1; 328 } 329 } 330 331 /* use a specific hugetlbfs mount */ 332 if (opts->hugedir) { 333 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 334 if (args == NULL) { 335 return -1; 336 } 337 } 338 339 if (opts->num_pci_addr) { 340 size_t i; 341 char bdf[32]; 342 struct spdk_pci_addr *pci_addr = 343 opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; 344 345 for (i = 0; i < opts->num_pci_addr; i++) { 346 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 347 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 348 (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), 349 bdf)); 350 if (args == NULL) { 351 return -1; 352 } 353 } 354 } 355 356 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 357 * This can be overridden by specifying the same option in opts->env_context 358 */ 359 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 360 if (args == NULL) { 361 return -1; 362 } 363 364 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 365 * This can be overridden by specifying the same option in opts->env_context 366 */ 367 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 368 if (args == NULL) { 369 return -1; 370 } 371 372 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 373 * vhost user message. We don't want that. The same log type is also used by a couple 374 * of other DPDK libs, but none of which we make use right now. If necessary, this can 375 * be overridden via opts->env_context. 376 */ 377 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 378 if (args == NULL) { 379 return -1; 380 } 381 382 if (opts->env_context) { 383 args = push_arg(args, &argcount, strdup(opts->env_context)); 384 if (args == NULL) { 385 return -1; 386 } 387 } 388 389 #ifdef __linux__ 390 391 if (opts->iova_mode) { 392 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 393 if (args == NULL) { 394 return -1; 395 } 396 } else { 397 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 398 * but DPDK guesses it should be iova-mode=va. Add a check and force 399 * iova-mode=pa here. */ 400 if (rte_vfio_noiommu_is_enabled()) { 401 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 402 if (args == NULL) { 403 return -1; 404 } 405 } 406 407 #if defined(__x86_64__) 408 /* DPDK by default guesses that it should be using iova-mode=va so that it can 409 * support running as an unprivileged user. However, some systems (especially 410 * virtual machines) don't have an IOMMU capable of handling the full virtual 411 * address space and DPDK doesn't currently catch that. Add a check in SPDK 412 * and force iova-mode=pa here. */ 413 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 414 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 415 if (args == NULL) { 416 return -1; 417 } 418 } 419 #elif defined(__PPC64__) 420 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 421 * auto-detect at the moment, so we'll just force it here. */ 422 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 423 if (args == NULL) { 424 return -1; 425 } 426 #endif 427 } 428 429 430 /* Set the base virtual address - it must be an address that is not in the 431 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 432 * mmap hint. 433 * 434 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 435 */ 436 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 437 if (args == NULL) { 438 return -1; 439 } 440 441 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 442 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 443 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 444 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 445 */ 446 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 447 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 448 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 449 if (args == NULL) { 450 return -1; 451 } 452 } 453 #endif 454 455 if (opts->shm_id < 0) { 456 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 457 getpid())); 458 if (args == NULL) { 459 return -1; 460 } 461 } else { 462 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 463 opts->shm_id)); 464 if (args == NULL) { 465 return -1; 466 } 467 468 /* set the process type */ 469 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 470 if (args == NULL) { 471 return -1; 472 } 473 } 474 #endif 475 476 g_eal_cmdline = args; 477 g_eal_cmdline_argcount = argcount; 478 return argcount; 479 } 480 481 int 482 spdk_env_dpdk_post_init(bool legacy_mem) 483 { 484 int rc; 485 486 pci_env_init(); 487 488 rc = mem_map_init(legacy_mem); 489 if (rc < 0) { 490 SPDK_ERRLOG("Failed to allocate mem_map\n"); 491 return rc; 492 } 493 494 rc = vtophys_init(); 495 if (rc < 0) { 496 SPDK_ERRLOG("Failed to initialize vtophys\n"); 497 return rc; 498 } 499 500 return 0; 501 } 502 503 void 504 spdk_env_dpdk_post_fini(void) 505 { 506 pci_env_fini(); 507 508 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 509 g_eal_cmdline = NULL; 510 g_eal_cmdline_argcount = 0; 511 } 512 513 int 514 spdk_env_init(const struct spdk_env_opts *opts) 515 { 516 char **dpdk_args = NULL; 517 int i, rc; 518 int orig_optind; 519 bool legacy_mem; 520 521 /* If SPDK env has been initialized before, then only pci env requires 522 * reinitialization. 523 */ 524 if (g_external_init == false) { 525 if (opts != NULL) { 526 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 527 return -EINVAL; 528 } 529 530 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 531 pci_env_reinit(); 532 533 return 0; 534 } 535 536 if (opts == NULL) { 537 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 538 return -EINVAL; 539 } 540 541 rc = build_eal_cmdline(opts); 542 if (rc < 0) { 543 SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); 544 return -EINVAL; 545 } 546 547 SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 548 SPDK_PRINTF("[ DPDK EAL parameters: "); 549 for (i = 0; i < g_eal_cmdline_argcount; i++) { 550 SPDK_PRINTF("%s ", g_eal_cmdline[i]); 551 } 552 SPDK_PRINTF("]\n"); 553 554 /* DPDK rearranges the array we pass to it, so make a copy 555 * before passing so we can still free the individual strings 556 * correctly. 557 */ 558 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 559 if (dpdk_args == NULL) { 560 SPDK_ERRLOG("Failed to allocate dpdk_args\n"); 561 return -ENOMEM; 562 } 563 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 564 565 fflush(stdout); 566 orig_optind = optind; 567 optind = 1; 568 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 569 optind = orig_optind; 570 571 free(dpdk_args); 572 573 if (rc < 0) { 574 if (rte_errno == EALREADY) { 575 SPDK_ERRLOG("DPDK already initialized\n"); 576 } else { 577 SPDK_ERRLOG("Failed to initialize DPDK\n"); 578 } 579 return -rte_errno; 580 } 581 582 legacy_mem = false; 583 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 584 legacy_mem = true; 585 } 586 587 rc = spdk_env_dpdk_post_init(legacy_mem); 588 if (rc == 0) { 589 g_external_init = false; 590 } 591 592 return rc; 593 } 594 595 void 596 spdk_env_fini(void) 597 { 598 spdk_env_dpdk_post_fini(); 599 } 600 601 bool 602 spdk_env_dpdk_external_init(void) 603 { 604 return g_external_init; 605 } 606