1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include "spdk/version.h" 39 #include "spdk/env_dpdk.h" 40 41 #include <rte_config.h> 42 #include <rte_eal.h> 43 #include <rte_errno.h> 44 #include <rte_vfio.h> 45 46 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 52 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 53 54 static char **g_eal_cmdline; 55 static int g_eal_cmdline_argcount; 56 static bool g_external_init = true; 57 58 static char * 59 _sprintf_alloc(const char *format, ...) 60 { 61 va_list args; 62 va_list args_copy; 63 char *buf; 64 size_t bufsize; 65 int rc; 66 67 va_start(args, format); 68 69 /* Try with a small buffer first. */ 70 bufsize = 32; 71 72 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 73 while (bufsize <= 1024 * 1024) { 74 buf = malloc(bufsize); 75 if (buf == NULL) { 76 va_end(args); 77 return NULL; 78 } 79 80 va_copy(args_copy, args); 81 rc = vsnprintf(buf, bufsize, format, args_copy); 82 va_end(args_copy); 83 84 /* 85 * If vsnprintf() returned a count within our current buffer size, we are done. 86 * The count does not include the \0 terminator, so rc == bufsize is not OK. 87 */ 88 if (rc >= 0 && (size_t)rc < bufsize) { 89 va_end(args); 90 return buf; 91 } 92 93 /* 94 * vsnprintf() should return the required space, but some libc versions do not 95 * implement this correctly, so just double the buffer size and try again. 96 * 97 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 98 * again to avoid a copy. 99 */ 100 free(buf); 101 bufsize *= 2; 102 } 103 104 va_end(args); 105 return NULL; 106 } 107 108 void 109 spdk_env_opts_init(struct spdk_env_opts *opts) 110 { 111 if (!opts) { 112 return; 113 } 114 115 memset(opts, 0, sizeof(*opts)); 116 117 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 118 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 119 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 120 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 121 opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; 122 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 123 opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; 124 } 125 126 static void 127 free_args(char **args, int argcount) 128 { 129 int i; 130 131 if (args == NULL) { 132 return; 133 } 134 135 for (i = 0; i < argcount; i++) { 136 free(args[i]); 137 } 138 139 if (argcount) { 140 free(args); 141 } 142 } 143 144 static char ** 145 push_arg(char *args[], int *argcount, char *arg) 146 { 147 char **tmp; 148 149 if (arg == NULL) { 150 fprintf(stderr, "%s: NULL arg supplied\n", __func__); 151 free_args(args, *argcount); 152 return NULL; 153 } 154 155 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 156 if (tmp == NULL) { 157 free(arg); 158 free_args(args, *argcount); 159 return NULL; 160 } 161 162 tmp[*argcount] = arg; 163 (*argcount)++; 164 165 return tmp; 166 } 167 168 #if defined(__linux__) && defined(__x86_64__) 169 170 /* TODO: Can likely get this value from rlimits in the future */ 171 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 172 #define VTD_CAP_MGAW_SHIFT 16 173 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 174 175 static int 176 get_iommu_width(void) 177 { 178 DIR *dir; 179 FILE *file; 180 struct dirent *entry; 181 char mgaw_path[64]; 182 char buf[64]; 183 char *end; 184 long long int val; 185 int width, tmp; 186 187 dir = opendir("/sys/devices/virtual/iommu/"); 188 if (dir == NULL) { 189 return -EINVAL; 190 } 191 192 width = 0; 193 194 while ((entry = readdir(dir)) != NULL) { 195 /* Find directories named "dmar0", "dmar1", etc */ 196 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 197 continue; 198 } 199 200 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 201 entry->d_name); 202 if ((unsigned)tmp >= sizeof(mgaw_path)) { 203 continue; 204 } 205 206 file = fopen(mgaw_path, "r"); 207 if (file == NULL) { 208 continue; 209 } 210 211 if (fgets(buf, sizeof(buf), file) == NULL) { 212 fclose(file); 213 continue; 214 } 215 216 val = strtoll(buf, &end, 16); 217 if (val == LLONG_MIN || val == LLONG_MAX) { 218 fclose(file); 219 continue; 220 } 221 222 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 223 if (width == 0 || tmp < width) { 224 width = tmp; 225 } 226 227 fclose(file); 228 } 229 230 closedir(dir); 231 232 return width; 233 } 234 235 #endif 236 237 static int 238 build_eal_cmdline(const struct spdk_env_opts *opts) 239 { 240 int argcount = 0; 241 char **args; 242 243 args = NULL; 244 245 /* set the program name */ 246 args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 247 if (args == NULL) { 248 return -1; 249 } 250 251 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 252 if (opts->shm_id < 0) { 253 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 254 if (args == NULL) { 255 return -1; 256 } 257 } 258 259 /* set the coremask */ 260 /* NOTE: If coremask starts with '[' and ends with ']' it is a core list 261 */ 262 if (opts->core_mask[0] == '[') { 263 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 264 265 if (l_arg != NULL) { 266 int len = strlen(l_arg); 267 268 if (l_arg[len - 1] == ']') { 269 l_arg[len - 1] = '\0'; 270 } 271 } 272 args = push_arg(args, &argcount, l_arg); 273 } else { 274 args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 275 } 276 277 if (args == NULL) { 278 return -1; 279 } 280 281 /* set the memory channel number */ 282 if (opts->mem_channel > 0) { 283 args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 284 if (args == NULL) { 285 return -1; 286 } 287 } 288 289 /* set the memory size */ 290 if (opts->mem_size >= 0) { 291 args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 292 if (args == NULL) { 293 return -1; 294 } 295 } 296 297 /* set the master core */ 298 if (opts->master_core > 0) { 299 args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", 300 opts->master_core)); 301 if (args == NULL) { 302 return -1; 303 } 304 } 305 306 /* set no pci if enabled */ 307 if (opts->no_pci) { 308 args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 309 if (args == NULL) { 310 return -1; 311 } 312 } 313 314 /* create just one hugetlbfs file */ 315 if (opts->hugepage_single_segments) { 316 args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 317 if (args == NULL) { 318 return -1; 319 } 320 } 321 322 /* unlink hugepages after initialization */ 323 if (opts->unlink_hugepage) { 324 args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 325 if (args == NULL) { 326 return -1; 327 } 328 } 329 330 /* use a specific hugetlbfs mount */ 331 if (opts->hugedir) { 332 args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 333 if (args == NULL) { 334 return -1; 335 } 336 } 337 338 if (opts->num_pci_addr) { 339 size_t i; 340 char bdf[32]; 341 struct spdk_pci_addr *pci_addr = 342 opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; 343 344 for (i = 0; i < opts->num_pci_addr; i++) { 345 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 346 args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", 347 (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), 348 bdf)); 349 if (args == NULL) { 350 return -1; 351 } 352 } 353 } 354 355 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 356 * This can be overridden by specifying the same option in opts->env_context 357 */ 358 args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 359 if (args == NULL) { 360 return -1; 361 } 362 363 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 364 * This can be overridden by specifying the same option in opts->env_context 365 */ 366 args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 367 if (args == NULL) { 368 return -1; 369 } 370 371 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 372 * vhost user message. We don't want that. The same log type is also used by a couple 373 * of other DPDK libs, but none of which we make use right now. If necessary, this can 374 * be overridden via opts->env_context. 375 */ 376 args = push_arg(args, &argcount, strdup("--log-level=user1:6")); 377 if (args == NULL) { 378 return -1; 379 } 380 381 if (opts->env_context) { 382 args = push_arg(args, &argcount, strdup(opts->env_context)); 383 if (args == NULL) { 384 return -1; 385 } 386 } 387 388 #ifdef __linux__ 389 390 if (opts->iova_mode) { 391 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); 392 if (args == NULL) { 393 return -1; 394 } 395 } else { 396 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 397 * but DPDK guesses it should be iova-mode=va. Add a check and force 398 * iova-mode=pa here. */ 399 if (rte_vfio_noiommu_is_enabled()) { 400 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 401 if (args == NULL) { 402 return -1; 403 } 404 } 405 406 #if defined(__x86_64__) 407 /* DPDK by default guesses that it should be using iova-mode=va so that it can 408 * support running as an unprivileged user. However, some systems (especially 409 * virtual machines) don't have an IOMMU capable of handling the full virtual 410 * address space and DPDK doesn't currently catch that. Add a check in SPDK 411 * and force iova-mode=pa here. */ 412 if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 413 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 414 if (args == NULL) { 415 return -1; 416 } 417 } 418 #elif defined(__PPC64__) 419 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 420 * auto-detect at the moment, so we'll just force it here. */ 421 args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 422 if (args == NULL) { 423 return -1; 424 } 425 #endif 426 } 427 428 429 /* Set the base virtual address - it must be an address that is not in the 430 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 431 * mmap hint. 432 * 433 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 434 */ 435 args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); 436 if (args == NULL) { 437 return -1; 438 } 439 440 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 441 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 442 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 443 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 444 */ 445 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 446 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 447 args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 448 if (args == NULL) { 449 return -1; 450 } 451 } 452 #endif 453 454 if (opts->shm_id < 0) { 455 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 456 getpid())); 457 if (args == NULL) { 458 return -1; 459 } 460 } else { 461 args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 462 opts->shm_id)); 463 if (args == NULL) { 464 return -1; 465 } 466 467 /* set the process type */ 468 args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 469 if (args == NULL) { 470 return -1; 471 } 472 } 473 #endif 474 475 g_eal_cmdline = args; 476 g_eal_cmdline_argcount = argcount; 477 return argcount; 478 } 479 480 int 481 spdk_env_dpdk_post_init(bool legacy_mem) 482 { 483 int rc; 484 485 pci_env_init(); 486 487 rc = mem_map_init(legacy_mem); 488 if (rc < 0) { 489 fprintf(stderr, "Failed to allocate mem_map\n"); 490 return rc; 491 } 492 493 rc = vtophys_init(); 494 if (rc < 0) { 495 fprintf(stderr, "Failed to initialize vtophys\n"); 496 return rc; 497 } 498 499 return 0; 500 } 501 502 void 503 spdk_env_dpdk_post_fini(void) 504 { 505 pci_env_fini(); 506 507 free_args(g_eal_cmdline, g_eal_cmdline_argcount); 508 g_eal_cmdline = NULL; 509 g_eal_cmdline_argcount = 0; 510 } 511 512 int 513 spdk_env_init(const struct spdk_env_opts *opts) 514 { 515 char **dpdk_args = NULL; 516 int i, rc; 517 int orig_optind; 518 bool legacy_mem; 519 520 /* If SPDK env has been initialized before, then only pci env requires 521 * reinitialization. 522 */ 523 if (g_external_init == false) { 524 if (opts != NULL) { 525 fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); 526 return -EINVAL; 527 } 528 529 printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); 530 pci_env_reinit(); 531 532 return 0; 533 } 534 535 if (opts == NULL) { 536 fprintf(stderr, "NULL arguments to initialize DPDK\n"); 537 return -EINVAL; 538 } 539 540 rc = build_eal_cmdline(opts); 541 if (rc < 0) { 542 fprintf(stderr, "Invalid arguments to initialize DPDK\n"); 543 return -EINVAL; 544 } 545 546 printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 547 printf("[ DPDK EAL parameters: "); 548 for (i = 0; i < g_eal_cmdline_argcount; i++) { 549 printf("%s ", g_eal_cmdline[i]); 550 } 551 printf("]\n"); 552 553 /* DPDK rearranges the array we pass to it, so make a copy 554 * before passing so we can still free the individual strings 555 * correctly. 556 */ 557 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 558 if (dpdk_args == NULL) { 559 fprintf(stderr, "Failed to allocate dpdk_args\n"); 560 return -ENOMEM; 561 } 562 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 563 564 fflush(stdout); 565 orig_optind = optind; 566 optind = 1; 567 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 568 optind = orig_optind; 569 570 free(dpdk_args); 571 572 if (rc < 0) { 573 if (rte_errno == EALREADY) { 574 fprintf(stderr, "DPDK already initialized\n"); 575 } else { 576 fprintf(stderr, "Failed to initialize DPDK\n"); 577 } 578 return -rte_errno; 579 } 580 581 legacy_mem = false; 582 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 583 legacy_mem = true; 584 } 585 586 rc = spdk_env_dpdk_post_init(legacy_mem); 587 if (rc == 0) { 588 g_external_init = false; 589 } 590 591 return rc; 592 } 593 594 void 595 spdk_env_fini(void) 596 { 597 spdk_env_dpdk_post_fini(); 598 } 599 600 bool 601 spdk_env_dpdk_external_init(void) 602 { 603 return g_external_init; 604 } 605