1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include "spdk/version.h" 39 #include "spdk/env_dpdk.h" 40 41 #include <rte_config.h> 42 #include <rte_eal.h> 43 #include <rte_errno.h> 44 #include <rte_vfio.h> 45 46 #define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" 47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" 52 53 static char **g_eal_cmdline; 54 static int g_eal_cmdline_argcount; 55 static bool g_external_init = true; 56 57 static char * 58 _sprintf_alloc(const char *format, ...) 59 { 60 va_list args; 61 va_list args_copy; 62 char *buf; 63 size_t bufsize; 64 int rc; 65 66 va_start(args, format); 67 68 /* Try with a small buffer first. */ 69 bufsize = 32; 70 71 /* Limit maximum buffer size to something reasonable so we don't loop forever. */ 72 while (bufsize <= 1024 * 1024) { 73 buf = malloc(bufsize); 74 if (buf == NULL) { 75 va_end(args); 76 return NULL; 77 } 78 79 va_copy(args_copy, args); 80 rc = vsnprintf(buf, bufsize, format, args_copy); 81 va_end(args_copy); 82 83 /* 84 * If vsnprintf() returned a count within our current buffer size, we are done. 85 * The count does not include the \0 terminator, so rc == bufsize is not OK. 86 */ 87 if (rc >= 0 && (size_t)rc < bufsize) { 88 va_end(args); 89 return buf; 90 } 91 92 /* 93 * vsnprintf() should return the required space, but some libc versions do not 94 * implement this correctly, so just double the buffer size and try again. 95 * 96 * We don't need the data in buf, so rather than realloc(), use free() and malloc() 97 * again to avoid a copy. 98 */ 99 free(buf); 100 bufsize *= 2; 101 } 102 103 va_end(args); 104 return NULL; 105 } 106 107 static void 108 spdk_env_unlink_shared_files(void) 109 { 110 /* Starting with DPDK 18.05, there are more files with unpredictable paths 111 * and filenames. The --no-shconf option prevents from creating them, but 112 * only for DPDK 18.08+. For DPDK 18.05 we just leave them be. 113 */ 114 #if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) 115 char buffer[PATH_MAX]; 116 117 snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid()); 118 if (unlink(buffer)) { 119 fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno); 120 } 121 #endif 122 } 123 124 void 125 spdk_env_opts_init(struct spdk_env_opts *opts) 126 { 127 if (!opts) { 128 return; 129 } 130 131 memset(opts, 0, sizeof(*opts)); 132 133 opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; 134 opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; 135 opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; 136 opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; 137 opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; 138 opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; 139 } 140 141 static void 142 spdk_free_args(char **args, int argcount) 143 { 144 int i; 145 146 for (i = 0; i < argcount; i++) { 147 free(args[i]); 148 } 149 150 if (argcount) { 151 free(args); 152 } 153 } 154 155 static char ** 156 spdk_push_arg(char *args[], int *argcount, char *arg) 157 { 158 char **tmp; 159 160 if (arg == NULL) { 161 fprintf(stderr, "%s: NULL arg supplied\n", __func__); 162 spdk_free_args(args, *argcount); 163 return NULL; 164 } 165 166 tmp = realloc(args, sizeof(char *) * (*argcount + 1)); 167 if (tmp == NULL) { 168 free(arg); 169 spdk_free_args(args, *argcount); 170 return NULL; 171 } 172 173 tmp[*argcount] = arg; 174 (*argcount)++; 175 176 return tmp; 177 } 178 179 #if defined(__linux__) && defined(__x86_64__) 180 181 /* TODO: Can likely get this value from rlimits in the future */ 182 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 183 #define VTD_CAP_MGAW_SHIFT 16 184 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) 185 186 static int 187 spdk_get_iommu_width(void) 188 { 189 DIR *dir; 190 FILE *file; 191 struct dirent *entry; 192 char mgaw_path[64]; 193 char buf[64]; 194 char *end; 195 long long int val; 196 int width, tmp; 197 198 dir = opendir("/sys/devices/virtual/iommu/"); 199 if (dir == NULL) { 200 return -EINVAL; 201 } 202 203 width = 0; 204 205 while ((entry = readdir(dir)) != NULL) { 206 /* Find directories named "dmar0", "dmar1", etc */ 207 if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { 208 continue; 209 } 210 211 tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", 212 entry->d_name); 213 if ((unsigned)tmp >= sizeof(mgaw_path)) { 214 continue; 215 } 216 217 file = fopen(mgaw_path, "r"); 218 if (file == NULL) { 219 continue; 220 } 221 222 if (fgets(buf, sizeof(buf), file) == NULL) { 223 fclose(file); 224 continue; 225 } 226 227 val = strtoll(buf, &end, 16); 228 if (val == LLONG_MIN || val == LLONG_MAX) { 229 fclose(file); 230 continue; 231 } 232 233 tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; 234 if (width == 0 || tmp < width) { 235 width = tmp; 236 } 237 238 fclose(file); 239 } 240 241 closedir(dir); 242 243 return width; 244 } 245 246 #endif 247 248 static int 249 spdk_build_eal_cmdline(const struct spdk_env_opts *opts) 250 { 251 int argcount = 0; 252 char **args; 253 254 args = NULL; 255 256 /* set the program name */ 257 args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); 258 if (args == NULL) { 259 return -1; 260 } 261 262 /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ 263 if (opts->shm_id < 0) { 264 args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); 265 if (args == NULL) { 266 return -1; 267 } 268 } 269 270 /* set the coremask */ 271 /* NOTE: If coremask starts with '[' and ends with ']' it is a core list 272 */ 273 if (opts->core_mask[0] == '[') { 274 char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); 275 276 if (l_arg != NULL) { 277 int len = strlen(l_arg); 278 279 if (l_arg[len - 1] == ']') { 280 l_arg[len - 1] = '\0'; 281 } 282 } 283 args = spdk_push_arg(args, &argcount, l_arg); 284 } else { 285 args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); 286 } 287 288 if (args == NULL) { 289 return -1; 290 } 291 292 /* set the memory channel number */ 293 if (opts->mem_channel > 0) { 294 args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); 295 if (args == NULL) { 296 return -1; 297 } 298 } 299 300 /* set the memory size */ 301 if (opts->mem_size >= 0) { 302 args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); 303 if (args == NULL) { 304 return -1; 305 } 306 } 307 308 /* set the master core */ 309 if (opts->master_core > 0) { 310 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", 311 opts->master_core)); 312 if (args == NULL) { 313 return -1; 314 } 315 } 316 317 /* set no pci if enabled */ 318 if (opts->no_pci) { 319 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci")); 320 if (args == NULL) { 321 return -1; 322 } 323 } 324 325 /* create just one hugetlbfs file */ 326 if (opts->hugepage_single_segments) { 327 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); 328 if (args == NULL) { 329 return -1; 330 } 331 } 332 333 /* unlink hugepages after initialization */ 334 if (opts->unlink_hugepage) { 335 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); 336 if (args == NULL) { 337 return -1; 338 } 339 } 340 341 /* use a specific hugetlbfs mount */ 342 if (opts->hugedir) { 343 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); 344 if (args == NULL) { 345 return -1; 346 } 347 } 348 349 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0) 350 /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */ 351 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 352 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem")); 353 if (args == NULL) { 354 return -1; 355 } 356 } 357 #endif 358 359 if (opts->num_pci_addr) { 360 size_t i; 361 char bdf[32]; 362 struct spdk_pci_addr *pci_addr = 363 opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; 364 365 for (i = 0; i < opts->num_pci_addr; i++) { 366 spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); 367 args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s", 368 (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), 369 bdf)); 370 if (args == NULL) { 371 return -1; 372 } 373 } 374 } 375 376 /* The following log-level options are not understood by older DPDKs */ 377 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 378 /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. 379 * This can be overridden by specifying the same option in opts->env_context 380 */ 381 args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); 382 if (args == NULL) { 383 return -1; 384 } 385 386 /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. 387 * This can be overridden by specifying the same option in opts->env_context 388 */ 389 args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); 390 if (args == NULL) { 391 return -1; 392 } 393 394 /* `user1` log type is used by rte_vhost, which prints an INFO log for each received 395 * vhost user message. We don't want that. The same log type is also used by a couple 396 * of other DPDK libs, but none of which we make use right now. If necessary, this can 397 * be overridden via opts->env_context. 398 */ 399 args = spdk_push_arg(args, &argcount, strdup("--log-level=user1:6")); 400 if (args == NULL) { 401 return -1; 402 } 403 #endif 404 405 if (opts->env_context) { 406 args = spdk_push_arg(args, &argcount, strdup(opts->env_context)); 407 if (args == NULL) { 408 return -1; 409 } 410 } 411 412 #ifdef __linux__ 413 414 /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, 415 * but DPDK guesses it should be iova-mode=va. Add a check and force 416 * iova-mode=pa here. */ 417 if (rte_vfio_noiommu_is_enabled()) { 418 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 419 if (args == NULL) { 420 return -1; 421 } 422 } 423 424 #if defined(__x86_64__) 425 /* DPDK by default guesses that it should be using iova-mode=va so that it can 426 * support running as an unprivileged user. However, some systems (especially 427 * virtual machines) don't have an IOMMU capable of handling the full virtual 428 * address space and DPDK doesn't currently catch that. Add a check in SPDK 429 * and force iova-mode=pa here. */ 430 if (spdk_get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { 431 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 432 if (args == NULL) { 433 return -1; 434 } 435 } 436 #elif defined(__PPC64__) 437 /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly 438 * auto-detect at the moment, so we'll just force it here. */ 439 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); 440 if (args == NULL) { 441 return -1; 442 } 443 #endif 444 445 446 /* Set the base virtual address - it must be an address that is not in the 447 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the 448 * mmap hint. 449 * 450 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 451 */ 452 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000")); 453 if (args == NULL) { 454 return -1; 455 } 456 457 /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. 458 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two 459 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split 460 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. 461 */ 462 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 463 if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { 464 args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); 465 if (args == NULL) { 466 return -1; 467 } 468 } 469 #endif 470 471 if (opts->shm_id < 0) { 472 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", 473 getpid())); 474 if (args == NULL) { 475 return -1; 476 } 477 } else { 478 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", 479 opts->shm_id)); 480 if (args == NULL) { 481 return -1; 482 } 483 484 /* set the process type */ 485 args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); 486 if (args == NULL) { 487 return -1; 488 } 489 } 490 #endif 491 492 g_eal_cmdline = args; 493 g_eal_cmdline_argcount = argcount; 494 return argcount; 495 } 496 497 int 498 spdk_env_dpdk_post_init(bool legacy_mem) 499 { 500 int rc; 501 502 pci_init(); 503 504 rc = mem_map_init(legacy_mem); 505 if (rc < 0) { 506 fprintf(stderr, "Failed to allocate mem_map\n"); 507 return rc; 508 } 509 510 rc = vtophys_init(); 511 if (rc < 0) { 512 fprintf(stderr, "Failed to initialize vtophys\n"); 513 return rc; 514 } 515 516 return 0; 517 } 518 519 void 520 spdk_env_dpdk_post_fini(void) 521 { 522 pci_fini(); 523 524 spdk_free_args(g_eal_cmdline, g_eal_cmdline_argcount); 525 } 526 527 int 528 spdk_env_init(const struct spdk_env_opts *opts) 529 { 530 char **dpdk_args = NULL; 531 int i, rc; 532 int orig_optind; 533 bool legacy_mem; 534 535 g_external_init = false; 536 537 rc = spdk_build_eal_cmdline(opts); 538 if (rc < 0) { 539 fprintf(stderr, "Invalid arguments to initialize DPDK\n"); 540 return -EINVAL; 541 } 542 543 printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); 544 printf("[ DPDK EAL parameters: "); 545 for (i = 0; i < g_eal_cmdline_argcount; i++) { 546 printf("%s ", g_eal_cmdline[i]); 547 } 548 printf("]\n"); 549 550 /* DPDK rearranges the array we pass to it, so make a copy 551 * before passing so we can still free the individual strings 552 * correctly. 553 */ 554 dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); 555 if (dpdk_args == NULL) { 556 fprintf(stderr, "Failed to allocate dpdk_args\n"); 557 return -ENOMEM; 558 } 559 memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); 560 561 fflush(stdout); 562 orig_optind = optind; 563 optind = 1; 564 rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); 565 optind = orig_optind; 566 567 free(dpdk_args); 568 569 if (rc < 0) { 570 if (rte_errno == EALREADY) { 571 fprintf(stderr, "DPDK already initialized\n"); 572 } else { 573 fprintf(stderr, "Failed to initialize DPDK\n"); 574 } 575 return -rte_errno; 576 } 577 578 if (opts->shm_id < 0 && !opts->hugepage_single_segments) { 579 /* 580 * Unlink hugepage and config info files after init. This will ensure they get 581 * deleted on app exit, even if the app crashes and does not exit normally. 582 * Only do this when not in multi-process mode, since for multi-process other 583 * apps will need to open these files. These files are not created for 584 * "single file segments". 585 */ 586 spdk_env_unlink_shared_files(); 587 } 588 589 legacy_mem = false; 590 if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { 591 legacy_mem = true; 592 } 593 594 return spdk_env_dpdk_post_init(legacy_mem); 595 } 596 597 void 598 spdk_env_fini(void) 599 { 600 spdk_env_dpdk_post_fini(); 601 } 602 603 bool 604 spdk_env_dpdk_external_init(void) 605 { 606 return g_external_init; 607 } 608