xref: /spdk/lib/env_dpdk/init.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 
10 #include "spdk/version.h"
11 #include "spdk/env_dpdk.h"
12 #include "spdk/log.h"
13 
14 #include <rte_config.h>
15 #include <rte_eal.h>
16 #include <rte_errno.h>
17 #include <rte_vfio.h>
18 
19 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE		-1
23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
26 
27 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
28 #define DPDK_ALLOW_PARAM	"--pci-whitelist"
29 #define DPDK_BLOCK_PARAM	"--pci-blacklist"
30 #define DPDK_MAIN_CORE_PARAM	"--master-lcore"
31 #else
32 #define DPDK_ALLOW_PARAM	"--allow"
33 #define DPDK_BLOCK_PARAM	"--block"
34 #define DPDK_MAIN_CORE_PARAM	"--main-lcore"
35 #endif
36 
37 static char **g_eal_cmdline;
38 static int g_eal_cmdline_argcount;
39 static bool g_external_init = true;
40 
41 static char *
42 _sprintf_alloc(const char *format, ...)
43 {
44 	va_list args;
45 	va_list args_copy;
46 	char *buf;
47 	size_t bufsize;
48 	int rc;
49 
50 	va_start(args, format);
51 
52 	/* Try with a small buffer first. */
53 	bufsize = 32;
54 
55 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
56 	while (bufsize <= 1024 * 1024) {
57 		buf = malloc(bufsize);
58 		if (buf == NULL) {
59 			va_end(args);
60 			return NULL;
61 		}
62 
63 		va_copy(args_copy, args);
64 		rc = vsnprintf(buf, bufsize, format, args_copy);
65 		va_end(args_copy);
66 
67 		/*
68 		 * If vsnprintf() returned a count within our current buffer size, we are done.
69 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
70 		 */
71 		if (rc >= 0 && (size_t)rc < bufsize) {
72 			va_end(args);
73 			return buf;
74 		}
75 
76 		/*
77 		 * vsnprintf() should return the required space, but some libc versions do not
78 		 * implement this correctly, so just double the buffer size and try again.
79 		 *
80 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
81 		 * again to avoid a copy.
82 		 */
83 		free(buf);
84 		bufsize *= 2;
85 	}
86 
87 	va_end(args);
88 	return NULL;
89 }
90 
91 void
92 spdk_env_opts_init(struct spdk_env_opts *opts)
93 {
94 	if (!opts) {
95 		return;
96 	}
97 
98 	memset(opts, 0, sizeof(*opts));
99 
100 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
101 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
102 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
103 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
104 	opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE;
105 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
106 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
107 }
108 
109 static void
110 free_args(char **args, int argcount)
111 {
112 	int i;
113 
114 	if (args == NULL) {
115 		return;
116 	}
117 
118 	for (i = 0; i < argcount; i++) {
119 		free(args[i]);
120 	}
121 
122 	if (argcount) {
123 		free(args);
124 	}
125 }
126 
127 static char **
128 push_arg(char *args[], int *argcount, char *arg)
129 {
130 	char **tmp;
131 
132 	if (arg == NULL) {
133 		SPDK_ERRLOG("%s: NULL arg supplied\n", __func__);
134 		free_args(args, *argcount);
135 		return NULL;
136 	}
137 
138 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
139 	if (tmp == NULL) {
140 		free(arg);
141 		free_args(args, *argcount);
142 		return NULL;
143 	}
144 
145 	tmp[*argcount] = arg;
146 	(*argcount)++;
147 
148 	return tmp;
149 }
150 
151 #if defined(__linux__) && defined(__x86_64__)
152 
153 /* TODO: Can likely get this value from rlimits in the future */
154 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
155 #define VTD_CAP_MGAW_SHIFT 16
156 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
157 #define RD_AMD_CAP_VASIZE_SHIFT 15
158 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT)
159 
160 static int
161 get_amd_iommu_width(void)
162 {
163 	FILE *file;
164 	char buf[64];
165 	char *end;
166 	long long int amd_cap;
167 
168 	file = fopen("/sys/class/iommu/ivhd2/amd-iommu/cap", "r");
169 	if (file == NULL) {
170 		return 0;
171 	}
172 
173 	if (fgets(buf, sizeof(buf), file) == NULL) {
174 		fclose(file);
175 		return 0;
176 	}
177 
178 	amd_cap = strtoll(buf, &end, 16);
179 	if (amd_cap == LLONG_MIN || amd_cap == LLONG_MAX) {
180 		fclose(file);
181 		return 0;
182 	}
183 
184 	fclose(file);
185 	return (amd_cap & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT;
186 }
187 
188 static int
189 get_iommu_width(void)
190 {
191 	DIR *dir;
192 	FILE *file;
193 	struct dirent *entry;
194 	char mgaw_path[64];
195 	char buf[64];
196 	char *end;
197 	long long int val;
198 	int width, tmp;
199 	struct stat s;
200 
201 	if (stat("/sys/class/iommu/ivhd2/amd-iommu", &s) == 0) {
202 		return get_amd_iommu_width();
203 	}
204 
205 	dir = opendir("/sys/devices/virtual/iommu/");
206 	if (dir == NULL) {
207 		return -EINVAL;
208 	}
209 
210 	width = 0;
211 
212 	while ((entry = readdir(dir)) != NULL) {
213 		/* Find directories named "dmar0", "dmar1", etc */
214 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
215 			continue;
216 		}
217 
218 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
219 			       entry->d_name);
220 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
221 			continue;
222 		}
223 
224 		file = fopen(mgaw_path, "r");
225 		if (file == NULL) {
226 			continue;
227 		}
228 
229 		if (fgets(buf, sizeof(buf), file) == NULL) {
230 			fclose(file);
231 			continue;
232 		}
233 
234 		val = strtoll(buf, &end, 16);
235 		if (val == LLONG_MIN || val == LLONG_MAX) {
236 			fclose(file);
237 			continue;
238 		}
239 
240 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
241 		if (width == 0 || tmp < width) {
242 			width = tmp;
243 		}
244 
245 		fclose(file);
246 	}
247 
248 	closedir(dir);
249 
250 	return width;
251 }
252 
253 #endif
254 
255 static int
256 build_eal_cmdline(const struct spdk_env_opts *opts)
257 {
258 	int argcount = 0;
259 	char **args;
260 
261 	args = NULL;
262 
263 	/* set the program name */
264 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
265 	if (args == NULL) {
266 		return -1;
267 	}
268 
269 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
270 	if (opts->shm_id < 0) {
271 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
272 		if (args == NULL) {
273 			return -1;
274 		}
275 	}
276 
277 	/*
278 	 * Set the coremask:
279 	 *
280 	 * - if it starts with '-', we presume it's literal EAL arguments such
281 	 *   as --lcores.
282 	 *
283 	 * - if it starts with '[', we presume it's a core list to use with the
284 	 *   -l option.
285 	 *
286 	 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the
287 	 *   -c option.
288 	 */
289 	if (opts->core_mask[0] == '-') {
290 		args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask));
291 	} else if (opts->core_mask[0] == '[') {
292 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
293 
294 		if (l_arg != NULL) {
295 			int len = strlen(l_arg);
296 
297 			if (l_arg[len - 1] == ']') {
298 				l_arg[len - 1] = '\0';
299 			}
300 		}
301 		args = push_arg(args, &argcount, l_arg);
302 	} else {
303 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
304 	}
305 
306 	if (args == NULL) {
307 		return -1;
308 	}
309 
310 	/* set the memory channel number */
311 	if (opts->mem_channel > 0) {
312 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
313 		if (args == NULL) {
314 			return -1;
315 		}
316 	}
317 
318 	/* set the memory size */
319 	if (opts->mem_size >= 0) {
320 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
321 		if (args == NULL) {
322 			return -1;
323 		}
324 	}
325 
326 	/* set the main core */
327 	if (opts->main_core > 0) {
328 		args = push_arg(args, &argcount, _sprintf_alloc("%s=%d",
329 				DPDK_MAIN_CORE_PARAM, opts->main_core));
330 		if (args == NULL) {
331 			return -1;
332 		}
333 	}
334 
335 	/* set no pci  if enabled */
336 	if (opts->no_pci) {
337 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
338 		if (args == NULL) {
339 			return -1;
340 		}
341 	}
342 
343 	/* create just one hugetlbfs file */
344 	if (opts->hugepage_single_segments) {
345 		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
346 		if (args == NULL) {
347 			return -1;
348 		}
349 	}
350 
351 	/* unlink hugepages after initialization */
352 	/* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using
353 	 * multi-process so we don't need the hugepage links anymore.  But we need to make sure
354 	 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since
355 	 * DPDK doesn't support that.
356 	 */
357 	if (opts->unlink_hugepage ||
358 	    (opts->shm_id < 0 && !opts->hugepage_single_segments)) {
359 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
360 		if (args == NULL) {
361 			return -1;
362 		}
363 	}
364 
365 	/* use a specific hugetlbfs mount */
366 	if (opts->hugedir) {
367 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
368 		if (args == NULL) {
369 			return -1;
370 		}
371 	}
372 
373 	if (opts->num_pci_addr) {
374 		size_t i;
375 		char bdf[32];
376 		struct spdk_pci_addr *pci_addr =
377 				opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed;
378 
379 		for (i = 0; i < opts->num_pci_addr; i++) {
380 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
381 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
382 					(opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM),
383 					bdf));
384 			if (args == NULL) {
385 				return -1;
386 			}
387 		}
388 	}
389 
390 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
391 	 * This can be overridden by specifying the same option in opts->env_context
392 	 */
393 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
394 	if (args == NULL) {
395 		return -1;
396 	}
397 
398 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
399 	 * This can be overridden by specifying the same option in opts->env_context
400 	 */
401 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
402 	if (args == NULL) {
403 		return -1;
404 	}
405 
406 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
407 	 * vhost user message. We don't want that. The same log type is also used by a couple
408 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
409 	 * be overridden via opts->env_context.
410 	 */
411 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
412 	if (args == NULL) {
413 		return -1;
414 	}
415 
416 	if (opts->env_context) {
417 		char *ptr = strdup(opts->env_context);
418 		char *tok = strtok(ptr, " \t");
419 
420 		/* DPDK expects each argument as a separate string in the argv
421 		 * array, so we need to tokenize here in case the caller
422 		 * passed multiple arguments in the env_context string.
423 		 */
424 		while (tok != NULL) {
425 			args = push_arg(args, &argcount, strdup(tok));
426 			tok = strtok(NULL, " \t");
427 		}
428 
429 		free(ptr);
430 	}
431 
432 #ifdef __linux__
433 
434 	if (opts->iova_mode) {
435 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
436 		if (args == NULL) {
437 			return -1;
438 		}
439 	} else {
440 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
441 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
442 		 * iova-mode=pa here. */
443 		if (rte_vfio_noiommu_is_enabled()) {
444 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
445 			if (args == NULL) {
446 				return -1;
447 			}
448 		}
449 
450 #if defined(__x86_64__)
451 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
452 		 * support running as an unprivileged user. However, some systems (especially
453 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
454 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
455 		 * and force iova-mode=pa here. */
456 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
457 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
458 			if (args == NULL) {
459 				return -1;
460 			}
461 		}
462 #elif defined(__PPC64__)
463 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
464 		 * auto-detect at the moment, so we'll just force it here. */
465 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
466 		if (args == NULL) {
467 			return -1;
468 		}
469 #endif
470 	}
471 
472 
473 	/* Set the base virtual address - it must be an address that is not in the
474 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
475 	 * mmap hint.
476 	 *
477 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
478 	 */
479 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
480 	if (args == NULL) {
481 		return -1;
482 	}
483 
484 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
485 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
486 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
487 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
488 	 */
489 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
490 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
491 		if (args == NULL) {
492 			return -1;
493 		}
494 	}
495 
496 	if (opts->shm_id < 0) {
497 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
498 				getpid()));
499 		if (args == NULL) {
500 			return -1;
501 		}
502 	} else {
503 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
504 				opts->shm_id));
505 		if (args == NULL) {
506 			return -1;
507 		}
508 
509 		/* set the process type */
510 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
511 		if (args == NULL) {
512 			return -1;
513 		}
514 	}
515 #endif
516 
517 	g_eal_cmdline = args;
518 	g_eal_cmdline_argcount = argcount;
519 	return argcount;
520 }
521 
522 int
523 spdk_env_dpdk_post_init(bool legacy_mem)
524 {
525 	int rc;
526 
527 	rc = pci_env_init();
528 	if (rc < 0) {
529 		SPDK_ERRLOG("pci_env_init() failed\n");
530 		return rc;
531 	}
532 
533 	rc = mem_map_init(legacy_mem);
534 	if (rc < 0) {
535 		SPDK_ERRLOG("Failed to allocate mem_map\n");
536 		return rc;
537 	}
538 
539 	rc = vtophys_init();
540 	if (rc < 0) {
541 		SPDK_ERRLOG("Failed to initialize vtophys\n");
542 		return rc;
543 	}
544 
545 	return 0;
546 }
547 
548 void
549 spdk_env_dpdk_post_fini(void)
550 {
551 	pci_env_fini();
552 
553 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
554 	g_eal_cmdline = NULL;
555 	g_eal_cmdline_argcount = 0;
556 }
557 
558 int
559 spdk_env_init(const struct spdk_env_opts *opts)
560 {
561 	char **dpdk_args = NULL;
562 	int i, rc;
563 	int orig_optind;
564 	bool legacy_mem;
565 
566 	/* If SPDK env has been initialized before, then only pci env requires
567 	 * reinitialization.
568 	 */
569 	if (g_external_init == false) {
570 		if (opts != NULL) {
571 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
572 			return -EINVAL;
573 		}
574 
575 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
576 		pci_env_reinit();
577 
578 		return 0;
579 	}
580 
581 	if (opts == NULL) {
582 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
583 		return -EINVAL;
584 	}
585 
586 	rc = build_eal_cmdline(opts);
587 	if (rc < 0) {
588 		SPDK_ERRLOG("Invalid arguments to initialize DPDK\n");
589 		return -EINVAL;
590 	}
591 
592 	SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
593 	SPDK_PRINTF("[ DPDK EAL parameters: ");
594 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
595 		SPDK_PRINTF("%s ", g_eal_cmdline[i]);
596 	}
597 	SPDK_PRINTF("]\n");
598 
599 	/* DPDK rearranges the array we pass to it, so make a copy
600 	 * before passing so we can still free the individual strings
601 	 * correctly.
602 	 */
603 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
604 	if (dpdk_args == NULL) {
605 		SPDK_ERRLOG("Failed to allocate dpdk_args\n");
606 		return -ENOMEM;
607 	}
608 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
609 
610 	fflush(stdout);
611 	orig_optind = optind;
612 	optind = 1;
613 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
614 	optind = orig_optind;
615 
616 	free(dpdk_args);
617 
618 	if (rc < 0) {
619 		if (rte_errno == EALREADY) {
620 			SPDK_ERRLOG("DPDK already initialized\n");
621 		} else {
622 			SPDK_ERRLOG("Failed to initialize DPDK\n");
623 		}
624 		return -rte_errno;
625 	}
626 
627 	legacy_mem = false;
628 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
629 		legacy_mem = true;
630 	}
631 
632 	rc = spdk_env_dpdk_post_init(legacy_mem);
633 	if (rc == 0) {
634 		g_external_init = false;
635 	}
636 
637 	return rc;
638 }
639 
640 /* We use priority 101 which is the highest priority level available
641  * to applications (the toolchains reserve 1 to 100 for internal usage).
642  * This ensures this destructor runs last, after any other destructors
643  * that might still need the environment up and running.
644  */
645 __attribute__((destructor(101))) static void
646 dpdk_cleanup(void)
647 {
648 	/* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */
649 	if (!g_external_init) {
650 		rte_eal_cleanup();
651 	}
652 }
653 
654 void
655 spdk_env_fini(void)
656 {
657 	spdk_env_dpdk_post_fini();
658 }
659 
660 bool
661 spdk_env_dpdk_external_init(void)
662 {
663 	return g_external_init;
664 }
665