xref: /spdk/lib/env_dpdk/init.c (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 
10 #include "spdk/version.h"
11 #include "spdk/env_dpdk.h"
12 #include "spdk/log.h"
13 
14 #include <rte_config.h>
15 #include <rte_eal.h>
16 #include <rte_errno.h>
17 #include <rte_vfio.h>
18 
19 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE		-1
23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
26 
27 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
28 #define DPDK_ALLOW_PARAM	"--pci-whitelist"
29 #define DPDK_BLOCK_PARAM	"--pci-blacklist"
30 #define DPDK_MAIN_CORE_PARAM	"--master-lcore"
31 #else
32 #define DPDK_ALLOW_PARAM	"--allow"
33 #define DPDK_BLOCK_PARAM	"--block"
34 #define DPDK_MAIN_CORE_PARAM	"--main-lcore"
35 #endif
36 
37 static char **g_eal_cmdline;
38 static int g_eal_cmdline_argcount;
39 static bool g_external_init = true;
40 
41 static char *
42 _sprintf_alloc(const char *format, ...)
43 {
44 	va_list args;
45 	va_list args_copy;
46 	char *buf;
47 	size_t bufsize;
48 	int rc;
49 
50 	va_start(args, format);
51 
52 	/* Try with a small buffer first. */
53 	bufsize = 32;
54 
55 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
56 	while (bufsize <= 1024 * 1024) {
57 		buf = malloc(bufsize);
58 		if (buf == NULL) {
59 			va_end(args);
60 			return NULL;
61 		}
62 
63 		va_copy(args_copy, args);
64 		rc = vsnprintf(buf, bufsize, format, args_copy);
65 		va_end(args_copy);
66 
67 		/*
68 		 * If vsnprintf() returned a count within our current buffer size, we are done.
69 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
70 		 */
71 		if (rc >= 0 && (size_t)rc < bufsize) {
72 			va_end(args);
73 			return buf;
74 		}
75 
76 		/*
77 		 * vsnprintf() should return the required space, but some libc versions do not
78 		 * implement this correctly, so just double the buffer size and try again.
79 		 *
80 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
81 		 * again to avoid a copy.
82 		 */
83 		free(buf);
84 		bufsize *= 2;
85 	}
86 
87 	va_end(args);
88 	return NULL;
89 }
90 
91 void
92 spdk_env_opts_init(struct spdk_env_opts *opts)
93 {
94 	if (!opts) {
95 		return;
96 	}
97 
98 	memset(opts, 0, sizeof(*opts));
99 
100 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
101 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
102 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
103 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
104 	opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE;
105 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
106 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
107 }
108 
109 static void
110 free_args(char **args, int argcount)
111 {
112 	int i;
113 
114 	if (args == NULL) {
115 		return;
116 	}
117 
118 	for (i = 0; i < argcount; i++) {
119 		free(args[i]);
120 	}
121 
122 	if (argcount) {
123 		free(args);
124 	}
125 }
126 
127 static char **
128 push_arg(char *args[], int *argcount, char *arg)
129 {
130 	char **tmp;
131 
132 	if (arg == NULL) {
133 		SPDK_ERRLOG("%s: NULL arg supplied\n", __func__);
134 		free_args(args, *argcount);
135 		return NULL;
136 	}
137 
138 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
139 	if (tmp == NULL) {
140 		free(arg);
141 		free_args(args, *argcount);
142 		return NULL;
143 	}
144 
145 	tmp[*argcount] = arg;
146 	(*argcount)++;
147 
148 	return tmp;
149 }
150 
151 #if defined(__linux__) && defined(__x86_64__)
152 
153 /* TODO: Can likely get this value from rlimits in the future */
154 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
155 #define VTD_CAP_MGAW_SHIFT 16
156 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
157 #define RD_AMD_CAP_VASIZE_SHIFT 15
158 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT)
159 
160 static int
161 get_iommu_width(void)
162 {
163 	int width = 0;
164 	glob_t glob_results = {};
165 
166 	/* Break * and / into separate strings to appease check_format.sh comment style check. */
167 	glob("/sys/devices/virtual/iommu/dmar*" "/intel-iommu/cap", 0, NULL, &glob_results);
168 	glob("/sys/class/iommu/ivhd*" "/amd-iommu/cap", GLOB_APPEND, NULL, &glob_results);
169 
170 	for (size_t i = 0; i < glob_results.gl_pathc; i++) {
171 		const char *filename = glob_results.gl_pathv[0];
172 		FILE *file = fopen(filename, "r");
173 		uint64_t cap_reg = 0;
174 
175 		if (file == NULL) {
176 			continue;
177 		}
178 
179 		if (fscanf(file, "%" PRIx64, &cap_reg) == 1) {
180 			if (strstr(filename, "intel-iommu") != NULL) {
181 				/* We have an Intel IOMMU */
182 				int mgaw = ((cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
183 
184 				if (width == 0 || (mgaw > 0 && mgaw < width)) {
185 					width = mgaw;
186 				}
187 			} else if (strstr(filename, "amd-iommu") != NULL) {
188 				/* We have an AMD IOMMU */
189 				int mgaw = ((cap_reg & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT) + 1;
190 
191 				if (width == 0 || (mgaw > 0 && mgaw < width)) {
192 					width = mgaw;
193 				}
194 			}
195 		}
196 
197 		fclose(file);
198 	}
199 
200 	globfree(&glob_results);
201 	return width;
202 }
203 
204 #endif
205 
206 static int
207 build_eal_cmdline(const struct spdk_env_opts *opts)
208 {
209 	int argcount = 0;
210 	char **args;
211 
212 	args = NULL;
213 
214 	/* set the program name */
215 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
216 	if (args == NULL) {
217 		return -1;
218 	}
219 
220 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
221 	if (opts->shm_id < 0) {
222 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
223 		if (args == NULL) {
224 			return -1;
225 		}
226 	}
227 
228 	/* Either lcore_map or core_mask must be set. If both, or none specified, fail */
229 	if ((opts->core_mask == NULL) == (opts->lcore_map == NULL)) {
230 		if (opts->core_mask && opts->lcore_map) {
231 			fprintf(stderr,
232 				"Both, lcore map and core mask are provided, while only one can be set\n");
233 		} else {
234 			fprintf(stderr, "Core mask or lcore map must be specified\n");
235 		}
236 		free_args(args, argcount);
237 		return -1;
238 	}
239 
240 	if (opts->lcore_map) {
241 		/* If lcore list is set, generate --lcores parameter */
242 		args = push_arg(args, &argcount, _sprintf_alloc("--lcores=%s", opts->lcore_map));
243 	} else if (opts->core_mask[0] == '-') {
244 		/*
245 		 * Set the coremask:
246 		 *
247 		 * - if it starts with '-', we presume it's literal EAL arguments such
248 		 *   as --lcores.
249 		 *
250 		 * - if it starts with '[', we presume it's a core list to use with the
251 		 *   -l option.
252 		 *
253 		 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the
254 		 *   -c option.
255 		 */
256 		args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask));
257 	} else if (opts->core_mask[0] == '[') {
258 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
259 
260 		if (l_arg != NULL) {
261 			int len = strlen(l_arg);
262 
263 			if (l_arg[len - 1] == ']') {
264 				l_arg[len - 1] = '\0';
265 			}
266 		}
267 		args = push_arg(args, &argcount, l_arg);
268 	} else {
269 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
270 	}
271 
272 	if (args == NULL) {
273 		return -1;
274 	}
275 
276 	/* set the memory channel number */
277 	if (opts->mem_channel > 0) {
278 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
279 		if (args == NULL) {
280 			return -1;
281 		}
282 	}
283 
284 	/* set the memory size */
285 	if (opts->mem_size >= 0) {
286 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
287 		if (args == NULL) {
288 			return -1;
289 		}
290 	}
291 
292 	/* set the main core */
293 	if (opts->main_core > 0) {
294 		args = push_arg(args, &argcount, _sprintf_alloc("%s=%d",
295 				DPDK_MAIN_CORE_PARAM, opts->main_core));
296 		if (args == NULL) {
297 			return -1;
298 		}
299 	}
300 
301 	/* set no pci  if enabled */
302 	if (opts->no_pci) {
303 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
304 		if (args == NULL) {
305 			return -1;
306 		}
307 	}
308 
309 	if (opts->env_context && strstr(opts->env_context, "--no-huge") != NULL) {
310 		if (opts->hugepage_single_segments || opts->unlink_hugepage || opts->hugedir) {
311 			fprintf(stderr, "--no-huge invalid with other hugepage options\n");
312 			free_args(args, argcount);
313 			return -1;
314 		}
315 	} else {
316 		/* create just one hugetlbfs file */
317 		if (opts->hugepage_single_segments) {
318 			args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
319 			if (args == NULL) {
320 				return -1;
321 			}
322 		}
323 
324 		/* unlink hugepages after initialization */
325 		/* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using
326 		 * multi-process so we don't need the hugepage links anymore.  But we need to make sure
327 		 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since
328 		 * DPDK doesn't support that.
329 		 */
330 		if (opts->unlink_hugepage ||
331 		    (opts->shm_id < 0 && !opts->hugepage_single_segments)) {
332 			args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
333 			if (args == NULL) {
334 				return -1;
335 			}
336 		}
337 
338 		/* use a specific hugetlbfs mount */
339 		if (opts->hugedir) {
340 			args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
341 			if (args == NULL) {
342 				return -1;
343 			}
344 		}
345 	}
346 
347 	if (opts->num_pci_addr) {
348 		size_t i;
349 		char bdf[32];
350 		struct spdk_pci_addr *pci_addr =
351 				opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed;
352 
353 		for (i = 0; i < opts->num_pci_addr; i++) {
354 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
355 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
356 					(opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM),
357 					bdf));
358 			if (args == NULL) {
359 				return -1;
360 			}
361 		}
362 	}
363 
364 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
365 	 * This can be overridden by specifying the same option in opts->env_context
366 	 */
367 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
368 	if (args == NULL) {
369 		return -1;
370 	}
371 
372 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
373 	 * This can be overridden by specifying the same option in opts->env_context
374 	 */
375 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
376 	if (args == NULL) {
377 		return -1;
378 	}
379 
380 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
381 	 * vhost user message. We don't want that. The same log type is also used by a couple
382 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
383 	 * be overridden via opts->env_context.
384 	 */
385 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
386 	if (args == NULL) {
387 		return -1;
388 	}
389 
390 	if (opts->env_context) {
391 		char *ptr = strdup(opts->env_context);
392 		char *tok = strtok(ptr, " \t");
393 
394 		/* DPDK expects each argument as a separate string in the argv
395 		 * array, so we need to tokenize here in case the caller
396 		 * passed multiple arguments in the env_context string.
397 		 */
398 		while (tok != NULL) {
399 			args = push_arg(args, &argcount, strdup(tok));
400 			tok = strtok(NULL, " \t");
401 		}
402 
403 		free(ptr);
404 	}
405 
406 #ifdef __linux__
407 
408 	if (opts->iova_mode) {
409 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
410 		if (args == NULL) {
411 			return -1;
412 		}
413 	} else {
414 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
415 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
416 		 * iova-mode=pa here. */
417 		if (rte_vfio_noiommu_is_enabled()) {
418 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
419 			if (args == NULL) {
420 				return -1;
421 			}
422 		}
423 
424 #if defined(__x86_64__)
425 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
426 		 * support running as an unprivileged user. However, some systems (especially
427 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
428 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
429 		 * and force iova-mode=pa here. */
430 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
431 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
432 			if (args == NULL) {
433 				return -1;
434 			}
435 		}
436 #elif defined(__PPC64__)
437 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
438 		 * auto-detect at the moment, so we'll just force it here. */
439 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
440 		if (args == NULL) {
441 			return -1;
442 		}
443 #endif
444 	}
445 
446 
447 	/* Set the base virtual address - it must be an address that is not in the
448 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
449 	 * mmap hint.
450 	 *
451 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
452 	 */
453 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
454 	if (args == NULL) {
455 		return -1;
456 	}
457 
458 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
459 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
460 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
461 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
462 	 */
463 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
464 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
465 		if (args == NULL) {
466 			return -1;
467 		}
468 	}
469 
470 	if (opts->shm_id < 0) {
471 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
472 				getpid()));
473 		if (args == NULL) {
474 			return -1;
475 		}
476 	} else {
477 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
478 				opts->shm_id));
479 		if (args == NULL) {
480 			return -1;
481 		}
482 
483 		/* set the process type */
484 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
485 		if (args == NULL) {
486 			return -1;
487 		}
488 	}
489 
490 	/* --vfio-vf-token used for VF initialized by vfio_pci driver. */
491 	if (opts->vf_token) {
492 		args = push_arg(args, &argcount, _sprintf_alloc("--vfio-vf-token=%s",
493 				opts->vf_token));
494 		if (args == NULL) {
495 			return -1;
496 		}
497 	}
498 #endif
499 
500 	g_eal_cmdline = args;
501 	g_eal_cmdline_argcount = argcount;
502 	return argcount;
503 }
504 
505 int
506 spdk_env_dpdk_post_init(bool legacy_mem)
507 {
508 	int rc;
509 
510 	rc = pci_env_init();
511 	if (rc < 0) {
512 		SPDK_ERRLOG("pci_env_init() failed\n");
513 		return rc;
514 	}
515 
516 	rc = mem_map_init(legacy_mem);
517 	if (rc < 0) {
518 		SPDK_ERRLOG("Failed to allocate mem_map\n");
519 		return rc;
520 	}
521 
522 	rc = vtophys_init();
523 	if (rc < 0) {
524 		SPDK_ERRLOG("Failed to initialize vtophys\n");
525 		return rc;
526 	}
527 
528 	return 0;
529 }
530 
531 void
532 spdk_env_dpdk_post_fini(void)
533 {
534 	pci_env_fini();
535 
536 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
537 	g_eal_cmdline = NULL;
538 	g_eal_cmdline_argcount = 0;
539 }
540 
541 int
542 spdk_env_init(const struct spdk_env_opts *opts)
543 {
544 	char **dpdk_args = NULL;
545 	char *args_print = NULL, *args_tmp = NULL;
546 	int i, rc;
547 	int orig_optind;
548 	bool legacy_mem;
549 
550 	/* If SPDK env has been initialized before, then only pci env requires
551 	 * reinitialization.
552 	 */
553 	if (g_external_init == false) {
554 		if (opts != NULL) {
555 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
556 			return -EINVAL;
557 		}
558 
559 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
560 		pci_env_reinit();
561 
562 		return 0;
563 	}
564 
565 	if (opts == NULL) {
566 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
567 		return -EINVAL;
568 	}
569 
570 	rc = build_eal_cmdline(opts);
571 	if (rc < 0) {
572 		SPDK_ERRLOG("Invalid arguments to initialize DPDK\n");
573 		return -EINVAL;
574 	}
575 
576 	SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
577 
578 	args_print = _sprintf_alloc("[ DPDK EAL parameters: ");
579 	if (args_print == NULL) {
580 		return -ENOMEM;
581 	}
582 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
583 		args_tmp = args_print;
584 		args_print = _sprintf_alloc("%s%s ", args_tmp, g_eal_cmdline[i]);
585 		if (args_print == NULL) {
586 			free(args_tmp);
587 			return -ENOMEM;
588 		}
589 		free(args_tmp);
590 	}
591 	SPDK_PRINTF("%s]\n", args_print);
592 	free(args_print);
593 
594 	/* DPDK rearranges the array we pass to it, so make a copy
595 	 * before passing so we can still free the individual strings
596 	 * correctly.
597 	 */
598 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
599 	if (dpdk_args == NULL) {
600 		SPDK_ERRLOG("Failed to allocate dpdk_args\n");
601 		return -ENOMEM;
602 	}
603 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
604 
605 	fflush(stdout);
606 	orig_optind = optind;
607 	optind = 1;
608 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
609 	optind = orig_optind;
610 
611 	free(dpdk_args);
612 
613 	if (rc < 0) {
614 		if (rte_errno == EALREADY) {
615 			SPDK_ERRLOG("DPDK already initialized\n");
616 		} else {
617 			SPDK_ERRLOG("Failed to initialize DPDK\n");
618 		}
619 		return -rte_errno;
620 	}
621 
622 	legacy_mem = false;
623 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
624 		legacy_mem = true;
625 	}
626 
627 	rc = spdk_env_dpdk_post_init(legacy_mem);
628 	if (rc == 0) {
629 		g_external_init = false;
630 	}
631 
632 	return rc;
633 }
634 
635 /* We use priority 101 which is the highest priority level available
636  * to applications (the toolchains reserve 1 to 100 for internal usage).
637  * This ensures this destructor runs last, after any other destructors
638  * that might still need the environment up and running.
639  */
640 __attribute__((destructor(101))) static void
641 dpdk_cleanup(void)
642 {
643 	/* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */
644 	if (!g_external_init) {
645 		rte_eal_cleanup();
646 	}
647 }
648 
649 void
650 spdk_env_fini(void)
651 {
652 	spdk_env_dpdk_post_fini();
653 }
654 
655 bool
656 spdk_env_dpdk_external_init(void)
657 {
658 	return g_external_init;
659 }
660