xref: /spdk/lib/env_dpdk/init.c (revision 12fbe739a31b09aff0d05f354d4f3bbef99afc55)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 
10 #include "spdk/version.h"
11 #include "spdk/env_dpdk.h"
12 #include "spdk/log.h"
13 
14 #include <rte_config.h>
15 #include <rte_eal.h>
16 #include <rte_errno.h>
17 #include <rte_vfio.h>
18 
19 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
20 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
21 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
22 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE		-1
23 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
24 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
25 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
26 
27 #define DPDK_ALLOW_PARAM	"--allow"
28 #define DPDK_BLOCK_PARAM	"--block"
29 #define DPDK_MAIN_CORE_PARAM	"--main-lcore"
30 
31 static char **g_eal_cmdline;
32 static int g_eal_cmdline_argcount;
33 static bool g_external_init = true;
34 
35 static char *
36 _sprintf_alloc(const char *format, ...)
37 {
38 	va_list args;
39 	va_list args_copy;
40 	char *buf;
41 	size_t bufsize;
42 	int rc;
43 
44 	va_start(args, format);
45 
46 	/* Try with a small buffer first. */
47 	bufsize = 32;
48 
49 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
50 	while (bufsize <= 1024 * 1024) {
51 		buf = malloc(bufsize);
52 		if (buf == NULL) {
53 			va_end(args);
54 			return NULL;
55 		}
56 
57 		va_copy(args_copy, args);
58 		rc = vsnprintf(buf, bufsize, format, args_copy);
59 		va_end(args_copy);
60 
61 		/*
62 		 * If vsnprintf() returned a count within our current buffer size, we are done.
63 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
64 		 */
65 		if (rc >= 0 && (size_t)rc < bufsize) {
66 			va_end(args);
67 			return buf;
68 		}
69 
70 		/*
71 		 * vsnprintf() should return the required space, but some libc versions do not
72 		 * implement this correctly, so just double the buffer size and try again.
73 		 *
74 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
75 		 * again to avoid a copy.
76 		 */
77 		free(buf);
78 		bufsize *= 2;
79 	}
80 
81 	va_end(args);
82 	return NULL;
83 }
84 
85 void
86 spdk_env_opts_init(struct spdk_env_opts *opts)
87 {
88 	if (!opts) {
89 		return;
90 	}
91 
92 	memset(opts, 0, sizeof(*opts));
93 
94 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
95 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
96 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
97 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
98 	opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE;
99 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
100 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
101 }
102 
103 static void
104 free_args(char **args, int argcount)
105 {
106 	int i;
107 
108 	if (args == NULL) {
109 		return;
110 	}
111 
112 	for (i = 0; i < argcount; i++) {
113 		free(args[i]);
114 	}
115 
116 	if (argcount) {
117 		free(args);
118 	}
119 }
120 
121 static char **
122 push_arg(char *args[], int *argcount, char *arg)
123 {
124 	char **tmp;
125 
126 	if (arg == NULL) {
127 		SPDK_ERRLOG("%s: NULL arg supplied\n", __func__);
128 		free_args(args, *argcount);
129 		return NULL;
130 	}
131 
132 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
133 	if (tmp == NULL) {
134 		free(arg);
135 		free_args(args, *argcount);
136 		return NULL;
137 	}
138 
139 	tmp[*argcount] = arg;
140 	(*argcount)++;
141 
142 	return tmp;
143 }
144 
145 #if defined(__linux__) && defined(__x86_64__)
146 
147 /* TODO: Can likely get this value from rlimits in the future */
148 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
149 #define VTD_CAP_MGAW_SHIFT 16
150 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
151 #define RD_AMD_CAP_VASIZE_SHIFT 15
152 #define RD_AMD_CAP_VASIZE_MASK (0x7F << RD_AMD_CAP_VASIZE_SHIFT)
153 
154 static int
155 get_iommu_width(void)
156 {
157 	int width = 0;
158 	glob_t glob_results = {};
159 
160 	/* Break * and / into separate strings to appease check_format.sh comment style check. */
161 	glob("/sys/devices/virtual/iommu/dmar*" "/intel-iommu/cap", 0, NULL, &glob_results);
162 	glob("/sys/class/iommu/ivhd*" "/amd-iommu/cap", GLOB_APPEND, NULL, &glob_results);
163 
164 	for (size_t i = 0; i < glob_results.gl_pathc; i++) {
165 		const char *filename = glob_results.gl_pathv[0];
166 		FILE *file = fopen(filename, "r");
167 		uint64_t cap_reg = 0;
168 
169 		if (file == NULL) {
170 			continue;
171 		}
172 
173 		if (fscanf(file, "%" PRIx64, &cap_reg) == 1) {
174 			if (strstr(filename, "intel-iommu") != NULL) {
175 				/* We have an Intel IOMMU */
176 				int mgaw = ((cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
177 
178 				if (width == 0 || (mgaw > 0 && mgaw < width)) {
179 					width = mgaw;
180 				}
181 			} else if (strstr(filename, "amd-iommu") != NULL) {
182 				/* We have an AMD IOMMU */
183 				int mgaw = ((cap_reg & RD_AMD_CAP_VASIZE_MASK) >> RD_AMD_CAP_VASIZE_SHIFT) + 1;
184 
185 				if (width == 0 || (mgaw > 0 && mgaw < width)) {
186 					width = mgaw;
187 				}
188 			}
189 		}
190 
191 		fclose(file);
192 	}
193 
194 	globfree(&glob_results);
195 	return width;
196 }
197 
198 #endif
199 
200 static int
201 build_eal_cmdline(const struct spdk_env_opts *opts)
202 {
203 	int argcount = 0;
204 	char **args;
205 
206 	args = NULL;
207 
208 	/* set the program name */
209 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
210 	if (args == NULL) {
211 		return -1;
212 	}
213 
214 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
215 	if (opts->shm_id < 0) {
216 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
217 		if (args == NULL) {
218 			return -1;
219 		}
220 	}
221 
222 	/* Either lcore_map or core_mask must be set. If both, or none specified, fail */
223 	if ((opts->core_mask == NULL) == (opts->lcore_map == NULL)) {
224 		if (opts->core_mask && opts->lcore_map) {
225 			fprintf(stderr,
226 				"Both, lcore map and core mask are provided, while only one can be set\n");
227 		} else {
228 			fprintf(stderr, "Core mask or lcore map must be specified\n");
229 		}
230 		free_args(args, argcount);
231 		return -1;
232 	}
233 
234 	if (opts->lcore_map) {
235 		/* If lcore list is set, generate --lcores parameter */
236 		args = push_arg(args, &argcount, _sprintf_alloc("--lcores=%s", opts->lcore_map));
237 	} else if (opts->core_mask[0] == '-') {
238 		/*
239 		 * Set the coremask:
240 		 *
241 		 * - if it starts with '-', we presume it's literal EAL arguments such
242 		 *   as --lcores.
243 		 *
244 		 * - if it starts with '[', we presume it's a core list to use with the
245 		 *   -l option.
246 		 *
247 		 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the
248 		 *   -c option.
249 		 */
250 		args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask));
251 	} else if (opts->core_mask[0] == '[') {
252 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
253 
254 		if (l_arg != NULL) {
255 			int len = strlen(l_arg);
256 
257 			if (l_arg[len - 1] == ']') {
258 				l_arg[len - 1] = '\0';
259 			}
260 		}
261 		args = push_arg(args, &argcount, l_arg);
262 	} else {
263 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
264 	}
265 
266 	if (args == NULL) {
267 		return -1;
268 	}
269 
270 	/* set the memory channel number */
271 	if (opts->mem_channel > 0) {
272 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
273 		if (args == NULL) {
274 			return -1;
275 		}
276 	}
277 
278 	/* set the memory size */
279 	if (opts->mem_size >= 0) {
280 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
281 		if (args == NULL) {
282 			return -1;
283 		}
284 	}
285 
286 	/* set the main core */
287 	if (opts->main_core > 0) {
288 		args = push_arg(args, &argcount, _sprintf_alloc("%s=%d",
289 				DPDK_MAIN_CORE_PARAM, opts->main_core));
290 		if (args == NULL) {
291 			return -1;
292 		}
293 	}
294 
295 	/* set no pci  if enabled */
296 	if (opts->no_pci) {
297 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
298 		if (args == NULL) {
299 			return -1;
300 		}
301 	}
302 
303 	if (opts->env_context && strstr(opts->env_context, "--no-huge") != NULL) {
304 		if (opts->hugepage_single_segments || opts->unlink_hugepage || opts->hugedir) {
305 			fprintf(stderr, "--no-huge invalid with other hugepage options\n");
306 			free_args(args, argcount);
307 			return -1;
308 		}
309 	} else {
310 		/* create just one hugetlbfs file */
311 		if (opts->hugepage_single_segments) {
312 			args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
313 			if (args == NULL) {
314 				return -1;
315 			}
316 		}
317 
318 		/* unlink hugepages after initialization */
319 		/* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using
320 		 * multi-process so we don't need the hugepage links anymore.  But we need to make sure
321 		 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since
322 		 * DPDK doesn't support that.
323 		 */
324 		if (opts->unlink_hugepage ||
325 		    (opts->shm_id < 0 && !opts->hugepage_single_segments)) {
326 			args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
327 			if (args == NULL) {
328 				return -1;
329 			}
330 		}
331 
332 		/* use a specific hugetlbfs mount */
333 		if (opts->hugedir) {
334 			args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
335 			if (args == NULL) {
336 				return -1;
337 			}
338 		}
339 	}
340 
341 	if (opts->num_pci_addr) {
342 		size_t i;
343 		char bdf[32];
344 		struct spdk_pci_addr *pci_addr =
345 				opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed;
346 
347 		for (i = 0; i < opts->num_pci_addr; i++) {
348 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
349 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
350 					(opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM),
351 					bdf));
352 			if (args == NULL) {
353 				return -1;
354 			}
355 		}
356 	}
357 
358 	/* Disable DPDK telemetry information by default, can be modified with env_context.
359 	 * Prevents creation of dpdk_telemetry socket and additional pthread for it.
360 	 */
361 	args = push_arg(args, &argcount, _sprintf_alloc("--no-telemetry"));
362 	if (args == NULL) {
363 		return -1;
364 	}
365 
366 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
367 	 * This can be overridden by specifying the same option in opts->env_context
368 	 */
369 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
370 	if (args == NULL) {
371 		return -1;
372 	}
373 
374 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
375 	 * This can be overridden by specifying the same option in opts->env_context
376 	 */
377 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
378 	if (args == NULL) {
379 		return -1;
380 	}
381 
382 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
383 	 * vhost user message. We don't want that. The same log type is also used by a couple
384 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
385 	 * be overridden via opts->env_context.
386 	 */
387 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
388 	if (args == NULL) {
389 		return -1;
390 	}
391 
392 	if (opts->env_context) {
393 		char *ptr = strdup(opts->env_context);
394 		char *tok = strtok(ptr, " \t");
395 
396 		/* DPDK expects each argument as a separate string in the argv
397 		 * array, so we need to tokenize here in case the caller
398 		 * passed multiple arguments in the env_context string.
399 		 */
400 		while (tok != NULL) {
401 			args = push_arg(args, &argcount, strdup(tok));
402 			tok = strtok(NULL, " \t");
403 		}
404 
405 		free(ptr);
406 	}
407 
408 #ifdef __linux__
409 
410 	if (opts->iova_mode) {
411 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
412 		if (args == NULL) {
413 			return -1;
414 		}
415 	} else {
416 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
417 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
418 		 * iova-mode=pa here. */
419 		if (rte_vfio_noiommu_is_enabled()) {
420 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
421 			if (args == NULL) {
422 				return -1;
423 			}
424 		}
425 
426 #if defined(__x86_64__)
427 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
428 		 * support running as an unprivileged user. However, some systems (especially
429 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
430 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
431 		 * and force iova-mode=pa here. */
432 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
433 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
434 			if (args == NULL) {
435 				return -1;
436 			}
437 		}
438 #elif defined(__PPC64__)
439 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
440 		 * auto-detect at the moment, so we'll just force it here. */
441 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
442 		if (args == NULL) {
443 			return -1;
444 		}
445 #endif
446 	}
447 
448 
449 	/* Set the base virtual address - it must be an address that is not in the
450 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
451 	 * mmap hint.
452 	 *
453 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
454 	 */
455 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
456 	if (args == NULL) {
457 		return -1;
458 	}
459 
460 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
461 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
462 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
463 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
464 	 */
465 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
466 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
467 		if (args == NULL) {
468 			return -1;
469 		}
470 	}
471 
472 	if (opts->shm_id < 0) {
473 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
474 				getpid()));
475 		if (args == NULL) {
476 			return -1;
477 		}
478 	} else {
479 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
480 				opts->shm_id));
481 		if (args == NULL) {
482 			return -1;
483 		}
484 
485 		/* set the process type */
486 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
487 		if (args == NULL) {
488 			return -1;
489 		}
490 	}
491 
492 	/* --vfio-vf-token used for VF initialized by vfio_pci driver. */
493 	if (opts->vf_token) {
494 		args = push_arg(args, &argcount, _sprintf_alloc("--vfio-vf-token=%s",
495 				opts->vf_token));
496 		if (args == NULL) {
497 			return -1;
498 		}
499 	}
500 #endif
501 
502 	g_eal_cmdline = args;
503 	g_eal_cmdline_argcount = argcount;
504 	return argcount;
505 }
506 
507 int
508 spdk_env_dpdk_post_init(bool legacy_mem)
509 {
510 	int rc;
511 
512 	rc = pci_env_init();
513 	if (rc < 0) {
514 		SPDK_ERRLOG("pci_env_init() failed\n");
515 		return rc;
516 	}
517 
518 	rc = mem_map_init(legacy_mem);
519 	if (rc < 0) {
520 		SPDK_ERRLOG("Failed to allocate mem_map\n");
521 		return rc;
522 	}
523 
524 	rc = vtophys_init();
525 	if (rc < 0) {
526 		SPDK_ERRLOG("Failed to initialize vtophys\n");
527 		return rc;
528 	}
529 
530 	return 0;
531 }
532 
533 void
534 spdk_env_dpdk_post_fini(void)
535 {
536 	pci_env_fini();
537 
538 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
539 	g_eal_cmdline = NULL;
540 	g_eal_cmdline_argcount = 0;
541 }
542 
543 int
544 spdk_env_init(const struct spdk_env_opts *opts)
545 {
546 	char **dpdk_args = NULL;
547 	char *args_print = NULL, *args_tmp = NULL;
548 	int i, rc;
549 	int orig_optind;
550 	bool legacy_mem;
551 
552 	/* If SPDK env has been initialized before, then only pci env requires
553 	 * reinitialization.
554 	 */
555 	if (g_external_init == false) {
556 		if (opts != NULL) {
557 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
558 			return -EINVAL;
559 		}
560 
561 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
562 		pci_env_reinit();
563 
564 		return 0;
565 	}
566 
567 	if (opts == NULL) {
568 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
569 		return -EINVAL;
570 	}
571 
572 	rc = build_eal_cmdline(opts);
573 	if (rc < 0) {
574 		SPDK_ERRLOG("Invalid arguments to initialize DPDK\n");
575 		return -EINVAL;
576 	}
577 
578 	SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
579 
580 	args_print = _sprintf_alloc("[ DPDK EAL parameters: ");
581 	if (args_print == NULL) {
582 		return -ENOMEM;
583 	}
584 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
585 		args_tmp = args_print;
586 		args_print = _sprintf_alloc("%s%s ", args_tmp, g_eal_cmdline[i]);
587 		if (args_print == NULL) {
588 			free(args_tmp);
589 			return -ENOMEM;
590 		}
591 		free(args_tmp);
592 	}
593 	SPDK_PRINTF("%s]\n", args_print);
594 	free(args_print);
595 
596 	/* DPDK rearranges the array we pass to it, so make a copy
597 	 * before passing so we can still free the individual strings
598 	 * correctly.
599 	 */
600 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
601 	if (dpdk_args == NULL) {
602 		SPDK_ERRLOG("Failed to allocate dpdk_args\n");
603 		return -ENOMEM;
604 	}
605 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
606 
607 	fflush(stdout);
608 	orig_optind = optind;
609 	optind = 1;
610 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
611 	optind = orig_optind;
612 
613 	free(dpdk_args);
614 
615 	if (rc < 0) {
616 		if (rte_errno == EALREADY) {
617 			SPDK_ERRLOG("DPDK already initialized\n");
618 		} else {
619 			SPDK_ERRLOG("Failed to initialize DPDK\n");
620 		}
621 		return -rte_errno;
622 	}
623 
624 	legacy_mem = false;
625 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
626 		legacy_mem = true;
627 	}
628 
629 	rc = spdk_env_dpdk_post_init(legacy_mem);
630 	if (rc == 0) {
631 		g_external_init = false;
632 	}
633 
634 	return rc;
635 }
636 
637 /* We use priority 101 which is the highest priority level available
638  * to applications (the toolchains reserve 1 to 100 for internal usage).
639  * This ensures this destructor runs last, after any other destructors
640  * that might still need the environment up and running.
641  */
642 __attribute__((destructor(101))) static void
643 dpdk_cleanup(void)
644 {
645 	/* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */
646 	if (!g_external_init) {
647 		rte_eal_cleanup();
648 	}
649 }
650 
651 void
652 spdk_env_fini(void)
653 {
654 	spdk_env_dpdk_post_fini();
655 }
656 
657 bool
658 spdk_env_dpdk_external_init(void)
659 {
660 	return g_external_init;
661 }
662