xref: /spdk/lib/env_dpdk/init.c (revision 927f1fd57bd004df581518466ec4c1b8083e5d23)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 #include "spdk/log.h"
41 
42 #include <rte_config.h>
43 #include <rte_eal.h>
44 #include <rte_errno.h>
45 #include <rte_vfio.h>
46 
47 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
48 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
49 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
50 #define SPDK_ENV_DPDK_DEFAULT_MAIN_CORE		-1
51 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
52 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
53 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
54 
55 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
56 #define DPDK_ALLOW_PARAM	"--pci-whitelist"
57 #define DPDK_BLOCK_PARAM	"--pci-blacklist"
58 #define DPDK_MAIN_CORE_PARAM	"--master-lcore"
59 #else
60 #define DPDK_ALLOW_PARAM	"--allow"
61 #define DPDK_BLOCK_PARAM	"--block"
62 #define DPDK_MAIN_CORE_PARAM	"--main-lcore"
63 #endif
64 
65 static char **g_eal_cmdline;
66 static int g_eal_cmdline_argcount;
67 static bool g_external_init = true;
68 
69 static char *
70 _sprintf_alloc(const char *format, ...)
71 {
72 	va_list args;
73 	va_list args_copy;
74 	char *buf;
75 	size_t bufsize;
76 	int rc;
77 
78 	va_start(args, format);
79 
80 	/* Try with a small buffer first. */
81 	bufsize = 32;
82 
83 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
84 	while (bufsize <= 1024 * 1024) {
85 		buf = malloc(bufsize);
86 		if (buf == NULL) {
87 			va_end(args);
88 			return NULL;
89 		}
90 
91 		va_copy(args_copy, args);
92 		rc = vsnprintf(buf, bufsize, format, args_copy);
93 		va_end(args_copy);
94 
95 		/*
96 		 * If vsnprintf() returned a count within our current buffer size, we are done.
97 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
98 		 */
99 		if (rc >= 0 && (size_t)rc < bufsize) {
100 			va_end(args);
101 			return buf;
102 		}
103 
104 		/*
105 		 * vsnprintf() should return the required space, but some libc versions do not
106 		 * implement this correctly, so just double the buffer size and try again.
107 		 *
108 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
109 		 * again to avoid a copy.
110 		 */
111 		free(buf);
112 		bufsize *= 2;
113 	}
114 
115 	va_end(args);
116 	return NULL;
117 }
118 
119 void
120 spdk_env_opts_init(struct spdk_env_opts *opts)
121 {
122 	if (!opts) {
123 		return;
124 	}
125 
126 	memset(opts, 0, sizeof(*opts));
127 
128 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
129 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
130 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
131 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
132 	opts->main_core = SPDK_ENV_DPDK_DEFAULT_MAIN_CORE;
133 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
134 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
135 }
136 
137 static void
138 free_args(char **args, int argcount)
139 {
140 	int i;
141 
142 	if (args == NULL) {
143 		return;
144 	}
145 
146 	for (i = 0; i < argcount; i++) {
147 		free(args[i]);
148 	}
149 
150 	if (argcount) {
151 		free(args);
152 	}
153 }
154 
155 static char **
156 push_arg(char *args[], int *argcount, char *arg)
157 {
158 	char **tmp;
159 
160 	if (arg == NULL) {
161 		SPDK_ERRLOG("%s: NULL arg supplied\n", __func__);
162 		free_args(args, *argcount);
163 		return NULL;
164 	}
165 
166 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
167 	if (tmp == NULL) {
168 		free(arg);
169 		free_args(args, *argcount);
170 		return NULL;
171 	}
172 
173 	tmp[*argcount] = arg;
174 	(*argcount)++;
175 
176 	return tmp;
177 }
178 
179 #if defined(__linux__) && defined(__x86_64__)
180 
181 /* TODO: Can likely get this value from rlimits in the future */
182 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
183 #define VTD_CAP_MGAW_SHIFT 16
184 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
185 
186 static int
187 get_iommu_width(void)
188 {
189 	DIR *dir;
190 	FILE *file;
191 	struct dirent *entry;
192 	char mgaw_path[64];
193 	char buf[64];
194 	char *end;
195 	long long int val;
196 	int width, tmp;
197 
198 	dir = opendir("/sys/devices/virtual/iommu/");
199 	if (dir == NULL) {
200 		return -EINVAL;
201 	}
202 
203 	width = 0;
204 
205 	while ((entry = readdir(dir)) != NULL) {
206 		/* Find directories named "dmar0", "dmar1", etc */
207 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
208 			continue;
209 		}
210 
211 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
212 			       entry->d_name);
213 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
214 			continue;
215 		}
216 
217 		file = fopen(mgaw_path, "r");
218 		if (file == NULL) {
219 			continue;
220 		}
221 
222 		if (fgets(buf, sizeof(buf), file) == NULL) {
223 			fclose(file);
224 			continue;
225 		}
226 
227 		val = strtoll(buf, &end, 16);
228 		if (val == LLONG_MIN || val == LLONG_MAX) {
229 			fclose(file);
230 			continue;
231 		}
232 
233 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
234 		if (width == 0 || tmp < width) {
235 			width = tmp;
236 		}
237 
238 		fclose(file);
239 	}
240 
241 	closedir(dir);
242 
243 	return width;
244 }
245 
246 #endif
247 
248 static int
249 build_eal_cmdline(const struct spdk_env_opts *opts)
250 {
251 	int argcount = 0;
252 	char **args;
253 
254 	args = NULL;
255 
256 	/* set the program name */
257 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
258 	if (args == NULL) {
259 		return -1;
260 	}
261 
262 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
263 	if (opts->shm_id < 0) {
264 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
265 		if (args == NULL) {
266 			return -1;
267 		}
268 	}
269 
270 	/*
271 	 * Set the coremask:
272 	 *
273 	 * - if it starts with '-', we presume it's literal EAL arguments such
274 	 *   as --lcores.
275 	 *
276 	 * - if it starts with '[', we presume it's a core list to use with the
277 	 *   -l option.
278 	 *
279 	 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the
280 	 *   -c option.
281 	 */
282 	if (opts->core_mask[0] == '-') {
283 		args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask));
284 	} else if (opts->core_mask[0] == '[') {
285 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
286 
287 		if (l_arg != NULL) {
288 			int len = strlen(l_arg);
289 
290 			if (l_arg[len - 1] == ']') {
291 				l_arg[len - 1] = '\0';
292 			}
293 		}
294 		args = push_arg(args, &argcount, l_arg);
295 	} else {
296 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
297 	}
298 
299 	if (args == NULL) {
300 		return -1;
301 	}
302 
303 	/* set the memory channel number */
304 	if (opts->mem_channel > 0) {
305 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
306 		if (args == NULL) {
307 			return -1;
308 		}
309 	}
310 
311 	/* set the memory size */
312 	if (opts->mem_size >= 0) {
313 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
314 		if (args == NULL) {
315 			return -1;
316 		}
317 	}
318 
319 	/* set the main core */
320 	if (opts->main_core > 0) {
321 		args = push_arg(args, &argcount, _sprintf_alloc("%s=%d",
322 				DPDK_MAIN_CORE_PARAM, opts->main_core));
323 		if (args == NULL) {
324 			return -1;
325 		}
326 	}
327 
328 	/* set no pci  if enabled */
329 	if (opts->no_pci) {
330 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
331 		if (args == NULL) {
332 			return -1;
333 		}
334 	}
335 
336 	/* create just one hugetlbfs file */
337 	if (opts->hugepage_single_segments) {
338 		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
339 		if (args == NULL) {
340 			return -1;
341 		}
342 	}
343 
344 	/* unlink hugepages after initialization */
345 	/* Note: Automatically unlink hugepage when shm_id < 0, since it means we're not using
346 	 * multi-process so we don't need the hugepage links anymore.  But we need to make sure
347 	 * we don't specify --huge-unlink implicitly if --single-file-segments was specified since
348 	 * DPDK doesn't support that.
349 	 */
350 	if (opts->unlink_hugepage ||
351 	    (opts->shm_id < 0 && !opts->hugepage_single_segments)) {
352 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
353 		if (args == NULL) {
354 			return -1;
355 		}
356 	}
357 
358 	/* use a specific hugetlbfs mount */
359 	if (opts->hugedir) {
360 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
361 		if (args == NULL) {
362 			return -1;
363 		}
364 	}
365 
366 	if (opts->num_pci_addr) {
367 		size_t i;
368 		char bdf[32];
369 		struct spdk_pci_addr *pci_addr =
370 				opts->pci_blocked ? opts->pci_blocked : opts->pci_allowed;
371 
372 		for (i = 0; i < opts->num_pci_addr; i++) {
373 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
374 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
375 					(opts->pci_blocked ? DPDK_BLOCK_PARAM : DPDK_ALLOW_PARAM),
376 					bdf));
377 			if (args == NULL) {
378 				return -1;
379 			}
380 		}
381 	}
382 
383 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
384 	 * This can be overridden by specifying the same option in opts->env_context
385 	 */
386 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
387 	if (args == NULL) {
388 		return -1;
389 	}
390 
391 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
392 	 * This can be overridden by specifying the same option in opts->env_context
393 	 */
394 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
395 	if (args == NULL) {
396 		return -1;
397 	}
398 
399 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
400 	 * vhost user message. We don't want that. The same log type is also used by a couple
401 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
402 	 * be overridden via opts->env_context.
403 	 */
404 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
405 	if (args == NULL) {
406 		return -1;
407 	}
408 
409 	if (opts->env_context) {
410 		char *ptr = strdup(opts->env_context);
411 		char *tok = strtok(ptr, " \t");
412 
413 		/* DPDK expects each argument as a separate string in the argv
414 		 * array, so we need to tokenize here in case the caller
415 		 * passed multiple arguments in the env_context string.
416 		 */
417 		while (tok != NULL) {
418 			args = push_arg(args, &argcount, strdup(tok));
419 			tok = strtok(NULL, " \t");
420 		}
421 
422 		free(ptr);
423 	}
424 
425 #ifdef __linux__
426 
427 	if (opts->iova_mode) {
428 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
429 		if (args == NULL) {
430 			return -1;
431 		}
432 	} else {
433 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
434 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
435 		 * iova-mode=pa here. */
436 		if (rte_vfio_noiommu_is_enabled()) {
437 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
438 			if (args == NULL) {
439 				return -1;
440 			}
441 		}
442 
443 #if defined(__x86_64__)
444 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
445 		 * support running as an unprivileged user. However, some systems (especially
446 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
447 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
448 		 * and force iova-mode=pa here. */
449 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
450 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
451 			if (args == NULL) {
452 				return -1;
453 			}
454 		}
455 #elif defined(__PPC64__)
456 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
457 		 * auto-detect at the moment, so we'll just force it here. */
458 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
459 		if (args == NULL) {
460 			return -1;
461 		}
462 #endif
463 	}
464 
465 
466 	/* Set the base virtual address - it must be an address that is not in the
467 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
468 	 * mmap hint.
469 	 *
470 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
471 	 */
472 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
473 	if (args == NULL) {
474 		return -1;
475 	}
476 
477 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
478 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
479 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
480 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
481 	 */
482 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
483 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
484 		if (args == NULL) {
485 			return -1;
486 		}
487 	}
488 
489 	if (opts->shm_id < 0) {
490 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
491 				getpid()));
492 		if (args == NULL) {
493 			return -1;
494 		}
495 	} else {
496 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
497 				opts->shm_id));
498 		if (args == NULL) {
499 			return -1;
500 		}
501 
502 		/* set the process type */
503 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
504 		if (args == NULL) {
505 			return -1;
506 		}
507 	}
508 #endif
509 
510 	g_eal_cmdline = args;
511 	g_eal_cmdline_argcount = argcount;
512 	return argcount;
513 }
514 
515 int
516 spdk_env_dpdk_post_init(bool legacy_mem)
517 {
518 	int rc;
519 
520 	pci_env_init();
521 
522 	rc = mem_map_init(legacy_mem);
523 	if (rc < 0) {
524 		SPDK_ERRLOG("Failed to allocate mem_map\n");
525 		return rc;
526 	}
527 
528 	rc = vtophys_init();
529 	if (rc < 0) {
530 		SPDK_ERRLOG("Failed to initialize vtophys\n");
531 		return rc;
532 	}
533 
534 	return 0;
535 }
536 
537 void
538 spdk_env_dpdk_post_fini(void)
539 {
540 	pci_env_fini();
541 
542 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
543 	g_eal_cmdline = NULL;
544 	g_eal_cmdline_argcount = 0;
545 }
546 
547 int
548 spdk_env_init(const struct spdk_env_opts *opts)
549 {
550 	char **dpdk_args = NULL;
551 	int i, rc;
552 	int orig_optind;
553 	bool legacy_mem;
554 
555 	/* If SPDK env has been initialized before, then only pci env requires
556 	 * reinitialization.
557 	 */
558 	if (g_external_init == false) {
559 		if (opts != NULL) {
560 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
561 			return -EINVAL;
562 		}
563 
564 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
565 		pci_env_reinit();
566 
567 		return 0;
568 	}
569 
570 	if (opts == NULL) {
571 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
572 		return -EINVAL;
573 	}
574 
575 	rc = build_eal_cmdline(opts);
576 	if (rc < 0) {
577 		SPDK_ERRLOG("Invalid arguments to initialize DPDK\n");
578 		return -EINVAL;
579 	}
580 
581 	SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
582 	SPDK_PRINTF("[ DPDK EAL parameters: ");
583 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
584 		SPDK_PRINTF("%s ", g_eal_cmdline[i]);
585 	}
586 	SPDK_PRINTF("]\n");
587 
588 	/* DPDK rearranges the array we pass to it, so make a copy
589 	 * before passing so we can still free the individual strings
590 	 * correctly.
591 	 */
592 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
593 	if (dpdk_args == NULL) {
594 		SPDK_ERRLOG("Failed to allocate dpdk_args\n");
595 		return -ENOMEM;
596 	}
597 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
598 
599 	fflush(stdout);
600 	orig_optind = optind;
601 	optind = 1;
602 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
603 	optind = orig_optind;
604 
605 	free(dpdk_args);
606 
607 	if (rc < 0) {
608 		if (rte_errno == EALREADY) {
609 			SPDK_ERRLOG("DPDK already initialized\n");
610 		} else {
611 			SPDK_ERRLOG("Failed to initialize DPDK\n");
612 		}
613 		return -rte_errno;
614 	}
615 
616 	legacy_mem = false;
617 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
618 		legacy_mem = true;
619 	}
620 
621 	rc = spdk_env_dpdk_post_init(legacy_mem);
622 	if (rc == 0) {
623 		g_external_init = false;
624 	}
625 
626 	return rc;
627 }
628 
629 /* We use priority 101 which is the highest priority level available
630  * to applications (the toolchains reserve 1 to 100 for internal usage).
631  * This ensures this destructor runs last, after any other destructors
632  * that might still need the environment up and running.
633  */
634 __attribute__((destructor(101))) static void
635 dpdk_cleanup(void)
636 {
637 	/* Only call rte_eal_cleanup if the SPDK env library called rte_eal_init. */
638 	if (!g_external_init) {
639 		rte_eal_cleanup();
640 	}
641 }
642 
643 void
644 spdk_env_fini(void)
645 {
646 	spdk_env_dpdk_post_fini();
647 }
648 
649 bool
650 spdk_env_dpdk_external_init(void)
651 {
652 	return g_external_init;
653 }
654