xref: /spdk/lib/env_dpdk/init.c (revision ceea3088870a3919d6bdfe61d7adba11b9733fb7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 #include "spdk/log.h"
41 
42 #include <rte_config.h>
43 #include <rte_eal.h>
44 #include <rte_errno.h>
45 #include <rte_vfio.h>
46 
47 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
48 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
49 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
50 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
51 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
52 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
53 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
54 
55 static char **g_eal_cmdline;
56 static int g_eal_cmdline_argcount;
57 static bool g_external_init = true;
58 
59 static char *
60 _sprintf_alloc(const char *format, ...)
61 {
62 	va_list args;
63 	va_list args_copy;
64 	char *buf;
65 	size_t bufsize;
66 	int rc;
67 
68 	va_start(args, format);
69 
70 	/* Try with a small buffer first. */
71 	bufsize = 32;
72 
73 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
74 	while (bufsize <= 1024 * 1024) {
75 		buf = malloc(bufsize);
76 		if (buf == NULL) {
77 			va_end(args);
78 			return NULL;
79 		}
80 
81 		va_copy(args_copy, args);
82 		rc = vsnprintf(buf, bufsize, format, args_copy);
83 		va_end(args_copy);
84 
85 		/*
86 		 * If vsnprintf() returned a count within our current buffer size, we are done.
87 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
88 		 */
89 		if (rc >= 0 && (size_t)rc < bufsize) {
90 			va_end(args);
91 			return buf;
92 		}
93 
94 		/*
95 		 * vsnprintf() should return the required space, but some libc versions do not
96 		 * implement this correctly, so just double the buffer size and try again.
97 		 *
98 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
99 		 * again to avoid a copy.
100 		 */
101 		free(buf);
102 		bufsize *= 2;
103 	}
104 
105 	va_end(args);
106 	return NULL;
107 }
108 
109 void
110 spdk_env_opts_init(struct spdk_env_opts *opts)
111 {
112 	if (!opts) {
113 		return;
114 	}
115 
116 	memset(opts, 0, sizeof(*opts));
117 
118 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
119 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
120 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
121 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
122 	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
123 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
124 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
125 }
126 
127 static void
128 free_args(char **args, int argcount)
129 {
130 	int i;
131 
132 	if (args == NULL) {
133 		return;
134 	}
135 
136 	for (i = 0; i < argcount; i++) {
137 		free(args[i]);
138 	}
139 
140 	if (argcount) {
141 		free(args);
142 	}
143 }
144 
145 static char **
146 push_arg(char *args[], int *argcount, char *arg)
147 {
148 	char **tmp;
149 
150 	if (arg == NULL) {
151 		SPDK_ERRLOG("%s: NULL arg supplied\n", __func__);
152 		free_args(args, *argcount);
153 		return NULL;
154 	}
155 
156 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
157 	if (tmp == NULL) {
158 		free(arg);
159 		free_args(args, *argcount);
160 		return NULL;
161 	}
162 
163 	tmp[*argcount] = arg;
164 	(*argcount)++;
165 
166 	return tmp;
167 }
168 
169 #if defined(__linux__) && defined(__x86_64__)
170 
171 /* TODO: Can likely get this value from rlimits in the future */
172 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
173 #define VTD_CAP_MGAW_SHIFT 16
174 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
175 
176 static int
177 get_iommu_width(void)
178 {
179 	DIR *dir;
180 	FILE *file;
181 	struct dirent *entry;
182 	char mgaw_path[64];
183 	char buf[64];
184 	char *end;
185 	long long int val;
186 	int width, tmp;
187 
188 	dir = opendir("/sys/devices/virtual/iommu/");
189 	if (dir == NULL) {
190 		return -EINVAL;
191 	}
192 
193 	width = 0;
194 
195 	while ((entry = readdir(dir)) != NULL) {
196 		/* Find directories named "dmar0", "dmar1", etc */
197 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
198 			continue;
199 		}
200 
201 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
202 			       entry->d_name);
203 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
204 			continue;
205 		}
206 
207 		file = fopen(mgaw_path, "r");
208 		if (file == NULL) {
209 			continue;
210 		}
211 
212 		if (fgets(buf, sizeof(buf), file) == NULL) {
213 			fclose(file);
214 			continue;
215 		}
216 
217 		val = strtoll(buf, &end, 16);
218 		if (val == LLONG_MIN || val == LLONG_MAX) {
219 			fclose(file);
220 			continue;
221 		}
222 
223 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
224 		if (width == 0 || tmp < width) {
225 			width = tmp;
226 		}
227 
228 		fclose(file);
229 	}
230 
231 	closedir(dir);
232 
233 	return width;
234 }
235 
236 #endif
237 
238 static int
239 build_eal_cmdline(const struct spdk_env_opts *opts)
240 {
241 	int argcount = 0;
242 	char **args;
243 
244 	args = NULL;
245 
246 	/* set the program name */
247 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
248 	if (args == NULL) {
249 		return -1;
250 	}
251 
252 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
253 	if (opts->shm_id < 0) {
254 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
255 		if (args == NULL) {
256 			return -1;
257 		}
258 	}
259 
260 	/* set the coremask */
261 	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
262 	 */
263 	if (opts->core_mask[0] == '[') {
264 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
265 
266 		if (l_arg != NULL) {
267 			int len = strlen(l_arg);
268 
269 			if (l_arg[len - 1] == ']') {
270 				l_arg[len - 1] = '\0';
271 			}
272 		}
273 		args = push_arg(args, &argcount, l_arg);
274 	} else {
275 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
276 	}
277 
278 	if (args == NULL) {
279 		return -1;
280 	}
281 
282 	/* set the memory channel number */
283 	if (opts->mem_channel > 0) {
284 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
285 		if (args == NULL) {
286 			return -1;
287 		}
288 	}
289 
290 	/* set the memory size */
291 	if (opts->mem_size >= 0) {
292 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
293 		if (args == NULL) {
294 			return -1;
295 		}
296 	}
297 
298 	/* set the master core */
299 	if (opts->master_core > 0) {
300 		args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
301 				opts->master_core));
302 		if (args == NULL) {
303 			return -1;
304 		}
305 	}
306 
307 	/* set no pci  if enabled */
308 	if (opts->no_pci) {
309 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
310 		if (args == NULL) {
311 			return -1;
312 		}
313 	}
314 
315 	/* create just one hugetlbfs file */
316 	if (opts->hugepage_single_segments) {
317 		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
318 		if (args == NULL) {
319 			return -1;
320 		}
321 	}
322 
323 	/* unlink hugepages after initialization */
324 	if (opts->unlink_hugepage) {
325 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
326 		if (args == NULL) {
327 			return -1;
328 		}
329 	}
330 
331 	/* use a specific hugetlbfs mount */
332 	if (opts->hugedir) {
333 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
334 		if (args == NULL) {
335 			return -1;
336 		}
337 	}
338 
339 	if (opts->num_pci_addr) {
340 		size_t i;
341 		char bdf[32];
342 		struct spdk_pci_addr *pci_addr =
343 				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
344 
345 		for (i = 0; i < opts->num_pci_addr; i++) {
346 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
347 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
348 					(opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
349 					bdf));
350 			if (args == NULL) {
351 				return -1;
352 			}
353 		}
354 	}
355 
356 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
357 	 * This can be overridden by specifying the same option in opts->env_context
358 	 */
359 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
360 	if (args == NULL) {
361 		return -1;
362 	}
363 
364 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
365 	 * This can be overridden by specifying the same option in opts->env_context
366 	 */
367 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
368 	if (args == NULL) {
369 		return -1;
370 	}
371 
372 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
373 	 * vhost user message. We don't want that. The same log type is also used by a couple
374 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
375 	 * be overridden via opts->env_context.
376 	 */
377 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
378 	if (args == NULL) {
379 		return -1;
380 	}
381 
382 	if (opts->env_context) {
383 		args = push_arg(args, &argcount, strdup(opts->env_context));
384 		if (args == NULL) {
385 			return -1;
386 		}
387 	}
388 
389 #ifdef __linux__
390 
391 	if (opts->iova_mode) {
392 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
393 		if (args == NULL) {
394 			return -1;
395 		}
396 	} else {
397 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
398 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
399 		 * iova-mode=pa here. */
400 		if (rte_vfio_noiommu_is_enabled()) {
401 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
402 			if (args == NULL) {
403 				return -1;
404 			}
405 		}
406 
407 #if defined(__x86_64__)
408 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
409 		 * support running as an unprivileged user. However, some systems (especially
410 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
411 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
412 		 * and force iova-mode=pa here. */
413 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
414 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
415 			if (args == NULL) {
416 				return -1;
417 			}
418 		}
419 #elif defined(__PPC64__)
420 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
421 		 * auto-detect at the moment, so we'll just force it here. */
422 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
423 		if (args == NULL) {
424 			return -1;
425 		}
426 #endif
427 	}
428 
429 
430 	/* Set the base virtual address - it must be an address that is not in the
431 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
432 	 * mmap hint.
433 	 *
434 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
435 	 */
436 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
437 	if (args == NULL) {
438 		return -1;
439 	}
440 
441 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
442 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
443 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
444 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
445 	 */
446 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
447 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
448 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
449 		if (args == NULL) {
450 			return -1;
451 		}
452 	}
453 #endif
454 
455 	if (opts->shm_id < 0) {
456 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
457 				getpid()));
458 		if (args == NULL) {
459 			return -1;
460 		}
461 	} else {
462 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
463 				opts->shm_id));
464 		if (args == NULL) {
465 			return -1;
466 		}
467 
468 		/* set the process type */
469 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
470 		if (args == NULL) {
471 			return -1;
472 		}
473 	}
474 #endif
475 
476 	g_eal_cmdline = args;
477 	g_eal_cmdline_argcount = argcount;
478 	return argcount;
479 }
480 
481 int
482 spdk_env_dpdk_post_init(bool legacy_mem)
483 {
484 	int rc;
485 
486 	pci_env_init();
487 
488 	rc = mem_map_init(legacy_mem);
489 	if (rc < 0) {
490 		SPDK_ERRLOG("Failed to allocate mem_map\n");
491 		return rc;
492 	}
493 
494 	rc = vtophys_init();
495 	if (rc < 0) {
496 		SPDK_ERRLOG("Failed to initialize vtophys\n");
497 		return rc;
498 	}
499 
500 	return 0;
501 }
502 
503 void
504 spdk_env_dpdk_post_fini(void)
505 {
506 	pci_env_fini();
507 
508 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
509 	g_eal_cmdline = NULL;
510 	g_eal_cmdline_argcount = 0;
511 }
512 
513 int
514 spdk_env_init(const struct spdk_env_opts *opts)
515 {
516 	char **dpdk_args = NULL;
517 	int i, rc;
518 	int orig_optind;
519 	bool legacy_mem;
520 
521 	/* If SPDK env has been initialized before, then only pci env requires
522 	 * reinitialization.
523 	 */
524 	if (g_external_init == false) {
525 		if (opts != NULL) {
526 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
527 			return -EINVAL;
528 		}
529 
530 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
531 		pci_env_reinit();
532 
533 		return 0;
534 	}
535 
536 	if (opts == NULL) {
537 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
538 		return -EINVAL;
539 	}
540 
541 	rc = build_eal_cmdline(opts);
542 	if (rc < 0) {
543 		SPDK_ERRLOG("Invalid arguments to initialize DPDK\n");
544 		return -EINVAL;
545 	}
546 
547 	SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
548 	SPDK_PRINTF("[ DPDK EAL parameters: ");
549 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
550 		SPDK_PRINTF("%s ", g_eal_cmdline[i]);
551 	}
552 	SPDK_PRINTF("]\n");
553 
554 	/* DPDK rearranges the array we pass to it, so make a copy
555 	 * before passing so we can still free the individual strings
556 	 * correctly.
557 	 */
558 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
559 	if (dpdk_args == NULL) {
560 		SPDK_ERRLOG("Failed to allocate dpdk_args\n");
561 		return -ENOMEM;
562 	}
563 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
564 
565 	fflush(stdout);
566 	orig_optind = optind;
567 	optind = 1;
568 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
569 	optind = orig_optind;
570 
571 	free(dpdk_args);
572 
573 	if (rc < 0) {
574 		if (rte_errno == EALREADY) {
575 			SPDK_ERRLOG("DPDK already initialized\n");
576 		} else {
577 			SPDK_ERRLOG("Failed to initialize DPDK\n");
578 		}
579 		return -rte_errno;
580 	}
581 
582 	legacy_mem = false;
583 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
584 		legacy_mem = true;
585 	}
586 
587 	rc = spdk_env_dpdk_post_init(legacy_mem);
588 	if (rc == 0) {
589 		g_external_init = false;
590 	}
591 
592 	return rc;
593 }
594 
595 void
596 spdk_env_fini(void)
597 {
598 	spdk_env_dpdk_post_fini();
599 }
600 
601 bool
602 spdk_env_dpdk_external_init(void)
603 {
604 	return g_external_init;
605 }
606