xref: /spdk/lib/env_dpdk/init.c (revision da60639f86dd88295eb46c2d76f9c327db92d7b3)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 
41 #include <rte_config.h>
42 #include <rte_eal.h>
43 #include <rte_errno.h>
44 #include <rte_vfio.h>
45 
46 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
52 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
53 
54 static char **g_eal_cmdline;
55 static int g_eal_cmdline_argcount;
56 static bool g_external_init = true;
57 
58 static char *
59 _sprintf_alloc(const char *format, ...)
60 {
61 	va_list args;
62 	va_list args_copy;
63 	char *buf;
64 	size_t bufsize;
65 	int rc;
66 
67 	va_start(args, format);
68 
69 	/* Try with a small buffer first. */
70 	bufsize = 32;
71 
72 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
73 	while (bufsize <= 1024 * 1024) {
74 		buf = malloc(bufsize);
75 		if (buf == NULL) {
76 			va_end(args);
77 			return NULL;
78 		}
79 
80 		va_copy(args_copy, args);
81 		rc = vsnprintf(buf, bufsize, format, args_copy);
82 		va_end(args_copy);
83 
84 		/*
85 		 * If vsnprintf() returned a count within our current buffer size, we are done.
86 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
87 		 */
88 		if (rc >= 0 && (size_t)rc < bufsize) {
89 			va_end(args);
90 			return buf;
91 		}
92 
93 		/*
94 		 * vsnprintf() should return the required space, but some libc versions do not
95 		 * implement this correctly, so just double the buffer size and try again.
96 		 *
97 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
98 		 * again to avoid a copy.
99 		 */
100 		free(buf);
101 		bufsize *= 2;
102 	}
103 
104 	va_end(args);
105 	return NULL;
106 }
107 
108 void
109 spdk_env_opts_init(struct spdk_env_opts *opts)
110 {
111 	if (!opts) {
112 		return;
113 	}
114 
115 	memset(opts, 0, sizeof(*opts));
116 
117 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
118 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
119 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
120 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
121 	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
122 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
123 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
124 }
125 
126 static void
127 free_args(char **args, int argcount)
128 {
129 	int i;
130 
131 	if (args == NULL) {
132 		return;
133 	}
134 
135 	for (i = 0; i < argcount; i++) {
136 		free(args[i]);
137 	}
138 
139 	if (argcount) {
140 		free(args);
141 	}
142 }
143 
144 static char **
145 push_arg(char *args[], int *argcount, char *arg)
146 {
147 	char **tmp;
148 
149 	if (arg == NULL) {
150 		fprintf(stderr, "%s: NULL arg supplied\n", __func__);
151 		free_args(args, *argcount);
152 		return NULL;
153 	}
154 
155 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
156 	if (tmp == NULL) {
157 		free(arg);
158 		free_args(args, *argcount);
159 		return NULL;
160 	}
161 
162 	tmp[*argcount] = arg;
163 	(*argcount)++;
164 
165 	return tmp;
166 }
167 
168 #if defined(__linux__) && defined(__x86_64__)
169 
170 /* TODO: Can likely get this value from rlimits in the future */
171 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
172 #define VTD_CAP_MGAW_SHIFT 16
173 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
174 
175 static int
176 get_iommu_width(void)
177 {
178 	DIR *dir;
179 	FILE *file;
180 	struct dirent *entry;
181 	char mgaw_path[64];
182 	char buf[64];
183 	char *end;
184 	long long int val;
185 	int width, tmp;
186 
187 	dir = opendir("/sys/devices/virtual/iommu/");
188 	if (dir == NULL) {
189 		return -EINVAL;
190 	}
191 
192 	width = 0;
193 
194 	while ((entry = readdir(dir)) != NULL) {
195 		/* Find directories named "dmar0", "dmar1", etc */
196 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
197 			continue;
198 		}
199 
200 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
201 			       entry->d_name);
202 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
203 			continue;
204 		}
205 
206 		file = fopen(mgaw_path, "r");
207 		if (file == NULL) {
208 			continue;
209 		}
210 
211 		if (fgets(buf, sizeof(buf), file) == NULL) {
212 			fclose(file);
213 			continue;
214 		}
215 
216 		val = strtoll(buf, &end, 16);
217 		if (val == LLONG_MIN || val == LLONG_MAX) {
218 			fclose(file);
219 			continue;
220 		}
221 
222 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
223 		if (width == 0 || tmp < width) {
224 			width = tmp;
225 		}
226 
227 		fclose(file);
228 	}
229 
230 	closedir(dir);
231 
232 	return width;
233 }
234 
235 #endif
236 
237 static int
238 build_eal_cmdline(const struct spdk_env_opts *opts)
239 {
240 	int argcount = 0;
241 	char **args;
242 
243 	args = NULL;
244 
245 	/* set the program name */
246 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
247 	if (args == NULL) {
248 		return -1;
249 	}
250 
251 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
252 	if (opts->shm_id < 0) {
253 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
254 		if (args == NULL) {
255 			return -1;
256 		}
257 	}
258 
259 	/* set the coremask */
260 	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
261 	 */
262 	if (opts->core_mask[0] == '[') {
263 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
264 
265 		if (l_arg != NULL) {
266 			int len = strlen(l_arg);
267 
268 			if (l_arg[len - 1] == ']') {
269 				l_arg[len - 1] = '\0';
270 			}
271 		}
272 		args = push_arg(args, &argcount, l_arg);
273 	} else {
274 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
275 	}
276 
277 	if (args == NULL) {
278 		return -1;
279 	}
280 
281 	/* set the memory channel number */
282 	if (opts->mem_channel > 0) {
283 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
284 		if (args == NULL) {
285 			return -1;
286 		}
287 	}
288 
289 	/* set the memory size */
290 	if (opts->mem_size >= 0) {
291 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
292 		if (args == NULL) {
293 			return -1;
294 		}
295 	}
296 
297 	/* set the master core */
298 	if (opts->master_core > 0) {
299 		args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
300 				opts->master_core));
301 		if (args == NULL) {
302 			return -1;
303 		}
304 	}
305 
306 	/* set no pci  if enabled */
307 	if (opts->no_pci) {
308 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
309 		if (args == NULL) {
310 			return -1;
311 		}
312 	}
313 
314 	/* create just one hugetlbfs file */
315 	if (opts->hugepage_single_segments) {
316 		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
317 		if (args == NULL) {
318 			return -1;
319 		}
320 	}
321 
322 	/* unlink hugepages after initialization */
323 	if (opts->unlink_hugepage) {
324 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
325 		if (args == NULL) {
326 			return -1;
327 		}
328 	}
329 
330 	/* use a specific hugetlbfs mount */
331 	if (opts->hugedir) {
332 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
333 		if (args == NULL) {
334 			return -1;
335 		}
336 	}
337 
338 	if (opts->num_pci_addr) {
339 		size_t i;
340 		char bdf[32];
341 		struct spdk_pci_addr *pci_addr =
342 				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
343 
344 		for (i = 0; i < opts->num_pci_addr; i++) {
345 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
346 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
347 					(opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
348 					bdf));
349 			if (args == NULL) {
350 				return -1;
351 			}
352 		}
353 	}
354 
355 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
356 	 * This can be overridden by specifying the same option in opts->env_context
357 	 */
358 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
359 	if (args == NULL) {
360 		return -1;
361 	}
362 
363 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
364 	 * This can be overridden by specifying the same option in opts->env_context
365 	 */
366 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
367 	if (args == NULL) {
368 		return -1;
369 	}
370 
371 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
372 	 * vhost user message. We don't want that. The same log type is also used by a couple
373 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
374 	 * be overridden via opts->env_context.
375 	 */
376 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
377 	if (args == NULL) {
378 		return -1;
379 	}
380 
381 	if (opts->env_context) {
382 		args = push_arg(args, &argcount, strdup(opts->env_context));
383 		if (args == NULL) {
384 			return -1;
385 		}
386 	}
387 
388 #ifdef __linux__
389 
390 	if (opts->iova_mode) {
391 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
392 		if (args == NULL) {
393 			return -1;
394 		}
395 	} else {
396 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
397 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
398 		 * iova-mode=pa here. */
399 		if (rte_vfio_noiommu_is_enabled()) {
400 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
401 			if (args == NULL) {
402 				return -1;
403 			}
404 		}
405 
406 #if defined(__x86_64__)
407 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
408 		 * support running as an unprivileged user. However, some systems (especially
409 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
410 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
411 		 * and force iova-mode=pa here. */
412 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
413 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
414 			if (args == NULL) {
415 				return -1;
416 			}
417 		}
418 #elif defined(__PPC64__)
419 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
420 		 * auto-detect at the moment, so we'll just force it here. */
421 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
422 		if (args == NULL) {
423 			return -1;
424 		}
425 #endif
426 	}
427 
428 
429 	/* Set the base virtual address - it must be an address that is not in the
430 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
431 	 * mmap hint.
432 	 *
433 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
434 	 */
435 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
436 	if (args == NULL) {
437 		return -1;
438 	}
439 
440 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
441 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
442 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
443 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
444 	 */
445 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
446 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
447 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
448 		if (args == NULL) {
449 			return -1;
450 		}
451 	}
452 #endif
453 
454 	if (opts->shm_id < 0) {
455 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
456 				getpid()));
457 		if (args == NULL) {
458 			return -1;
459 		}
460 	} else {
461 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
462 				opts->shm_id));
463 		if (args == NULL) {
464 			return -1;
465 		}
466 
467 		/* set the process type */
468 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
469 		if (args == NULL) {
470 			return -1;
471 		}
472 	}
473 #endif
474 
475 	g_eal_cmdline = args;
476 	g_eal_cmdline_argcount = argcount;
477 	return argcount;
478 }
479 
480 int
481 spdk_env_dpdk_post_init(bool legacy_mem)
482 {
483 	int rc;
484 
485 	pci_env_init();
486 
487 	rc = mem_map_init(legacy_mem);
488 	if (rc < 0) {
489 		fprintf(stderr, "Failed to allocate mem_map\n");
490 		return rc;
491 	}
492 
493 	rc = vtophys_init();
494 	if (rc < 0) {
495 		fprintf(stderr, "Failed to initialize vtophys\n");
496 		return rc;
497 	}
498 
499 	return 0;
500 }
501 
502 void
503 spdk_env_dpdk_post_fini(void)
504 {
505 	pci_env_fini();
506 
507 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
508 	g_eal_cmdline = NULL;
509 	g_eal_cmdline_argcount = 0;
510 }
511 
512 int
513 spdk_env_init(const struct spdk_env_opts *opts)
514 {
515 	char **dpdk_args = NULL;
516 	int i, rc;
517 	int orig_optind;
518 	bool legacy_mem;
519 
520 	/* If SPDK env has been initialized before, then only pci env requires
521 	 * reinitialization.
522 	 */
523 	if (g_external_init == false) {
524 		if (opts != NULL) {
525 			fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
526 			return -EINVAL;
527 		}
528 
529 		printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
530 		pci_env_reinit();
531 
532 		return 0;
533 	}
534 
535 	if (opts == NULL) {
536 		fprintf(stderr, "NULL arguments to initialize DPDK\n");
537 		return -EINVAL;
538 	}
539 
540 	rc = build_eal_cmdline(opts);
541 	if (rc < 0) {
542 		fprintf(stderr, "Invalid arguments to initialize DPDK\n");
543 		return -EINVAL;
544 	}
545 
546 	printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
547 	printf("[ DPDK EAL parameters: ");
548 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
549 		printf("%s ", g_eal_cmdline[i]);
550 	}
551 	printf("]\n");
552 
553 	/* DPDK rearranges the array we pass to it, so make a copy
554 	 * before passing so we can still free the individual strings
555 	 * correctly.
556 	 */
557 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
558 	if (dpdk_args == NULL) {
559 		fprintf(stderr, "Failed to allocate dpdk_args\n");
560 		return -ENOMEM;
561 	}
562 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
563 
564 	fflush(stdout);
565 	orig_optind = optind;
566 	optind = 1;
567 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
568 	optind = orig_optind;
569 
570 	free(dpdk_args);
571 
572 	if (rc < 0) {
573 		if (rte_errno == EALREADY) {
574 			fprintf(stderr, "DPDK already initialized\n");
575 		} else {
576 			fprintf(stderr, "Failed to initialize DPDK\n");
577 		}
578 		return -rte_errno;
579 	}
580 
581 	legacy_mem = false;
582 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
583 		legacy_mem = true;
584 	}
585 
586 	rc = spdk_env_dpdk_post_init(legacy_mem);
587 	if (rc == 0) {
588 		g_external_init = false;
589 	}
590 
591 	return rc;
592 }
593 
594 void
595 spdk_env_fini(void)
596 {
597 	spdk_env_dpdk_post_fini();
598 }
599 
600 bool
601 spdk_env_dpdk_external_init(void)
602 {
603 	return g_external_init;
604 }
605