xref: /spdk/lib/env_dpdk/init.c (revision 6b6a3ff91f77970587950b17ddb58bb65d690c8b)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 
41 #include <rte_config.h>
42 #include <rte_eal.h>
43 #include <rte_errno.h>
44 #include <rte_vfio.h>
45 
46 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
52 #define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR	0x200000000000
53 
54 static char **g_eal_cmdline;
55 static int g_eal_cmdline_argcount;
56 static bool g_external_init = true;
57 
58 static char *
59 _sprintf_alloc(const char *format, ...)
60 {
61 	va_list args;
62 	va_list args_copy;
63 	char *buf;
64 	size_t bufsize;
65 	int rc;
66 
67 	va_start(args, format);
68 
69 	/* Try with a small buffer first. */
70 	bufsize = 32;
71 
72 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
73 	while (bufsize <= 1024 * 1024) {
74 		buf = malloc(bufsize);
75 		if (buf == NULL) {
76 			va_end(args);
77 			return NULL;
78 		}
79 
80 		va_copy(args_copy, args);
81 		rc = vsnprintf(buf, bufsize, format, args_copy);
82 		va_end(args_copy);
83 
84 		/*
85 		 * If vsnprintf() returned a count within our current buffer size, we are done.
86 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
87 		 */
88 		if (rc >= 0 && (size_t)rc < bufsize) {
89 			va_end(args);
90 			return buf;
91 		}
92 
93 		/*
94 		 * vsnprintf() should return the required space, but some libc versions do not
95 		 * implement this correctly, so just double the buffer size and try again.
96 		 *
97 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
98 		 * again to avoid a copy.
99 		 */
100 		free(buf);
101 		bufsize *= 2;
102 	}
103 
104 	va_end(args);
105 	return NULL;
106 }
107 
108 static void
109 env_unlink_shared_files(void)
110 {
111 	/* Starting with DPDK 18.05, there are more files with unpredictable paths
112 	 * and filenames. The --no-shconf option prevents from creating them, but
113 	 * only for DPDK 18.08+. For DPDK 18.05 we just leave them be.
114 	 */
115 #if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0)
116 	char buffer[PATH_MAX];
117 
118 	snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid());
119 	if (unlink(buffer)) {
120 		fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno);
121 	}
122 #endif
123 }
124 
125 void
126 spdk_env_opts_init(struct spdk_env_opts *opts)
127 {
128 	if (!opts) {
129 		return;
130 	}
131 
132 	memset(opts, 0, sizeof(*opts));
133 
134 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
135 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
136 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
137 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
138 	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
139 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
140 	opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
141 }
142 
143 static void
144 free_args(char **args, int argcount)
145 {
146 	int i;
147 
148 	for (i = 0; i < argcount; i++) {
149 		free(args[i]);
150 	}
151 
152 	if (argcount) {
153 		free(args);
154 	}
155 }
156 
157 static char **
158 push_arg(char *args[], int *argcount, char *arg)
159 {
160 	char **tmp;
161 
162 	if (arg == NULL) {
163 		fprintf(stderr, "%s: NULL arg supplied\n", __func__);
164 		free_args(args, *argcount);
165 		return NULL;
166 	}
167 
168 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
169 	if (tmp == NULL) {
170 		free(arg);
171 		free_args(args, *argcount);
172 		return NULL;
173 	}
174 
175 	tmp[*argcount] = arg;
176 	(*argcount)++;
177 
178 	return tmp;
179 }
180 
181 #if defined(__linux__) && defined(__x86_64__)
182 
183 /* TODO: Can likely get this value from rlimits in the future */
184 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
185 #define VTD_CAP_MGAW_SHIFT 16
186 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
187 
188 static int
189 get_iommu_width(void)
190 {
191 	DIR *dir;
192 	FILE *file;
193 	struct dirent *entry;
194 	char mgaw_path[64];
195 	char buf[64];
196 	char *end;
197 	long long int val;
198 	int width, tmp;
199 
200 	dir = opendir("/sys/devices/virtual/iommu/");
201 	if (dir == NULL) {
202 		return -EINVAL;
203 	}
204 
205 	width = 0;
206 
207 	while ((entry = readdir(dir)) != NULL) {
208 		/* Find directories named "dmar0", "dmar1", etc */
209 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
210 			continue;
211 		}
212 
213 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
214 			       entry->d_name);
215 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
216 			continue;
217 		}
218 
219 		file = fopen(mgaw_path, "r");
220 		if (file == NULL) {
221 			continue;
222 		}
223 
224 		if (fgets(buf, sizeof(buf), file) == NULL) {
225 			fclose(file);
226 			continue;
227 		}
228 
229 		val = strtoll(buf, &end, 16);
230 		if (val == LLONG_MIN || val == LLONG_MAX) {
231 			fclose(file);
232 			continue;
233 		}
234 
235 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
236 		if (width == 0 || tmp < width) {
237 			width = tmp;
238 		}
239 
240 		fclose(file);
241 	}
242 
243 	closedir(dir);
244 
245 	return width;
246 }
247 
248 #endif
249 
250 static int
251 build_eal_cmdline(const struct spdk_env_opts *opts)
252 {
253 	int argcount = 0;
254 	char **args;
255 
256 	args = NULL;
257 
258 	/* set the program name */
259 	args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
260 	if (args == NULL) {
261 		return -1;
262 	}
263 
264 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
265 	if (opts->shm_id < 0) {
266 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
267 		if (args == NULL) {
268 			return -1;
269 		}
270 	}
271 
272 	/* set the coremask */
273 	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
274 	 */
275 	if (opts->core_mask[0] == '[') {
276 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
277 
278 		if (l_arg != NULL) {
279 			int len = strlen(l_arg);
280 
281 			if (l_arg[len - 1] == ']') {
282 				l_arg[len - 1] = '\0';
283 			}
284 		}
285 		args = push_arg(args, &argcount, l_arg);
286 	} else {
287 		args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
288 	}
289 
290 	if (args == NULL) {
291 		return -1;
292 	}
293 
294 	/* set the memory channel number */
295 	if (opts->mem_channel > 0) {
296 		args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
297 		if (args == NULL) {
298 			return -1;
299 		}
300 	}
301 
302 	/* set the memory size */
303 	if (opts->mem_size >= 0) {
304 		args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
305 		if (args == NULL) {
306 			return -1;
307 		}
308 	}
309 
310 	/* set the master core */
311 	if (opts->master_core > 0) {
312 		args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
313 				opts->master_core));
314 		if (args == NULL) {
315 			return -1;
316 		}
317 	}
318 
319 	/* set no pci  if enabled */
320 	if (opts->no_pci) {
321 		args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
322 		if (args == NULL) {
323 			return -1;
324 		}
325 	}
326 
327 	/* create just one hugetlbfs file */
328 	if (opts->hugepage_single_segments) {
329 		args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
330 		if (args == NULL) {
331 			return -1;
332 		}
333 	}
334 
335 	/* unlink hugepages after initialization */
336 	if (opts->unlink_hugepage) {
337 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
338 		if (args == NULL) {
339 			return -1;
340 		}
341 	}
342 
343 	/* use a specific hugetlbfs mount */
344 	if (opts->hugedir) {
345 		args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
346 		if (args == NULL) {
347 			return -1;
348 		}
349 	}
350 
351 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0)
352 	/* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */
353 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
354 		args = push_arg(args, &argcount, _sprintf_alloc("--legacy-mem"));
355 		if (args == NULL) {
356 			return -1;
357 		}
358 	}
359 #endif
360 
361 	if (opts->num_pci_addr) {
362 		size_t i;
363 		char bdf[32];
364 		struct spdk_pci_addr *pci_addr =
365 				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
366 
367 		for (i = 0; i < opts->num_pci_addr; i++) {
368 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
369 			args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
370 					(opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
371 					bdf));
372 			if (args == NULL) {
373 				return -1;
374 			}
375 		}
376 	}
377 
378 	/* The following log-level options are not understood by older DPDKs */
379 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
380 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
381 	 * This can be overridden by specifying the same option in opts->env_context
382 	 */
383 	args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
384 	if (args == NULL) {
385 		return -1;
386 	}
387 
388 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
389 	 * This can be overridden by specifying the same option in opts->env_context
390 	 */
391 	args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
392 	if (args == NULL) {
393 		return -1;
394 	}
395 
396 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
397 	 * vhost user message. We don't want that. The same log type is also used by a couple
398 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
399 	 * be overridden via opts->env_context.
400 	 */
401 	args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
402 	if (args == NULL) {
403 		return -1;
404 	}
405 #endif
406 
407 	if (opts->env_context) {
408 		args = push_arg(args, &argcount, strdup(opts->env_context));
409 		if (args == NULL) {
410 			return -1;
411 		}
412 	}
413 
414 #ifdef __linux__
415 
416 	if (opts->iova_mode) {
417 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
418 		if (args == NULL) {
419 			return -1;
420 		}
421 	} else {
422 		/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
423 		 * but DPDK guesses it should be iova-mode=va. Add a check and force
424 		 * iova-mode=pa here. */
425 		if (rte_vfio_noiommu_is_enabled()) {
426 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
427 			if (args == NULL) {
428 				return -1;
429 			}
430 		}
431 
432 #if defined(__x86_64__)
433 		/* DPDK by default guesses that it should be using iova-mode=va so that it can
434 		 * support running as an unprivileged user. However, some systems (especially
435 		 * virtual machines) don't have an IOMMU capable of handling the full virtual
436 		 * address space and DPDK doesn't currently catch that. Add a check in SPDK
437 		 * and force iova-mode=pa here. */
438 		if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
439 			args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
440 			if (args == NULL) {
441 				return -1;
442 			}
443 		}
444 #elif defined(__PPC64__)
445 		/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
446 		 * auto-detect at the moment, so we'll just force it here. */
447 		args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
448 		if (args == NULL) {
449 			return -1;
450 		}
451 #endif
452 	}
453 
454 
455 	/* Set the base virtual address - it must be an address that is not in the
456 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
457 	 * mmap hint.
458 	 *
459 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
460 	 */
461 	args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
462 	if (args == NULL) {
463 		return -1;
464 	}
465 
466 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
467 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
468 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
469 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
470 	 */
471 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
472 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
473 		args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
474 		if (args == NULL) {
475 			return -1;
476 		}
477 	}
478 #endif
479 
480 	if (opts->shm_id < 0) {
481 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
482 				getpid()));
483 		if (args == NULL) {
484 			return -1;
485 		}
486 	} else {
487 		args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
488 				opts->shm_id));
489 		if (args == NULL) {
490 			return -1;
491 		}
492 
493 		/* set the process type */
494 		args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
495 		if (args == NULL) {
496 			return -1;
497 		}
498 	}
499 #endif
500 
501 	g_eal_cmdline = args;
502 	g_eal_cmdline_argcount = argcount;
503 	return argcount;
504 }
505 
506 int
507 spdk_env_dpdk_post_init(bool legacy_mem)
508 {
509 	int rc;
510 
511 	pci_env_init();
512 
513 	rc = mem_map_init(legacy_mem);
514 	if (rc < 0) {
515 		fprintf(stderr, "Failed to allocate mem_map\n");
516 		return rc;
517 	}
518 
519 	rc = vtophys_init();
520 	if (rc < 0) {
521 		fprintf(stderr, "Failed to initialize vtophys\n");
522 		return rc;
523 	}
524 
525 	return 0;
526 }
527 
528 void
529 spdk_env_dpdk_post_fini(void)
530 {
531 	pci_env_fini();
532 
533 	free_args(g_eal_cmdline, g_eal_cmdline_argcount);
534 }
535 
536 int
537 spdk_env_init(const struct spdk_env_opts *opts)
538 {
539 	char **dpdk_args = NULL;
540 	int i, rc;
541 	int orig_optind;
542 	bool legacy_mem;
543 
544 	g_external_init = false;
545 
546 	rc = build_eal_cmdline(opts);
547 	if (rc < 0) {
548 		fprintf(stderr, "Invalid arguments to initialize DPDK\n");
549 		return -EINVAL;
550 	}
551 
552 	printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
553 	printf("[ DPDK EAL parameters: ");
554 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
555 		printf("%s ", g_eal_cmdline[i]);
556 	}
557 	printf("]\n");
558 
559 	/* DPDK rearranges the array we pass to it, so make a copy
560 	 * before passing so we can still free the individual strings
561 	 * correctly.
562 	 */
563 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
564 	if (dpdk_args == NULL) {
565 		fprintf(stderr, "Failed to allocate dpdk_args\n");
566 		return -ENOMEM;
567 	}
568 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
569 
570 	fflush(stdout);
571 	orig_optind = optind;
572 	optind = 1;
573 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
574 	optind = orig_optind;
575 
576 	free(dpdk_args);
577 
578 	if (rc < 0) {
579 		if (rte_errno == EALREADY) {
580 			fprintf(stderr, "DPDK already initialized\n");
581 		} else {
582 			fprintf(stderr, "Failed to initialize DPDK\n");
583 		}
584 		return -rte_errno;
585 	}
586 
587 	if (opts->shm_id < 0 && !opts->hugepage_single_segments) {
588 		/*
589 		 * Unlink hugepage and config info files after init.  This will ensure they get
590 		 *  deleted on app exit, even if the app crashes and does not exit normally.
591 		 *  Only do this when not in multi-process mode, since for multi-process other
592 		 *  apps will need to open these files. These files are not created for
593 		 *  "single file segments".
594 		 */
595 		env_unlink_shared_files();
596 	}
597 
598 	legacy_mem = false;
599 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
600 		legacy_mem = true;
601 	}
602 
603 	return spdk_env_dpdk_post_init(legacy_mem);
604 }
605 
606 void
607 spdk_env_fini(void)
608 {
609 	spdk_env_dpdk_post_fini();
610 }
611 
612 bool
613 spdk_env_dpdk_external_init(void)
614 {
615 	return g_external_init;
616 }
617