xref: /spdk/lib/env_dpdk/init.c (revision e316ec90b21eb3fea2dfa930261266dceb1f05aa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 
41 #include <rte_config.h>
42 #include <rte_eal.h>
43 #include <rte_errno.h>
44 #include <rte_vfio.h>
45 
46 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
47 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
48 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
49 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
50 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
51 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
52 
53 static char **g_eal_cmdline;
54 static int g_eal_cmdline_argcount;
55 static bool g_external_init = true;
56 
57 static char *
58 _sprintf_alloc(const char *format, ...)
59 {
60 	va_list args;
61 	va_list args_copy;
62 	char *buf;
63 	size_t bufsize;
64 	int rc;
65 
66 	va_start(args, format);
67 
68 	/* Try with a small buffer first. */
69 	bufsize = 32;
70 
71 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
72 	while (bufsize <= 1024 * 1024) {
73 		buf = malloc(bufsize);
74 		if (buf == NULL) {
75 			va_end(args);
76 			return NULL;
77 		}
78 
79 		va_copy(args_copy, args);
80 		rc = vsnprintf(buf, bufsize, format, args_copy);
81 		va_end(args_copy);
82 
83 		/*
84 		 * If vsnprintf() returned a count within our current buffer size, we are done.
85 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
86 		 */
87 		if (rc >= 0 && (size_t)rc < bufsize) {
88 			va_end(args);
89 			return buf;
90 		}
91 
92 		/*
93 		 * vsnprintf() should return the required space, but some libc versions do not
94 		 * implement this correctly, so just double the buffer size and try again.
95 		 *
96 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
97 		 * again to avoid a copy.
98 		 */
99 		free(buf);
100 		bufsize *= 2;
101 	}
102 
103 	va_end(args);
104 	return NULL;
105 }
106 
107 static void
108 spdk_env_unlink_shared_files(void)
109 {
110 	/* Starting with DPDK 18.05, there are more files with unpredictable paths
111 	 * and filenames. The --no-shconf option prevents from creating them, but
112 	 * only for DPDK 18.08+. For DPDK 18.05 we just leave them be.
113 	 */
114 #if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0)
115 	char buffer[PATH_MAX];
116 
117 	snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid());
118 	if (unlink(buffer)) {
119 		fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno);
120 	}
121 #endif
122 }
123 
124 void
125 spdk_env_opts_init(struct spdk_env_opts *opts)
126 {
127 	if (!opts) {
128 		return;
129 	}
130 
131 	memset(opts, 0, sizeof(*opts));
132 
133 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
134 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
135 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
136 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
137 	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
138 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
139 }
140 
141 static void
142 spdk_free_args(char **args, int argcount)
143 {
144 	int i;
145 
146 	for (i = 0; i < argcount; i++) {
147 		free(args[i]);
148 	}
149 
150 	if (argcount) {
151 		free(args);
152 	}
153 }
154 
155 static char **
156 spdk_push_arg(char *args[], int *argcount, char *arg)
157 {
158 	char **tmp;
159 
160 	if (arg == NULL) {
161 		fprintf(stderr, "%s: NULL arg supplied\n", __func__);
162 		spdk_free_args(args, *argcount);
163 		return NULL;
164 	}
165 
166 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
167 	if (tmp == NULL) {
168 		free(arg);
169 		spdk_free_args(args, *argcount);
170 		return NULL;
171 	}
172 
173 	tmp[*argcount] = arg;
174 	(*argcount)++;
175 
176 	return tmp;
177 }
178 
179 #if defined(__linux__) && defined(__x86_64__)
180 
181 /* TODO: Can likely get this value from rlimits in the future */
182 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
183 #define VTD_CAP_MGAW_SHIFT 16
184 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
185 
186 static int
187 spdk_get_iommu_width(void)
188 {
189 	DIR *dir;
190 	FILE *file;
191 	struct dirent *entry;
192 	char mgaw_path[64];
193 	char buf[64];
194 	char *end;
195 	long long int val;
196 	int width, tmp;
197 
198 	dir = opendir("/sys/devices/virtual/iommu/");
199 	if (dir == NULL) {
200 		return -EINVAL;
201 	}
202 
203 	width = 0;
204 
205 	while ((entry = readdir(dir)) != NULL) {
206 		/* Find directories named "dmar0", "dmar1", etc */
207 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
208 			continue;
209 		}
210 
211 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
212 			       entry->d_name);
213 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
214 			continue;
215 		}
216 
217 		file = fopen(mgaw_path, "r");
218 		if (file == NULL) {
219 			continue;
220 		}
221 
222 		if (fgets(buf, sizeof(buf), file) == NULL) {
223 			fclose(file);
224 			continue;
225 		}
226 
227 		val = strtoll(buf, &end, 16);
228 		if (val == LLONG_MIN || val == LLONG_MAX) {
229 			fclose(file);
230 			continue;
231 		}
232 
233 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
234 		if (width == 0 || tmp < width) {
235 			width = tmp;
236 		}
237 
238 		fclose(file);
239 	}
240 
241 	closedir(dir);
242 
243 	return width;
244 }
245 
246 #endif
247 
248 static int
249 spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
250 {
251 	int argcount = 0;
252 	char **args;
253 
254 	args = NULL;
255 
256 	/* set the program name */
257 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
258 	if (args == NULL) {
259 		return -1;
260 	}
261 
262 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
263 	if (opts->shm_id < 0) {
264 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
265 		if (args == NULL) {
266 			return -1;
267 		}
268 	}
269 
270 	/* set the coremask */
271 	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
272 	 */
273 	if (opts->core_mask[0] == '[') {
274 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
275 
276 		if (l_arg != NULL) {
277 			int len = strlen(l_arg);
278 
279 			if (l_arg[len - 1] == ']') {
280 				l_arg[len - 1] = '\0';
281 			}
282 		}
283 		args = spdk_push_arg(args, &argcount, l_arg);
284 	} else {
285 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
286 	}
287 
288 	if (args == NULL) {
289 		return -1;
290 	}
291 
292 	/* set the memory channel number */
293 	if (opts->mem_channel > 0) {
294 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
295 		if (args == NULL) {
296 			return -1;
297 		}
298 	}
299 
300 	/* set the memory size */
301 	if (opts->mem_size >= 0) {
302 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
303 		if (args == NULL) {
304 			return -1;
305 		}
306 	}
307 
308 	/* set the master core */
309 	if (opts->master_core > 0) {
310 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
311 				     opts->master_core));
312 		if (args == NULL) {
313 			return -1;
314 		}
315 	}
316 
317 	/* set no pci  if enabled */
318 	if (opts->no_pci) {
319 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
320 		if (args == NULL) {
321 			return -1;
322 		}
323 	}
324 
325 	/* create just one hugetlbfs file */
326 	if (opts->hugepage_single_segments) {
327 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
328 		if (args == NULL) {
329 			return -1;
330 		}
331 	}
332 
333 	/* unlink hugepages after initialization */
334 	if (opts->unlink_hugepage) {
335 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
336 		if (args == NULL) {
337 			return -1;
338 		}
339 	}
340 
341 	/* use a specific hugetlbfs mount */
342 	if (opts->hugedir) {
343 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
344 		if (args == NULL) {
345 			return -1;
346 		}
347 	}
348 
349 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0)
350 	/* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */
351 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
352 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem"));
353 		if (args == NULL) {
354 			return -1;
355 		}
356 	}
357 #endif
358 
359 	if (opts->num_pci_addr) {
360 		size_t i;
361 		char bdf[32];
362 		struct spdk_pci_addr *pci_addr =
363 				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
364 
365 		for (i = 0; i < opts->num_pci_addr; i++) {
366 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
367 			args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s",
368 					     (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
369 					     bdf));
370 			if (args == NULL) {
371 				return -1;
372 			}
373 		}
374 	}
375 
376 	/* The following log-level options are not understood by older DPDKs */
377 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
378 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
379 	 * This can be overridden by specifying the same option in opts->env_context
380 	 */
381 	args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
382 	if (args == NULL) {
383 		return -1;
384 	}
385 
386 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
387 	 * This can be overridden by specifying the same option in opts->env_context
388 	 */
389 	args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
390 	if (args == NULL) {
391 		return -1;
392 	}
393 
394 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
395 	 * vhost user message. We don't want that. The same log type is also used by a couple
396 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
397 	 * be overridden via opts->env_context.
398 	 */
399 	args = spdk_push_arg(args, &argcount, strdup("--log-level=user1:6"));
400 	if (args == NULL) {
401 		return -1;
402 	}
403 #endif
404 
405 	if (opts->env_context) {
406 		args = spdk_push_arg(args, &argcount, strdup(opts->env_context));
407 		if (args == NULL) {
408 			return -1;
409 		}
410 	}
411 
412 #ifdef __linux__
413 
414 	/* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
415 	 * but DPDK guesses it should be iova-mode=va. Add a check and force
416 	 * iova-mode=pa here. */
417 	if (rte_vfio_noiommu_is_enabled()) {
418 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
419 		if (args == NULL) {
420 			return -1;
421 		}
422 	}
423 
424 #if defined(__x86_64__)
425 	/* DPDK by default guesses that it should be using iova-mode=va so that it can
426 	 * support running as an unprivileged user. However, some systems (especially
427 	 * virtual machines) don't have an IOMMU capable of handling the full virtual
428 	 * address space and DPDK doesn't currently catch that. Add a check in SPDK
429 	 * and force iova-mode=pa here. */
430 	if (spdk_get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
431 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
432 		if (args == NULL) {
433 			return -1;
434 		}
435 	}
436 #elif defined(__PPC64__)
437 	/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
438 	 * auto-detect at the moment, so we'll just force it here. */
439 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
440 	if (args == NULL) {
441 		return -1;
442 	}
443 #endif
444 
445 
446 	/* Set the base virtual address - it must be an address that is not in the
447 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
448 	 * mmap hint.
449 	 *
450 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
451 	 */
452 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000"));
453 	if (args == NULL) {
454 		return -1;
455 	}
456 
457 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
458 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
459 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
460 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
461 	 */
462 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
463 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
464 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
465 		if (args == NULL) {
466 			return -1;
467 		}
468 	}
469 #endif
470 
471 	if (opts->shm_id < 0) {
472 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
473 				     getpid()));
474 		if (args == NULL) {
475 			return -1;
476 		}
477 	} else {
478 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
479 				     opts->shm_id));
480 		if (args == NULL) {
481 			return -1;
482 		}
483 
484 		/* set the process type */
485 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
486 		if (args == NULL) {
487 			return -1;
488 		}
489 	}
490 #endif
491 
492 	g_eal_cmdline = args;
493 	g_eal_cmdline_argcount = argcount;
494 	return argcount;
495 }
496 
497 int
498 spdk_env_dpdk_post_init(bool legacy_mem)
499 {
500 	int rc;
501 
502 	spdk_pci_init();
503 
504 	rc = spdk_mem_map_init(legacy_mem);
505 	if (rc < 0) {
506 		fprintf(stderr, "Failed to allocate mem_map\n");
507 		return rc;
508 	}
509 
510 	rc = spdk_vtophys_init();
511 	if (rc < 0) {
512 		fprintf(stderr, "Failed to initialize vtophys\n");
513 		return rc;
514 	}
515 
516 	return 0;
517 }
518 
519 void
520 spdk_env_dpdk_post_fini(void)
521 {
522 	spdk_pci_fini();
523 
524 	spdk_free_args(g_eal_cmdline, g_eal_cmdline_argcount);
525 }
526 
527 int
528 spdk_env_init(const struct spdk_env_opts *opts)
529 {
530 	char **dpdk_args = NULL;
531 	int i, rc;
532 	int orig_optind;
533 	bool legacy_mem;
534 
535 	g_external_init = false;
536 
537 	rc = spdk_build_eal_cmdline(opts);
538 	if (rc < 0) {
539 		fprintf(stderr, "Invalid arguments to initialize DPDK\n");
540 		return -EINVAL;
541 	}
542 
543 	printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
544 	printf("[ DPDK EAL parameters: ");
545 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
546 		printf("%s ", g_eal_cmdline[i]);
547 	}
548 	printf("]\n");
549 
550 	/* DPDK rearranges the array we pass to it, so make a copy
551 	 * before passing so we can still free the individual strings
552 	 * correctly.
553 	 */
554 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
555 	if (dpdk_args == NULL) {
556 		fprintf(stderr, "Failed to allocate dpdk_args\n");
557 		return -ENOMEM;
558 	}
559 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
560 
561 	fflush(stdout);
562 	orig_optind = optind;
563 	optind = 1;
564 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
565 	optind = orig_optind;
566 
567 	free(dpdk_args);
568 
569 	if (rc < 0) {
570 		if (rte_errno == EALREADY) {
571 			fprintf(stderr, "DPDK already initialized\n");
572 		} else {
573 			fprintf(stderr, "Failed to initialize DPDK\n");
574 		}
575 		return -rte_errno;
576 	}
577 
578 	if (opts->shm_id < 0 && !opts->hugepage_single_segments) {
579 		/*
580 		 * Unlink hugepage and config info files after init.  This will ensure they get
581 		 *  deleted on app exit, even if the app crashes and does not exit normally.
582 		 *  Only do this when not in multi-process mode, since for multi-process other
583 		 *  apps will need to open these files. These files are not created for
584 		 *  "single file segments".
585 		 */
586 		spdk_env_unlink_shared_files();
587 	}
588 
589 	legacy_mem = false;
590 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
591 		legacy_mem = true;
592 	}
593 
594 	return spdk_env_dpdk_post_init(legacy_mem);
595 }
596 
597 void
598 spdk_env_fini(void)
599 {
600 	spdk_env_dpdk_post_fini();
601 }
602 
603 bool
604 spdk_env_dpdk_external_init(void)
605 {
606 	return g_external_init;
607 }
608