xref: /spdk/lib/env_dpdk/init.c (revision 1914de09202410b6881d8bcff916ce78fb749ad6)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include "spdk/version.h"
39 #include "spdk/env_dpdk.h"
40 
41 #include <rte_config.h>
42 #include <rte_eal.h>
43 #include <rte_errno.h>
44 
45 #define SPDK_ENV_DPDK_DEFAULT_NAME		"spdk"
46 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID		-1
47 #define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE		-1
48 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE	-1
49 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL	-1
50 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK		"0x1"
51 
52 static char **g_eal_cmdline;
53 static int g_eal_cmdline_argcount;
54 static bool g_external_init = true;
55 
56 static char *
57 _sprintf_alloc(const char *format, ...)
58 {
59 	va_list args;
60 	va_list args_copy;
61 	char *buf;
62 	size_t bufsize;
63 	int rc;
64 
65 	va_start(args, format);
66 
67 	/* Try with a small buffer first. */
68 	bufsize = 32;
69 
70 	/* Limit maximum buffer size to something reasonable so we don't loop forever. */
71 	while (bufsize <= 1024 * 1024) {
72 		buf = malloc(bufsize);
73 		if (buf == NULL) {
74 			va_end(args);
75 			return NULL;
76 		}
77 
78 		va_copy(args_copy, args);
79 		rc = vsnprintf(buf, bufsize, format, args_copy);
80 		va_end(args_copy);
81 
82 		/*
83 		 * If vsnprintf() returned a count within our current buffer size, we are done.
84 		 * The count does not include the \0 terminator, so rc == bufsize is not OK.
85 		 */
86 		if (rc >= 0 && (size_t)rc < bufsize) {
87 			va_end(args);
88 			return buf;
89 		}
90 
91 		/*
92 		 * vsnprintf() should return the required space, but some libc versions do not
93 		 * implement this correctly, so just double the buffer size and try again.
94 		 *
95 		 * We don't need the data in buf, so rather than realloc(), use free() and malloc()
96 		 * again to avoid a copy.
97 		 */
98 		free(buf);
99 		bufsize *= 2;
100 	}
101 
102 	va_end(args);
103 	return NULL;
104 }
105 
106 static void
107 spdk_env_unlink_shared_files(void)
108 {
109 	/* Starting with DPDK 18.05, there are more files with unpredictable paths
110 	 * and filenames. The --no-shconf option prevents from creating them, but
111 	 * only for DPDK 18.08+. For DPDK 18.05 we just leave them be.
112 	 */
113 #if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0)
114 	char buffer[PATH_MAX];
115 
116 	snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid());
117 	if (unlink(buffer)) {
118 		fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno);
119 	}
120 #endif
121 }
122 
123 void
124 spdk_env_opts_init(struct spdk_env_opts *opts)
125 {
126 	if (!opts) {
127 		return;
128 	}
129 
130 	memset(opts, 0, sizeof(*opts));
131 
132 	opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
133 	opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
134 	opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
135 	opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
136 	opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
137 	opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
138 }
139 
140 static void
141 spdk_free_args(char **args, int argcount)
142 {
143 	int i;
144 
145 	for (i = 0; i < argcount; i++) {
146 		free(args[i]);
147 	}
148 
149 	if (argcount) {
150 		free(args);
151 	}
152 }
153 
154 static char **
155 spdk_push_arg(char *args[], int *argcount, char *arg)
156 {
157 	char **tmp;
158 
159 	if (arg == NULL) {
160 		fprintf(stderr, "%s: NULL arg supplied\n", __func__);
161 		spdk_free_args(args, *argcount);
162 		return NULL;
163 	}
164 
165 	tmp = realloc(args, sizeof(char *) * (*argcount + 1));
166 	if (tmp == NULL) {
167 		free(arg);
168 		spdk_free_args(args, *argcount);
169 		return NULL;
170 	}
171 
172 	tmp[*argcount] = arg;
173 	(*argcount)++;
174 
175 	return tmp;
176 }
177 
178 #if defined(__linux__) && defined(__x86_64__)
179 
180 /* TODO: Can likely get this value from rlimits in the future */
181 #define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
182 #define VTD_CAP_MGAW_SHIFT 16
183 #define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
184 
185 static int
186 spdk_get_iommu_width(void)
187 {
188 	DIR *dir;
189 	FILE *file;
190 	struct dirent *entry;
191 	char mgaw_path[64];
192 	char buf[64];
193 	char *end;
194 	long long int val;
195 	int width, tmp;
196 
197 	dir = opendir("/sys/devices/virtual/iommu/");
198 	if (dir == NULL) {
199 		return -EINVAL;
200 	}
201 
202 	width = 0;
203 
204 	while ((entry = readdir(dir)) != NULL) {
205 		/* Find directories named "dmar0", "dmar1", etc */
206 		if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
207 			continue;
208 		}
209 
210 		tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
211 			       entry->d_name);
212 		if ((unsigned)tmp >= sizeof(mgaw_path)) {
213 			continue;
214 		}
215 
216 		file = fopen(mgaw_path, "r");
217 		if (file == NULL) {
218 			continue;
219 		}
220 
221 		if (fgets(buf, sizeof(buf), file) == NULL) {
222 			fclose(file);
223 			continue;
224 		}
225 
226 		val = strtoll(buf, &end, 16);
227 		if (val == LLONG_MIN || val == LLONG_MAX) {
228 			fclose(file);
229 			continue;
230 		}
231 
232 		tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
233 		if (width == 0 || tmp < width) {
234 			width = tmp;
235 		}
236 
237 		fclose(file);
238 	}
239 
240 	closedir(dir);
241 
242 	return width;
243 }
244 
245 #endif
246 
247 static int
248 spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
249 {
250 	int argcount = 0;
251 	char **args;
252 
253 	args = NULL;
254 
255 	/* set the program name */
256 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
257 	if (args == NULL) {
258 		return -1;
259 	}
260 
261 	/* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
262 	if (opts->shm_id < 0) {
263 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
264 		if (args == NULL) {
265 			return -1;
266 		}
267 	}
268 
269 	/* set the coremask */
270 	/* NOTE: If coremask starts with '[' and ends with ']' it is a core list
271 	 */
272 	if (opts->core_mask[0] == '[') {
273 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
274 
275 		if (l_arg != NULL) {
276 			int len = strlen(l_arg);
277 
278 			if (l_arg[len - 1] == ']') {
279 				l_arg[len - 1] = '\0';
280 			}
281 		}
282 		args = spdk_push_arg(args, &argcount, l_arg);
283 	} else {
284 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
285 	}
286 
287 	if (args == NULL) {
288 		return -1;
289 	}
290 
291 	/* set the memory channel number */
292 	if (opts->mem_channel > 0) {
293 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
294 		if (args == NULL) {
295 			return -1;
296 		}
297 	}
298 
299 	/* set the memory size */
300 	if (opts->mem_size >= 0) {
301 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
302 		if (args == NULL) {
303 			return -1;
304 		}
305 	}
306 
307 	/* set the master core */
308 	if (opts->master_core > 0) {
309 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
310 				     opts->master_core));
311 		if (args == NULL) {
312 			return -1;
313 		}
314 	}
315 
316 	/* set no pci  if enabled */
317 	if (opts->no_pci) {
318 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
319 		if (args == NULL) {
320 			return -1;
321 		}
322 	}
323 
324 	/* create just one hugetlbfs file */
325 	if (opts->hugepage_single_segments) {
326 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
327 		if (args == NULL) {
328 			return -1;
329 		}
330 	}
331 
332 	/* unlink hugepages after initialization */
333 	if (opts->unlink_hugepage) {
334 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
335 		if (args == NULL) {
336 			return -1;
337 		}
338 	}
339 
340 	/* use a specific hugetlbfs mount */
341 	if (opts->hugedir) {
342 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
343 		if (args == NULL) {
344 			return -1;
345 		}
346 	}
347 
348 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0)
349 	/* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */
350 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
351 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem"));
352 		if (args == NULL) {
353 			return -1;
354 		}
355 	}
356 #endif
357 
358 	if (opts->num_pci_addr) {
359 		size_t i;
360 		char bdf[32];
361 		struct spdk_pci_addr *pci_addr =
362 				opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
363 
364 		for (i = 0; i < opts->num_pci_addr; i++) {
365 			spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
366 			args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s",
367 					     (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
368 					     bdf));
369 			if (args == NULL) {
370 				return -1;
371 			}
372 		}
373 	}
374 
375 	/* The following log-level options are not understood by older DPDKs */
376 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
377 	/* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
378 	 * This can be overridden by specifying the same option in opts->env_context
379 	 */
380 	args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
381 	if (args == NULL) {
382 		return -1;
383 	}
384 
385 	/* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
386 	 * This can be overridden by specifying the same option in opts->env_context
387 	 */
388 	args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
389 	if (args == NULL) {
390 		return -1;
391 	}
392 
393 	/* `user1` log type is used by rte_vhost, which prints an INFO log for each received
394 	 * vhost user message. We don't want that. The same log type is also used by a couple
395 	 * of other DPDK libs, but none of which we make use right now. If necessary, this can
396 	 * be overridden via opts->env_context.
397 	 */
398 	args = spdk_push_arg(args, &argcount, strdup("--log-level=user1:6"));
399 	if (args == NULL) {
400 		return -1;
401 	}
402 #endif
403 
404 	if (opts->env_context) {
405 		args = spdk_push_arg(args, &argcount, strdup(opts->env_context));
406 		if (args == NULL) {
407 			return -1;
408 		}
409 	}
410 
411 #ifdef __linux__
412 
413 #if defined(__x86_64__)
414 	/* DPDK by default guesses that it should be using iova-mode=va so that it can
415 	 * support running as an unprivileged user. However, some systems (especially
416 	 * virtual machines) don't have an IOMMU capable of handling the full virtual
417 	 * address space and DPDK doesn't currently catch that. Add a check in SPDK
418 	 * and force iova-mode=pa here. */
419 	if (spdk_get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
420 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
421 		if (args == NULL) {
422 			return -1;
423 		}
424 	}
425 #elif defined(__PPC64__)
426 	/* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
427 	 * auto-detect at the moment, so we'll just force it here. */
428 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
429 	if (args == NULL) {
430 		return -1;
431 	}
432 #endif
433 
434 
435 	/* Set the base virtual address - it must be an address that is not in the
436 	 * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
437 	 * mmap hint.
438 	 *
439 	 * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
440 	 */
441 	args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000"));
442 	if (args == NULL) {
443 		return -1;
444 	}
445 
446 	/* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
447 	 * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
448 	 * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
449 	 * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
450 	 */
451 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
452 	if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
453 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
454 		if (args == NULL) {
455 			return -1;
456 		}
457 	}
458 #endif
459 
460 	if (opts->shm_id < 0) {
461 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
462 				     getpid()));
463 		if (args == NULL) {
464 			return -1;
465 		}
466 	} else {
467 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
468 				     opts->shm_id));
469 		if (args == NULL) {
470 			return -1;
471 		}
472 
473 		/* set the process type */
474 		args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
475 		if (args == NULL) {
476 			return -1;
477 		}
478 	}
479 #endif
480 
481 	g_eal_cmdline = args;
482 	g_eal_cmdline_argcount = argcount;
483 	return argcount;
484 }
485 
486 int
487 spdk_env_dpdk_post_init(bool legacy_mem)
488 {
489 	int rc;
490 
491 	spdk_pci_init();
492 
493 	rc = spdk_mem_map_init(legacy_mem);
494 	if (rc < 0) {
495 		fprintf(stderr, "Failed to allocate mem_map\n");
496 		return rc;
497 	}
498 
499 	rc = spdk_vtophys_init();
500 	if (rc < 0) {
501 		fprintf(stderr, "Failed to initialize vtophys\n");
502 		return rc;
503 	}
504 
505 	return 0;
506 }
507 
508 void
509 spdk_env_dpdk_post_fini(void)
510 {
511 	spdk_pci_fini();
512 
513 	spdk_free_args(g_eal_cmdline, g_eal_cmdline_argcount);
514 }
515 
516 int
517 spdk_env_init(const struct spdk_env_opts *opts)
518 {
519 	char **dpdk_args = NULL;
520 	int i, rc;
521 	int orig_optind;
522 	bool legacy_mem;
523 
524 	g_external_init = false;
525 
526 	rc = spdk_build_eal_cmdline(opts);
527 	if (rc < 0) {
528 		fprintf(stderr, "Invalid arguments to initialize DPDK\n");
529 		return -EINVAL;
530 	}
531 
532 	printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
533 	printf("[ DPDK EAL parameters: ");
534 	for (i = 0; i < g_eal_cmdline_argcount; i++) {
535 		printf("%s ", g_eal_cmdline[i]);
536 	}
537 	printf("]\n");
538 
539 	/* DPDK rearranges the array we pass to it, so make a copy
540 	 * before passing so we can still free the individual strings
541 	 * correctly.
542 	 */
543 	dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
544 	if (dpdk_args == NULL) {
545 		fprintf(stderr, "Failed to allocate dpdk_args\n");
546 		return -ENOMEM;
547 	}
548 	memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
549 
550 	fflush(stdout);
551 	orig_optind = optind;
552 	optind = 1;
553 	rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
554 	optind = orig_optind;
555 
556 	free(dpdk_args);
557 
558 	if (rc < 0) {
559 		if (rte_errno == EALREADY) {
560 			fprintf(stderr, "DPDK already initialized\n");
561 		} else {
562 			fprintf(stderr, "Failed to initialize DPDK\n");
563 		}
564 		return -rte_errno;
565 	}
566 
567 	if (opts->shm_id < 0 && !opts->hugepage_single_segments) {
568 		/*
569 		 * Unlink hugepage and config info files after init.  This will ensure they get
570 		 *  deleted on app exit, even if the app crashes and does not exit normally.
571 		 *  Only do this when not in multi-process mode, since for multi-process other
572 		 *  apps will need to open these files. These files are not created for
573 		 *  "single file segments".
574 		 */
575 		spdk_env_unlink_shared_files();
576 	}
577 
578 	legacy_mem = false;
579 	if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
580 		legacy_mem = true;
581 	}
582 
583 	return spdk_env_dpdk_post_init(legacy_mem);
584 }
585 
586 void
587 spdk_env_fini(void)
588 {
589 	spdk_env_dpdk_post_fini();
590 }
591 
592 bool
593 spdk_env_dpdk_external_init(void)
594 {
595 	return g_external_init;
596 }
597