xref: /spdk/lib/env_dpdk/memory.c (revision 45a053c5777494f4e8ce4bc1191c9de3920377f7)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 #include "pci_dpdk.h"
10 
11 #include <rte_config.h>
12 #include <rte_memory.h>
13 #include <rte_eal_memconfig.h>
14 #include <rte_dev.h>
15 #include <rte_pci.h>
16 
17 #include "spdk_internal/assert.h"
18 
19 #include "spdk/assert.h"
20 #include "spdk/likely.h"
21 #include "spdk/queue.h"
22 #include "spdk/util.h"
23 #include "spdk/memory.h"
24 #include "spdk/env_dpdk.h"
25 #include "spdk/log.h"
26 
27 #ifdef __linux__
28 #include <linux/version.h>
29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 #include <linux/vfio.h>
31 #include <rte_vfio.h>
32 
33 struct spdk_vfio_dma_map {
34 	struct vfio_iommu_type1_dma_map map;
35 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 };
37 
38 struct vfio_cfg {
39 	int fd;
40 	bool enabled;
41 	bool noiommu_enabled;
42 	unsigned device_ref;
43 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 	pthread_mutex_t mutex;
45 };
46 
47 static struct vfio_cfg g_vfio = {
48 	.fd = -1,
49 	.enabled = false,
50 	.noiommu_enabled = false,
51 	.device_ref = 0,
52 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 	.mutex = PTHREAD_MUTEX_INITIALIZER
54 };
55 #endif
56 #endif
57 
58 #if DEBUG
59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 #else
61 #define DEBUG_PRINT(...)
62 #endif
63 
64 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
65 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
66 
67 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 
70 /* Page is registered */
71 #define REG_MAP_REGISTERED	(1ULL << 62)
72 
73 /* A notification region barrier. The 2MB translation entry that's marked
74  * with this flag must be unregistered separately. This allows contiguous
75  * regions to be unregistered in the same chunks they were registered.
76  */
77 #define REG_MAP_NOTIFY_START	(1ULL << 63)
78 
79 /* Translation of a single 2MB page. */
80 struct map_2mb {
81 	uint64_t translation_2mb;
82 };
83 
84 /* Second-level map table indexed by bits [21..29] of the virtual address.
85  * Each entry contains the address translation or error for entries that haven't
86  * been retrieved yet.
87  */
88 struct map_1gb {
89 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 };
91 
92 /* Top-level map table indexed by bits [30..47] of the virtual address.
93  * Each entry points to a second-level map table or NULL.
94  */
95 struct map_256tb {
96 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 };
98 
99 /* Page-granularity memory address translation */
100 struct spdk_mem_map {
101 	struct map_256tb map_256tb;
102 	pthread_mutex_t mutex;
103 	uint64_t default_translation;
104 	struct spdk_mem_map_ops ops;
105 	void *cb_ctx;
106 	TAILQ_ENTRY(spdk_mem_map) tailq;
107 };
108 
109 /* Registrations map. The 64 bit translations are bit fields with the
110  * following layout (starting with the low bits):
111  *    0 - 61 : reserved
112  *   62 - 63 : flags
113  */
114 static struct spdk_mem_map *g_mem_reg_map;
115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 
119 static bool g_legacy_mem;
120 static bool g_huge_pages = true;
121 
122 /*
123  * Walk the currently registered memory via the main memory registration map
124  * and call the new map's notify callback for each virtually contiguous region.
125  */
126 static int
127 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
128 {
129 	size_t idx_256tb;
130 	uint64_t idx_1gb;
131 	uint64_t contig_start = UINT64_MAX;
132 	uint64_t contig_end = UINT64_MAX;
133 	struct map_1gb *map_1gb;
134 	int rc;
135 
136 	if (!g_mem_reg_map) {
137 		return -EINVAL;
138 	}
139 
140 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
141 	pthread_mutex_lock(&g_mem_reg_map->mutex);
142 
143 	for (idx_256tb = 0;
144 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
145 	     idx_256tb++) {
146 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
147 
148 		if (!map_1gb) {
149 			if (contig_start != UINT64_MAX) {
150 				/* End of of a virtually contiguous range */
151 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
152 							(void *)contig_start,
153 							contig_end - contig_start + VALUE_2MB);
154 				/* Don't bother handling unregister failures. It can't be any worse */
155 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
156 					goto err_unregister;
157 				}
158 			}
159 			contig_start = UINT64_MAX;
160 			continue;
161 		}
162 
163 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
164 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
165 			    (contig_start == UINT64_MAX ||
166 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
167 				/* Rebuild the virtual address from the indexes */
168 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
169 
170 				if (contig_start == UINT64_MAX) {
171 					contig_start = vaddr;
172 				}
173 
174 				contig_end = vaddr;
175 			} else {
176 				if (contig_start != UINT64_MAX) {
177 					/* End of of a virtually contiguous range */
178 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
179 								(void *)contig_start,
180 								contig_end - contig_start + VALUE_2MB);
181 					/* Don't bother handling unregister failures. It can't be any worse */
182 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183 						goto err_unregister;
184 					}
185 
186 					/* This page might be a part of a neighbour region, so process
187 					 * it again. The idx_1gb will be incremented immediately.
188 					 */
189 					idx_1gb--;
190 				}
191 				contig_start = UINT64_MAX;
192 			}
193 		}
194 	}
195 
196 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
197 	return 0;
198 
199 err_unregister:
200 	/* Unwind to the first empty translation so we don't unregister
201 	 * a region that just failed to register.
202 	 */
203 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
204 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
205 	contig_start = UINT64_MAX;
206 	contig_end = UINT64_MAX;
207 
208 	/* Unregister any memory we managed to register before the failure */
209 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
210 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
211 
212 		if (!map_1gb) {
213 			if (contig_end != UINT64_MAX) {
214 				/* End of of a virtually contiguous range */
215 				map->ops.notify_cb(map->cb_ctx, map,
216 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
217 						   (void *)contig_start,
218 						   contig_end - contig_start + VALUE_2MB);
219 			}
220 			contig_end = UINT64_MAX;
221 			continue;
222 		}
223 
224 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
225 			/* Rebuild the virtual address from the indexes */
226 			uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
227 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
228 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
229 
230 				if (contig_end == UINT64_MAX) {
231 					contig_end = vaddr;
232 				}
233 				contig_start = vaddr;
234 			} else {
235 				if (contig_end != UINT64_MAX) {
236 					if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
237 						contig_start = vaddr;
238 					}
239 					/* End of of a virtually contiguous range */
240 					map->ops.notify_cb(map->cb_ctx, map,
241 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
242 							   (void *)contig_start,
243 							   contig_end - contig_start + VALUE_2MB);
244 				}
245 				contig_end = UINT64_MAX;
246 			}
247 		}
248 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
249 	}
250 
251 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
252 	return rc;
253 }
254 
255 struct spdk_mem_map *
256 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
257 {
258 	struct spdk_mem_map *map;
259 	int rc;
260 	size_t i;
261 
262 	map = calloc(1, sizeof(*map));
263 	if (map == NULL) {
264 		return NULL;
265 	}
266 
267 	if (pthread_mutex_init(&map->mutex, NULL)) {
268 		free(map);
269 		return NULL;
270 	}
271 
272 	map->default_translation = default_translation;
273 	map->cb_ctx = cb_ctx;
274 	if (ops) {
275 		map->ops = *ops;
276 	}
277 
278 	if (ops && ops->notify_cb) {
279 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
280 		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
281 		if (rc != 0) {
282 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
283 			DEBUG_PRINT("Initial mem_map notify failed\n");
284 			pthread_mutex_destroy(&map->mutex);
285 			for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
286 				free(map->map_256tb.map[i]);
287 			}
288 			free(map);
289 			return NULL;
290 		}
291 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
292 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
293 	}
294 
295 	return map;
296 }
297 
298 void
299 spdk_mem_map_free(struct spdk_mem_map **pmap)
300 {
301 	struct spdk_mem_map *map;
302 	size_t i;
303 
304 	if (!pmap) {
305 		return;
306 	}
307 
308 	map = *pmap;
309 
310 	if (!map) {
311 		return;
312 	}
313 
314 	if (map->ops.notify_cb) {
315 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
316 		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
317 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
318 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
319 	}
320 
321 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
322 		free(map->map_256tb.map[i]);
323 	}
324 
325 	pthread_mutex_destroy(&map->mutex);
326 
327 	free(map);
328 	*pmap = NULL;
329 }
330 
331 int
332 spdk_mem_register(void *vaddr, size_t len)
333 {
334 	struct spdk_mem_map *map;
335 	int rc;
336 	void *seg_vaddr;
337 	size_t seg_len;
338 	uint64_t reg;
339 
340 	if ((uintptr_t)vaddr & ~MASK_256TB) {
341 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
342 		return -EINVAL;
343 	}
344 
345 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
346 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
347 			    __func__, vaddr, len);
348 		return -EINVAL;
349 	}
350 
351 	if (len == 0) {
352 		return 0;
353 	}
354 
355 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
356 
357 	seg_vaddr = vaddr;
358 	seg_len = len;
359 	while (seg_len > 0) {
360 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
361 		if (reg & REG_MAP_REGISTERED) {
362 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
363 			return -EBUSY;
364 		}
365 		seg_vaddr += VALUE_2MB;
366 		seg_len -= VALUE_2MB;
367 	}
368 
369 	seg_vaddr = vaddr;
370 	seg_len = 0;
371 	while (len > 0) {
372 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
373 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
374 		seg_len += VALUE_2MB;
375 		vaddr += VALUE_2MB;
376 		len -= VALUE_2MB;
377 	}
378 
379 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
380 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
381 		if (rc != 0) {
382 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
383 			return rc;
384 		}
385 	}
386 
387 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
388 	return 0;
389 }
390 
391 int
392 spdk_mem_unregister(void *vaddr, size_t len)
393 {
394 	struct spdk_mem_map *map;
395 	int rc;
396 	void *seg_vaddr;
397 	size_t seg_len;
398 	uint64_t reg, newreg;
399 
400 	if ((uintptr_t)vaddr & ~MASK_256TB) {
401 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
402 		return -EINVAL;
403 	}
404 
405 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
406 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
407 			    __func__, vaddr, len);
408 		return -EINVAL;
409 	}
410 
411 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
412 
413 	/* The first page must be a start of a region. Also check if it's
414 	 * registered to make sure we don't return -ERANGE for non-registered
415 	 * regions.
416 	 */
417 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
418 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
419 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
420 		return -ERANGE;
421 	}
422 
423 	seg_vaddr = vaddr;
424 	seg_len = len;
425 	while (seg_len > 0) {
426 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
427 		if ((reg & REG_MAP_REGISTERED) == 0) {
428 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
429 			return -EINVAL;
430 		}
431 		seg_vaddr += VALUE_2MB;
432 		seg_len -= VALUE_2MB;
433 	}
434 
435 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
436 	/* If the next page is registered, it must be a start of a region as well,
437 	 * otherwise we'd be unregistering only a part of a region.
438 	 */
439 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
440 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
441 		return -ERANGE;
442 	}
443 	seg_vaddr = vaddr;
444 	seg_len = 0;
445 
446 	while (len > 0) {
447 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
448 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
449 
450 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
451 			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
452 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
453 				if (rc != 0) {
454 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
455 					return rc;
456 				}
457 			}
458 
459 			seg_vaddr = vaddr;
460 			seg_len = VALUE_2MB;
461 		} else {
462 			seg_len += VALUE_2MB;
463 		}
464 
465 		vaddr += VALUE_2MB;
466 		len -= VALUE_2MB;
467 	}
468 
469 	if (seg_len > 0) {
470 		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
471 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
472 			if (rc != 0) {
473 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
474 				return rc;
475 			}
476 		}
477 	}
478 
479 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
480 	return 0;
481 }
482 
483 int
484 spdk_mem_reserve(void *vaddr, size_t len)
485 {
486 	struct spdk_mem_map *map;
487 	void *seg_vaddr;
488 	size_t seg_len;
489 	uint64_t reg;
490 
491 	if ((uintptr_t)vaddr & ~MASK_256TB) {
492 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
493 		return -EINVAL;
494 	}
495 
496 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
497 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
498 			    __func__, vaddr, len);
499 		return -EINVAL;
500 	}
501 
502 	if (len == 0) {
503 		return 0;
504 	}
505 
506 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
507 
508 	/* Check if any part of this range is already registered */
509 	seg_vaddr = vaddr;
510 	seg_len = len;
511 	while (seg_len > 0) {
512 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
513 		if (reg & REG_MAP_REGISTERED) {
514 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
515 			return -EBUSY;
516 		}
517 		seg_vaddr += VALUE_2MB;
518 		seg_len -= VALUE_2MB;
519 	}
520 
521 	/* Simply set the translation to the memory map's default. This allocates the space in the
522 	 * map but does not provide a valid translation. */
523 	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
524 				     g_mem_reg_map->default_translation);
525 
526 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
527 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
528 	}
529 
530 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
531 	return 0;
532 }
533 
534 static struct map_1gb *
535 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
536 {
537 	struct map_1gb *map_1gb;
538 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
539 	size_t i;
540 
541 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
542 		return NULL;
543 	}
544 
545 	map_1gb = map->map_256tb.map[idx_256tb];
546 
547 	if (!map_1gb) {
548 		pthread_mutex_lock(&map->mutex);
549 
550 		/* Recheck to make sure nobody else got the mutex first. */
551 		map_1gb = map->map_256tb.map[idx_256tb];
552 		if (!map_1gb) {
553 			map_1gb = malloc(sizeof(struct map_1gb));
554 			if (map_1gb) {
555 				/* initialize all entries to default translation */
556 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
557 					map_1gb->map[i].translation_2mb = map->default_translation;
558 				}
559 				map->map_256tb.map[idx_256tb] = map_1gb;
560 			}
561 		}
562 
563 		pthread_mutex_unlock(&map->mutex);
564 
565 		if (!map_1gb) {
566 			DEBUG_PRINT("allocation failed\n");
567 			return NULL;
568 		}
569 	}
570 
571 	return map_1gb;
572 }
573 
574 int
575 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
576 			     uint64_t translation)
577 {
578 	uint64_t vfn_2mb;
579 	struct map_1gb *map_1gb;
580 	uint64_t idx_1gb;
581 	struct map_2mb *map_2mb;
582 
583 	if ((uintptr_t)vaddr & ~MASK_256TB) {
584 		DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
585 		return -EINVAL;
586 	}
587 
588 	/* For now, only 2 MB-aligned registrations are supported */
589 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
590 		DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
591 			    __func__, vaddr, size);
592 		return -EINVAL;
593 	}
594 
595 	vfn_2mb = vaddr >> SHIFT_2MB;
596 
597 	while (size) {
598 		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
599 		if (!map_1gb) {
600 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
601 			return -ENOMEM;
602 		}
603 
604 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
605 		map_2mb = &map_1gb->map[idx_1gb];
606 		map_2mb->translation_2mb = translation;
607 
608 		size -= VALUE_2MB;
609 		vfn_2mb++;
610 	}
611 
612 	return 0;
613 }
614 
615 int
616 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
617 {
618 	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
619 }
620 
621 inline uint64_t
622 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
623 {
624 	const struct map_1gb *map_1gb;
625 	const struct map_2mb *map_2mb;
626 	uint64_t idx_256tb;
627 	uint64_t idx_1gb;
628 	uint64_t vfn_2mb;
629 	uint64_t cur_size;
630 	uint64_t prev_translation;
631 	uint64_t orig_translation;
632 
633 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
634 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
635 		return map->default_translation;
636 	}
637 
638 	vfn_2mb = vaddr >> SHIFT_2MB;
639 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
640 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
641 
642 	map_1gb = map->map_256tb.map[idx_256tb];
643 	if (spdk_unlikely(!map_1gb)) {
644 		return map->default_translation;
645 	}
646 
647 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
648 	map_2mb = &map_1gb->map[idx_1gb];
649 	if (size == NULL || map->ops.are_contiguous == NULL ||
650 	    map_2mb->translation_2mb == map->default_translation) {
651 		if (size != NULL) {
652 			*size = spdk_min(*size, cur_size);
653 		}
654 		return map_2mb->translation_2mb;
655 	}
656 
657 	orig_translation = map_2mb->translation_2mb;
658 	prev_translation = orig_translation;
659 	while (cur_size < *size) {
660 		vfn_2mb++;
661 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
662 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
663 
664 		map_1gb = map->map_256tb.map[idx_256tb];
665 		if (spdk_unlikely(!map_1gb)) {
666 			break;
667 		}
668 
669 		map_2mb = &map_1gb->map[idx_1gb];
670 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
671 			break;
672 		}
673 
674 		cur_size += VALUE_2MB;
675 		prev_translation = map_2mb->translation_2mb;
676 	}
677 
678 	*size = spdk_min(*size, cur_size);
679 	return orig_translation;
680 }
681 
682 static void
683 memory_hotplug_cb(enum rte_mem_event event_type,
684 		  const void *addr, size_t len, void *arg)
685 {
686 	if (event_type == RTE_MEM_EVENT_ALLOC) {
687 		spdk_mem_register((void *)addr, len);
688 
689 		if (!spdk_env_dpdk_external_init()) {
690 			return;
691 		}
692 
693 		/* When the user initialized DPDK separately, we can't
694 		 * be sure that --match-allocations RTE flag was specified.
695 		 * Without this flag, DPDK can free memory in different units
696 		 * than it was allocated. It doesn't work with things like RDMA MRs.
697 		 *
698 		 * For such cases, we mark segments so they aren't freed.
699 		 */
700 		while (len > 0) {
701 			struct rte_memseg *seg;
702 
703 			seg = rte_mem_virt2memseg(addr, NULL);
704 			assert(seg != NULL);
705 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
706 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
707 			len -= seg->hugepage_sz;
708 		}
709 	} else if (event_type == RTE_MEM_EVENT_FREE) {
710 		spdk_mem_unregister((void *)addr, len);
711 	}
712 }
713 
714 static int
715 memory_iter_cb(const struct rte_memseg_list *msl,
716 	       const struct rte_memseg *ms, size_t len, void *arg)
717 {
718 	return spdk_mem_register(ms->addr, len);
719 }
720 
721 int
722 mem_map_init(bool legacy_mem)
723 {
724 	g_legacy_mem = legacy_mem;
725 
726 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
727 	if (g_mem_reg_map == NULL) {
728 		DEBUG_PRINT("memory registration map allocation failed\n");
729 		return -ENOMEM;
730 	}
731 
732 	/*
733 	 * Walk all DPDK memory segments and register them
734 	 * with the main memory map
735 	 */
736 	if (g_huge_pages) {
737 		rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
738 		rte_memseg_contig_walk(memory_iter_cb, NULL);
739 	}
740 	return 0;
741 }
742 
743 bool
744 spdk_iommu_is_enabled(void)
745 {
746 #if VFIO_ENABLED
747 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
748 #else
749 	return false;
750 #endif
751 }
752 
753 struct spdk_vtophys_pci_device {
754 	struct rte_pci_device *pci_device;
755 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
756 };
757 
758 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
759 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
760 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
761 
762 static struct spdk_mem_map *g_vtophys_map;
763 static struct spdk_mem_map *g_phys_ref_map;
764 
765 #if VFIO_ENABLED
766 static int
767 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
768 {
769 	struct spdk_vfio_dma_map *dma_map;
770 	int ret;
771 
772 	dma_map = calloc(1, sizeof(*dma_map));
773 	if (dma_map == NULL) {
774 		return -ENOMEM;
775 	}
776 
777 	dma_map->map.argsz = sizeof(dma_map->map);
778 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
779 	dma_map->map.vaddr = vaddr;
780 	dma_map->map.iova = iova;
781 	dma_map->map.size = size;
782 
783 	if (g_vfio.device_ref == 0) {
784 		/* VFIO requires at least one device (IOMMU group) to be added to
785 		 * a VFIO container before it is possible to perform any IOMMU
786 		 * operations on that container. This memory will be mapped once
787 		 * the first device (IOMMU group) is hotplugged.
788 		 *
789 		 * Since the vfio container is managed internally by DPDK, it is
790 		 * also possible that some device is already in that container, but
791 		 * it's not managed by SPDK -  e.g. an NIC attached internally
792 		 * inside DPDK. We could map the memory straight away in such
793 		 * scenario, but there's no need to do it. DPDK devices clearly
794 		 * don't need our mappings and hence we defer the mapping
795 		 * unconditionally until the first SPDK-managed device is
796 		 * hotplugged.
797 		 */
798 		goto out_insert;
799 	}
800 
801 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
802 	if (ret) {
803 		/* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
804 		SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
805 	}
806 
807 out_insert:
808 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
809 	return 0;
810 }
811 
812 
813 static int
814 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
815 {
816 	uint64_t refcount;
817 	int ret;
818 
819 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
820 	assert(refcount < UINT64_MAX);
821 	if (refcount > 0) {
822 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
823 		return 0;
824 	}
825 
826 	pthread_mutex_lock(&g_vfio.mutex);
827 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
828 	pthread_mutex_unlock(&g_vfio.mutex);
829 	if (ret) {
830 		return ret;
831 	}
832 
833 	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
834 	return 0;
835 }
836 
837 int
838 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
839 {
840 	int ret;
841 
842 	pthread_mutex_lock(&g_vfio.mutex);
843 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
844 	pthread_mutex_unlock(&g_vfio.mutex);
845 
846 	return ret;
847 }
848 
849 static int
850 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
851 {
852 	struct vfio_iommu_type1_dma_unmap unmap = {};
853 	int ret;
854 
855 	if (g_vfio.device_ref == 0) {
856 		/* Memory is not mapped anymore, just remove it's references */
857 		goto out_remove;
858 	}
859 
860 	unmap.argsz = sizeof(unmap);
861 	unmap.flags = 0;
862 	unmap.iova = dma_map->map.iova;
863 	unmap.size = dma_map->map.size;
864 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
865 	if (ret) {
866 		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
867 	}
868 
869 out_remove:
870 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
871 	free(dma_map);
872 	return 0;
873 }
874 
875 static int
876 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
877 {
878 	struct spdk_vfio_dma_map *dma_map;
879 	uint64_t refcount;
880 	int ret;
881 
882 	pthread_mutex_lock(&g_vfio.mutex);
883 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
884 		if (dma_map->map.iova == iova) {
885 			break;
886 		}
887 	}
888 
889 	if (dma_map == NULL) {
890 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
891 		pthread_mutex_unlock(&g_vfio.mutex);
892 		return -ENXIO;
893 	}
894 
895 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
896 	assert(refcount < UINT64_MAX);
897 	if (refcount > 0) {
898 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
899 	}
900 
901 	/* We still have outstanding references, don't clear it. */
902 	if (refcount > 1) {
903 		pthread_mutex_unlock(&g_vfio.mutex);
904 		return 0;
905 	}
906 
907 	/** don't support partial or multiple-page unmap for now */
908 	assert(dma_map->map.size == size);
909 
910 	ret = _vfio_iommu_unmap_dma(dma_map);
911 	pthread_mutex_unlock(&g_vfio.mutex);
912 
913 	return ret;
914 }
915 
916 int
917 vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
918 {
919 	struct spdk_vfio_dma_map *dma_map;
920 	int ret;
921 
922 	pthread_mutex_lock(&g_vfio.mutex);
923 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
924 		if (dma_map->map.vaddr == vaddr) {
925 			break;
926 		}
927 	}
928 
929 	if (dma_map == NULL) {
930 		DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
931 		pthread_mutex_unlock(&g_vfio.mutex);
932 		return -ENXIO;
933 	}
934 
935 	ret = _vfio_iommu_unmap_dma(dma_map);
936 	pthread_mutex_unlock(&g_vfio.mutex);
937 	return ret;
938 }
939 #endif
940 
941 static uint64_t
942 vtophys_get_paddr_memseg(uint64_t vaddr)
943 {
944 	uintptr_t paddr;
945 	struct rte_memseg *seg;
946 
947 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
948 	if (seg != NULL) {
949 		paddr = seg->iova;
950 		if (paddr == RTE_BAD_IOVA) {
951 			return SPDK_VTOPHYS_ERROR;
952 		}
953 		paddr += (vaddr - (uintptr_t)seg->addr);
954 		return paddr;
955 	}
956 
957 	return SPDK_VTOPHYS_ERROR;
958 }
959 
960 /* Try to get the paddr from /proc/self/pagemap */
961 static uint64_t
962 vtophys_get_paddr_pagemap(uint64_t vaddr)
963 {
964 	uintptr_t paddr;
965 
966 	/* Silence static analyzers */
967 	assert(vaddr != 0);
968 	paddr = rte_mem_virt2iova((void *)vaddr);
969 	if (paddr == RTE_BAD_IOVA) {
970 		/*
971 		 * The vaddr may be valid but doesn't have a backing page
972 		 * assigned yet.  Touch the page to ensure a backing page
973 		 * gets assigned, then try to translate again.
974 		 */
975 		rte_atomic64_read((rte_atomic64_t *)vaddr);
976 		paddr = rte_mem_virt2iova((void *)vaddr);
977 	}
978 	if (paddr == RTE_BAD_IOVA) {
979 		/* Unable to get to the physical address. */
980 		return SPDK_VTOPHYS_ERROR;
981 	}
982 
983 	return paddr;
984 }
985 
986 static uint64_t
987 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
988 {
989 	struct rte_mem_resource *res;
990 	uint64_t paddr;
991 	unsigned r;
992 
993 	for (r = 0; r < PCI_MAX_RESOURCE; r++) {
994 		res = dpdk_pci_device_get_mem_resource(dev, r);
995 
996 		if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
997 		    (vaddr + len) >= (uint64_t)res->addr + res->len) {
998 			continue;
999 		}
1000 
1001 #if VFIO_ENABLED
1002 		if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1003 			/*
1004 			 * The IOMMU is on and we're using IOVA == VA. The BAR was
1005 			 * automatically registered when it was mapped, so just return
1006 			 * the virtual address here.
1007 			 */
1008 			return vaddr;
1009 		}
1010 #endif
1011 		paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1012 		return paddr;
1013 	}
1014 
1015 	return SPDK_VTOPHYS_ERROR;
1016 }
1017 
1018 /* Try to get the paddr from pci devices */
1019 static uint64_t
1020 vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1021 {
1022 	struct spdk_vtophys_pci_device *vtophys_dev;
1023 	uintptr_t paddr;
1024 	struct rte_pci_device	*dev;
1025 
1026 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1027 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1028 		dev = vtophys_dev->pci_device;
1029 		paddr = pci_device_vtophys(dev, vaddr, len);
1030 		if (paddr != SPDK_VTOPHYS_ERROR) {
1031 			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1032 			return paddr;
1033 		}
1034 	}
1035 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1036 
1037 	return SPDK_VTOPHYS_ERROR;
1038 }
1039 
1040 static int
1041 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1042 	       enum spdk_mem_map_notify_action action,
1043 	       void *vaddr, size_t len)
1044 {
1045 	int rc = 0;
1046 	uint64_t paddr;
1047 
1048 	if ((uintptr_t)vaddr & ~MASK_256TB) {
1049 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1050 		return -EINVAL;
1051 	}
1052 
1053 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1054 		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1055 			    vaddr, len);
1056 		return -EINVAL;
1057 	}
1058 
1059 	/* Get the physical address from the DPDK memsegs */
1060 	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1061 
1062 	switch (action) {
1063 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1064 		if (paddr == SPDK_VTOPHYS_ERROR) {
1065 			/* This is not an address that DPDK is managing. */
1066 
1067 			/* Check if this is a PCI BAR. They need special handling */
1068 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1069 			if (paddr != SPDK_VTOPHYS_ERROR) {
1070 				/* Get paddr for each 2MB chunk in this address range */
1071 				while (len > 0) {
1072 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1073 					if (paddr == SPDK_VTOPHYS_ERROR) {
1074 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1075 						return -EFAULT;
1076 					}
1077 
1078 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1079 					if (rc != 0) {
1080 						return rc;
1081 					}
1082 
1083 					vaddr += VALUE_2MB;
1084 					len -= VALUE_2MB;
1085 				}
1086 
1087 				return 0;
1088 			}
1089 
1090 #if VFIO_ENABLED
1091 			enum rte_iova_mode iova_mode;
1092 
1093 			iova_mode = rte_eal_iova_mode();
1094 
1095 			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1096 				/* We'll use the virtual address as the iova to match DPDK. */
1097 				paddr = (uint64_t)vaddr;
1098 				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1099 				if (rc) {
1100 					return -EFAULT;
1101 				}
1102 				while (len > 0) {
1103 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1104 					if (rc != 0) {
1105 						return rc;
1106 					}
1107 					vaddr += VALUE_2MB;
1108 					paddr += VALUE_2MB;
1109 					len -= VALUE_2MB;
1110 				}
1111 			} else
1112 #endif
1113 			{
1114 				/* Get the physical address from /proc/self/pagemap. */
1115 				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1116 				if (paddr == SPDK_VTOPHYS_ERROR) {
1117 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1118 					return -EFAULT;
1119 				}
1120 
1121 				/* Get paddr for each 2MB chunk in this address range */
1122 				while (len > 0) {
1123 					/* Get the physical address from /proc/self/pagemap. */
1124 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1125 
1126 					if (paddr == SPDK_VTOPHYS_ERROR) {
1127 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1128 						return -EFAULT;
1129 					}
1130 
1131 					if (paddr & MASK_2MB) {
1132 						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1133 						return -EINVAL;
1134 					}
1135 #if VFIO_ENABLED
1136 					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1137 					 * with the IOMMU using the physical address to match. */
1138 					if (spdk_iommu_is_enabled()) {
1139 						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1140 						if (rc) {
1141 							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1142 							return -EFAULT;
1143 						}
1144 					}
1145 #endif
1146 
1147 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1148 					if (rc != 0) {
1149 						return rc;
1150 					}
1151 
1152 					vaddr += VALUE_2MB;
1153 					len -= VALUE_2MB;
1154 				}
1155 			}
1156 		} else {
1157 			/* This is an address managed by DPDK. Just setup the translations. */
1158 			while (len > 0) {
1159 				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1160 				if (paddr == SPDK_VTOPHYS_ERROR) {
1161 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1162 					return -EFAULT;
1163 				}
1164 
1165 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1166 				if (rc != 0) {
1167 					return rc;
1168 				}
1169 
1170 				vaddr += VALUE_2MB;
1171 				len -= VALUE_2MB;
1172 			}
1173 		}
1174 
1175 		break;
1176 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1177 #if VFIO_ENABLED
1178 		if (paddr == SPDK_VTOPHYS_ERROR) {
1179 			/*
1180 			 * This is not an address that DPDK is managing.
1181 			 */
1182 
1183 			/* Check if this is a PCI BAR. They need special handling */
1184 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1185 			if (paddr != SPDK_VTOPHYS_ERROR) {
1186 				/* Get paddr for each 2MB chunk in this address range */
1187 				while (len > 0) {
1188 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1189 					if (paddr == SPDK_VTOPHYS_ERROR) {
1190 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1191 						return -EFAULT;
1192 					}
1193 
1194 					rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1195 					if (rc != 0) {
1196 						return rc;
1197 					}
1198 
1199 					vaddr += VALUE_2MB;
1200 					len -= VALUE_2MB;
1201 				}
1202 
1203 				return 0;
1204 			}
1205 
1206 			/* If vfio is enabled,
1207 			 * we need to unmap the range from the IOMMU
1208 			 */
1209 			if (spdk_iommu_is_enabled()) {
1210 				uint64_t buffer_len = len;
1211 				uint8_t *va = vaddr;
1212 				enum rte_iova_mode iova_mode;
1213 
1214 				iova_mode = rte_eal_iova_mode();
1215 				/*
1216 				 * In virtual address mode, the region is contiguous and can be done in
1217 				 * one unmap.
1218 				 */
1219 				if (iova_mode == RTE_IOVA_VA) {
1220 					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1221 					if (buffer_len != len || paddr != (uintptr_t)va) {
1222 						DEBUG_PRINT("Unmapping %p with length %lu failed because "
1223 							    "translation had address 0x%" PRIx64 " and length %lu\n",
1224 							    va, len, paddr, buffer_len);
1225 						return -EINVAL;
1226 					}
1227 					rc = vtophys_iommu_unmap_dma(paddr, len);
1228 					if (rc) {
1229 						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1230 						return -EFAULT;
1231 					}
1232 				} else if (iova_mode == RTE_IOVA_PA) {
1233 					/* Get paddr for each 2MB chunk in this address range */
1234 					while (buffer_len > 0) {
1235 						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1236 
1237 						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1238 							DEBUG_PRINT("could not get phys addr for %p\n", va);
1239 							return -EFAULT;
1240 						}
1241 
1242 						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1243 						if (rc) {
1244 							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1245 							return -EFAULT;
1246 						}
1247 
1248 						va += VALUE_2MB;
1249 						buffer_len -= VALUE_2MB;
1250 					}
1251 				}
1252 			}
1253 		}
1254 #endif
1255 		while (len > 0) {
1256 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1257 			if (rc != 0) {
1258 				return rc;
1259 			}
1260 
1261 			vaddr += VALUE_2MB;
1262 			len -= VALUE_2MB;
1263 		}
1264 
1265 		break;
1266 	default:
1267 		SPDK_UNREACHABLE();
1268 	}
1269 
1270 	return rc;
1271 }
1272 
1273 static int
1274 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1275 {
1276 	/* This function is always called with paddrs for two subsequent
1277 	 * 2MB chunks in virtual address space, so those chunks will be only
1278 	 * physically contiguous if the physical addresses are 2MB apart
1279 	 * from each other as well.
1280 	 */
1281 	return (paddr2 - paddr1 == VALUE_2MB);
1282 }
1283 
1284 #if VFIO_ENABLED
1285 
1286 static bool
1287 vfio_enabled(void)
1288 {
1289 	return rte_vfio_is_enabled("vfio_pci");
1290 }
1291 
1292 /* Check if IOMMU is enabled on the system */
1293 static bool
1294 has_iommu_groups(void)
1295 {
1296 	int count = 0;
1297 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1298 
1299 	if (dir == NULL) {
1300 		return false;
1301 	}
1302 
1303 	while (count < 3 && readdir(dir) != NULL) {
1304 		count++;
1305 	}
1306 
1307 	closedir(dir);
1308 	/* there will always be ./ and ../ entries */
1309 	return count > 2;
1310 }
1311 
1312 static bool
1313 vfio_noiommu_enabled(void)
1314 {
1315 	return rte_vfio_noiommu_is_enabled();
1316 }
1317 
1318 static void
1319 vtophys_iommu_init(void)
1320 {
1321 	char proc_fd_path[PATH_MAX + 1];
1322 	char link_path[PATH_MAX + 1];
1323 	const char vfio_path[] = "/dev/vfio/vfio";
1324 	DIR *dir;
1325 	struct dirent *d;
1326 
1327 	if (!vfio_enabled()) {
1328 		return;
1329 	}
1330 
1331 	if (vfio_noiommu_enabled()) {
1332 		g_vfio.noiommu_enabled = true;
1333 	} else if (!has_iommu_groups()) {
1334 		return;
1335 	}
1336 
1337 	dir = opendir("/proc/self/fd");
1338 	if (!dir) {
1339 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1340 		return;
1341 	}
1342 
1343 	while ((d = readdir(dir)) != NULL) {
1344 		if (d->d_type != DT_LNK) {
1345 			continue;
1346 		}
1347 
1348 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1349 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1350 			continue;
1351 		}
1352 
1353 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1354 			sscanf(d->d_name, "%d", &g_vfio.fd);
1355 			break;
1356 		}
1357 	}
1358 
1359 	closedir(dir);
1360 
1361 	if (g_vfio.fd < 0) {
1362 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1363 		return;
1364 	}
1365 
1366 	g_vfio.enabled = true;
1367 
1368 	return;
1369 }
1370 
1371 #endif
1372 
1373 void
1374 vtophys_pci_device_added(struct rte_pci_device *pci_device)
1375 {
1376 	struct spdk_vtophys_pci_device *vtophys_dev;
1377 
1378 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1379 
1380 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1381 	if (vtophys_dev) {
1382 		vtophys_dev->pci_device = pci_device;
1383 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1384 	} else {
1385 		DEBUG_PRINT("Memory allocation error\n");
1386 	}
1387 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1388 
1389 #if VFIO_ENABLED
1390 	struct spdk_vfio_dma_map *dma_map;
1391 	int ret;
1392 
1393 	if (!g_vfio.enabled) {
1394 		return;
1395 	}
1396 
1397 	pthread_mutex_lock(&g_vfio.mutex);
1398 	g_vfio.device_ref++;
1399 	if (g_vfio.device_ref > 1) {
1400 		pthread_mutex_unlock(&g_vfio.mutex);
1401 		return;
1402 	}
1403 
1404 	/* This is the first SPDK device using DPDK vfio. This means that the first
1405 	 * IOMMU group might have been just been added to the DPDK vfio container.
1406 	 * From this point it is certain that the memory can be mapped now.
1407 	 */
1408 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1409 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1410 		if (ret) {
1411 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1412 			break;
1413 		}
1414 	}
1415 	pthread_mutex_unlock(&g_vfio.mutex);
1416 #endif
1417 }
1418 
1419 void
1420 vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1421 {
1422 	struct spdk_vtophys_pci_device *vtophys_dev;
1423 
1424 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1425 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1426 		if (vtophys_dev->pci_device == pci_device) {
1427 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1428 			free(vtophys_dev);
1429 			break;
1430 		}
1431 	}
1432 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1433 
1434 #if VFIO_ENABLED
1435 	struct spdk_vfio_dma_map *dma_map;
1436 	int ret;
1437 
1438 	if (!g_vfio.enabled) {
1439 		return;
1440 	}
1441 
1442 	pthread_mutex_lock(&g_vfio.mutex);
1443 	assert(g_vfio.device_ref > 0);
1444 	g_vfio.device_ref--;
1445 	if (g_vfio.device_ref > 0) {
1446 		pthread_mutex_unlock(&g_vfio.mutex);
1447 		return;
1448 	}
1449 
1450 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1451 	 * any additional devices using it's vfio container, all the mappings
1452 	 * will be automatically removed by the Linux vfio driver. We unmap
1453 	 * the memory manually to be able to easily re-map it later regardless
1454 	 * of other, external factors.
1455 	 */
1456 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1457 		struct vfio_iommu_type1_dma_unmap unmap = {};
1458 		unmap.argsz = sizeof(unmap);
1459 		unmap.flags = 0;
1460 		unmap.iova = dma_map->map.iova;
1461 		unmap.size = dma_map->map.size;
1462 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1463 		if (ret) {
1464 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1465 			break;
1466 		}
1467 	}
1468 	pthread_mutex_unlock(&g_vfio.mutex);
1469 #endif
1470 }
1471 
1472 int
1473 vtophys_init(void)
1474 {
1475 	const struct spdk_mem_map_ops vtophys_map_ops = {
1476 		.notify_cb = vtophys_notify,
1477 		.are_contiguous = vtophys_check_contiguous_entries,
1478 	};
1479 
1480 	const struct spdk_mem_map_ops phys_ref_map_ops = {
1481 		.notify_cb = NULL,
1482 		.are_contiguous = NULL,
1483 	};
1484 
1485 #if VFIO_ENABLED
1486 	vtophys_iommu_init();
1487 #endif
1488 
1489 	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1490 	if (g_phys_ref_map == NULL) {
1491 		DEBUG_PRINT("phys_ref map allocation failed.\n");
1492 		return -ENOMEM;
1493 	}
1494 
1495 	if (g_huge_pages) {
1496 		g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1497 		if (g_vtophys_map == NULL) {
1498 			DEBUG_PRINT("vtophys map allocation failed\n");
1499 			spdk_mem_map_free(&g_phys_ref_map);
1500 			return -ENOMEM;
1501 		}
1502 	}
1503 	return 0;
1504 }
1505 
1506 uint64_t
1507 spdk_vtophys(const void *buf, uint64_t *size)
1508 {
1509 	uint64_t vaddr, paddr_2mb;
1510 
1511 	if (!g_huge_pages) {
1512 		return SPDK_VTOPHYS_ERROR;
1513 	}
1514 
1515 	vaddr = (uint64_t)buf;
1516 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1517 
1518 	/*
1519 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1520 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1521 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1522 	 * unaligned) we must now check the return value before addition.
1523 	 */
1524 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1525 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1526 		return SPDK_VTOPHYS_ERROR;
1527 	} else {
1528 		return paddr_2mb + (vaddr & MASK_2MB);
1529 	}
1530 }
1531 
1532 int
1533 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1534 {
1535 	struct rte_memseg *seg;
1536 	int ret, fd;
1537 
1538 	seg = rte_mem_virt2memseg(vaddr, NULL);
1539 	if (!seg) {
1540 		SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1541 		return -ENOENT;
1542 	}
1543 
1544 	fd = rte_memseg_get_fd_thread_unsafe(seg);
1545 	if (fd < 0) {
1546 		return fd;
1547 	}
1548 
1549 	ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1550 	if (ret < 0) {
1551 		return ret;
1552 	}
1553 
1554 	return fd;
1555 }
1556 
1557 void
1558 mem_disable_huge_pages(void)
1559 {
1560 	g_huge_pages = false;
1561 }
1562