xref: /spdk/lib/env_dpdk/memory.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 #include "pci_dpdk.h"
10 
11 #include <rte_config.h>
12 #include <rte_memory.h>
13 #include <rte_eal_memconfig.h>
14 #include <rte_dev.h>
15 #include <rte_pci.h>
16 
17 #include "spdk_internal/assert.h"
18 
19 #include "spdk/assert.h"
20 #include "spdk/likely.h"
21 #include "spdk/queue.h"
22 #include "spdk/util.h"
23 #include "spdk/memory.h"
24 #include "spdk/env_dpdk.h"
25 #include "spdk/log.h"
26 
27 #ifdef __linux__
28 #include <linux/version.h>
29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 #include <linux/vfio.h>
31 #include <rte_vfio.h>
32 
33 struct spdk_vfio_dma_map {
34 	struct vfio_iommu_type1_dma_map map;
35 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 };
37 
38 struct vfio_cfg {
39 	int fd;
40 	bool enabled;
41 	bool noiommu_enabled;
42 	unsigned device_ref;
43 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 	pthread_mutex_t mutex;
45 };
46 
47 static struct vfio_cfg g_vfio = {
48 	.fd = -1,
49 	.enabled = false,
50 	.noiommu_enabled = false,
51 	.device_ref = 0,
52 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 	.mutex = PTHREAD_MUTEX_INITIALIZER
54 };
55 #endif
56 #endif
57 
58 #if DEBUG
59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 #else
61 #define DEBUG_PRINT(...)
62 #endif
63 
64 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
65 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
66 
67 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 
70 /* Page is registered */
71 #define REG_MAP_REGISTERED	(1ULL << 62)
72 
73 /* A notification region barrier. The 2MB translation entry that's marked
74  * with this flag must be unregistered separately. This allows contiguous
75  * regions to be unregistered in the same chunks they were registered.
76  */
77 #define REG_MAP_NOTIFY_START	(1ULL << 63)
78 
79 /* Translation of a single 2MB page. */
80 struct map_2mb {
81 	uint64_t translation_2mb;
82 };
83 
84 /* Second-level map table indexed by bits [21..29] of the virtual address.
85  * Each entry contains the address translation or error for entries that haven't
86  * been retrieved yet.
87  */
88 struct map_1gb {
89 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 };
91 
92 /* Top-level map table indexed by bits [30..47] of the virtual address.
93  * Each entry points to a second-level map table or NULL.
94  */
95 struct map_256tb {
96 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 };
98 
99 /* Page-granularity memory address translation */
100 struct spdk_mem_map {
101 	struct map_256tb map_256tb;
102 	pthread_mutex_t mutex;
103 	uint64_t default_translation;
104 	struct spdk_mem_map_ops ops;
105 	void *cb_ctx;
106 	TAILQ_ENTRY(spdk_mem_map) tailq;
107 };
108 
109 /* Registrations map. The 64 bit translations are bit fields with the
110  * following layout (starting with the low bits):
111  *    0 - 61 : reserved
112  *   62 - 63 : flags
113  */
114 static struct spdk_mem_map *g_mem_reg_map;
115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 
119 static bool g_legacy_mem;
120 
121 /*
122  * Walk the currently registered memory via the main memory registration map
123  * and call the new map's notify callback for each virtually contiguous region.
124  */
125 static int
126 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
127 {
128 	size_t idx_256tb;
129 	uint64_t idx_1gb;
130 	uint64_t contig_start = UINT64_MAX;
131 	uint64_t contig_end = UINT64_MAX;
132 	struct map_1gb *map_1gb;
133 	int rc;
134 
135 	if (!g_mem_reg_map) {
136 		return -EINVAL;
137 	}
138 
139 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
140 	pthread_mutex_lock(&g_mem_reg_map->mutex);
141 
142 	for (idx_256tb = 0;
143 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
144 	     idx_256tb++) {
145 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
146 
147 		if (!map_1gb) {
148 			if (contig_start != UINT64_MAX) {
149 				/* End of of a virtually contiguous range */
150 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
151 							(void *)contig_start,
152 							contig_end - contig_start + VALUE_2MB);
153 				/* Don't bother handling unregister failures. It can't be any worse */
154 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
155 					goto err_unregister;
156 				}
157 			}
158 			contig_start = UINT64_MAX;
159 			continue;
160 		}
161 
162 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
163 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
164 			    (contig_start == UINT64_MAX ||
165 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
166 				/* Rebuild the virtual address from the indexes */
167 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
168 
169 				if (contig_start == UINT64_MAX) {
170 					contig_start = vaddr;
171 				}
172 
173 				contig_end = vaddr;
174 			} else {
175 				if (contig_start != UINT64_MAX) {
176 					/* End of of a virtually contiguous range */
177 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
178 								(void *)contig_start,
179 								contig_end - contig_start + VALUE_2MB);
180 					/* Don't bother handling unregister failures. It can't be any worse */
181 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
182 						goto err_unregister;
183 					}
184 
185 					/* This page might be a part of a neighbour region, so process
186 					 * it again. The idx_1gb will be incremented immediately.
187 					 */
188 					idx_1gb--;
189 				}
190 				contig_start = UINT64_MAX;
191 			}
192 		}
193 	}
194 
195 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
196 	return 0;
197 
198 err_unregister:
199 	/* Unwind to the first empty translation so we don't unregister
200 	 * a region that just failed to register.
201 	 */
202 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
203 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
204 	contig_start = UINT64_MAX;
205 	contig_end = UINT64_MAX;
206 
207 	/* Unregister any memory we managed to register before the failure */
208 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
209 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
210 
211 		if (!map_1gb) {
212 			if (contig_end != UINT64_MAX) {
213 				/* End of of a virtually contiguous range */
214 				map->ops.notify_cb(map->cb_ctx, map,
215 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
216 						   (void *)contig_start,
217 						   contig_end - contig_start + VALUE_2MB);
218 			}
219 			contig_end = UINT64_MAX;
220 			continue;
221 		}
222 
223 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
224 			/* Rebuild the virtual address from the indexes */
225 			uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
226 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
227 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
228 
229 				if (contig_end == UINT64_MAX) {
230 					contig_end = vaddr;
231 				}
232 				contig_start = vaddr;
233 			} else {
234 				if (contig_end != UINT64_MAX) {
235 					if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
236 						contig_start = vaddr;
237 					}
238 					/* End of of a virtually contiguous range */
239 					map->ops.notify_cb(map->cb_ctx, map,
240 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
241 							   (void *)contig_start,
242 							   contig_end - contig_start + VALUE_2MB);
243 				}
244 				contig_end = UINT64_MAX;
245 			}
246 		}
247 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
248 	}
249 
250 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
251 	return rc;
252 }
253 
254 struct spdk_mem_map *
255 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
256 {
257 	struct spdk_mem_map *map;
258 	int rc;
259 	size_t i;
260 
261 	map = calloc(1, sizeof(*map));
262 	if (map == NULL) {
263 		return NULL;
264 	}
265 
266 	if (pthread_mutex_init(&map->mutex, NULL)) {
267 		free(map);
268 		return NULL;
269 	}
270 
271 	map->default_translation = default_translation;
272 	map->cb_ctx = cb_ctx;
273 	if (ops) {
274 		map->ops = *ops;
275 	}
276 
277 	if (ops && ops->notify_cb) {
278 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
279 		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
280 		if (rc != 0) {
281 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
282 			DEBUG_PRINT("Initial mem_map notify failed\n");
283 			pthread_mutex_destroy(&map->mutex);
284 			for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
285 				free(map->map_256tb.map[i]);
286 			}
287 			free(map);
288 			return NULL;
289 		}
290 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
291 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
292 	}
293 
294 	return map;
295 }
296 
297 void
298 spdk_mem_map_free(struct spdk_mem_map **pmap)
299 {
300 	struct spdk_mem_map *map;
301 	size_t i;
302 
303 	if (!pmap) {
304 		return;
305 	}
306 
307 	map = *pmap;
308 
309 	if (!map) {
310 		return;
311 	}
312 
313 	if (map->ops.notify_cb) {
314 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
315 		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
316 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
317 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
318 	}
319 
320 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
321 		free(map->map_256tb.map[i]);
322 	}
323 
324 	pthread_mutex_destroy(&map->mutex);
325 
326 	free(map);
327 	*pmap = NULL;
328 }
329 
330 int
331 spdk_mem_register(void *vaddr, size_t len)
332 {
333 	struct spdk_mem_map *map;
334 	int rc;
335 	void *seg_vaddr;
336 	size_t seg_len;
337 	uint64_t reg;
338 
339 	if ((uintptr_t)vaddr & ~MASK_256TB) {
340 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
341 		return -EINVAL;
342 	}
343 
344 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
345 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
346 			    __func__, vaddr, len);
347 		return -EINVAL;
348 	}
349 
350 	if (len == 0) {
351 		return 0;
352 	}
353 
354 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
355 
356 	seg_vaddr = vaddr;
357 	seg_len = len;
358 	while (seg_len > 0) {
359 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
360 		if (reg & REG_MAP_REGISTERED) {
361 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
362 			return -EBUSY;
363 		}
364 		seg_vaddr += VALUE_2MB;
365 		seg_len -= VALUE_2MB;
366 	}
367 
368 	seg_vaddr = vaddr;
369 	seg_len = 0;
370 	while (len > 0) {
371 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
372 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
373 		seg_len += VALUE_2MB;
374 		vaddr += VALUE_2MB;
375 		len -= VALUE_2MB;
376 	}
377 
378 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
379 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
380 		if (rc != 0) {
381 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
382 			return rc;
383 		}
384 	}
385 
386 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
387 	return 0;
388 }
389 
390 int
391 spdk_mem_unregister(void *vaddr, size_t len)
392 {
393 	struct spdk_mem_map *map;
394 	int rc;
395 	void *seg_vaddr;
396 	size_t seg_len;
397 	uint64_t reg, newreg;
398 
399 	if ((uintptr_t)vaddr & ~MASK_256TB) {
400 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
401 		return -EINVAL;
402 	}
403 
404 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
405 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
406 			    __func__, vaddr, len);
407 		return -EINVAL;
408 	}
409 
410 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
411 
412 	/* The first page must be a start of a region. Also check if it's
413 	 * registered to make sure we don't return -ERANGE for non-registered
414 	 * regions.
415 	 */
416 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
417 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
418 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
419 		return -ERANGE;
420 	}
421 
422 	seg_vaddr = vaddr;
423 	seg_len = len;
424 	while (seg_len > 0) {
425 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
426 		if ((reg & REG_MAP_REGISTERED) == 0) {
427 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
428 			return -EINVAL;
429 		}
430 		seg_vaddr += VALUE_2MB;
431 		seg_len -= VALUE_2MB;
432 	}
433 
434 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
435 	/* If the next page is registered, it must be a start of a region as well,
436 	 * otherwise we'd be unregistering only a part of a region.
437 	 */
438 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
439 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
440 		return -ERANGE;
441 	}
442 	seg_vaddr = vaddr;
443 	seg_len = 0;
444 
445 	while (len > 0) {
446 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
447 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
448 
449 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
450 			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
451 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
452 				if (rc != 0) {
453 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
454 					return rc;
455 				}
456 			}
457 
458 			seg_vaddr = vaddr;
459 			seg_len = VALUE_2MB;
460 		} else {
461 			seg_len += VALUE_2MB;
462 		}
463 
464 		vaddr += VALUE_2MB;
465 		len -= VALUE_2MB;
466 	}
467 
468 	if (seg_len > 0) {
469 		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
470 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
471 			if (rc != 0) {
472 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
473 				return rc;
474 			}
475 		}
476 	}
477 
478 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
479 	return 0;
480 }
481 
482 int
483 spdk_mem_reserve(void *vaddr, size_t len)
484 {
485 	struct spdk_mem_map *map;
486 	void *seg_vaddr;
487 	size_t seg_len;
488 	uint64_t reg;
489 
490 	if ((uintptr_t)vaddr & ~MASK_256TB) {
491 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
492 		return -EINVAL;
493 	}
494 
495 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
496 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
497 			    __func__, vaddr, len);
498 		return -EINVAL;
499 	}
500 
501 	if (len == 0) {
502 		return 0;
503 	}
504 
505 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
506 
507 	/* Check if any part of this range is already registered */
508 	seg_vaddr = vaddr;
509 	seg_len = len;
510 	while (seg_len > 0) {
511 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
512 		if (reg & REG_MAP_REGISTERED) {
513 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
514 			return -EBUSY;
515 		}
516 		seg_vaddr += VALUE_2MB;
517 		seg_len -= VALUE_2MB;
518 	}
519 
520 	/* Simply set the translation to the memory map's default. This allocates the space in the
521 	 * map but does not provide a valid translation. */
522 	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
523 				     g_mem_reg_map->default_translation);
524 
525 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
526 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
527 	}
528 
529 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
530 	return 0;
531 }
532 
533 static struct map_1gb *
534 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
535 {
536 	struct map_1gb *map_1gb;
537 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
538 	size_t i;
539 
540 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
541 		return NULL;
542 	}
543 
544 	map_1gb = map->map_256tb.map[idx_256tb];
545 
546 	if (!map_1gb) {
547 		pthread_mutex_lock(&map->mutex);
548 
549 		/* Recheck to make sure nobody else got the mutex first. */
550 		map_1gb = map->map_256tb.map[idx_256tb];
551 		if (!map_1gb) {
552 			map_1gb = malloc(sizeof(struct map_1gb));
553 			if (map_1gb) {
554 				/* initialize all entries to default translation */
555 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
556 					map_1gb->map[i].translation_2mb = map->default_translation;
557 				}
558 				map->map_256tb.map[idx_256tb] = map_1gb;
559 			}
560 		}
561 
562 		pthread_mutex_unlock(&map->mutex);
563 
564 		if (!map_1gb) {
565 			DEBUG_PRINT("allocation failed\n");
566 			return NULL;
567 		}
568 	}
569 
570 	return map_1gb;
571 }
572 
573 int
574 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
575 			     uint64_t translation)
576 {
577 	uint64_t vfn_2mb;
578 	struct map_1gb *map_1gb;
579 	uint64_t idx_1gb;
580 	struct map_2mb *map_2mb;
581 
582 	if ((uintptr_t)vaddr & ~MASK_256TB) {
583 		DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
584 		return -EINVAL;
585 	}
586 
587 	/* For now, only 2 MB-aligned registrations are supported */
588 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
589 		DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
590 			    __func__, vaddr, size);
591 		return -EINVAL;
592 	}
593 
594 	vfn_2mb = vaddr >> SHIFT_2MB;
595 
596 	while (size) {
597 		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
598 		if (!map_1gb) {
599 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
600 			return -ENOMEM;
601 		}
602 
603 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
604 		map_2mb = &map_1gb->map[idx_1gb];
605 		map_2mb->translation_2mb = translation;
606 
607 		size -= VALUE_2MB;
608 		vfn_2mb++;
609 	}
610 
611 	return 0;
612 }
613 
614 int
615 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
616 {
617 	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
618 }
619 
620 inline uint64_t
621 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
622 {
623 	const struct map_1gb *map_1gb;
624 	const struct map_2mb *map_2mb;
625 	uint64_t idx_256tb;
626 	uint64_t idx_1gb;
627 	uint64_t vfn_2mb;
628 	uint64_t cur_size;
629 	uint64_t prev_translation;
630 	uint64_t orig_translation;
631 
632 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
633 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
634 		return map->default_translation;
635 	}
636 
637 	vfn_2mb = vaddr >> SHIFT_2MB;
638 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
639 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
640 
641 	map_1gb = map->map_256tb.map[idx_256tb];
642 	if (spdk_unlikely(!map_1gb)) {
643 		return map->default_translation;
644 	}
645 
646 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
647 	map_2mb = &map_1gb->map[idx_1gb];
648 	if (size == NULL || map->ops.are_contiguous == NULL ||
649 	    map_2mb->translation_2mb == map->default_translation) {
650 		if (size != NULL) {
651 			*size = spdk_min(*size, cur_size);
652 		}
653 		return map_2mb->translation_2mb;
654 	}
655 
656 	orig_translation = map_2mb->translation_2mb;
657 	prev_translation = orig_translation;
658 	while (cur_size < *size) {
659 		vfn_2mb++;
660 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
661 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
662 
663 		map_1gb = map->map_256tb.map[idx_256tb];
664 		if (spdk_unlikely(!map_1gb)) {
665 			break;
666 		}
667 
668 		map_2mb = &map_1gb->map[idx_1gb];
669 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
670 			break;
671 		}
672 
673 		cur_size += VALUE_2MB;
674 		prev_translation = map_2mb->translation_2mb;
675 	}
676 
677 	*size = spdk_min(*size, cur_size);
678 	return orig_translation;
679 }
680 
681 static void
682 memory_hotplug_cb(enum rte_mem_event event_type,
683 		  const void *addr, size_t len, void *arg)
684 {
685 	if (event_type == RTE_MEM_EVENT_ALLOC) {
686 		spdk_mem_register((void *)addr, len);
687 
688 		if (!spdk_env_dpdk_external_init()) {
689 			return;
690 		}
691 
692 		/* When the user initialized DPDK separately, we can't
693 		 * be sure that --match-allocations RTE flag was specified.
694 		 * Without this flag, DPDK can free memory in different units
695 		 * than it was allocated. It doesn't work with things like RDMA MRs.
696 		 *
697 		 * For such cases, we mark segments so they aren't freed.
698 		 */
699 		while (len > 0) {
700 			struct rte_memseg *seg;
701 
702 			seg = rte_mem_virt2memseg(addr, NULL);
703 			assert(seg != NULL);
704 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
705 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
706 			len -= seg->hugepage_sz;
707 		}
708 	} else if (event_type == RTE_MEM_EVENT_FREE) {
709 		spdk_mem_unregister((void *)addr, len);
710 	}
711 }
712 
713 static int
714 memory_iter_cb(const struct rte_memseg_list *msl,
715 	       const struct rte_memseg *ms, size_t len, void *arg)
716 {
717 	return spdk_mem_register(ms->addr, len);
718 }
719 
720 int
721 mem_map_init(bool legacy_mem)
722 {
723 	g_legacy_mem = legacy_mem;
724 
725 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
726 	if (g_mem_reg_map == NULL) {
727 		DEBUG_PRINT("memory registration map allocation failed\n");
728 		return -ENOMEM;
729 	}
730 
731 	/*
732 	 * Walk all DPDK memory segments and register them
733 	 * with the main memory map
734 	 */
735 	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
736 	rte_memseg_contig_walk(memory_iter_cb, NULL);
737 	return 0;
738 }
739 
740 bool
741 spdk_iommu_is_enabled(void)
742 {
743 #if VFIO_ENABLED
744 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
745 #else
746 	return false;
747 #endif
748 }
749 
750 struct spdk_vtophys_pci_device {
751 	struct rte_pci_device *pci_device;
752 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
753 };
754 
755 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
756 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
757 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
758 
759 static struct spdk_mem_map *g_vtophys_map;
760 static struct spdk_mem_map *g_phys_ref_map;
761 
762 #if VFIO_ENABLED
763 static int
764 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
765 {
766 	struct spdk_vfio_dma_map *dma_map;
767 	int ret;
768 
769 	dma_map = calloc(1, sizeof(*dma_map));
770 	if (dma_map == NULL) {
771 		return -ENOMEM;
772 	}
773 
774 	dma_map->map.argsz = sizeof(dma_map->map);
775 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
776 	dma_map->map.vaddr = vaddr;
777 	dma_map->map.iova = iova;
778 	dma_map->map.size = size;
779 
780 	if (g_vfio.device_ref == 0) {
781 		/* VFIO requires at least one device (IOMMU group) to be added to
782 		 * a VFIO container before it is possible to perform any IOMMU
783 		 * operations on that container. This memory will be mapped once
784 		 * the first device (IOMMU group) is hotplugged.
785 		 *
786 		 * Since the vfio container is managed internally by DPDK, it is
787 		 * also possible that some device is already in that container, but
788 		 * it's not managed by SPDK -  e.g. an NIC attached internally
789 		 * inside DPDK. We could map the memory straight away in such
790 		 * scenario, but there's no need to do it. DPDK devices clearly
791 		 * don't need our mappings and hence we defer the mapping
792 		 * unconditionally until the first SPDK-managed device is
793 		 * hotplugged.
794 		 */
795 		goto out_insert;
796 	}
797 
798 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
799 	if (ret) {
800 		/* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
801 		SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
802 	}
803 
804 out_insert:
805 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
806 	return 0;
807 }
808 
809 
810 static int
811 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
812 {
813 	uint64_t refcount;
814 	int ret;
815 
816 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
817 	assert(refcount < UINT64_MAX);
818 	if (refcount > 0) {
819 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
820 		return 0;
821 	}
822 
823 	pthread_mutex_lock(&g_vfio.mutex);
824 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
825 	pthread_mutex_unlock(&g_vfio.mutex);
826 	if (ret) {
827 		return ret;
828 	}
829 
830 	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
831 	return 0;
832 }
833 
834 int
835 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
836 {
837 	int ret;
838 
839 	pthread_mutex_lock(&g_vfio.mutex);
840 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
841 	pthread_mutex_unlock(&g_vfio.mutex);
842 
843 	return ret;
844 }
845 
846 static int
847 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
848 {
849 	struct vfio_iommu_type1_dma_unmap unmap = {};
850 	int ret;
851 
852 	if (g_vfio.device_ref == 0) {
853 		/* Memory is not mapped anymore, just remove it's references */
854 		goto out_remove;
855 	}
856 
857 	unmap.argsz = sizeof(unmap);
858 	unmap.flags = 0;
859 	unmap.iova = dma_map->map.iova;
860 	unmap.size = dma_map->map.size;
861 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
862 	if (ret) {
863 		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
864 	}
865 
866 out_remove:
867 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
868 	free(dma_map);
869 	return 0;
870 }
871 
872 static int
873 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
874 {
875 	struct spdk_vfio_dma_map *dma_map;
876 	uint64_t refcount;
877 	int ret;
878 
879 	pthread_mutex_lock(&g_vfio.mutex);
880 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
881 		if (dma_map->map.iova == iova) {
882 			break;
883 		}
884 	}
885 
886 	if (dma_map == NULL) {
887 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
888 		pthread_mutex_unlock(&g_vfio.mutex);
889 		return -ENXIO;
890 	}
891 
892 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
893 	assert(refcount < UINT64_MAX);
894 	if (refcount > 0) {
895 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
896 	}
897 
898 	/* We still have outstanding references, don't clear it. */
899 	if (refcount > 1) {
900 		pthread_mutex_unlock(&g_vfio.mutex);
901 		return 0;
902 	}
903 
904 	/** don't support partial or multiple-page unmap for now */
905 	assert(dma_map->map.size == size);
906 
907 	ret = _vfio_iommu_unmap_dma(dma_map);
908 	pthread_mutex_unlock(&g_vfio.mutex);
909 
910 	return ret;
911 }
912 
913 int
914 vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
915 {
916 	struct spdk_vfio_dma_map *dma_map;
917 	int ret;
918 
919 	pthread_mutex_lock(&g_vfio.mutex);
920 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
921 		if (dma_map->map.vaddr == vaddr) {
922 			break;
923 		}
924 	}
925 
926 	if (dma_map == NULL) {
927 		DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
928 		pthread_mutex_unlock(&g_vfio.mutex);
929 		return -ENXIO;
930 	}
931 
932 	ret = _vfio_iommu_unmap_dma(dma_map);
933 	pthread_mutex_unlock(&g_vfio.mutex);
934 	return ret;
935 }
936 #endif
937 
938 static uint64_t
939 vtophys_get_paddr_memseg(uint64_t vaddr)
940 {
941 	uintptr_t paddr;
942 	struct rte_memseg *seg;
943 
944 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
945 	if (seg != NULL) {
946 		paddr = seg->iova;
947 		if (paddr == RTE_BAD_IOVA) {
948 			return SPDK_VTOPHYS_ERROR;
949 		}
950 		paddr += (vaddr - (uintptr_t)seg->addr);
951 		return paddr;
952 	}
953 
954 	return SPDK_VTOPHYS_ERROR;
955 }
956 
957 /* Try to get the paddr from /proc/self/pagemap */
958 static uint64_t
959 vtophys_get_paddr_pagemap(uint64_t vaddr)
960 {
961 	uintptr_t paddr;
962 
963 	/* Silence static analyzers */
964 	assert(vaddr != 0);
965 	paddr = rte_mem_virt2iova((void *)vaddr);
966 	if (paddr == RTE_BAD_IOVA) {
967 		/*
968 		 * The vaddr may be valid but doesn't have a backing page
969 		 * assigned yet.  Touch the page to ensure a backing page
970 		 * gets assigned, then try to translate again.
971 		 */
972 		rte_atomic64_read((rte_atomic64_t *)vaddr);
973 		paddr = rte_mem_virt2iova((void *)vaddr);
974 	}
975 	if (paddr == RTE_BAD_IOVA) {
976 		/* Unable to get to the physical address. */
977 		return SPDK_VTOPHYS_ERROR;
978 	}
979 
980 	return paddr;
981 }
982 
983 static uint64_t
984 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
985 {
986 	struct rte_mem_resource *res;
987 	uint64_t paddr;
988 	unsigned r;
989 
990 	for (r = 0; r < PCI_MAX_RESOURCE; r++) {
991 		res = dpdk_pci_device_get_mem_resource(dev, r);
992 
993 		if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
994 		    (vaddr + len) >= (uint64_t)res->addr + res->len) {
995 			continue;
996 		}
997 
998 #if VFIO_ENABLED
999 		if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1000 			/*
1001 			 * The IOMMU is on and we're using IOVA == VA. The BAR was
1002 			 * automatically registered when it was mapped, so just return
1003 			 * the virtual address here.
1004 			 */
1005 			return vaddr;
1006 		}
1007 #endif
1008 		paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1009 		return paddr;
1010 	}
1011 
1012 	return SPDK_VTOPHYS_ERROR;
1013 }
1014 
1015 /* Try to get the paddr from pci devices */
1016 static uint64_t
1017 vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1018 {
1019 	struct spdk_vtophys_pci_device *vtophys_dev;
1020 	uintptr_t paddr;
1021 	struct rte_pci_device	*dev;
1022 
1023 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1024 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1025 		dev = vtophys_dev->pci_device;
1026 		paddr = pci_device_vtophys(dev, vaddr, len);
1027 		if (paddr != SPDK_VTOPHYS_ERROR) {
1028 			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1029 			return paddr;
1030 		}
1031 	}
1032 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1033 
1034 	return SPDK_VTOPHYS_ERROR;
1035 }
1036 
1037 static int
1038 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1039 	       enum spdk_mem_map_notify_action action,
1040 	       void *vaddr, size_t len)
1041 {
1042 	int rc = 0;
1043 	uint64_t paddr;
1044 
1045 	if ((uintptr_t)vaddr & ~MASK_256TB) {
1046 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1047 		return -EINVAL;
1048 	}
1049 
1050 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1051 		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1052 			    vaddr, len);
1053 		return -EINVAL;
1054 	}
1055 
1056 	/* Get the physical address from the DPDK memsegs */
1057 	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1058 
1059 	switch (action) {
1060 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1061 		if (paddr == SPDK_VTOPHYS_ERROR) {
1062 			/* This is not an address that DPDK is managing. */
1063 
1064 			/* Check if this is a PCI BAR. They need special handling */
1065 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1066 			if (paddr != SPDK_VTOPHYS_ERROR) {
1067 				/* Get paddr for each 2MB chunk in this address range */
1068 				while (len > 0) {
1069 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1070 					if (paddr == SPDK_VTOPHYS_ERROR) {
1071 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1072 						return -EFAULT;
1073 					}
1074 
1075 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1076 					if (rc != 0) {
1077 						return rc;
1078 					}
1079 
1080 					vaddr += VALUE_2MB;
1081 					len -= VALUE_2MB;
1082 				}
1083 
1084 				return 0;
1085 			}
1086 
1087 #if VFIO_ENABLED
1088 			enum rte_iova_mode iova_mode;
1089 
1090 			iova_mode = rte_eal_iova_mode();
1091 
1092 			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1093 				/* We'll use the virtual address as the iova to match DPDK. */
1094 				paddr = (uint64_t)vaddr;
1095 				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1096 				if (rc) {
1097 					return -EFAULT;
1098 				}
1099 				while (len > 0) {
1100 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1101 					if (rc != 0) {
1102 						return rc;
1103 					}
1104 					vaddr += VALUE_2MB;
1105 					paddr += VALUE_2MB;
1106 					len -= VALUE_2MB;
1107 				}
1108 			} else
1109 #endif
1110 			{
1111 				/* Get the physical address from /proc/self/pagemap. */
1112 				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1113 				if (paddr == SPDK_VTOPHYS_ERROR) {
1114 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1115 					return -EFAULT;
1116 				}
1117 
1118 				/* Get paddr for each 2MB chunk in this address range */
1119 				while (len > 0) {
1120 					/* Get the physical address from /proc/self/pagemap. */
1121 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1122 
1123 					if (paddr == SPDK_VTOPHYS_ERROR) {
1124 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1125 						return -EFAULT;
1126 					}
1127 
1128 					if (paddr & MASK_2MB) {
1129 						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1130 						return -EINVAL;
1131 					}
1132 #if VFIO_ENABLED
1133 					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1134 					 * with the IOMMU using the physical address to match. */
1135 					if (spdk_iommu_is_enabled()) {
1136 						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1137 						if (rc) {
1138 							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1139 							return -EFAULT;
1140 						}
1141 					}
1142 #endif
1143 
1144 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1145 					if (rc != 0) {
1146 						return rc;
1147 					}
1148 
1149 					vaddr += VALUE_2MB;
1150 					len -= VALUE_2MB;
1151 				}
1152 			}
1153 		} else {
1154 			/* This is an address managed by DPDK. Just setup the translations. */
1155 			while (len > 0) {
1156 				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1157 				if (paddr == SPDK_VTOPHYS_ERROR) {
1158 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1159 					return -EFAULT;
1160 				}
1161 
1162 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1163 				if (rc != 0) {
1164 					return rc;
1165 				}
1166 
1167 				vaddr += VALUE_2MB;
1168 				len -= VALUE_2MB;
1169 			}
1170 		}
1171 
1172 		break;
1173 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1174 #if VFIO_ENABLED
1175 		if (paddr == SPDK_VTOPHYS_ERROR) {
1176 			/*
1177 			 * This is not an address that DPDK is managing.
1178 			 */
1179 
1180 			/* Check if this is a PCI BAR. They need special handling */
1181 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1182 			if (paddr != SPDK_VTOPHYS_ERROR) {
1183 				/* Get paddr for each 2MB chunk in this address range */
1184 				while (len > 0) {
1185 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1186 					if (paddr == SPDK_VTOPHYS_ERROR) {
1187 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1188 						return -EFAULT;
1189 					}
1190 
1191 					rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1192 					if (rc != 0) {
1193 						return rc;
1194 					}
1195 
1196 					vaddr += VALUE_2MB;
1197 					len -= VALUE_2MB;
1198 				}
1199 
1200 				return 0;
1201 			}
1202 
1203 			/* If vfio is enabled,
1204 			 * we need to unmap the range from the IOMMU
1205 			 */
1206 			if (spdk_iommu_is_enabled()) {
1207 				uint64_t buffer_len = len;
1208 				uint8_t *va = vaddr;
1209 				enum rte_iova_mode iova_mode;
1210 
1211 				iova_mode = rte_eal_iova_mode();
1212 				/*
1213 				 * In virtual address mode, the region is contiguous and can be done in
1214 				 * one unmap.
1215 				 */
1216 				if (iova_mode == RTE_IOVA_VA) {
1217 					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1218 					if (buffer_len != len || paddr != (uintptr_t)va) {
1219 						DEBUG_PRINT("Unmapping %p with length %lu failed because "
1220 							    "translation had address 0x%" PRIx64 " and length %lu\n",
1221 							    va, len, paddr, buffer_len);
1222 						return -EINVAL;
1223 					}
1224 					rc = vtophys_iommu_unmap_dma(paddr, len);
1225 					if (rc) {
1226 						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1227 						return -EFAULT;
1228 					}
1229 				} else if (iova_mode == RTE_IOVA_PA) {
1230 					/* Get paddr for each 2MB chunk in this address range */
1231 					while (buffer_len > 0) {
1232 						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1233 
1234 						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1235 							DEBUG_PRINT("could not get phys addr for %p\n", va);
1236 							return -EFAULT;
1237 						}
1238 
1239 						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1240 						if (rc) {
1241 							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1242 							return -EFAULT;
1243 						}
1244 
1245 						va += VALUE_2MB;
1246 						buffer_len -= VALUE_2MB;
1247 					}
1248 				}
1249 			}
1250 		}
1251 #endif
1252 		while (len > 0) {
1253 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1254 			if (rc != 0) {
1255 				return rc;
1256 			}
1257 
1258 			vaddr += VALUE_2MB;
1259 			len -= VALUE_2MB;
1260 		}
1261 
1262 		break;
1263 	default:
1264 		SPDK_UNREACHABLE();
1265 	}
1266 
1267 	return rc;
1268 }
1269 
1270 static int
1271 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1272 {
1273 	/* This function is always called with paddrs for two subsequent
1274 	 * 2MB chunks in virtual address space, so those chunks will be only
1275 	 * physically contiguous if the physical addresses are 2MB apart
1276 	 * from each other as well.
1277 	 */
1278 	return (paddr2 - paddr1 == VALUE_2MB);
1279 }
1280 
1281 #if VFIO_ENABLED
1282 
1283 static bool
1284 vfio_enabled(void)
1285 {
1286 	return rte_vfio_is_enabled("vfio_pci");
1287 }
1288 
1289 /* Check if IOMMU is enabled on the system */
1290 static bool
1291 has_iommu_groups(void)
1292 {
1293 	int count = 0;
1294 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1295 
1296 	if (dir == NULL) {
1297 		return false;
1298 	}
1299 
1300 	while (count < 3 && readdir(dir) != NULL) {
1301 		count++;
1302 	}
1303 
1304 	closedir(dir);
1305 	/* there will always be ./ and ../ entries */
1306 	return count > 2;
1307 }
1308 
1309 static bool
1310 vfio_noiommu_enabled(void)
1311 {
1312 	return rte_vfio_noiommu_is_enabled();
1313 }
1314 
1315 static void
1316 vtophys_iommu_init(void)
1317 {
1318 	char proc_fd_path[PATH_MAX + 1];
1319 	char link_path[PATH_MAX + 1];
1320 	const char vfio_path[] = "/dev/vfio/vfio";
1321 	DIR *dir;
1322 	struct dirent *d;
1323 
1324 	if (!vfio_enabled()) {
1325 		return;
1326 	}
1327 
1328 	if (vfio_noiommu_enabled()) {
1329 		g_vfio.noiommu_enabled = true;
1330 	} else if (!has_iommu_groups()) {
1331 		return;
1332 	}
1333 
1334 	dir = opendir("/proc/self/fd");
1335 	if (!dir) {
1336 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1337 		return;
1338 	}
1339 
1340 	while ((d = readdir(dir)) != NULL) {
1341 		if (d->d_type != DT_LNK) {
1342 			continue;
1343 		}
1344 
1345 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1346 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1347 			continue;
1348 		}
1349 
1350 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1351 			sscanf(d->d_name, "%d", &g_vfio.fd);
1352 			break;
1353 		}
1354 	}
1355 
1356 	closedir(dir);
1357 
1358 	if (g_vfio.fd < 0) {
1359 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1360 		return;
1361 	}
1362 
1363 	g_vfio.enabled = true;
1364 
1365 	return;
1366 }
1367 
1368 #endif
1369 
1370 void
1371 vtophys_pci_device_added(struct rte_pci_device *pci_device)
1372 {
1373 	struct spdk_vtophys_pci_device *vtophys_dev;
1374 
1375 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1376 
1377 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1378 	if (vtophys_dev) {
1379 		vtophys_dev->pci_device = pci_device;
1380 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1381 	} else {
1382 		DEBUG_PRINT("Memory allocation error\n");
1383 	}
1384 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1385 
1386 #if VFIO_ENABLED
1387 	struct spdk_vfio_dma_map *dma_map;
1388 	int ret;
1389 
1390 	if (!g_vfio.enabled) {
1391 		return;
1392 	}
1393 
1394 	pthread_mutex_lock(&g_vfio.mutex);
1395 	g_vfio.device_ref++;
1396 	if (g_vfio.device_ref > 1) {
1397 		pthread_mutex_unlock(&g_vfio.mutex);
1398 		return;
1399 	}
1400 
1401 	/* This is the first SPDK device using DPDK vfio. This means that the first
1402 	 * IOMMU group might have been just been added to the DPDK vfio container.
1403 	 * From this point it is certain that the memory can be mapped now.
1404 	 */
1405 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1406 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1407 		if (ret) {
1408 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1409 			break;
1410 		}
1411 	}
1412 	pthread_mutex_unlock(&g_vfio.mutex);
1413 #endif
1414 }
1415 
1416 void
1417 vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1418 {
1419 	struct spdk_vtophys_pci_device *vtophys_dev;
1420 
1421 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1422 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1423 		if (vtophys_dev->pci_device == pci_device) {
1424 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1425 			free(vtophys_dev);
1426 			break;
1427 		}
1428 	}
1429 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1430 
1431 #if VFIO_ENABLED
1432 	struct spdk_vfio_dma_map *dma_map;
1433 	int ret;
1434 
1435 	if (!g_vfio.enabled) {
1436 		return;
1437 	}
1438 
1439 	pthread_mutex_lock(&g_vfio.mutex);
1440 	assert(g_vfio.device_ref > 0);
1441 	g_vfio.device_ref--;
1442 	if (g_vfio.device_ref > 0) {
1443 		pthread_mutex_unlock(&g_vfio.mutex);
1444 		return;
1445 	}
1446 
1447 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1448 	 * any additional devices using it's vfio container, all the mappings
1449 	 * will be automatically removed by the Linux vfio driver. We unmap
1450 	 * the memory manually to be able to easily re-map it later regardless
1451 	 * of other, external factors.
1452 	 */
1453 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1454 		struct vfio_iommu_type1_dma_unmap unmap = {};
1455 		unmap.argsz = sizeof(unmap);
1456 		unmap.flags = 0;
1457 		unmap.iova = dma_map->map.iova;
1458 		unmap.size = dma_map->map.size;
1459 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1460 		if (ret) {
1461 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1462 			break;
1463 		}
1464 	}
1465 	pthread_mutex_unlock(&g_vfio.mutex);
1466 #endif
1467 }
1468 
1469 int
1470 vtophys_init(void)
1471 {
1472 	const struct spdk_mem_map_ops vtophys_map_ops = {
1473 		.notify_cb = vtophys_notify,
1474 		.are_contiguous = vtophys_check_contiguous_entries,
1475 	};
1476 
1477 	const struct spdk_mem_map_ops phys_ref_map_ops = {
1478 		.notify_cb = NULL,
1479 		.are_contiguous = NULL,
1480 	};
1481 
1482 #if VFIO_ENABLED
1483 	vtophys_iommu_init();
1484 #endif
1485 
1486 	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1487 	if (g_phys_ref_map == NULL) {
1488 		DEBUG_PRINT("phys_ref map allocation failed.\n");
1489 		return -ENOMEM;
1490 	}
1491 
1492 	g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1493 	if (g_vtophys_map == NULL) {
1494 		DEBUG_PRINT("vtophys map allocation failed\n");
1495 		spdk_mem_map_free(&g_phys_ref_map);
1496 		return -ENOMEM;
1497 	}
1498 	return 0;
1499 }
1500 
1501 uint64_t
1502 spdk_vtophys(const void *buf, uint64_t *size)
1503 {
1504 	uint64_t vaddr, paddr_2mb;
1505 
1506 	vaddr = (uint64_t)buf;
1507 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1508 
1509 	/*
1510 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1511 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1512 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1513 	 * unaligned) we must now check the return value before addition.
1514 	 */
1515 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1516 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1517 		return SPDK_VTOPHYS_ERROR;
1518 	} else {
1519 		return paddr_2mb + (vaddr & MASK_2MB);
1520 	}
1521 }
1522 
1523 int
1524 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1525 {
1526 	struct rte_memseg *seg;
1527 	int ret, fd;
1528 
1529 	seg = rte_mem_virt2memseg(vaddr, NULL);
1530 	if (!seg) {
1531 		SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1532 		return -ENOENT;
1533 	}
1534 
1535 	fd = rte_memseg_get_fd_thread_unsafe(seg);
1536 	if (fd < 0) {
1537 		return fd;
1538 	}
1539 
1540 	ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1541 	if (ret < 0) {
1542 		return ret;
1543 	}
1544 
1545 	return fd;
1546 }
1547