xref: /spdk/lib/env_dpdk/memory.c (revision bb488d2829a9b7863daab45917dd2174905cc0ae)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
40 
41 #include "spdk_internal/assert.h"
42 #include "spdk_internal/memory.h"
43 
44 #include "spdk/assert.h"
45 #include "spdk/likely.h"
46 #include "spdk/queue.h"
47 #include "spdk/util.h"
48 #include "spdk/env_dpdk.h"
49 
50 #ifdef __FreeBSD__
51 #define SPDK_VFIO_ENABLED 0
52 #else
53 #include <linux/version.h>
54 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
55 #define SPDK_VFIO_ENABLED 1
56 #include <linux/vfio.h>
57 #include <rte_vfio.h>
58 
59 struct spdk_vfio_dma_map {
60 	struct vfio_iommu_type1_dma_map map;
61 	struct vfio_iommu_type1_dma_unmap unmap;
62 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
63 };
64 
65 struct vfio_cfg {
66 	int fd;
67 	bool enabled;
68 	bool noiommu_enabled;
69 	unsigned device_ref;
70 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
71 	pthread_mutex_t mutex;
72 };
73 
74 static struct vfio_cfg g_vfio = {
75 	.fd = -1,
76 	.enabled = false,
77 	.noiommu_enabled = false,
78 	.device_ref = 0,
79 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
80 	.mutex = PTHREAD_MUTEX_INITIALIZER
81 };
82 
83 #else
84 #define SPDK_VFIO_ENABLED 0
85 #endif
86 #endif
87 
88 #if DEBUG
89 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
90 #else
91 #define DEBUG_PRINT(...)
92 #endif
93 
94 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
95 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
96 
97 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
98 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
99 
100 /* Page is registered */
101 #define REG_MAP_REGISTERED	(1ULL << 62)
102 
103 /* A notification region barrier. The 2MB translation entry that's marked
104  * with this flag must be unregistered separately. This allows contiguous
105  * regions to be unregistered in the same chunks they were registered.
106  */
107 #define REG_MAP_NOTIFY_START	(1ULL << 63)
108 
109 /* Translation of a single 2MB page. */
110 struct map_2mb {
111 	uint64_t translation_2mb;
112 };
113 
114 /* Second-level map table indexed by bits [21..29] of the virtual address.
115  * Each entry contains the address translation or error for entries that haven't
116  * been retrieved yet.
117  */
118 struct map_1gb {
119 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
120 };
121 
122 /* Top-level map table indexed by bits [30..47] of the virtual address.
123  * Each entry points to a second-level map table or NULL.
124  */
125 struct map_256tb {
126 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
127 };
128 
129 /* Page-granularity memory address translation */
130 struct spdk_mem_map {
131 	struct map_256tb map_256tb;
132 	pthread_mutex_t mutex;
133 	uint64_t default_translation;
134 	struct spdk_mem_map_ops ops;
135 	void *cb_ctx;
136 	TAILQ_ENTRY(spdk_mem_map) tailq;
137 };
138 
139 /* Registrations map. The 64 bit translations are bit fields with the
140  * following layout (starting with the low bits):
141  *    0 - 61 : reserved
142  *   62 - 63 : flags
143  */
144 static struct spdk_mem_map *g_mem_reg_map;
145 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
146 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
147 
148 /*
149  * Walk the currently registered memory via the main memory registration map
150  * and call the new map's notify callback for each virtually contiguous region.
151  */
152 static int
153 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
154 {
155 	size_t idx_256tb;
156 	uint64_t idx_1gb;
157 	uint64_t contig_start = UINT64_MAX;
158 	uint64_t contig_end = UINT64_MAX;
159 	struct map_1gb *map_1gb;
160 	int rc;
161 
162 	if (!g_mem_reg_map) {
163 		return -EINVAL;
164 	}
165 
166 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
167 	pthread_mutex_lock(&g_mem_reg_map->mutex);
168 
169 	for (idx_256tb = 0;
170 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
171 	     idx_256tb++) {
172 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
173 
174 		if (!map_1gb) {
175 			if (contig_start != UINT64_MAX) {
176 				/* End of of a virtually contiguous range */
177 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
178 							(void *)contig_start,
179 							contig_end - contig_start + VALUE_2MB);
180 				/* Don't bother handling unregister failures. It can't be any worse */
181 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
182 					goto err_unregister;
183 				}
184 			}
185 			contig_start = UINT64_MAX;
186 			continue;
187 		}
188 
189 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
190 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
191 			    (contig_start == UINT64_MAX ||
192 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
193 				/* Rebuild the virtual address from the indexes */
194 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
195 
196 				if (contig_start == UINT64_MAX) {
197 					contig_start = vaddr;
198 				}
199 
200 				contig_end = vaddr;
201 			} else {
202 				if (contig_start != UINT64_MAX) {
203 					/* End of of a virtually contiguous range */
204 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
205 								(void *)contig_start,
206 								contig_end - contig_start + VALUE_2MB);
207 					/* Don't bother handling unregister failures. It can't be any worse */
208 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
209 						goto err_unregister;
210 					}
211 
212 					/* This page might be a part of a neighbour region, so process
213 					 * it again. The idx_1gb will be incremented immediately.
214 					 */
215 					idx_1gb--;
216 				}
217 				contig_start = UINT64_MAX;
218 			}
219 		}
220 	}
221 
222 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
223 	return 0;
224 
225 err_unregister:
226 	/* Unwind to the first empty translation so we don't unregister
227 	 * a region that just failed to register.
228 	 */
229 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
230 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
231 	contig_start = UINT64_MAX;
232 	contig_end = UINT64_MAX;
233 
234 	/* Unregister any memory we managed to register before the failure */
235 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
236 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
237 
238 		if (!map_1gb) {
239 			if (contig_end != UINT64_MAX) {
240 				/* End of of a virtually contiguous range */
241 				map->ops.notify_cb(map->cb_ctx, map,
242 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
243 						   (void *)contig_start,
244 						   contig_end - contig_start + VALUE_2MB);
245 			}
246 			contig_end = UINT64_MAX;
247 			continue;
248 		}
249 
250 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
251 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
252 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
253 				/* Rebuild the virtual address from the indexes */
254 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
255 
256 				if (contig_end == UINT64_MAX) {
257 					contig_end = vaddr;
258 				}
259 				contig_start = vaddr;
260 			} else {
261 				if (contig_end != UINT64_MAX) {
262 					/* End of of a virtually contiguous range */
263 					map->ops.notify_cb(map->cb_ctx, map,
264 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
265 							   (void *)contig_start,
266 							   contig_end - contig_start + VALUE_2MB);
267 					idx_1gb++;
268 				}
269 				contig_end = UINT64_MAX;
270 			}
271 		}
272 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
273 	}
274 
275 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
276 	return rc;
277 }
278 
279 struct spdk_mem_map *
280 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
281 {
282 	struct spdk_mem_map *map;
283 	int rc;
284 
285 	map = calloc(1, sizeof(*map));
286 	if (map == NULL) {
287 		return NULL;
288 	}
289 
290 	if (pthread_mutex_init(&map->mutex, NULL)) {
291 		free(map);
292 		return NULL;
293 	}
294 
295 	map->default_translation = default_translation;
296 	map->cb_ctx = cb_ctx;
297 	if (ops) {
298 		map->ops = *ops;
299 	}
300 
301 	if (ops && ops->notify_cb) {
302 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
303 		rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
304 		if (rc != 0) {
305 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
306 			DEBUG_PRINT("Initial mem_map notify failed\n");
307 			pthread_mutex_destroy(&map->mutex);
308 			free(map);
309 			return NULL;
310 		}
311 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
312 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
313 	}
314 
315 	return map;
316 }
317 
318 void
319 spdk_mem_map_free(struct spdk_mem_map **pmap)
320 {
321 	struct spdk_mem_map *map;
322 	size_t i;
323 
324 	if (!pmap) {
325 		return;
326 	}
327 
328 	map = *pmap;
329 
330 	if (!map) {
331 		return;
332 	}
333 
334 	if (map->ops.notify_cb) {
335 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
336 		spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
337 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
338 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
339 	}
340 
341 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
342 		free(map->map_256tb.map[i]);
343 	}
344 
345 	pthread_mutex_destroy(&map->mutex);
346 
347 	free(map);
348 	*pmap = NULL;
349 }
350 
351 int
352 spdk_mem_register(void *vaddr, size_t len)
353 {
354 	struct spdk_mem_map *map;
355 	int rc;
356 	void *seg_vaddr;
357 	size_t seg_len;
358 	uint64_t reg;
359 
360 	if ((uintptr_t)vaddr & ~MASK_256TB) {
361 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
362 		return -EINVAL;
363 	}
364 
365 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
366 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
367 			    __func__, vaddr, len);
368 		return -EINVAL;
369 	}
370 
371 	if (len == 0) {
372 		return 0;
373 	}
374 
375 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
376 
377 	seg_vaddr = vaddr;
378 	seg_len = len;
379 	while (seg_len > 0) {
380 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
381 		if (reg & REG_MAP_REGISTERED) {
382 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
383 			return -EBUSY;
384 		}
385 		seg_vaddr += VALUE_2MB;
386 		seg_len -= VALUE_2MB;
387 	}
388 
389 	seg_vaddr = vaddr;
390 	seg_len = 0;
391 	while (len > 0) {
392 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
393 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
394 		seg_len += VALUE_2MB;
395 		vaddr += VALUE_2MB;
396 		len -= VALUE_2MB;
397 	}
398 
399 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
400 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
401 		if (rc != 0) {
402 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
403 			return rc;
404 		}
405 	}
406 
407 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
408 	return 0;
409 }
410 
411 int
412 spdk_mem_unregister(void *vaddr, size_t len)
413 {
414 	struct spdk_mem_map *map;
415 	int rc;
416 	void *seg_vaddr;
417 	size_t seg_len;
418 	uint64_t reg, newreg;
419 
420 	if ((uintptr_t)vaddr & ~MASK_256TB) {
421 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
422 		return -EINVAL;
423 	}
424 
425 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
426 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
427 			    __func__, vaddr, len);
428 		return -EINVAL;
429 	}
430 
431 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
432 
433 	/* The first page must be a start of a region. Also check if it's
434 	 * registered to make sure we don't return -ERANGE for non-registered
435 	 * regions.
436 	 */
437 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
438 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
439 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
440 		return -ERANGE;
441 	}
442 
443 	seg_vaddr = vaddr;
444 	seg_len = len;
445 	while (seg_len > 0) {
446 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
447 		if ((reg & REG_MAP_REGISTERED) == 0) {
448 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
449 			return -EINVAL;
450 		}
451 		seg_vaddr += VALUE_2MB;
452 		seg_len -= VALUE_2MB;
453 	}
454 
455 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
456 	/* If the next page is registered, it must be a start of a region as well,
457 	 * otherwise we'd be unregistering only a part of a region.
458 	 */
459 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
460 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
461 		return -ERANGE;
462 	}
463 	seg_vaddr = vaddr;
464 	seg_len = 0;
465 
466 	while (len > 0) {
467 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
468 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
469 
470 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
471 			TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
472 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
473 				if (rc != 0) {
474 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
475 					return rc;
476 				}
477 			}
478 
479 			seg_vaddr = vaddr;
480 			seg_len = VALUE_2MB;
481 		} else {
482 			seg_len += VALUE_2MB;
483 		}
484 
485 		vaddr += VALUE_2MB;
486 		len -= VALUE_2MB;
487 	}
488 
489 	if (seg_len > 0) {
490 		TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
491 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
492 			if (rc != 0) {
493 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
494 				return rc;
495 			}
496 		}
497 	}
498 
499 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
500 	return 0;
501 }
502 
503 static struct map_1gb *
504 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
505 {
506 	struct map_1gb *map_1gb;
507 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
508 	size_t i;
509 
510 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
511 		return NULL;
512 	}
513 
514 	map_1gb = map->map_256tb.map[idx_256tb];
515 
516 	if (!map_1gb) {
517 		pthread_mutex_lock(&map->mutex);
518 
519 		/* Recheck to make sure nobody else got the mutex first. */
520 		map_1gb = map->map_256tb.map[idx_256tb];
521 		if (!map_1gb) {
522 			map_1gb = malloc(sizeof(struct map_1gb));
523 			if (map_1gb) {
524 				/* initialize all entries to default translation */
525 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
526 					map_1gb->map[i].translation_2mb = map->default_translation;
527 				}
528 				map->map_256tb.map[idx_256tb] = map_1gb;
529 			}
530 		}
531 
532 		pthread_mutex_unlock(&map->mutex);
533 
534 		if (!map_1gb) {
535 			DEBUG_PRINT("allocation failed\n");
536 			return NULL;
537 		}
538 	}
539 
540 	return map_1gb;
541 }
542 
543 int
544 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
545 			     uint64_t translation)
546 {
547 	uint64_t vfn_2mb;
548 	struct map_1gb *map_1gb;
549 	uint64_t idx_1gb;
550 	struct map_2mb *map_2mb;
551 
552 	if ((uintptr_t)vaddr & ~MASK_256TB) {
553 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
554 		return -EINVAL;
555 	}
556 
557 	/* For now, only 2 MB-aligned registrations are supported */
558 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
559 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
560 			    __func__, vaddr, size);
561 		return -EINVAL;
562 	}
563 
564 	vfn_2mb = vaddr >> SHIFT_2MB;
565 
566 	while (size) {
567 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
568 		if (!map_1gb) {
569 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
570 			return -ENOMEM;
571 		}
572 
573 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
574 		map_2mb = &map_1gb->map[idx_1gb];
575 		map_2mb->translation_2mb = translation;
576 
577 		size -= VALUE_2MB;
578 		vfn_2mb++;
579 	}
580 
581 	return 0;
582 }
583 
584 int
585 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
586 {
587 	uint64_t vfn_2mb;
588 	struct map_1gb *map_1gb;
589 	uint64_t idx_1gb;
590 	struct map_2mb *map_2mb;
591 
592 	if ((uintptr_t)vaddr & ~MASK_256TB) {
593 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
594 		return -EINVAL;
595 	}
596 
597 	/* For now, only 2 MB-aligned registrations are supported */
598 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
599 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
600 			    __func__, vaddr, size);
601 		return -EINVAL;
602 	}
603 
604 	vfn_2mb = vaddr >> SHIFT_2MB;
605 
606 	while (size) {
607 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
608 		if (!map_1gb) {
609 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
610 			return -ENOMEM;
611 		}
612 
613 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
614 		map_2mb = &map_1gb->map[idx_1gb];
615 		map_2mb->translation_2mb = map->default_translation;
616 
617 		size -= VALUE_2MB;
618 		vfn_2mb++;
619 	}
620 
621 	return 0;
622 }
623 
624 inline uint64_t
625 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
626 {
627 	const struct map_1gb *map_1gb;
628 	const struct map_2mb *map_2mb;
629 	uint64_t idx_256tb;
630 	uint64_t idx_1gb;
631 	uint64_t vfn_2mb;
632 	uint64_t cur_size;
633 	uint64_t prev_translation;
634 	uint64_t orig_translation;
635 
636 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
637 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
638 		return map->default_translation;
639 	}
640 
641 	vfn_2mb = vaddr >> SHIFT_2MB;
642 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
643 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
644 
645 	map_1gb = map->map_256tb.map[idx_256tb];
646 	if (spdk_unlikely(!map_1gb)) {
647 		return map->default_translation;
648 	}
649 
650 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
651 	map_2mb = &map_1gb->map[idx_1gb];
652 	if (size == NULL || map->ops.are_contiguous == NULL ||
653 	    map_2mb->translation_2mb == map->default_translation) {
654 		if (size != NULL) {
655 			*size = spdk_min(*size, cur_size);
656 		}
657 		return map_2mb->translation_2mb;
658 	}
659 
660 	orig_translation = map_2mb->translation_2mb;
661 	prev_translation = orig_translation;
662 	while (cur_size < *size) {
663 		vfn_2mb++;
664 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
665 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
666 
667 		map_1gb = map->map_256tb.map[idx_256tb];
668 		if (spdk_unlikely(!map_1gb)) {
669 			break;
670 		}
671 
672 		map_2mb = &map_1gb->map[idx_1gb];
673 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
674 			break;
675 		}
676 
677 		cur_size += VALUE_2MB;
678 		prev_translation = map_2mb->translation_2mb;
679 	}
680 
681 	*size = spdk_min(*size, cur_size);
682 	return orig_translation;
683 }
684 
685 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
686 static void
687 memory_hotplug_cb(enum rte_mem_event event_type,
688 		  const void *addr, size_t len, void *arg)
689 {
690 	if (event_type == RTE_MEM_EVENT_ALLOC) {
691 		spdk_mem_register((void *)addr, len);
692 
693 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
694 		if (!spdk_env_dpdk_external_init()) {
695 			return;
696 		}
697 #endif
698 
699 		/* Prior to DPDK 19.02, we have to worry about DPDK
700 		 * freeing memory in different units than it was allocated.
701 		 * That doesn't work with things like RDMA MRs.  So for
702 		 * those versions of DPDK, mark each segment so that DPDK
703 		 * won't later free it.  That ensures we don't have to deal
704 		 * with that scenario.
705 		 *
706 		 * DPDK 19.02 added the --match-allocations RTE flag to
707 		 * avoid this condition.
708 		 *
709 		 * Note: if the user initialized DPDK separately, we can't
710 		 * be sure that --match-allocations was specified, so need
711 		 * to still mark the segments so they aren't freed.
712 		 */
713 		while (len > 0) {
714 			struct rte_memseg *seg;
715 
716 			seg = rte_mem_virt2memseg(addr, NULL);
717 			assert(seg != NULL);
718 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
719 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
720 			len -= seg->hugepage_sz;
721 		}
722 	} else if (event_type == RTE_MEM_EVENT_FREE) {
723 		spdk_mem_unregister((void *)addr, len);
724 	}
725 }
726 
727 static int
728 memory_iter_cb(const struct rte_memseg_list *msl,
729 	       const struct rte_memseg *ms, size_t len, void *arg)
730 {
731 	return spdk_mem_register(ms->addr, len);
732 }
733 #endif
734 
735 int
736 spdk_mem_map_init(void)
737 {
738 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
739 	if (g_mem_reg_map == NULL) {
740 		DEBUG_PRINT("memory registration map allocation failed\n");
741 		return -1;
742 	}
743 
744 	/*
745 	 * Walk all DPDK memory segments and register them
746 	 * with the master memory map
747 	 */
748 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
749 	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
750 	rte_memseg_contig_walk(memory_iter_cb, NULL);
751 #else
752 	struct rte_mem_config *mcfg;
753 	size_t seg_idx;
754 
755 	mcfg = rte_eal_get_configuration()->mem_config;
756 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
757 		struct rte_memseg *seg = &mcfg->memseg[seg_idx];
758 
759 		if (seg->addr == NULL) {
760 			break;
761 		}
762 
763 		spdk_mem_register(seg->addr, seg->len);
764 	}
765 #endif
766 	return 0;
767 }
768 
769 bool
770 spdk_iommu_is_enabled(void)
771 {
772 #if SPDK_VFIO_ENABLED
773 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
774 #else
775 	return false;
776 #endif
777 }
778 
779 struct spdk_vtophys_pci_device {
780 	struct rte_pci_device *pci_device;
781 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
782 };
783 
784 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
785 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
786 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
787 
788 static struct spdk_mem_map *g_vtophys_map;
789 
790 #if SPDK_VFIO_ENABLED
791 static int
792 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
793 {
794 	struct spdk_vfio_dma_map *dma_map;
795 	int ret;
796 
797 	dma_map = calloc(1, sizeof(*dma_map));
798 	if (dma_map == NULL) {
799 		return -ENOMEM;
800 	}
801 
802 	dma_map->map.argsz = sizeof(dma_map->map);
803 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
804 	dma_map->map.vaddr = vaddr;
805 	dma_map->map.iova = iova;
806 	dma_map->map.size = size;
807 
808 	dma_map->unmap.argsz = sizeof(dma_map->unmap);
809 	dma_map->unmap.flags = 0;
810 	dma_map->unmap.iova = iova;
811 	dma_map->unmap.size = size;
812 
813 	pthread_mutex_lock(&g_vfio.mutex);
814 	if (g_vfio.device_ref == 0) {
815 		/* VFIO requires at least one device (IOMMU group) to be added to
816 		 * a VFIO container before it is possible to perform any IOMMU
817 		 * operations on that container. This memory will be mapped once
818 		 * the first device (IOMMU group) is hotplugged.
819 		 *
820 		 * Since the vfio container is managed internally by DPDK, it is
821 		 * also possible that some device is already in that container, but
822 		 * it's not managed by SPDK -  e.g. an NIC attached internally
823 		 * inside DPDK. We could map the memory straight away in such
824 		 * scenario, but there's no need to do it. DPDK devices clearly
825 		 * don't need our mappings and hence we defer the mapping
826 		 * unconditionally until the first SPDK-managed device is
827 		 * hotplugged.
828 		 */
829 		goto out_insert;
830 	}
831 
832 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
833 	if (ret) {
834 		DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
835 		pthread_mutex_unlock(&g_vfio.mutex);
836 		free(dma_map);
837 		return ret;
838 	}
839 
840 out_insert:
841 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
842 	pthread_mutex_unlock(&g_vfio.mutex);
843 	return 0;
844 }
845 
846 static int
847 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
848 {
849 	struct spdk_vfio_dma_map *dma_map;
850 	int ret;
851 
852 	pthread_mutex_lock(&g_vfio.mutex);
853 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
854 		if (dma_map->map.iova == iova) {
855 			break;
856 		}
857 	}
858 
859 	if (dma_map == NULL) {
860 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
861 		pthread_mutex_unlock(&g_vfio.mutex);
862 		return -ENXIO;
863 	}
864 
865 	/** don't support partial or multiple-page unmap for now */
866 	assert(dma_map->map.size == size);
867 
868 	if (g_vfio.device_ref == 0) {
869 		/* Memory is not mapped anymore, just remove it's references */
870 		goto out_remove;
871 	}
872 
873 
874 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
875 	if (ret) {
876 		DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
877 		pthread_mutex_unlock(&g_vfio.mutex);
878 		return ret;
879 	}
880 
881 out_remove:
882 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
883 	pthread_mutex_unlock(&g_vfio.mutex);
884 	free(dma_map);
885 	return 0;
886 }
887 #endif
888 
889 static uint64_t
890 vtophys_get_paddr_memseg(uint64_t vaddr)
891 {
892 	uintptr_t paddr;
893 	struct rte_memseg *seg;
894 
895 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
896 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
897 	if (seg != NULL) {
898 		paddr = seg->phys_addr;
899 		if (paddr == RTE_BAD_IOVA) {
900 			return SPDK_VTOPHYS_ERROR;
901 		}
902 		paddr += (vaddr - (uintptr_t)seg->addr);
903 		return paddr;
904 	}
905 #else
906 	struct rte_mem_config *mcfg;
907 	uint32_t seg_idx;
908 
909 	mcfg = rte_eal_get_configuration()->mem_config;
910 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
911 		seg = &mcfg->memseg[seg_idx];
912 		if (seg->addr == NULL) {
913 			break;
914 		}
915 
916 		if (vaddr >= (uintptr_t)seg->addr &&
917 		    vaddr < ((uintptr_t)seg->addr + seg->len)) {
918 			paddr = seg->phys_addr;
919 			if (paddr == RTE_BAD_IOVA) {
920 				return SPDK_VTOPHYS_ERROR;
921 			}
922 			paddr += (vaddr - (uintptr_t)seg->addr);
923 			return paddr;
924 		}
925 	}
926 #endif
927 
928 	return SPDK_VTOPHYS_ERROR;
929 }
930 
931 /* Try to get the paddr from /proc/self/pagemap */
932 static uint64_t
933 vtophys_get_paddr_pagemap(uint64_t vaddr)
934 {
935 	uintptr_t paddr;
936 
937 	paddr = rte_mem_virt2iova((void *)vaddr);
938 	if (paddr == RTE_BAD_IOVA) {
939 		/*
940 		 * The vaddr may be valid but doesn't have a backing page
941 		 * assigned yet.  Touch the page to ensure a backing page
942 		 * gets assigned, then try to translate again.
943 		 */
944 		rte_atomic64_read((rte_atomic64_t *)vaddr);
945 		paddr = rte_mem_virt2iova((void *)vaddr);
946 	}
947 	if (paddr == RTE_BAD_IOVA) {
948 		/* Unable to get to the physical address. */
949 		return SPDK_VTOPHYS_ERROR;
950 	}
951 
952 	return paddr;
953 }
954 
955 /* Try to get the paddr from pci devices */
956 static uint64_t
957 vtophys_get_paddr_pci(uint64_t vaddr)
958 {
959 	struct spdk_vtophys_pci_device *vtophys_dev;
960 	uintptr_t paddr;
961 	struct rte_pci_device	*dev;
962 	struct rte_mem_resource *res;
963 	unsigned r;
964 
965 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
966 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
967 		dev = vtophys_dev->pci_device;
968 
969 		for (r = 0; r < PCI_MAX_RESOURCE; r++) {
970 			res = &dev->mem_resource[r];
971 			if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
972 			    vaddr < (uint64_t)res->addr + res->len) {
973 				paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
974 				DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
975 					    (void *)paddr);
976 				pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
977 				return paddr;
978 			}
979 		}
980 	}
981 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
982 
983 	return  SPDK_VTOPHYS_ERROR;
984 }
985 
986 static int
987 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
988 		    enum spdk_mem_map_notify_action action,
989 		    void *vaddr, size_t len)
990 {
991 	int rc = 0, pci_phys = 0;
992 	uint64_t paddr;
993 
994 	if ((uintptr_t)vaddr & ~MASK_256TB) {
995 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
996 		return -EINVAL;
997 	}
998 
999 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1000 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
1001 			    __func__, vaddr, len);
1002 		return -EINVAL;
1003 	}
1004 
1005 	while (len > 0) {
1006 		/* Get the physical address from the DPDK memsegs */
1007 		paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1008 
1009 		switch (action) {
1010 		case SPDK_MEM_MAP_NOTIFY_REGISTER:
1011 			if (paddr == SPDK_VTOPHYS_ERROR) {
1012 				/* This is not an address that DPDK is managing. */
1013 #if SPDK_VFIO_ENABLED
1014 				if (spdk_iommu_is_enabled()) {
1015 					/* We'll use the virtual address as the iova. DPDK
1016 					 * currently uses physical addresses as the iovas (or counts
1017 					 * up from 0 if it can't get physical addresses), so
1018 					 * the range of user space virtual addresses and physical
1019 					 * addresses will never overlap.
1020 					 */
1021 					paddr = (uint64_t)vaddr;
1022 					rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1023 					if (rc) {
1024 						return -EFAULT;
1025 					}
1026 				} else
1027 #endif
1028 				{
1029 					/* Get the physical address from /proc/self/pagemap. */
1030 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1031 					if (paddr == SPDK_VTOPHYS_ERROR) {
1032 						/* Get the physical address from PCI devices */
1033 						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
1034 						if (paddr == SPDK_VTOPHYS_ERROR) {
1035 							DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1036 							return -EFAULT;
1037 						}
1038 						pci_phys = 1;
1039 					}
1040 				}
1041 			}
1042 			/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
1043 			if (!pci_phys && (paddr & MASK_2MB)) {
1044 				DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1045 				return -EINVAL;
1046 			}
1047 
1048 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1049 			break;
1050 		case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1051 #if SPDK_VFIO_ENABLED
1052 			if (paddr == SPDK_VTOPHYS_ERROR) {
1053 				/*
1054 				 * This is not an address that DPDK is managing. If vfio is enabled,
1055 				 * we need to unmap the range from the IOMMU
1056 				 */
1057 				if (spdk_iommu_is_enabled()) {
1058 					uint64_t buffer_len = VALUE_2MB;
1059 					paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
1060 					if (buffer_len != VALUE_2MB) {
1061 						return -EINVAL;
1062 					}
1063 					rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1064 					if (rc) {
1065 						return -EFAULT;
1066 					}
1067 				}
1068 			}
1069 #endif
1070 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1071 			break;
1072 		default:
1073 			SPDK_UNREACHABLE();
1074 		}
1075 
1076 		if (rc != 0) {
1077 			return rc;
1078 		}
1079 		vaddr += VALUE_2MB;
1080 		len -= VALUE_2MB;
1081 	}
1082 
1083 	return rc;
1084 }
1085 
1086 #if SPDK_VFIO_ENABLED
1087 
1088 static bool
1089 spdk_vfio_enabled(void)
1090 {
1091 	return rte_vfio_is_enabled("vfio_pci");
1092 }
1093 
1094 /* Check if IOMMU is enabled on the system */
1095 static bool
1096 has_iommu_groups(void)
1097 {
1098 	struct dirent *d;
1099 	int count = 0;
1100 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1101 
1102 	if (dir == NULL) {
1103 		return false;
1104 	}
1105 
1106 	while (count < 3 && (d = readdir(dir)) != NULL) {
1107 		count++;
1108 	}
1109 
1110 	closedir(dir);
1111 	/* there will always be ./ and ../ entries */
1112 	return count > 2;
1113 }
1114 
1115 static bool
1116 spdk_vfio_noiommu_enabled(void)
1117 {
1118 	return rte_vfio_noiommu_is_enabled();
1119 }
1120 
1121 static void
1122 spdk_vtophys_iommu_init(void)
1123 {
1124 	char proc_fd_path[PATH_MAX + 1];
1125 	char link_path[PATH_MAX + 1];
1126 	const char vfio_path[] = "/dev/vfio/vfio";
1127 	DIR *dir;
1128 	struct dirent *d;
1129 
1130 	if (!spdk_vfio_enabled()) {
1131 		return;
1132 	}
1133 
1134 	if (spdk_vfio_noiommu_enabled()) {
1135 		g_vfio.noiommu_enabled = true;
1136 	} else if (!has_iommu_groups()) {
1137 		return;
1138 	}
1139 
1140 	dir = opendir("/proc/self/fd");
1141 	if (!dir) {
1142 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1143 		return;
1144 	}
1145 
1146 	while ((d = readdir(dir)) != NULL) {
1147 		if (d->d_type != DT_LNK) {
1148 			continue;
1149 		}
1150 
1151 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1152 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1153 			continue;
1154 		}
1155 
1156 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1157 			sscanf(d->d_name, "%d", &g_vfio.fd);
1158 			break;
1159 		}
1160 	}
1161 
1162 	closedir(dir);
1163 
1164 	if (g_vfio.fd < 0) {
1165 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1166 		return;
1167 	}
1168 
1169 	g_vfio.enabled = true;
1170 
1171 	return;
1172 }
1173 #endif
1174 
1175 void
1176 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device)
1177 {
1178 	struct spdk_vtophys_pci_device *vtophys_dev;
1179 
1180 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1181 
1182 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1183 	if (vtophys_dev) {
1184 		vtophys_dev->pci_device = pci_device;
1185 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1186 	} else {
1187 		DEBUG_PRINT("Memory allocation error\n");
1188 	}
1189 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1190 
1191 #if SPDK_VFIO_ENABLED
1192 	struct spdk_vfio_dma_map *dma_map;
1193 	int ret;
1194 
1195 	if (!g_vfio.enabled) {
1196 		return;
1197 	}
1198 
1199 	pthread_mutex_lock(&g_vfio.mutex);
1200 	g_vfio.device_ref++;
1201 	if (g_vfio.device_ref > 1) {
1202 		pthread_mutex_unlock(&g_vfio.mutex);
1203 		return;
1204 	}
1205 
1206 	/* This is the first SPDK device using DPDK vfio. This means that the first
1207 	 * IOMMU group might have been just been added to the DPDK vfio container.
1208 	 * From this point it is certain that the memory can be mapped now.
1209 	 */
1210 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1211 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1212 		if (ret) {
1213 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1214 			break;
1215 		}
1216 	}
1217 	pthread_mutex_unlock(&g_vfio.mutex);
1218 #endif
1219 }
1220 
1221 void
1222 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1223 {
1224 	struct spdk_vtophys_pci_device *vtophys_dev;
1225 
1226 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1227 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1228 		if (vtophys_dev->pci_device == pci_device) {
1229 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1230 			free(vtophys_dev);
1231 			break;
1232 		}
1233 	}
1234 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1235 
1236 #if SPDK_VFIO_ENABLED
1237 	struct spdk_vfio_dma_map *dma_map;
1238 	int ret;
1239 
1240 	if (!g_vfio.enabled) {
1241 		return;
1242 	}
1243 
1244 	pthread_mutex_lock(&g_vfio.mutex);
1245 	assert(g_vfio.device_ref > 0);
1246 	g_vfio.device_ref--;
1247 	if (g_vfio.device_ref > 0) {
1248 		pthread_mutex_unlock(&g_vfio.mutex);
1249 		return;
1250 	}
1251 
1252 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1253 	 * any additional devices using it's vfio container, all the mappings
1254 	 * will be automatically removed by the Linux vfio driver. We unmap
1255 	 * the memory manually to be able to easily re-map it later regardless
1256 	 * of other, external factors.
1257 	 */
1258 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1259 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
1260 		if (ret) {
1261 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1262 			break;
1263 		}
1264 	}
1265 	pthread_mutex_unlock(&g_vfio.mutex);
1266 #endif
1267 }
1268 
1269 int
1270 spdk_vtophys_init(void)
1271 {
1272 	const struct spdk_mem_map_ops vtophys_map_ops = {
1273 		.notify_cb = spdk_vtophys_notify,
1274 		.are_contiguous = NULL
1275 	};
1276 
1277 #if SPDK_VFIO_ENABLED
1278 	spdk_vtophys_iommu_init();
1279 #endif
1280 
1281 	g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1282 	if (g_vtophys_map == NULL) {
1283 		DEBUG_PRINT("vtophys map allocation failed\n");
1284 		return -1;
1285 	}
1286 	return 0;
1287 }
1288 
1289 uint64_t
1290 spdk_vtophys(void *buf, uint64_t *size)
1291 {
1292 	uint64_t vaddr, paddr_2mb;
1293 
1294 	vaddr = (uint64_t)buf;
1295 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1296 
1297 	/*
1298 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1299 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1300 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1301 	 * unaligned) we must now check the return value before addition.
1302 	 */
1303 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1304 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1305 		return SPDK_VTOPHYS_ERROR;
1306 	} else {
1307 		return paddr_2mb + (vaddr & MASK_2MB);
1308 	}
1309 }
1310