xref: /spdk/lib/env_dpdk/memory.c (revision dfc989439662457d39bac524be72e8ea1c20e817)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 #include "pci_dpdk.h"
10 
11 #include <rte_config.h>
12 #include <rte_memory.h>
13 #include <rte_eal_memconfig.h>
14 
15 #include "spdk_internal/assert.h"
16 
17 #include "spdk/assert.h"
18 #include "spdk/likely.h"
19 #include "spdk/queue.h"
20 #include "spdk/util.h"
21 #include "spdk/memory.h"
22 #include "spdk/env_dpdk.h"
23 #include "spdk/log.h"
24 
25 #ifndef __linux__
26 #define VFIO_ENABLED 0
27 #else
28 #include <linux/version.h>
29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 #define VFIO_ENABLED 1
31 #include <linux/vfio.h>
32 #include <rte_vfio.h>
33 
34 struct spdk_vfio_dma_map {
35 	struct vfio_iommu_type1_dma_map map;
36 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
37 };
38 
39 struct vfio_cfg {
40 	int fd;
41 	bool enabled;
42 	bool noiommu_enabled;
43 	unsigned device_ref;
44 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
45 	pthread_mutex_t mutex;
46 };
47 
48 static struct vfio_cfg g_vfio = {
49 	.fd = -1,
50 	.enabled = false,
51 	.noiommu_enabled = false,
52 	.device_ref = 0,
53 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
54 	.mutex = PTHREAD_MUTEX_INITIALIZER
55 };
56 
57 #else
58 #define VFIO_ENABLED 0
59 #endif
60 #endif
61 
62 #if DEBUG
63 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
64 #else
65 #define DEBUG_PRINT(...)
66 #endif
67 
68 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
69 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
70 
71 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
72 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
73 
74 /* Page is registered */
75 #define REG_MAP_REGISTERED	(1ULL << 62)
76 
77 /* A notification region barrier. The 2MB translation entry that's marked
78  * with this flag must be unregistered separately. This allows contiguous
79  * regions to be unregistered in the same chunks they were registered.
80  */
81 #define REG_MAP_NOTIFY_START	(1ULL << 63)
82 
83 /* Translation of a single 2MB page. */
84 struct map_2mb {
85 	uint64_t translation_2mb;
86 };
87 
88 /* Second-level map table indexed by bits [21..29] of the virtual address.
89  * Each entry contains the address translation or error for entries that haven't
90  * been retrieved yet.
91  */
92 struct map_1gb {
93 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
94 };
95 
96 /* Top-level map table indexed by bits [30..47] of the virtual address.
97  * Each entry points to a second-level map table or NULL.
98  */
99 struct map_256tb {
100 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
101 };
102 
103 /* Page-granularity memory address translation */
104 struct spdk_mem_map {
105 	struct map_256tb map_256tb;
106 	pthread_mutex_t mutex;
107 	uint64_t default_translation;
108 	struct spdk_mem_map_ops ops;
109 	void *cb_ctx;
110 	TAILQ_ENTRY(spdk_mem_map) tailq;
111 };
112 
113 /* Registrations map. The 64 bit translations are bit fields with the
114  * following layout (starting with the low bits):
115  *    0 - 61 : reserved
116  *   62 - 63 : flags
117  */
118 static struct spdk_mem_map *g_mem_reg_map;
119 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
120 	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
121 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
122 
123 static bool g_legacy_mem;
124 
125 /*
126  * Walk the currently registered memory via the main memory registration map
127  * and call the new map's notify callback for each virtually contiguous region.
128  */
129 static int
130 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
131 {
132 	size_t idx_256tb;
133 	uint64_t idx_1gb;
134 	uint64_t contig_start = UINT64_MAX;
135 	uint64_t contig_end = UINT64_MAX;
136 	struct map_1gb *map_1gb;
137 	int rc;
138 
139 	if (!g_mem_reg_map) {
140 		return -EINVAL;
141 	}
142 
143 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
144 	pthread_mutex_lock(&g_mem_reg_map->mutex);
145 
146 	for (idx_256tb = 0;
147 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
148 	     idx_256tb++) {
149 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
150 
151 		if (!map_1gb) {
152 			if (contig_start != UINT64_MAX) {
153 				/* End of of a virtually contiguous range */
154 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
155 							(void *)contig_start,
156 							contig_end - contig_start + VALUE_2MB);
157 				/* Don't bother handling unregister failures. It can't be any worse */
158 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
159 					goto err_unregister;
160 				}
161 			}
162 			contig_start = UINT64_MAX;
163 			continue;
164 		}
165 
166 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
167 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
168 			    (contig_start == UINT64_MAX ||
169 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
170 				/* Rebuild the virtual address from the indexes */
171 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
172 
173 				if (contig_start == UINT64_MAX) {
174 					contig_start = vaddr;
175 				}
176 
177 				contig_end = vaddr;
178 			} else {
179 				if (contig_start != UINT64_MAX) {
180 					/* End of of a virtually contiguous range */
181 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
182 								(void *)contig_start,
183 								contig_end - contig_start + VALUE_2MB);
184 					/* Don't bother handling unregister failures. It can't be any worse */
185 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
186 						goto err_unregister;
187 					}
188 
189 					/* This page might be a part of a neighbour region, so process
190 					 * it again. The idx_1gb will be incremented immediately.
191 					 */
192 					idx_1gb--;
193 				}
194 				contig_start = UINT64_MAX;
195 			}
196 		}
197 	}
198 
199 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
200 	return 0;
201 
202 err_unregister:
203 	/* Unwind to the first empty translation so we don't unregister
204 	 * a region that just failed to register.
205 	 */
206 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
207 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
208 	contig_start = UINT64_MAX;
209 	contig_end = UINT64_MAX;
210 
211 	/* Unregister any memory we managed to register before the failure */
212 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
213 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
214 
215 		if (!map_1gb) {
216 			if (contig_end != UINT64_MAX) {
217 				/* End of of a virtually contiguous range */
218 				map->ops.notify_cb(map->cb_ctx, map,
219 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
220 						   (void *)contig_start,
221 						   contig_end - contig_start + VALUE_2MB);
222 			}
223 			contig_end = UINT64_MAX;
224 			continue;
225 		}
226 
227 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
228 			/* Rebuild the virtual address from the indexes */
229 			uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
230 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
231 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
232 
233 				if (contig_end == UINT64_MAX) {
234 					contig_end = vaddr;
235 				}
236 				contig_start = vaddr;
237 			} else {
238 				if (contig_end != UINT64_MAX) {
239 					if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
240 						contig_start = vaddr;
241 					}
242 					/* End of of a virtually contiguous range */
243 					map->ops.notify_cb(map->cb_ctx, map,
244 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
245 							   (void *)contig_start,
246 							   contig_end - contig_start + VALUE_2MB);
247 				}
248 				contig_end = UINT64_MAX;
249 			}
250 		}
251 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
252 	}
253 
254 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
255 	return rc;
256 }
257 
258 struct spdk_mem_map *
259 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
260 {
261 	struct spdk_mem_map *map;
262 	int rc;
263 	size_t i;
264 
265 	map = calloc(1, sizeof(*map));
266 	if (map == NULL) {
267 		return NULL;
268 	}
269 
270 	if (pthread_mutex_init(&map->mutex, NULL)) {
271 		free(map);
272 		return NULL;
273 	}
274 
275 	map->default_translation = default_translation;
276 	map->cb_ctx = cb_ctx;
277 	if (ops) {
278 		map->ops = *ops;
279 	}
280 
281 	if (ops && ops->notify_cb) {
282 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
283 		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
284 		if (rc != 0) {
285 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
286 			DEBUG_PRINT("Initial mem_map notify failed\n");
287 			pthread_mutex_destroy(&map->mutex);
288 			for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
289 				free(map->map_256tb.map[i]);
290 			}
291 			free(map);
292 			return NULL;
293 		}
294 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
295 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
296 	}
297 
298 	return map;
299 }
300 
301 void
302 spdk_mem_map_free(struct spdk_mem_map **pmap)
303 {
304 	struct spdk_mem_map *map;
305 	size_t i;
306 
307 	if (!pmap) {
308 		return;
309 	}
310 
311 	map = *pmap;
312 
313 	if (!map) {
314 		return;
315 	}
316 
317 	if (map->ops.notify_cb) {
318 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
319 		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
320 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
321 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
322 	}
323 
324 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
325 		free(map->map_256tb.map[i]);
326 	}
327 
328 	pthread_mutex_destroy(&map->mutex);
329 
330 	free(map);
331 	*pmap = NULL;
332 }
333 
334 int
335 spdk_mem_register(void *vaddr, size_t len)
336 {
337 	struct spdk_mem_map *map;
338 	int rc;
339 	void *seg_vaddr;
340 	size_t seg_len;
341 	uint64_t reg;
342 
343 	if ((uintptr_t)vaddr & ~MASK_256TB) {
344 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
345 		return -EINVAL;
346 	}
347 
348 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
349 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
350 			    __func__, vaddr, len);
351 		return -EINVAL;
352 	}
353 
354 	if (len == 0) {
355 		return 0;
356 	}
357 
358 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
359 
360 	seg_vaddr = vaddr;
361 	seg_len = len;
362 	while (seg_len > 0) {
363 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
364 		if (reg & REG_MAP_REGISTERED) {
365 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
366 			return -EBUSY;
367 		}
368 		seg_vaddr += VALUE_2MB;
369 		seg_len -= VALUE_2MB;
370 	}
371 
372 	seg_vaddr = vaddr;
373 	seg_len = 0;
374 	while (len > 0) {
375 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
376 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
377 		seg_len += VALUE_2MB;
378 		vaddr += VALUE_2MB;
379 		len -= VALUE_2MB;
380 	}
381 
382 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
383 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
384 		if (rc != 0) {
385 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
386 			return rc;
387 		}
388 	}
389 
390 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
391 	return 0;
392 }
393 
394 int
395 spdk_mem_unregister(void *vaddr, size_t len)
396 {
397 	struct spdk_mem_map *map;
398 	int rc;
399 	void *seg_vaddr;
400 	size_t seg_len;
401 	uint64_t reg, newreg;
402 
403 	if ((uintptr_t)vaddr & ~MASK_256TB) {
404 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
405 		return -EINVAL;
406 	}
407 
408 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
409 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
410 			    __func__, vaddr, len);
411 		return -EINVAL;
412 	}
413 
414 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
415 
416 	/* The first page must be a start of a region. Also check if it's
417 	 * registered to make sure we don't return -ERANGE for non-registered
418 	 * regions.
419 	 */
420 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
421 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
422 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
423 		return -ERANGE;
424 	}
425 
426 	seg_vaddr = vaddr;
427 	seg_len = len;
428 	while (seg_len > 0) {
429 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
430 		if ((reg & REG_MAP_REGISTERED) == 0) {
431 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
432 			return -EINVAL;
433 		}
434 		seg_vaddr += VALUE_2MB;
435 		seg_len -= VALUE_2MB;
436 	}
437 
438 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
439 	/* If the next page is registered, it must be a start of a region as well,
440 	 * otherwise we'd be unregistering only a part of a region.
441 	 */
442 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
443 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
444 		return -ERANGE;
445 	}
446 	seg_vaddr = vaddr;
447 	seg_len = 0;
448 
449 	while (len > 0) {
450 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
451 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
452 
453 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
454 			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
455 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
456 				if (rc != 0) {
457 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
458 					return rc;
459 				}
460 			}
461 
462 			seg_vaddr = vaddr;
463 			seg_len = VALUE_2MB;
464 		} else {
465 			seg_len += VALUE_2MB;
466 		}
467 
468 		vaddr += VALUE_2MB;
469 		len -= VALUE_2MB;
470 	}
471 
472 	if (seg_len > 0) {
473 		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
474 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
475 			if (rc != 0) {
476 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
477 				return rc;
478 			}
479 		}
480 	}
481 
482 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
483 	return 0;
484 }
485 
486 int
487 spdk_mem_reserve(void *vaddr, size_t len)
488 {
489 	struct spdk_mem_map *map;
490 	void *seg_vaddr;
491 	size_t seg_len;
492 	uint64_t reg;
493 
494 	if ((uintptr_t)vaddr & ~MASK_256TB) {
495 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
496 		return -EINVAL;
497 	}
498 
499 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
500 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
501 			    __func__, vaddr, len);
502 		return -EINVAL;
503 	}
504 
505 	if (len == 0) {
506 		return 0;
507 	}
508 
509 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
510 
511 	/* Check if any part of this range is already registered */
512 	seg_vaddr = vaddr;
513 	seg_len = len;
514 	while (seg_len > 0) {
515 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
516 		if (reg & REG_MAP_REGISTERED) {
517 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
518 			return -EBUSY;
519 		}
520 		seg_vaddr += VALUE_2MB;
521 		seg_len -= VALUE_2MB;
522 	}
523 
524 	/* Simply set the translation to the memory map's default. This allocates the space in the
525 	 * map but does not provide a valid translation. */
526 	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
527 				     g_mem_reg_map->default_translation);
528 
529 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
530 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
531 	}
532 
533 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
534 	return 0;
535 }
536 
537 static struct map_1gb *
538 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
539 {
540 	struct map_1gb *map_1gb;
541 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
542 	size_t i;
543 
544 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
545 		return NULL;
546 	}
547 
548 	map_1gb = map->map_256tb.map[idx_256tb];
549 
550 	if (!map_1gb) {
551 		pthread_mutex_lock(&map->mutex);
552 
553 		/* Recheck to make sure nobody else got the mutex first. */
554 		map_1gb = map->map_256tb.map[idx_256tb];
555 		if (!map_1gb) {
556 			map_1gb = malloc(sizeof(struct map_1gb));
557 			if (map_1gb) {
558 				/* initialize all entries to default translation */
559 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
560 					map_1gb->map[i].translation_2mb = map->default_translation;
561 				}
562 				map->map_256tb.map[idx_256tb] = map_1gb;
563 			}
564 		}
565 
566 		pthread_mutex_unlock(&map->mutex);
567 
568 		if (!map_1gb) {
569 			DEBUG_PRINT("allocation failed\n");
570 			return NULL;
571 		}
572 	}
573 
574 	return map_1gb;
575 }
576 
577 int
578 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
579 			     uint64_t translation)
580 {
581 	uint64_t vfn_2mb;
582 	struct map_1gb *map_1gb;
583 	uint64_t idx_1gb;
584 	struct map_2mb *map_2mb;
585 
586 	if ((uintptr_t)vaddr & ~MASK_256TB) {
587 		DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
588 		return -EINVAL;
589 	}
590 
591 	/* For now, only 2 MB-aligned registrations are supported */
592 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
593 		DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
594 			    __func__, vaddr, size);
595 		return -EINVAL;
596 	}
597 
598 	vfn_2mb = vaddr >> SHIFT_2MB;
599 
600 	while (size) {
601 		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
602 		if (!map_1gb) {
603 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
604 			return -ENOMEM;
605 		}
606 
607 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
608 		map_2mb = &map_1gb->map[idx_1gb];
609 		map_2mb->translation_2mb = translation;
610 
611 		size -= VALUE_2MB;
612 		vfn_2mb++;
613 	}
614 
615 	return 0;
616 }
617 
618 int
619 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
620 {
621 	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
622 }
623 
624 inline uint64_t
625 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
626 {
627 	const struct map_1gb *map_1gb;
628 	const struct map_2mb *map_2mb;
629 	uint64_t idx_256tb;
630 	uint64_t idx_1gb;
631 	uint64_t vfn_2mb;
632 	uint64_t cur_size;
633 	uint64_t prev_translation;
634 	uint64_t orig_translation;
635 
636 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
637 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
638 		return map->default_translation;
639 	}
640 
641 	vfn_2mb = vaddr >> SHIFT_2MB;
642 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
643 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
644 
645 	map_1gb = map->map_256tb.map[idx_256tb];
646 	if (spdk_unlikely(!map_1gb)) {
647 		return map->default_translation;
648 	}
649 
650 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
651 	map_2mb = &map_1gb->map[idx_1gb];
652 	if (size == NULL || map->ops.are_contiguous == NULL ||
653 	    map_2mb->translation_2mb == map->default_translation) {
654 		if (size != NULL) {
655 			*size = spdk_min(*size, cur_size);
656 		}
657 		return map_2mb->translation_2mb;
658 	}
659 
660 	orig_translation = map_2mb->translation_2mb;
661 	prev_translation = orig_translation;
662 	while (cur_size < *size) {
663 		vfn_2mb++;
664 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
665 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
666 
667 		map_1gb = map->map_256tb.map[idx_256tb];
668 		if (spdk_unlikely(!map_1gb)) {
669 			break;
670 		}
671 
672 		map_2mb = &map_1gb->map[idx_1gb];
673 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
674 			break;
675 		}
676 
677 		cur_size += VALUE_2MB;
678 		prev_translation = map_2mb->translation_2mb;
679 	}
680 
681 	*size = spdk_min(*size, cur_size);
682 	return orig_translation;
683 }
684 
685 static void
686 memory_hotplug_cb(enum rte_mem_event event_type,
687 		  const void *addr, size_t len, void *arg)
688 {
689 	if (event_type == RTE_MEM_EVENT_ALLOC) {
690 		spdk_mem_register((void *)addr, len);
691 
692 		if (!spdk_env_dpdk_external_init()) {
693 			return;
694 		}
695 
696 		/* When the user initialized DPDK separately, we can't
697 		 * be sure that --match-allocations RTE flag was specified.
698 		 * Without this flag, DPDK can free memory in different units
699 		 * than it was allocated. It doesn't work with things like RDMA MRs.
700 		 *
701 		 * For such cases, we mark segments so they aren't freed.
702 		 */
703 		while (len > 0) {
704 			struct rte_memseg *seg;
705 
706 			seg = rte_mem_virt2memseg(addr, NULL);
707 			assert(seg != NULL);
708 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
709 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
710 			len -= seg->hugepage_sz;
711 		}
712 	} else if (event_type == RTE_MEM_EVENT_FREE) {
713 		spdk_mem_unregister((void *)addr, len);
714 	}
715 }
716 
717 static int
718 memory_iter_cb(const struct rte_memseg_list *msl,
719 	       const struct rte_memseg *ms, size_t len, void *arg)
720 {
721 	return spdk_mem_register(ms->addr, len);
722 }
723 
724 int
725 mem_map_init(bool legacy_mem)
726 {
727 	g_legacy_mem = legacy_mem;
728 
729 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
730 	if (g_mem_reg_map == NULL) {
731 		DEBUG_PRINT("memory registration map allocation failed\n");
732 		return -ENOMEM;
733 	}
734 
735 	/*
736 	 * Walk all DPDK memory segments and register them
737 	 * with the main memory map
738 	 */
739 	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
740 	rte_memseg_contig_walk(memory_iter_cb, NULL);
741 	return 0;
742 }
743 
744 bool
745 spdk_iommu_is_enabled(void)
746 {
747 #if VFIO_ENABLED
748 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
749 #else
750 	return false;
751 #endif
752 }
753 
754 struct spdk_vtophys_pci_device {
755 	struct rte_pci_device *pci_device;
756 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
757 };
758 
759 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
760 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
761 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
762 
763 static struct spdk_mem_map *g_vtophys_map;
764 static struct spdk_mem_map *g_phys_ref_map;
765 
766 #if VFIO_ENABLED
767 static int
768 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
769 {
770 	struct spdk_vfio_dma_map *dma_map;
771 	uint64_t refcount;
772 	int ret;
773 
774 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
775 	assert(refcount < UINT64_MAX);
776 	if (refcount > 0) {
777 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
778 		return 0;
779 	}
780 
781 	dma_map = calloc(1, sizeof(*dma_map));
782 	if (dma_map == NULL) {
783 		return -ENOMEM;
784 	}
785 
786 	dma_map->map.argsz = sizeof(dma_map->map);
787 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
788 	dma_map->map.vaddr = vaddr;
789 	dma_map->map.iova = iova;
790 	dma_map->map.size = size;
791 
792 	pthread_mutex_lock(&g_vfio.mutex);
793 	if (g_vfio.device_ref == 0) {
794 		/* VFIO requires at least one device (IOMMU group) to be added to
795 		 * a VFIO container before it is possible to perform any IOMMU
796 		 * operations on that container. This memory will be mapped once
797 		 * the first device (IOMMU group) is hotplugged.
798 		 *
799 		 * Since the vfio container is managed internally by DPDK, it is
800 		 * also possible that some device is already in that container, but
801 		 * it's not managed by SPDK -  e.g. an NIC attached internally
802 		 * inside DPDK. We could map the memory straight away in such
803 		 * scenario, but there's no need to do it. DPDK devices clearly
804 		 * don't need our mappings and hence we defer the mapping
805 		 * unconditionally until the first SPDK-managed device is
806 		 * hotplugged.
807 		 */
808 		goto out_insert;
809 	}
810 
811 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
812 	if (ret) {
813 		/* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
814 		SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
815 	}
816 
817 out_insert:
818 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
819 	pthread_mutex_unlock(&g_vfio.mutex);
820 	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
821 	return 0;
822 }
823 
824 static int
825 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
826 {
827 	struct spdk_vfio_dma_map *dma_map;
828 	uint64_t refcount;
829 	int ret;
830 	struct vfio_iommu_type1_dma_unmap unmap = {};
831 
832 	pthread_mutex_lock(&g_vfio.mutex);
833 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
834 		if (dma_map->map.iova == iova) {
835 			break;
836 		}
837 	}
838 
839 	if (dma_map == NULL) {
840 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
841 		pthread_mutex_unlock(&g_vfio.mutex);
842 		return -ENXIO;
843 	}
844 
845 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
846 	assert(refcount < UINT64_MAX);
847 	if (refcount > 0) {
848 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
849 	}
850 
851 	/* We still have outstanding references, don't clear it. */
852 	if (refcount > 1) {
853 		pthread_mutex_unlock(&g_vfio.mutex);
854 		return 0;
855 	}
856 
857 	/** don't support partial or multiple-page unmap for now */
858 	assert(dma_map->map.size == size);
859 
860 	if (g_vfio.device_ref == 0) {
861 		/* Memory is not mapped anymore, just remove it's references */
862 		goto out_remove;
863 	}
864 
865 	unmap.argsz = sizeof(unmap);
866 	unmap.flags = 0;
867 	unmap.iova = dma_map->map.iova;
868 	unmap.size = dma_map->map.size;
869 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
870 	if (ret) {
871 		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
872 	}
873 
874 out_remove:
875 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
876 	pthread_mutex_unlock(&g_vfio.mutex);
877 	free(dma_map);
878 	return 0;
879 }
880 #endif
881 
882 static uint64_t
883 vtophys_get_paddr_memseg(uint64_t vaddr)
884 {
885 	uintptr_t paddr;
886 	struct rte_memseg *seg;
887 
888 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
889 	if (seg != NULL) {
890 		paddr = seg->iova;
891 		if (paddr == RTE_BAD_IOVA) {
892 			return SPDK_VTOPHYS_ERROR;
893 		}
894 		paddr += (vaddr - (uintptr_t)seg->addr);
895 		return paddr;
896 	}
897 
898 	return SPDK_VTOPHYS_ERROR;
899 }
900 
901 /* Try to get the paddr from /proc/self/pagemap */
902 static uint64_t
903 vtophys_get_paddr_pagemap(uint64_t vaddr)
904 {
905 	uintptr_t paddr;
906 
907 	/* Silence static analyzers */
908 	assert(vaddr != 0);
909 	paddr = rte_mem_virt2iova((void *)vaddr);
910 	if (paddr == RTE_BAD_IOVA) {
911 		/*
912 		 * The vaddr may be valid but doesn't have a backing page
913 		 * assigned yet.  Touch the page to ensure a backing page
914 		 * gets assigned, then try to translate again.
915 		 */
916 		rte_atomic64_read((rte_atomic64_t *)vaddr);
917 		paddr = rte_mem_virt2iova((void *)vaddr);
918 	}
919 	if (paddr == RTE_BAD_IOVA) {
920 		/* Unable to get to the physical address. */
921 		return SPDK_VTOPHYS_ERROR;
922 	}
923 
924 	return paddr;
925 }
926 
927 /* Try to get the paddr from pci devices */
928 static uint64_t
929 vtophys_get_paddr_pci(uint64_t vaddr)
930 {
931 	struct spdk_vtophys_pci_device *vtophys_dev;
932 	uintptr_t paddr;
933 	struct rte_pci_device	*dev;
934 
935 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
936 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
937 		dev = vtophys_dev->pci_device;
938 		paddr = dpdk_pci_device_vtophys(dev, vaddr);
939 		if (paddr != SPDK_VTOPHYS_ERROR) {
940 			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
941 			return paddr;
942 		}
943 	}
944 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
945 
946 	return  SPDK_VTOPHYS_ERROR;
947 }
948 
949 static int
950 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
951 	       enum spdk_mem_map_notify_action action,
952 	       void *vaddr, size_t len)
953 {
954 	int rc = 0, pci_phys = 0;
955 	uint64_t paddr;
956 
957 	if ((uintptr_t)vaddr & ~MASK_256TB) {
958 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
959 		return -EINVAL;
960 	}
961 
962 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
963 		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
964 			    vaddr, len);
965 		return -EINVAL;
966 	}
967 
968 	/* Get the physical address from the DPDK memsegs */
969 	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
970 
971 	switch (action) {
972 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
973 		if (paddr == SPDK_VTOPHYS_ERROR) {
974 			/* This is not an address that DPDK is managing. */
975 #if VFIO_ENABLED
976 			enum rte_iova_mode iova_mode;
977 
978 			iova_mode = rte_eal_iova_mode();
979 
980 			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
981 				/* We'll use the virtual address as the iova to match DPDK. */
982 				paddr = (uint64_t)vaddr;
983 				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
984 				if (rc) {
985 					return -EFAULT;
986 				}
987 				while (len > 0) {
988 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
989 					if (rc != 0) {
990 						return rc;
991 					}
992 					vaddr += VALUE_2MB;
993 					paddr += VALUE_2MB;
994 					len -= VALUE_2MB;
995 				}
996 			} else
997 #endif
998 			{
999 				/* Get the physical address from /proc/self/pagemap. */
1000 				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1001 				if (paddr == SPDK_VTOPHYS_ERROR) {
1002 					/* Get the physical address from PCI devices */
1003 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
1004 					if (paddr == SPDK_VTOPHYS_ERROR) {
1005 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1006 						return -EFAULT;
1007 					}
1008 					/* The beginning of this address range points to a PCI resource,
1009 					 * so the rest must point to a PCI resource as well.
1010 					 */
1011 					pci_phys = 1;
1012 				}
1013 
1014 				/* Get paddr for each 2MB chunk in this address range */
1015 				while (len > 0) {
1016 					/* Get the physical address from /proc/self/pagemap. */
1017 					if (pci_phys) {
1018 						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
1019 					} else {
1020 						paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1021 					}
1022 
1023 					if (paddr == SPDK_VTOPHYS_ERROR) {
1024 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1025 						return -EFAULT;
1026 					}
1027 
1028 					/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
1029 					if (!pci_phys && (paddr & MASK_2MB)) {
1030 						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1031 						return -EINVAL;
1032 					}
1033 #if VFIO_ENABLED
1034 					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1035 					 * with the IOMMU using the physical address to match. */
1036 					if (spdk_iommu_is_enabled()) {
1037 						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1038 						if (rc) {
1039 							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1040 							return -EFAULT;
1041 						}
1042 					}
1043 #endif
1044 
1045 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1046 					if (rc != 0) {
1047 						return rc;
1048 					}
1049 
1050 					vaddr += VALUE_2MB;
1051 					len -= VALUE_2MB;
1052 				}
1053 			}
1054 		} else {
1055 			/* This is an address managed by DPDK. Just setup the translations. */
1056 			while (len > 0) {
1057 				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1058 				if (paddr == SPDK_VTOPHYS_ERROR) {
1059 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1060 					return -EFAULT;
1061 				}
1062 
1063 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1064 				if (rc != 0) {
1065 					return rc;
1066 				}
1067 
1068 				vaddr += VALUE_2MB;
1069 				len -= VALUE_2MB;
1070 			}
1071 		}
1072 
1073 		break;
1074 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1075 #if VFIO_ENABLED
1076 		if (paddr == SPDK_VTOPHYS_ERROR) {
1077 			/*
1078 			 * This is not an address that DPDK is managing. If vfio is enabled,
1079 			 * we need to unmap the range from the IOMMU
1080 			 */
1081 			if (spdk_iommu_is_enabled()) {
1082 				uint64_t buffer_len = len;
1083 				uint8_t *va = vaddr;
1084 				enum rte_iova_mode iova_mode;
1085 
1086 				iova_mode = rte_eal_iova_mode();
1087 				/*
1088 				 * In virtual address mode, the region is contiguous and can be done in
1089 				 * one unmap.
1090 				 */
1091 				if (iova_mode == RTE_IOVA_VA) {
1092 					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1093 					if (buffer_len != len || paddr != (uintptr_t)va) {
1094 						DEBUG_PRINT("Unmapping %p with length %lu failed because "
1095 							    "translation had address 0x%" PRIx64 " and length %lu\n",
1096 							    va, len, paddr, buffer_len);
1097 						return -EINVAL;
1098 					}
1099 					rc = vtophys_iommu_unmap_dma(paddr, len);
1100 					if (rc) {
1101 						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1102 						return -EFAULT;
1103 					}
1104 				} else if (iova_mode == RTE_IOVA_PA) {
1105 					/* Get paddr for each 2MB chunk in this address range */
1106 					while (buffer_len > 0) {
1107 						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1108 
1109 						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1110 							DEBUG_PRINT("could not get phys addr for %p\n", va);
1111 							return -EFAULT;
1112 						}
1113 
1114 						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1115 						if (rc) {
1116 							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1117 							return -EFAULT;
1118 						}
1119 
1120 						va += VALUE_2MB;
1121 						buffer_len -= VALUE_2MB;
1122 					}
1123 				}
1124 			}
1125 		}
1126 #endif
1127 		while (len > 0) {
1128 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1129 			if (rc != 0) {
1130 				return rc;
1131 			}
1132 
1133 			vaddr += VALUE_2MB;
1134 			len -= VALUE_2MB;
1135 		}
1136 
1137 		break;
1138 	default:
1139 		SPDK_UNREACHABLE();
1140 	}
1141 
1142 	return rc;
1143 }
1144 
1145 static int
1146 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1147 {
1148 	/* This function is always called with paddrs for two subsequent
1149 	 * 2MB chunks in virtual address space, so those chunks will be only
1150 	 * physically contiguous if the physical addresses are 2MB apart
1151 	 * from each other as well.
1152 	 */
1153 	return (paddr2 - paddr1 == VALUE_2MB);
1154 }
1155 
1156 #if VFIO_ENABLED
1157 
1158 static bool
1159 vfio_enabled(void)
1160 {
1161 	return rte_vfio_is_enabled("vfio_pci");
1162 }
1163 
1164 /* Check if IOMMU is enabled on the system */
1165 static bool
1166 has_iommu_groups(void)
1167 {
1168 	int count = 0;
1169 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1170 
1171 	if (dir == NULL) {
1172 		return false;
1173 	}
1174 
1175 	while (count < 3 && readdir(dir) != NULL) {
1176 		count++;
1177 	}
1178 
1179 	closedir(dir);
1180 	/* there will always be ./ and ../ entries */
1181 	return count > 2;
1182 }
1183 
1184 static bool
1185 vfio_noiommu_enabled(void)
1186 {
1187 	return rte_vfio_noiommu_is_enabled();
1188 }
1189 
1190 static void
1191 vtophys_iommu_init(void)
1192 {
1193 	char proc_fd_path[PATH_MAX + 1];
1194 	char link_path[PATH_MAX + 1];
1195 	const char vfio_path[] = "/dev/vfio/vfio";
1196 	DIR *dir;
1197 	struct dirent *d;
1198 
1199 	if (!vfio_enabled()) {
1200 		return;
1201 	}
1202 
1203 	if (vfio_noiommu_enabled()) {
1204 		g_vfio.noiommu_enabled = true;
1205 	} else if (!has_iommu_groups()) {
1206 		return;
1207 	}
1208 
1209 	dir = opendir("/proc/self/fd");
1210 	if (!dir) {
1211 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1212 		return;
1213 	}
1214 
1215 	while ((d = readdir(dir)) != NULL) {
1216 		if (d->d_type != DT_LNK) {
1217 			continue;
1218 		}
1219 
1220 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1221 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1222 			continue;
1223 		}
1224 
1225 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1226 			sscanf(d->d_name, "%d", &g_vfio.fd);
1227 			break;
1228 		}
1229 	}
1230 
1231 	closedir(dir);
1232 
1233 	if (g_vfio.fd < 0) {
1234 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1235 		return;
1236 	}
1237 
1238 	g_vfio.enabled = true;
1239 
1240 	return;
1241 }
1242 
1243 #endif
1244 
1245 void
1246 vtophys_pci_device_added(struct rte_pci_device *pci_device)
1247 {
1248 	struct spdk_vtophys_pci_device *vtophys_dev;
1249 
1250 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1251 
1252 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1253 	if (vtophys_dev) {
1254 		vtophys_dev->pci_device = pci_device;
1255 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1256 	} else {
1257 		DEBUG_PRINT("Memory allocation error\n");
1258 	}
1259 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1260 
1261 #if VFIO_ENABLED
1262 	struct spdk_vfio_dma_map *dma_map;
1263 	int ret;
1264 
1265 	if (!g_vfio.enabled) {
1266 		return;
1267 	}
1268 
1269 	pthread_mutex_lock(&g_vfio.mutex);
1270 	g_vfio.device_ref++;
1271 	if (g_vfio.device_ref > 1) {
1272 		pthread_mutex_unlock(&g_vfio.mutex);
1273 		return;
1274 	}
1275 
1276 	/* This is the first SPDK device using DPDK vfio. This means that the first
1277 	 * IOMMU group might have been just been added to the DPDK vfio container.
1278 	 * From this point it is certain that the memory can be mapped now.
1279 	 */
1280 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1281 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1282 		if (ret) {
1283 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1284 			break;
1285 		}
1286 	}
1287 	pthread_mutex_unlock(&g_vfio.mutex);
1288 #endif
1289 }
1290 
1291 void
1292 vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1293 {
1294 	struct spdk_vtophys_pci_device *vtophys_dev;
1295 
1296 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1297 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1298 		if (vtophys_dev->pci_device == pci_device) {
1299 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1300 			free(vtophys_dev);
1301 			break;
1302 		}
1303 	}
1304 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1305 
1306 #if VFIO_ENABLED
1307 	struct spdk_vfio_dma_map *dma_map;
1308 	int ret;
1309 
1310 	if (!g_vfio.enabled) {
1311 		return;
1312 	}
1313 
1314 	pthread_mutex_lock(&g_vfio.mutex);
1315 	assert(g_vfio.device_ref > 0);
1316 	g_vfio.device_ref--;
1317 	if (g_vfio.device_ref > 0) {
1318 		pthread_mutex_unlock(&g_vfio.mutex);
1319 		return;
1320 	}
1321 
1322 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1323 	 * any additional devices using it's vfio container, all the mappings
1324 	 * will be automatically removed by the Linux vfio driver. We unmap
1325 	 * the memory manually to be able to easily re-map it later regardless
1326 	 * of other, external factors.
1327 	 */
1328 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1329 		struct vfio_iommu_type1_dma_unmap unmap = {};
1330 		unmap.argsz = sizeof(unmap);
1331 		unmap.flags = 0;
1332 		unmap.iova = dma_map->map.iova;
1333 		unmap.size = dma_map->map.size;
1334 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1335 		if (ret) {
1336 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1337 			break;
1338 		}
1339 	}
1340 	pthread_mutex_unlock(&g_vfio.mutex);
1341 #endif
1342 }
1343 
1344 int
1345 vtophys_init(void)
1346 {
1347 	const struct spdk_mem_map_ops vtophys_map_ops = {
1348 		.notify_cb = vtophys_notify,
1349 		.are_contiguous = vtophys_check_contiguous_entries,
1350 	};
1351 
1352 	const struct spdk_mem_map_ops phys_ref_map_ops = {
1353 		.notify_cb = NULL,
1354 		.are_contiguous = NULL,
1355 	};
1356 
1357 #if VFIO_ENABLED
1358 	vtophys_iommu_init();
1359 #endif
1360 
1361 	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1362 	if (g_phys_ref_map == NULL) {
1363 		DEBUG_PRINT("phys_ref map allocation failed.\n");
1364 		return -ENOMEM;
1365 	}
1366 
1367 	g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1368 	if (g_vtophys_map == NULL) {
1369 		DEBUG_PRINT("vtophys map allocation failed\n");
1370 		spdk_mem_map_free(&g_phys_ref_map);
1371 		return -ENOMEM;
1372 	}
1373 	return 0;
1374 }
1375 
1376 uint64_t
1377 spdk_vtophys(const void *buf, uint64_t *size)
1378 {
1379 	uint64_t vaddr, paddr_2mb;
1380 
1381 	vaddr = (uint64_t)buf;
1382 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1383 
1384 	/*
1385 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1386 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1387 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1388 	 * unaligned) we must now check the return value before addition.
1389 	 */
1390 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1391 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1392 		return SPDK_VTOPHYS_ERROR;
1393 	} else {
1394 		return paddr_2mb + (vaddr & MASK_2MB);
1395 	}
1396 }
1397 
1398 int
1399 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1400 {
1401 	struct rte_memseg *seg;
1402 	int ret, fd;
1403 
1404 	seg = rte_mem_virt2memseg(vaddr, NULL);
1405 	if (!seg) {
1406 		SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1407 		return -ENOENT;
1408 	}
1409 
1410 	fd = rte_memseg_get_fd_thread_unsafe(seg);
1411 	if (fd < 0) {
1412 		return fd;
1413 	}
1414 
1415 	ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1416 	if (ret < 0) {
1417 		return ret;
1418 	}
1419 
1420 	return fd;
1421 }
1422