xref: /spdk/lib/env_dpdk/memory.c (revision c164db9ffe3718ad4e4f5bab380ccfa62c2fa672)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "env_internal.h"
9 #include "pci_dpdk.h"
10 
11 #include <rte_config.h>
12 #include <rte_memory.h>
13 #include <rte_eal_memconfig.h>
14 #include <rte_dev.h>
15 #include <rte_pci.h>
16 
17 #include "spdk_internal/assert.h"
18 
19 #include "spdk/assert.h"
20 #include "spdk/likely.h"
21 #include "spdk/queue.h"
22 #include "spdk/util.h"
23 #include "spdk/memory.h"
24 #include "spdk/env_dpdk.h"
25 #include "spdk/log.h"
26 
27 #ifdef __linux__
28 #include <linux/version.h>
29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 #include <linux/vfio.h>
31 #include <rte_vfio.h>
32 
33 struct spdk_vfio_dma_map {
34 	struct vfio_iommu_type1_dma_map map;
35 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 };
37 
38 struct vfio_cfg {
39 	int fd;
40 	bool enabled;
41 	bool noiommu_enabled;
42 	unsigned device_ref;
43 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 	pthread_mutex_t mutex;
45 };
46 
47 static struct vfio_cfg g_vfio = {
48 	.fd = -1,
49 	.enabled = false,
50 	.noiommu_enabled = false,
51 	.device_ref = 0,
52 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 	.mutex = PTHREAD_MUTEX_INITIALIZER
54 };
55 #endif
56 #endif
57 
58 #if DEBUG
59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 #else
61 #define DEBUG_PRINT(...)
62 #endif
63 
64 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
65 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
66 
67 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 
70 /* Page is registered */
71 #define REG_MAP_REGISTERED	(1ULL << 62)
72 
73 /* A notification region barrier. The 2MB translation entry that's marked
74  * with this flag must be unregistered separately. This allows contiguous
75  * regions to be unregistered in the same chunks they were registered.
76  */
77 #define REG_MAP_NOTIFY_START	(1ULL << 63)
78 
79 /* Translation of a single 2MB page. */
80 struct map_2mb {
81 	uint64_t translation_2mb;
82 };
83 
84 /* Second-level map table indexed by bits [21..29] of the virtual address.
85  * Each entry contains the address translation or error for entries that haven't
86  * been retrieved yet.
87  */
88 struct map_1gb {
89 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 };
91 
92 /* Top-level map table indexed by bits [30..47] of the virtual address.
93  * Each entry points to a second-level map table or NULL.
94  */
95 struct map_256tb {
96 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 };
98 
99 /* Page-granularity memory address translation */
100 struct spdk_mem_map {
101 	struct map_256tb map_256tb;
102 	pthread_mutex_t mutex;
103 	uint64_t default_translation;
104 	struct spdk_mem_map_ops ops;
105 	void *cb_ctx;
106 	TAILQ_ENTRY(spdk_mem_map) tailq;
107 };
108 
109 /* Registrations map. The 64 bit translations are bit fields with the
110  * following layout (starting with the low bits):
111  *    0 - 61 : reserved
112  *   62 - 63 : flags
113  */
114 static struct spdk_mem_map *g_mem_reg_map;
115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 
119 static bool g_legacy_mem;
120 static bool g_huge_pages = true;
121 
122 /*
123  * Walk the currently registered memory via the main memory registration map
124  * and call the new map's notify callback for each virtually contiguous region.
125  */
126 static int
127 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
128 {
129 	size_t idx_256tb;
130 	uint64_t idx_1gb;
131 	uint64_t contig_start = UINT64_MAX;
132 	uint64_t contig_end = UINT64_MAX;
133 	struct map_1gb *map_1gb;
134 	int rc;
135 
136 	if (!g_mem_reg_map) {
137 		return -EINVAL;
138 	}
139 
140 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
141 	pthread_mutex_lock(&g_mem_reg_map->mutex);
142 
143 	for (idx_256tb = 0;
144 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
145 	     idx_256tb++) {
146 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
147 
148 		if (!map_1gb) {
149 			if (contig_start != UINT64_MAX) {
150 				/* End of of a virtually contiguous range */
151 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
152 							(void *)contig_start,
153 							contig_end - contig_start + VALUE_2MB);
154 				/* Don't bother handling unregister failures. It can't be any worse */
155 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
156 					goto err_unregister;
157 				}
158 			}
159 			contig_start = UINT64_MAX;
160 			continue;
161 		}
162 
163 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
164 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
165 			    (contig_start == UINT64_MAX ||
166 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
167 				/* Rebuild the virtual address from the indexes */
168 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
169 
170 				if (contig_start == UINT64_MAX) {
171 					contig_start = vaddr;
172 				}
173 
174 				contig_end = vaddr;
175 			} else {
176 				if (contig_start != UINT64_MAX) {
177 					/* End of of a virtually contiguous range */
178 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
179 								(void *)contig_start,
180 								contig_end - contig_start + VALUE_2MB);
181 					/* Don't bother handling unregister failures. It can't be any worse */
182 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183 						goto err_unregister;
184 					}
185 
186 					/* This page might be a part of a neighbour region, so process
187 					 * it again. The idx_1gb will be incremented immediately.
188 					 */
189 					idx_1gb--;
190 				}
191 				contig_start = UINT64_MAX;
192 			}
193 		}
194 	}
195 
196 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
197 	return 0;
198 
199 err_unregister:
200 	/* Unwind to the first empty translation so we don't unregister
201 	 * a region that just failed to register.
202 	 */
203 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
204 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
205 	contig_start = UINT64_MAX;
206 	contig_end = UINT64_MAX;
207 
208 	/* Unregister any memory we managed to register before the failure */
209 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
210 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
211 
212 		if (!map_1gb) {
213 			if (contig_end != UINT64_MAX) {
214 				/* End of of a virtually contiguous range */
215 				map->ops.notify_cb(map->cb_ctx, map,
216 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
217 						   (void *)contig_start,
218 						   contig_end - contig_start + VALUE_2MB);
219 			}
220 			contig_end = UINT64_MAX;
221 			continue;
222 		}
223 
224 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
225 			/* Rebuild the virtual address from the indexes */
226 			uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
227 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
228 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
229 
230 				if (contig_end == UINT64_MAX) {
231 					contig_end = vaddr;
232 				}
233 				contig_start = vaddr;
234 			} else {
235 				if (contig_end != UINT64_MAX) {
236 					if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
237 						contig_start = vaddr;
238 					}
239 					/* End of of a virtually contiguous range */
240 					map->ops.notify_cb(map->cb_ctx, map,
241 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
242 							   (void *)contig_start,
243 							   contig_end - contig_start + VALUE_2MB);
244 				}
245 				contig_end = UINT64_MAX;
246 			}
247 		}
248 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
249 	}
250 
251 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
252 	return rc;
253 }
254 
255 struct spdk_mem_map *
256 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
257 {
258 	struct spdk_mem_map *map;
259 	int rc;
260 	size_t i;
261 
262 	map = calloc(1, sizeof(*map));
263 	if (map == NULL) {
264 		return NULL;
265 	}
266 
267 	if (pthread_mutex_init(&map->mutex, NULL)) {
268 		free(map);
269 		return NULL;
270 	}
271 
272 	map->default_translation = default_translation;
273 	map->cb_ctx = cb_ctx;
274 	if (ops) {
275 		map->ops = *ops;
276 	}
277 
278 	if (ops && ops->notify_cb) {
279 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
280 		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
281 		if (rc != 0) {
282 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
283 			DEBUG_PRINT("Initial mem_map notify failed\n");
284 			pthread_mutex_destroy(&map->mutex);
285 			for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
286 				free(map->map_256tb.map[i]);
287 			}
288 			free(map);
289 			return NULL;
290 		}
291 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
292 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
293 	}
294 
295 	return map;
296 }
297 
298 void
299 spdk_mem_map_free(struct spdk_mem_map **pmap)
300 {
301 	struct spdk_mem_map *map;
302 	size_t i;
303 
304 	if (!pmap) {
305 		return;
306 	}
307 
308 	map = *pmap;
309 
310 	if (!map) {
311 		return;
312 	}
313 
314 	if (map->ops.notify_cb) {
315 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
316 		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
317 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
318 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
319 	}
320 
321 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
322 		free(map->map_256tb.map[i]);
323 	}
324 
325 	pthread_mutex_destroy(&map->mutex);
326 
327 	free(map);
328 	*pmap = NULL;
329 }
330 
331 int
332 spdk_mem_register(void *_vaddr, size_t len)
333 {
334 	struct spdk_mem_map *map;
335 	int rc;
336 	uint64_t vaddr = (uintptr_t)_vaddr;
337 	uint64_t seg_vaddr;
338 	size_t seg_len;
339 	uint64_t reg;
340 
341 	if ((uintptr_t)vaddr & ~MASK_256TB) {
342 		DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
343 		return -EINVAL;
344 	}
345 
346 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
347 		DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
348 			    __func__, vaddr, len);
349 		return -EINVAL;
350 	}
351 
352 	if (len == 0) {
353 		return 0;
354 	}
355 
356 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
357 
358 	seg_vaddr = vaddr;
359 	seg_len = len;
360 	while (seg_len > 0) {
361 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
362 		if (reg & REG_MAP_REGISTERED) {
363 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
364 			return -EBUSY;
365 		}
366 		seg_vaddr += VALUE_2MB;
367 		seg_len -= VALUE_2MB;
368 	}
369 
370 	seg_vaddr = vaddr;
371 	seg_len = 0;
372 	while (len > 0) {
373 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
374 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
375 		seg_len += VALUE_2MB;
376 		vaddr += VALUE_2MB;
377 		len -= VALUE_2MB;
378 	}
379 
380 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
381 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER,
382 					(void *)seg_vaddr, seg_len);
383 		if (rc != 0) {
384 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
385 			return rc;
386 		}
387 	}
388 
389 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
390 	return 0;
391 }
392 
393 int
394 spdk_mem_unregister(void *_vaddr, size_t len)
395 {
396 	struct spdk_mem_map *map;
397 	int rc;
398 	uint64_t vaddr = (uintptr_t)_vaddr;
399 	uint64_t seg_vaddr;
400 	size_t seg_len;
401 	uint64_t reg, newreg;
402 
403 	if ((uintptr_t)vaddr & ~MASK_256TB) {
404 		DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
405 		return -EINVAL;
406 	}
407 
408 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
409 		DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
410 			    __func__, vaddr, len);
411 		return -EINVAL;
412 	}
413 
414 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
415 
416 	/* The first page must be a start of a region. Also check if it's
417 	 * registered to make sure we don't return -ERANGE for non-registered
418 	 * regions.
419 	 */
420 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
421 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
422 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
423 		return -ERANGE;
424 	}
425 
426 	seg_vaddr = vaddr;
427 	seg_len = len;
428 	while (seg_len > 0) {
429 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
430 		if ((reg & REG_MAP_REGISTERED) == 0) {
431 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
432 			return -EINVAL;
433 		}
434 		seg_vaddr += VALUE_2MB;
435 		seg_len -= VALUE_2MB;
436 	}
437 
438 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
439 	/* If the next page is registered, it must be a start of a region as well,
440 	 * otherwise we'd be unregistering only a part of a region.
441 	 */
442 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
443 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
444 		return -ERANGE;
445 	}
446 	seg_vaddr = vaddr;
447 	seg_len = 0;
448 
449 	while (len > 0) {
450 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
451 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
452 
453 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
454 			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
455 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
456 							(void *)seg_vaddr, seg_len);
457 				if (rc != 0) {
458 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
459 					return rc;
460 				}
461 			}
462 
463 			seg_vaddr = vaddr;
464 			seg_len = VALUE_2MB;
465 		} else {
466 			seg_len += VALUE_2MB;
467 		}
468 
469 		vaddr += VALUE_2MB;
470 		len -= VALUE_2MB;
471 	}
472 
473 	if (seg_len > 0) {
474 		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
475 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
476 						(void *)seg_vaddr, seg_len);
477 			if (rc != 0) {
478 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
479 				return rc;
480 			}
481 		}
482 	}
483 
484 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
485 	return 0;
486 }
487 
488 int
489 spdk_mem_reserve(void *vaddr, size_t len)
490 {
491 	struct spdk_mem_map *map;
492 	void *seg_vaddr;
493 	size_t seg_len;
494 	uint64_t reg;
495 
496 	if ((uintptr_t)vaddr & ~MASK_256TB) {
497 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
498 		return -EINVAL;
499 	}
500 
501 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
502 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
503 			    __func__, vaddr, len);
504 		return -EINVAL;
505 	}
506 
507 	if (len == 0) {
508 		return 0;
509 	}
510 
511 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
512 
513 	/* Check if any part of this range is already registered */
514 	seg_vaddr = vaddr;
515 	seg_len = len;
516 	while (seg_len > 0) {
517 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
518 		if (reg & REG_MAP_REGISTERED) {
519 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
520 			return -EBUSY;
521 		}
522 		seg_vaddr += VALUE_2MB;
523 		seg_len -= VALUE_2MB;
524 	}
525 
526 	/* Simply set the translation to the memory map's default. This allocates the space in the
527 	 * map but does not provide a valid translation. */
528 	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
529 				     g_mem_reg_map->default_translation);
530 
531 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
532 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
533 	}
534 
535 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
536 	return 0;
537 }
538 
539 static struct map_1gb *
540 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
541 {
542 	struct map_1gb *map_1gb;
543 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
544 	size_t i;
545 
546 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
547 		return NULL;
548 	}
549 
550 	map_1gb = map->map_256tb.map[idx_256tb];
551 
552 	if (!map_1gb) {
553 		pthread_mutex_lock(&map->mutex);
554 
555 		/* Recheck to make sure nobody else got the mutex first. */
556 		map_1gb = map->map_256tb.map[idx_256tb];
557 		if (!map_1gb) {
558 			map_1gb = malloc(sizeof(struct map_1gb));
559 			if (map_1gb) {
560 				/* initialize all entries to default translation */
561 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
562 					map_1gb->map[i].translation_2mb = map->default_translation;
563 				}
564 				map->map_256tb.map[idx_256tb] = map_1gb;
565 			}
566 		}
567 
568 		pthread_mutex_unlock(&map->mutex);
569 
570 		if (!map_1gb) {
571 			DEBUG_PRINT("allocation failed\n");
572 			return NULL;
573 		}
574 	}
575 
576 	return map_1gb;
577 }
578 
579 int
580 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
581 			     uint64_t translation)
582 {
583 	uint64_t vfn_2mb;
584 	struct map_1gb *map_1gb;
585 	uint64_t idx_1gb;
586 	struct map_2mb *map_2mb;
587 
588 	if ((uintptr_t)vaddr & ~MASK_256TB) {
589 		DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
590 		return -EINVAL;
591 	}
592 
593 	/* For now, only 2 MB-aligned registrations are supported */
594 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
595 		DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
596 			    __func__, vaddr, size);
597 		return -EINVAL;
598 	}
599 
600 	vfn_2mb = vaddr >> SHIFT_2MB;
601 
602 	while (size) {
603 		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
604 		if (!map_1gb) {
605 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
606 			return -ENOMEM;
607 		}
608 
609 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
610 		map_2mb = &map_1gb->map[idx_1gb];
611 		map_2mb->translation_2mb = translation;
612 
613 		size -= VALUE_2MB;
614 		vfn_2mb++;
615 	}
616 
617 	return 0;
618 }
619 
620 int
621 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
622 {
623 	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
624 }
625 
626 inline uint64_t
627 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
628 {
629 	const struct map_1gb *map_1gb;
630 	const struct map_2mb *map_2mb;
631 	uint64_t idx_256tb;
632 	uint64_t idx_1gb;
633 	uint64_t vfn_2mb;
634 	uint64_t cur_size;
635 	uint64_t prev_translation;
636 	uint64_t orig_translation;
637 
638 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
639 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
640 		return map->default_translation;
641 	}
642 
643 	vfn_2mb = vaddr >> SHIFT_2MB;
644 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
645 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
646 
647 	map_1gb = map->map_256tb.map[idx_256tb];
648 	if (spdk_unlikely(!map_1gb)) {
649 		return map->default_translation;
650 	}
651 
652 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
653 	map_2mb = &map_1gb->map[idx_1gb];
654 	if (size == NULL || map->ops.are_contiguous == NULL ||
655 	    map_2mb->translation_2mb == map->default_translation) {
656 		if (size != NULL) {
657 			*size = spdk_min(*size, cur_size);
658 		}
659 		return map_2mb->translation_2mb;
660 	}
661 
662 	orig_translation = map_2mb->translation_2mb;
663 	prev_translation = orig_translation;
664 	while (cur_size < *size) {
665 		vfn_2mb++;
666 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
667 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
668 
669 		map_1gb = map->map_256tb.map[idx_256tb];
670 		if (spdk_unlikely(!map_1gb)) {
671 			break;
672 		}
673 
674 		map_2mb = &map_1gb->map[idx_1gb];
675 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
676 			break;
677 		}
678 
679 		cur_size += VALUE_2MB;
680 		prev_translation = map_2mb->translation_2mb;
681 	}
682 
683 	*size = spdk_min(*size, cur_size);
684 	return orig_translation;
685 }
686 
687 static void
688 memory_hotplug_cb(enum rte_mem_event event_type,
689 		  const void *addr, size_t len, void *arg)
690 {
691 	if (event_type == RTE_MEM_EVENT_ALLOC) {
692 		spdk_mem_register((void *)addr, len);
693 
694 		if (!spdk_env_dpdk_external_init()) {
695 			return;
696 		}
697 
698 		/* When the user initialized DPDK separately, we can't
699 		 * be sure that --match-allocations RTE flag was specified.
700 		 * Without this flag, DPDK can free memory in different units
701 		 * than it was allocated. It doesn't work with things like RDMA MRs.
702 		 *
703 		 * For such cases, we mark segments so they aren't freed.
704 		 */
705 		while (len > 0) {
706 			struct rte_memseg *seg;
707 
708 			seg = rte_mem_virt2memseg(addr, NULL);
709 			assert(seg != NULL);
710 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
711 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
712 			len -= seg->hugepage_sz;
713 		}
714 	} else if (event_type == RTE_MEM_EVENT_FREE) {
715 		spdk_mem_unregister((void *)addr, len);
716 	}
717 }
718 
719 static int
720 memory_iter_cb(const struct rte_memseg_list *msl,
721 	       const struct rte_memseg *ms, size_t len, void *arg)
722 {
723 	return spdk_mem_register(ms->addr, len);
724 }
725 
726 int
727 mem_map_init(bool legacy_mem)
728 {
729 	g_legacy_mem = legacy_mem;
730 
731 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
732 	if (g_mem_reg_map == NULL) {
733 		DEBUG_PRINT("memory registration map allocation failed\n");
734 		return -ENOMEM;
735 	}
736 
737 	/*
738 	 * Walk all DPDK memory segments and register them
739 	 * with the main memory map
740 	 */
741 	if (g_huge_pages) {
742 		rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
743 		rte_memseg_contig_walk(memory_iter_cb, NULL);
744 	}
745 	return 0;
746 }
747 
748 bool
749 spdk_iommu_is_enabled(void)
750 {
751 #if VFIO_ENABLED
752 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
753 #else
754 	return false;
755 #endif
756 }
757 
758 struct spdk_vtophys_pci_device {
759 	struct rte_pci_device *pci_device;
760 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
761 };
762 
763 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
764 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
765 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
766 
767 static struct spdk_mem_map *g_vtophys_map;
768 static struct spdk_mem_map *g_phys_ref_map;
769 static struct spdk_mem_map *g_numa_map;
770 
771 #if VFIO_ENABLED
772 static int
773 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
774 {
775 	struct spdk_vfio_dma_map *dma_map;
776 	int ret;
777 
778 	dma_map = calloc(1, sizeof(*dma_map));
779 	if (dma_map == NULL) {
780 		return -ENOMEM;
781 	}
782 
783 	dma_map->map.argsz = sizeof(dma_map->map);
784 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
785 	dma_map->map.vaddr = vaddr;
786 	dma_map->map.iova = iova;
787 	dma_map->map.size = size;
788 
789 	if (g_vfio.device_ref == 0) {
790 		/* VFIO requires at least one device (IOMMU group) to be added to
791 		 * a VFIO container before it is possible to perform any IOMMU
792 		 * operations on that container. This memory will be mapped once
793 		 * the first device (IOMMU group) is hotplugged.
794 		 *
795 		 * Since the vfio container is managed internally by DPDK, it is
796 		 * also possible that some device is already in that container, but
797 		 * it's not managed by SPDK -  e.g. an NIC attached internally
798 		 * inside DPDK. We could map the memory straight away in such
799 		 * scenario, but there's no need to do it. DPDK devices clearly
800 		 * don't need our mappings and hence we defer the mapping
801 		 * unconditionally until the first SPDK-managed device is
802 		 * hotplugged.
803 		 */
804 		goto out_insert;
805 	}
806 
807 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
808 	if (ret) {
809 		/* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
810 		SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
811 	}
812 
813 out_insert:
814 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
815 	return 0;
816 }
817 
818 
819 static int
820 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
821 {
822 	uint64_t refcount;
823 	int ret;
824 
825 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
826 	assert(refcount < UINT64_MAX);
827 	if (refcount > 0) {
828 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
829 		return 0;
830 	}
831 
832 	pthread_mutex_lock(&g_vfio.mutex);
833 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
834 	pthread_mutex_unlock(&g_vfio.mutex);
835 	if (ret) {
836 		return ret;
837 	}
838 
839 	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
840 	return 0;
841 }
842 
843 int
844 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
845 {
846 	int ret;
847 
848 	pthread_mutex_lock(&g_vfio.mutex);
849 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
850 	pthread_mutex_unlock(&g_vfio.mutex);
851 
852 	return ret;
853 }
854 
855 static int
856 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
857 {
858 	struct vfio_iommu_type1_dma_unmap unmap = {};
859 	int ret;
860 
861 	if (g_vfio.device_ref == 0) {
862 		/* Memory is not mapped anymore, just remove it's references */
863 		goto out_remove;
864 	}
865 
866 	unmap.argsz = sizeof(unmap);
867 	unmap.flags = 0;
868 	unmap.iova = dma_map->map.iova;
869 	unmap.size = dma_map->map.size;
870 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
871 	if (ret) {
872 		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
873 	}
874 
875 out_remove:
876 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
877 	free(dma_map);
878 	return 0;
879 }
880 
881 static int
882 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
883 {
884 	struct spdk_vfio_dma_map *dma_map;
885 	uint64_t refcount;
886 	int ret;
887 
888 	pthread_mutex_lock(&g_vfio.mutex);
889 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
890 		if (dma_map->map.iova == iova) {
891 			break;
892 		}
893 	}
894 
895 	if (dma_map == NULL) {
896 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
897 		pthread_mutex_unlock(&g_vfio.mutex);
898 		return -ENXIO;
899 	}
900 
901 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
902 	assert(refcount < UINT64_MAX);
903 	if (refcount > 0) {
904 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
905 	}
906 
907 	/* We still have outstanding references, don't clear it. */
908 	if (refcount > 1) {
909 		pthread_mutex_unlock(&g_vfio.mutex);
910 		return 0;
911 	}
912 
913 	/** don't support partial or multiple-page unmap for now */
914 	assert(dma_map->map.size == size);
915 
916 	ret = _vfio_iommu_unmap_dma(dma_map);
917 	pthread_mutex_unlock(&g_vfio.mutex);
918 
919 	return ret;
920 }
921 
922 int
923 vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
924 {
925 	struct spdk_vfio_dma_map *dma_map;
926 	int ret;
927 
928 	pthread_mutex_lock(&g_vfio.mutex);
929 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
930 		if (dma_map->map.vaddr == vaddr) {
931 			break;
932 		}
933 	}
934 
935 	if (dma_map == NULL) {
936 		DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
937 		pthread_mutex_unlock(&g_vfio.mutex);
938 		return -ENXIO;
939 	}
940 
941 	ret = _vfio_iommu_unmap_dma(dma_map);
942 	pthread_mutex_unlock(&g_vfio.mutex);
943 	return ret;
944 }
945 #endif
946 
947 static uint64_t
948 vtophys_get_paddr_memseg(uint64_t vaddr)
949 {
950 	uintptr_t paddr;
951 	struct rte_memseg *seg;
952 
953 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
954 	if (seg != NULL) {
955 		paddr = seg->iova;
956 		if (paddr == RTE_BAD_IOVA) {
957 			return SPDK_VTOPHYS_ERROR;
958 		}
959 		paddr += (vaddr - (uintptr_t)seg->addr);
960 		return paddr;
961 	}
962 
963 	return SPDK_VTOPHYS_ERROR;
964 }
965 
966 /* Try to get the paddr from /proc/self/pagemap */
967 static uint64_t
968 vtophys_get_paddr_pagemap(uint64_t vaddr)
969 {
970 	uintptr_t paddr;
971 
972 	/* Silence static analyzers */
973 	assert(vaddr != 0);
974 	paddr = rte_mem_virt2iova((void *)vaddr);
975 	if (paddr == RTE_BAD_IOVA) {
976 		/*
977 		 * The vaddr may be valid but doesn't have a backing page
978 		 * assigned yet.  Touch the page to ensure a backing page
979 		 * gets assigned, then try to translate again.
980 		 */
981 		rte_atomic64_read((rte_atomic64_t *)vaddr);
982 		paddr = rte_mem_virt2iova((void *)vaddr);
983 	}
984 	if (paddr == RTE_BAD_IOVA) {
985 		/* Unable to get to the physical address. */
986 		return SPDK_VTOPHYS_ERROR;
987 	}
988 
989 	return paddr;
990 }
991 
992 static uint64_t
993 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
994 {
995 	struct rte_mem_resource *res;
996 	uint64_t paddr;
997 	unsigned r;
998 
999 	for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1000 		res = dpdk_pci_device_get_mem_resource(dev, r);
1001 
1002 		if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
1003 		    (vaddr + len) >= (uint64_t)res->addr + res->len) {
1004 			continue;
1005 		}
1006 
1007 #if VFIO_ENABLED
1008 		if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1009 			/*
1010 			 * The IOMMU is on and we're using IOVA == VA. The BAR was
1011 			 * automatically registered when it was mapped, so just return
1012 			 * the virtual address here.
1013 			 */
1014 			return vaddr;
1015 		}
1016 #endif
1017 		paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1018 		return paddr;
1019 	}
1020 
1021 	return SPDK_VTOPHYS_ERROR;
1022 }
1023 
1024 /* Try to get the paddr from pci devices */
1025 static uint64_t
1026 vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1027 {
1028 	struct spdk_vtophys_pci_device *vtophys_dev;
1029 	uintptr_t paddr;
1030 	struct rte_pci_device	*dev;
1031 
1032 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1033 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1034 		dev = vtophys_dev->pci_device;
1035 		paddr = pci_device_vtophys(dev, vaddr, len);
1036 		if (paddr != SPDK_VTOPHYS_ERROR) {
1037 			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1038 			return paddr;
1039 		}
1040 	}
1041 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1042 
1043 	return SPDK_VTOPHYS_ERROR;
1044 }
1045 
1046 static int
1047 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1048 	       enum spdk_mem_map_notify_action action,
1049 	       void *vaddr, size_t len)
1050 {
1051 	int rc = 0;
1052 	uint64_t paddr;
1053 
1054 	if ((uintptr_t)vaddr & ~MASK_256TB) {
1055 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1056 		return -EINVAL;
1057 	}
1058 
1059 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1060 		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1061 			    vaddr, len);
1062 		return -EINVAL;
1063 	}
1064 
1065 	/* Get the physical address from the DPDK memsegs */
1066 	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1067 
1068 	switch (action) {
1069 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1070 		if (paddr == SPDK_VTOPHYS_ERROR) {
1071 			/* This is not an address that DPDK is managing. */
1072 
1073 			/* Check if this is a PCI BAR. They need special handling */
1074 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1075 			if (paddr != SPDK_VTOPHYS_ERROR) {
1076 				/* Get paddr for each 2MB chunk in this address range */
1077 				while (len > 0) {
1078 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1079 					if (paddr == SPDK_VTOPHYS_ERROR) {
1080 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1081 						return -EFAULT;
1082 					}
1083 
1084 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1085 					if (rc != 0) {
1086 						return rc;
1087 					}
1088 
1089 					vaddr += VALUE_2MB;
1090 					len -= VALUE_2MB;
1091 				}
1092 
1093 				return 0;
1094 			}
1095 
1096 #if VFIO_ENABLED
1097 			enum rte_iova_mode iova_mode;
1098 
1099 			iova_mode = rte_eal_iova_mode();
1100 
1101 			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1102 				/* We'll use the virtual address as the iova to match DPDK. */
1103 				paddr = (uint64_t)vaddr;
1104 				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1105 				if (rc) {
1106 					return -EFAULT;
1107 				}
1108 				while (len > 0) {
1109 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1110 					if (rc != 0) {
1111 						return rc;
1112 					}
1113 					vaddr += VALUE_2MB;
1114 					paddr += VALUE_2MB;
1115 					len -= VALUE_2MB;
1116 				}
1117 			} else
1118 #endif
1119 			{
1120 				/* Get the physical address from /proc/self/pagemap. */
1121 				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1122 				if (paddr == SPDK_VTOPHYS_ERROR) {
1123 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1124 					return -EFAULT;
1125 				}
1126 
1127 				/* Get paddr for each 2MB chunk in this address range */
1128 				while (len > 0) {
1129 					/* Get the physical address from /proc/self/pagemap. */
1130 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1131 
1132 					if (paddr == SPDK_VTOPHYS_ERROR) {
1133 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1134 						return -EFAULT;
1135 					}
1136 
1137 					if (paddr & MASK_2MB) {
1138 						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1139 						return -EINVAL;
1140 					}
1141 #if VFIO_ENABLED
1142 					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1143 					 * with the IOMMU using the physical address to match. */
1144 					if (spdk_iommu_is_enabled()) {
1145 						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1146 						if (rc) {
1147 							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1148 							return -EFAULT;
1149 						}
1150 					}
1151 #endif
1152 
1153 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1154 					if (rc != 0) {
1155 						return rc;
1156 					}
1157 
1158 					vaddr += VALUE_2MB;
1159 					len -= VALUE_2MB;
1160 				}
1161 			}
1162 		} else {
1163 			/* This is an address managed by DPDK. Just setup the translations. */
1164 			while (len > 0) {
1165 				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1166 				if (paddr == SPDK_VTOPHYS_ERROR) {
1167 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1168 					return -EFAULT;
1169 				}
1170 
1171 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1172 				if (rc != 0) {
1173 					return rc;
1174 				}
1175 
1176 				vaddr += VALUE_2MB;
1177 				len -= VALUE_2MB;
1178 			}
1179 		}
1180 
1181 		break;
1182 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1183 #if VFIO_ENABLED
1184 		if (paddr == SPDK_VTOPHYS_ERROR) {
1185 			/*
1186 			 * This is not an address that DPDK is managing.
1187 			 */
1188 
1189 			/* Check if this is a PCI BAR. They need special handling */
1190 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1191 			if (paddr != SPDK_VTOPHYS_ERROR) {
1192 				/* Get paddr for each 2MB chunk in this address range */
1193 				while (len > 0) {
1194 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1195 					if (paddr == SPDK_VTOPHYS_ERROR) {
1196 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1197 						return -EFAULT;
1198 					}
1199 
1200 					rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1201 					if (rc != 0) {
1202 						return rc;
1203 					}
1204 
1205 					vaddr += VALUE_2MB;
1206 					len -= VALUE_2MB;
1207 				}
1208 
1209 				return 0;
1210 			}
1211 
1212 			/* If vfio is enabled,
1213 			 * we need to unmap the range from the IOMMU
1214 			 */
1215 			if (spdk_iommu_is_enabled()) {
1216 				uint64_t buffer_len = len;
1217 				uint8_t *va = vaddr;
1218 				enum rte_iova_mode iova_mode;
1219 
1220 				iova_mode = rte_eal_iova_mode();
1221 				/*
1222 				 * In virtual address mode, the region is contiguous and can be done in
1223 				 * one unmap.
1224 				 */
1225 				if (iova_mode == RTE_IOVA_VA) {
1226 					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1227 					if (buffer_len != len || paddr != (uintptr_t)va) {
1228 						DEBUG_PRINT("Unmapping %p with length %lu failed because "
1229 							    "translation had address 0x%" PRIx64 " and length %lu\n",
1230 							    va, len, paddr, buffer_len);
1231 						return -EINVAL;
1232 					}
1233 					rc = vtophys_iommu_unmap_dma(paddr, len);
1234 					if (rc) {
1235 						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1236 						return -EFAULT;
1237 					}
1238 				} else if (iova_mode == RTE_IOVA_PA) {
1239 					/* Get paddr for each 2MB chunk in this address range */
1240 					while (buffer_len > 0) {
1241 						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1242 
1243 						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1244 							DEBUG_PRINT("could not get phys addr for %p\n", va);
1245 							return -EFAULT;
1246 						}
1247 
1248 						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1249 						if (rc) {
1250 							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1251 							return -EFAULT;
1252 						}
1253 
1254 						va += VALUE_2MB;
1255 						buffer_len -= VALUE_2MB;
1256 					}
1257 				}
1258 			}
1259 		}
1260 #endif
1261 		while (len > 0) {
1262 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1263 			if (rc != 0) {
1264 				return rc;
1265 			}
1266 
1267 			vaddr += VALUE_2MB;
1268 			len -= VALUE_2MB;
1269 		}
1270 
1271 		break;
1272 	default:
1273 		SPDK_UNREACHABLE();
1274 	}
1275 
1276 	return rc;
1277 }
1278 
1279 static int
1280 numa_notify(void *cb_ctx, struct spdk_mem_map *map,
1281 	    enum spdk_mem_map_notify_action action,
1282 	    void *vaddr, size_t len)
1283 {
1284 	struct rte_memseg *seg;
1285 
1286 	/* We always return 0 from here, even if we aren't able to get a
1287 	 * memseg for the address. This can happen in non-DPDK memory
1288 	 * registration paths, for example vhost or vfio-user. That is OK,
1289 	 * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for
1290 	 * that kind of memory. If we return an error here, the
1291 	 * spdk_mem_register() from vhost or vfio-user would fail which is
1292 	 * not what we want.
1293 	 */
1294 	seg = rte_mem_virt2memseg(vaddr, NULL);
1295 	if (seg == NULL) {
1296 		return 0;
1297 	}
1298 
1299 	switch (action) {
1300 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1301 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id);
1302 		break;
1303 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1304 		spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len);
1305 		break;
1306 	default:
1307 		break;
1308 	}
1309 
1310 	return 0;
1311 }
1312 
1313 static int
1314 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1315 {
1316 	/* This function is always called with paddrs for two subsequent
1317 	 * 2MB chunks in virtual address space, so those chunks will be only
1318 	 * physically contiguous if the physical addresses are 2MB apart
1319 	 * from each other as well.
1320 	 */
1321 	return (paddr2 - paddr1 == VALUE_2MB);
1322 }
1323 
1324 #if VFIO_ENABLED
1325 
1326 static bool
1327 vfio_enabled(void)
1328 {
1329 	return rte_vfio_is_enabled("vfio_pci");
1330 }
1331 
1332 /* Check if IOMMU is enabled on the system */
1333 static bool
1334 has_iommu_groups(void)
1335 {
1336 	int count = 0;
1337 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1338 
1339 	if (dir == NULL) {
1340 		return false;
1341 	}
1342 
1343 	while (count < 3 && readdir(dir) != NULL) {
1344 		count++;
1345 	}
1346 
1347 	closedir(dir);
1348 	/* there will always be ./ and ../ entries */
1349 	return count > 2;
1350 }
1351 
1352 static bool
1353 vfio_noiommu_enabled(void)
1354 {
1355 	return rte_vfio_noiommu_is_enabled();
1356 }
1357 
1358 static void
1359 vtophys_iommu_init(void)
1360 {
1361 	char proc_fd_path[PATH_MAX + 1];
1362 	char link_path[PATH_MAX + 1];
1363 	const char vfio_path[] = "/dev/vfio/vfio";
1364 	DIR *dir;
1365 	struct dirent *d;
1366 
1367 	if (!vfio_enabled()) {
1368 		return;
1369 	}
1370 
1371 	if (vfio_noiommu_enabled()) {
1372 		g_vfio.noiommu_enabled = true;
1373 	} else if (!has_iommu_groups()) {
1374 		return;
1375 	}
1376 
1377 	dir = opendir("/proc/self/fd");
1378 	if (!dir) {
1379 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1380 		return;
1381 	}
1382 
1383 	while ((d = readdir(dir)) != NULL) {
1384 		if (d->d_type != DT_LNK) {
1385 			continue;
1386 		}
1387 
1388 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1389 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1390 			continue;
1391 		}
1392 
1393 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1394 			sscanf(d->d_name, "%d", &g_vfio.fd);
1395 			break;
1396 		}
1397 	}
1398 
1399 	closedir(dir);
1400 
1401 	if (g_vfio.fd < 0) {
1402 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1403 		return;
1404 	}
1405 
1406 	g_vfio.enabled = true;
1407 
1408 	return;
1409 }
1410 
1411 #endif
1412 
1413 void
1414 vtophys_pci_device_added(struct rte_pci_device *pci_device)
1415 {
1416 	struct spdk_vtophys_pci_device *vtophys_dev;
1417 
1418 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1419 
1420 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1421 	if (vtophys_dev) {
1422 		vtophys_dev->pci_device = pci_device;
1423 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1424 	} else {
1425 		DEBUG_PRINT("Memory allocation error\n");
1426 	}
1427 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1428 
1429 #if VFIO_ENABLED
1430 	struct spdk_vfio_dma_map *dma_map;
1431 	int ret;
1432 
1433 	if (!g_vfio.enabled) {
1434 		return;
1435 	}
1436 
1437 	pthread_mutex_lock(&g_vfio.mutex);
1438 	g_vfio.device_ref++;
1439 	if (g_vfio.device_ref > 1) {
1440 		pthread_mutex_unlock(&g_vfio.mutex);
1441 		return;
1442 	}
1443 
1444 	/* This is the first SPDK device using DPDK vfio. This means that the first
1445 	 * IOMMU group might have been just been added to the DPDK vfio container.
1446 	 * From this point it is certain that the memory can be mapped now.
1447 	 */
1448 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1449 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1450 		if (ret) {
1451 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1452 			break;
1453 		}
1454 	}
1455 	pthread_mutex_unlock(&g_vfio.mutex);
1456 #endif
1457 }
1458 
1459 void
1460 vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1461 {
1462 	struct spdk_vtophys_pci_device *vtophys_dev;
1463 
1464 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1465 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1466 		if (vtophys_dev->pci_device == pci_device) {
1467 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1468 			free(vtophys_dev);
1469 			break;
1470 		}
1471 	}
1472 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1473 
1474 #if VFIO_ENABLED
1475 	struct spdk_vfio_dma_map *dma_map;
1476 	int ret;
1477 
1478 	if (!g_vfio.enabled) {
1479 		return;
1480 	}
1481 
1482 	pthread_mutex_lock(&g_vfio.mutex);
1483 	assert(g_vfio.device_ref > 0);
1484 	g_vfio.device_ref--;
1485 	if (g_vfio.device_ref > 0) {
1486 		pthread_mutex_unlock(&g_vfio.mutex);
1487 		return;
1488 	}
1489 
1490 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1491 	 * any additional devices using it's vfio container, all the mappings
1492 	 * will be automatically removed by the Linux vfio driver. We unmap
1493 	 * the memory manually to be able to easily re-map it later regardless
1494 	 * of other, external factors.
1495 	 */
1496 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1497 		struct vfio_iommu_type1_dma_unmap unmap = {};
1498 		unmap.argsz = sizeof(unmap);
1499 		unmap.flags = 0;
1500 		unmap.iova = dma_map->map.iova;
1501 		unmap.size = dma_map->map.size;
1502 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1503 		if (ret) {
1504 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1505 			break;
1506 		}
1507 	}
1508 	pthread_mutex_unlock(&g_vfio.mutex);
1509 #endif
1510 }
1511 
1512 int
1513 vtophys_init(void)
1514 {
1515 	const struct spdk_mem_map_ops vtophys_map_ops = {
1516 		.notify_cb = vtophys_notify,
1517 		.are_contiguous = vtophys_check_contiguous_entries,
1518 	};
1519 
1520 	const struct spdk_mem_map_ops phys_ref_map_ops = {
1521 		.notify_cb = NULL,
1522 		.are_contiguous = NULL,
1523 	};
1524 
1525 	const struct spdk_mem_map_ops numa_map_ops = {
1526 		.notify_cb = numa_notify,
1527 		.are_contiguous = NULL,
1528 	};
1529 
1530 #if VFIO_ENABLED
1531 	vtophys_iommu_init();
1532 #endif
1533 
1534 	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1535 	if (g_phys_ref_map == NULL) {
1536 		DEBUG_PRINT("phys_ref map allocation failed.\n");
1537 		return -ENOMEM;
1538 	}
1539 
1540 	g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL);
1541 	if (g_numa_map == NULL) {
1542 		DEBUG_PRINT("numa map allocation failed.\n");
1543 		spdk_mem_map_free(&g_phys_ref_map);
1544 		return -ENOMEM;
1545 	}
1546 
1547 	if (g_huge_pages) {
1548 		g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1549 		if (g_vtophys_map == NULL) {
1550 			DEBUG_PRINT("vtophys map allocation failed\n");
1551 			spdk_mem_map_free(&g_numa_map);
1552 			spdk_mem_map_free(&g_phys_ref_map);
1553 			return -ENOMEM;
1554 		}
1555 	}
1556 	return 0;
1557 }
1558 
1559 uint64_t
1560 spdk_vtophys(const void *buf, uint64_t *size)
1561 {
1562 	uint64_t vaddr, paddr_2mb;
1563 
1564 	if (!g_huge_pages) {
1565 		return SPDK_VTOPHYS_ERROR;
1566 	}
1567 
1568 	vaddr = (uint64_t)buf;
1569 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1570 
1571 	/*
1572 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1573 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1574 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1575 	 * unaligned) we must now check the return value before addition.
1576 	 */
1577 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1578 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1579 		return SPDK_VTOPHYS_ERROR;
1580 	} else {
1581 		return paddr_2mb + (vaddr & MASK_2MB);
1582 	}
1583 }
1584 
1585 int32_t
1586 spdk_mem_get_numa_id(const void *buf, uint64_t *size)
1587 {
1588 	return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size);
1589 }
1590 
1591 int
1592 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1593 {
1594 	struct rte_memseg *seg;
1595 	int ret, fd;
1596 
1597 	seg = rte_mem_virt2memseg(vaddr, NULL);
1598 	if (!seg) {
1599 		SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1600 		return -ENOENT;
1601 	}
1602 
1603 	fd = rte_memseg_get_fd_thread_unsafe(seg);
1604 	if (fd < 0) {
1605 		return fd;
1606 	}
1607 
1608 	ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1609 	if (ret < 0) {
1610 		return ret;
1611 	}
1612 
1613 	return fd;
1614 }
1615 
1616 void
1617 mem_disable_huge_pages(void)
1618 {
1619 	g_huge_pages = false;
1620 }
1621