xref: /spdk/lib/env_dpdk/memory.c (revision 40c9acf6d4946a68e78bfc0370a2f4dc2ce2b976)
1488570ebSJim Harris /*   SPDX-License-Identifier: BSD-3-Clause
2a6dbe372Spaul luse  *   Copyright (C) 2017 Intel Corporation.
31d24e67dSBen Walker  *   All rights reserved.
41d24e67dSBen Walker  */
51d24e67dSBen Walker 
61d24e67dSBen Walker #include "spdk/stdinc.h"
71d24e67dSBen Walker 
81d24e67dSBen Walker #include "env_internal.h"
911313c20SJim Harris #include "pci_dpdk.h"
101d24e67dSBen Walker 
111d24e67dSBen Walker #include <rte_config.h>
12cf35beccSDarek Stojaczyk #include <rte_memory.h>
131d24e67dSBen Walker #include <rte_eal_memconfig.h>
14c7f50109SJim Harris #include <rte_dev.h>
15c7f50109SJim Harris #include <rte_pci.h>
161d24e67dSBen Walker 
171d24e67dSBen Walker #include "spdk_internal/assert.h"
181d24e67dSBen Walker 
191d24e67dSBen Walker #include "spdk/assert.h"
201d24e67dSBen Walker #include "spdk/likely.h"
211d24e67dSBen Walker #include "spdk/queue.h"
221d24e67dSBen Walker #include "spdk/util.h"
23e03861f1SDarek Stojaczyk #include "spdk/memory.h"
240871b382SJim Harris #include "spdk/env_dpdk.h"
25d8190d02SVitaliy Mysak #include "spdk/log.h"
261d24e67dSBen Walker 
27a36bc251SBen Walker #ifdef __linux__
28f74d069eSJim Harris #include <linux/version.h>
296f48bf7cSDarek Stojaczyk #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30f74d069eSJim Harris #include <linux/vfio.h>
31f74d069eSJim Harris #include <rte_vfio.h>
32f74d069eSJim Harris 
33f74d069eSJim Harris struct spdk_vfio_dma_map {
34f74d069eSJim Harris 	struct vfio_iommu_type1_dma_map map;
35f74d069eSJim Harris 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36f74d069eSJim Harris };
37f74d069eSJim Harris 
38f74d069eSJim Harris struct vfio_cfg {
39f74d069eSJim Harris 	int fd;
40f74d069eSJim Harris 	bool enabled;
41f74d069eSJim Harris 	bool noiommu_enabled;
42f74d069eSJim Harris 	unsigned device_ref;
43f74d069eSJim Harris 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44f74d069eSJim Harris 	pthread_mutex_t mutex;
45f74d069eSJim Harris };
46f74d069eSJim Harris 
47f74d069eSJim Harris static struct vfio_cfg g_vfio = {
48f74d069eSJim Harris 	.fd = -1,
49f74d069eSJim Harris 	.enabled = false,
50f74d069eSJim Harris 	.noiommu_enabled = false,
51f74d069eSJim Harris 	.device_ref = 0,
52f74d069eSJim Harris 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53f74d069eSJim Harris 	.mutex = PTHREAD_MUTEX_INITIALIZER
54f74d069eSJim Harris };
55f74d069eSJim Harris #endif
56f74d069eSJim Harris #endif
57f74d069eSJim Harris 
581d24e67dSBen Walker #if DEBUG
59d8190d02SVitaliy Mysak #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
601d24e67dSBen Walker #else
611d24e67dSBen Walker #define DEBUG_PRINT(...)
621d24e67dSBen Walker #endif
631d24e67dSBen Walker 
641d24e67dSBen Walker #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
651d24e67dSBen Walker #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
661d24e67dSBen Walker 
6733e88d79SJim Harris #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
687ac8b609SDaniel Verkamp #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
691d24e67dSBen Walker 
70a33e0943SDarek Stojaczyk /* Page is registered */
71a33e0943SDarek Stojaczyk #define REG_MAP_REGISTERED	(1ULL << 62)
726bcd9295SDarek Stojaczyk 
736bcd9295SDarek Stojaczyk /* A notification region barrier. The 2MB translation entry that's marked
746bcd9295SDarek Stojaczyk  * with this flag must be unregistered separately. This allows contiguous
756bcd9295SDarek Stojaczyk  * regions to be unregistered in the same chunks they were registered.
766bcd9295SDarek Stojaczyk  */
776bcd9295SDarek Stojaczyk #define REG_MAP_NOTIFY_START	(1ULL << 63)
786bcd9295SDarek Stojaczyk 
791d24e67dSBen Walker /* Translation of a single 2MB page. */
801d24e67dSBen Walker struct map_2mb {
811d24e67dSBen Walker 	uint64_t translation_2mb;
821d24e67dSBen Walker };
831d24e67dSBen Walker 
841d24e67dSBen Walker /* Second-level map table indexed by bits [21..29] of the virtual address.
851d24e67dSBen Walker  * Each entry contains the address translation or error for entries that haven't
861d24e67dSBen Walker  * been retrieved yet.
871d24e67dSBen Walker  */
881d24e67dSBen Walker struct map_1gb {
897ac8b609SDaniel Verkamp 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
901d24e67dSBen Walker };
911d24e67dSBen Walker 
9218f80e3aSJim Harris /* Top-level map table indexed by bits [30..47] of the virtual address.
931d24e67dSBen Walker  * Each entry points to a second-level map table or NULL.
941d24e67dSBen Walker  */
9533e88d79SJim Harris struct map_256tb {
967ac8b609SDaniel Verkamp 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
971d24e67dSBen Walker };
981d24e67dSBen Walker 
991d24e67dSBen Walker /* Page-granularity memory address translation */
1001d24e67dSBen Walker struct spdk_mem_map {
10133e88d79SJim Harris 	struct map_256tb map_256tb;
1021d24e67dSBen Walker 	pthread_mutex_t mutex;
1031d24e67dSBen Walker 	uint64_t default_translation;
1041ee27f79SSeth Howell 	struct spdk_mem_map_ops ops;
1051d24e67dSBen Walker 	void *cb_ctx;
1061d24e67dSBen Walker 	TAILQ_ENTRY(spdk_mem_map) tailq;
1071d24e67dSBen Walker };
1081d24e67dSBen Walker 
1096bcd9295SDarek Stojaczyk /* Registrations map. The 64 bit translations are bit fields with the
1106bcd9295SDarek Stojaczyk  * following layout (starting with the low bits):
111a33e0943SDarek Stojaczyk  *    0 - 61 : reserved
112a33e0943SDarek Stojaczyk  *   62 - 63 : flags
1136bcd9295SDarek Stojaczyk  */
1141d24e67dSBen Walker static struct spdk_mem_map *g_mem_reg_map;
115dcac8e97SDarek Stojaczyk static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116dcac8e97SDarek Stojaczyk 	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
1171d24e67dSBen Walker static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
1181d24e67dSBen Walker 
119396c445cSJim Harris static bool g_legacy_mem;
120a6658c54SSarvesh Lanke static bool g_huge_pages = true;
121396c445cSJim Harris 
1221d24e67dSBen Walker /*
1231d24e67dSBen Walker  * Walk the currently registered memory via the main memory registration map
1241d24e67dSBen Walker  * and call the new map's notify callback for each virtually contiguous region.
1251d24e67dSBen Walker  */
126803449caSDarek Stojaczyk static int
1273456377bSSeth Howell mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
1281d24e67dSBen Walker {
12933e88d79SJim Harris 	size_t idx_256tb;
130803449caSDarek Stojaczyk 	uint64_t idx_1gb;
131cb9f0f33SDarek Stojaczyk 	uint64_t contig_start = UINT64_MAX;
132cb9f0f33SDarek Stojaczyk 	uint64_t contig_end = UINT64_MAX;
133803449caSDarek Stojaczyk 	struct map_1gb *map_1gb;
134803449caSDarek Stojaczyk 	int rc;
1351d24e67dSBen Walker 
1361d24e67dSBen Walker 	if (!g_mem_reg_map) {
137803449caSDarek Stojaczyk 		return -EINVAL;
1381d24e67dSBen Walker 	}
1391d24e67dSBen Walker 
1401d24e67dSBen Walker 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
1411d24e67dSBen Walker 	pthread_mutex_lock(&g_mem_reg_map->mutex);
1421d24e67dSBen Walker 
14333e88d79SJim Harris 	for (idx_256tb = 0;
14433e88d79SJim Harris 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
14533e88d79SJim Harris 	     idx_256tb++) {
146803449caSDarek Stojaczyk 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
1471d24e67dSBen Walker 
1481d24e67dSBen Walker 		if (!map_1gb) {
149cb9f0f33SDarek Stojaczyk 			if (contig_start != UINT64_MAX) {
150803449caSDarek Stojaczyk 				/* End of of a virtually contiguous range */
151803449caSDarek Stojaczyk 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
152803449caSDarek Stojaczyk 							(void *)contig_start,
153803449caSDarek Stojaczyk 							contig_end - contig_start + VALUE_2MB);
154803449caSDarek Stojaczyk 				/* Don't bother handling unregister failures. It can't be any worse */
155803449caSDarek Stojaczyk 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
156803449caSDarek Stojaczyk 					goto err_unregister;
157803449caSDarek Stojaczyk 				}
158803449caSDarek Stojaczyk 			}
159cb9f0f33SDarek Stojaczyk 			contig_start = UINT64_MAX;
1601d24e67dSBen Walker 			continue;
1611d24e67dSBen Walker 		}
1621d24e67dSBen Walker 
1631d24e67dSBen Walker 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
164a33e0943SDarek Stojaczyk 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
1656bcd9295SDarek Stojaczyk 			    (contig_start == UINT64_MAX ||
1666bcd9295SDarek Stojaczyk 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
1671d24e67dSBen Walker 				/* Rebuild the virtual address from the indexes */
16833e88d79SJim Harris 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
1691d24e67dSBen Walker 
170cb9f0f33SDarek Stojaczyk 				if (contig_start == UINT64_MAX) {
1711d24e67dSBen Walker 					contig_start = vaddr;
1721d24e67dSBen Walker 				}
173803449caSDarek Stojaczyk 
1741d24e67dSBen Walker 				contig_end = vaddr;
1751d24e67dSBen Walker 			} else {
176cb9f0f33SDarek Stojaczyk 				if (contig_start != UINT64_MAX) {
177803449caSDarek Stojaczyk 					/* End of of a virtually contiguous range */
178803449caSDarek Stojaczyk 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
179803449caSDarek Stojaczyk 								(void *)contig_start,
180803449caSDarek Stojaczyk 								contig_end - contig_start + VALUE_2MB);
181803449caSDarek Stojaczyk 					/* Don't bother handling unregister failures. It can't be any worse */
182803449caSDarek Stojaczyk 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183803449caSDarek Stojaczyk 						goto err_unregister;
184803449caSDarek Stojaczyk 					}
1856bcd9295SDarek Stojaczyk 
1866bcd9295SDarek Stojaczyk 					/* This page might be a part of a neighbour region, so process
1876bcd9295SDarek Stojaczyk 					 * it again. The idx_1gb will be incremented immediately.
1886bcd9295SDarek Stojaczyk 					 */
1896bcd9295SDarek Stojaczyk 					idx_1gb--;
190803449caSDarek Stojaczyk 				}
191cb9f0f33SDarek Stojaczyk 				contig_start = UINT64_MAX;
1921d24e67dSBen Walker 			}
1931d24e67dSBen Walker 		}
1941d24e67dSBen Walker 	}
1951d24e67dSBen Walker 
1961d24e67dSBen Walker 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
197803449caSDarek Stojaczyk 	return 0;
198803449caSDarek Stojaczyk 
199803449caSDarek Stojaczyk err_unregister:
200803449caSDarek Stojaczyk 	/* Unwind to the first empty translation so we don't unregister
201803449caSDarek Stojaczyk 	 * a region that just failed to register.
202803449caSDarek Stojaczyk 	 */
203803449caSDarek Stojaczyk 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
204803449caSDarek Stojaczyk 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
205cb9f0f33SDarek Stojaczyk 	contig_start = UINT64_MAX;
206cb9f0f33SDarek Stojaczyk 	contig_end = UINT64_MAX;
207803449caSDarek Stojaczyk 
208803449caSDarek Stojaczyk 	/* Unregister any memory we managed to register before the failure */
209803449caSDarek Stojaczyk 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
210803449caSDarek Stojaczyk 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
211803449caSDarek Stojaczyk 
212803449caSDarek Stojaczyk 		if (!map_1gb) {
213cb9f0f33SDarek Stojaczyk 			if (contig_end != UINT64_MAX) {
214803449caSDarek Stojaczyk 				/* End of of a virtually contiguous range */
215803449caSDarek Stojaczyk 				map->ops.notify_cb(map->cb_ctx, map,
216803449caSDarek Stojaczyk 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
217803449caSDarek Stojaczyk 						   (void *)contig_start,
218803449caSDarek Stojaczyk 						   contig_end - contig_start + VALUE_2MB);
219803449caSDarek Stojaczyk 			}
220cb9f0f33SDarek Stojaczyk 			contig_end = UINT64_MAX;
221803449caSDarek Stojaczyk 			continue;
222803449caSDarek Stojaczyk 		}
223803449caSDarek Stojaczyk 
224803449caSDarek Stojaczyk 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
225803449caSDarek Stojaczyk 			/* Rebuild the virtual address from the indexes */
226803449caSDarek Stojaczyk 			uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
227c8837711SAviv Ben-David 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
228c8837711SAviv Ben-David 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
229803449caSDarek Stojaczyk 
230cb9f0f33SDarek Stojaczyk 				if (contig_end == UINT64_MAX) {
231803449caSDarek Stojaczyk 					contig_end = vaddr;
232803449caSDarek Stojaczyk 				}
233803449caSDarek Stojaczyk 				contig_start = vaddr;
234803449caSDarek Stojaczyk 			} else {
235cb9f0f33SDarek Stojaczyk 				if (contig_end != UINT64_MAX) {
236c8837711SAviv Ben-David 					if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
237c8837711SAviv Ben-David 						contig_start = vaddr;
238c8837711SAviv Ben-David 					}
239803449caSDarek Stojaczyk 					/* End of of a virtually contiguous range */
240803449caSDarek Stojaczyk 					map->ops.notify_cb(map->cb_ctx, map,
241803449caSDarek Stojaczyk 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
242803449caSDarek Stojaczyk 							   (void *)contig_start,
243803449caSDarek Stojaczyk 							   contig_end - contig_start + VALUE_2MB);
244803449caSDarek Stojaczyk 				}
245cb9f0f33SDarek Stojaczyk 				contig_end = UINT64_MAX;
246803449caSDarek Stojaczyk 			}
247803449caSDarek Stojaczyk 		}
248803449caSDarek Stojaczyk 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
249803449caSDarek Stojaczyk 	}
250803449caSDarek Stojaczyk 
251803449caSDarek Stojaczyk 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
252803449caSDarek Stojaczyk 	return rc;
2531d24e67dSBen Walker }
2541d24e67dSBen Walker 
2551d24e67dSBen Walker struct spdk_mem_map *
2564e06bb5eSSeth Howell spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
2571d24e67dSBen Walker {
2581d24e67dSBen Walker 	struct spdk_mem_map *map;
259803449caSDarek Stojaczyk 	int rc;
260c8837711SAviv Ben-David 	size_t i;
2611d24e67dSBen Walker 
2621d24e67dSBen Walker 	map = calloc(1, sizeof(*map));
2631d24e67dSBen Walker 	if (map == NULL) {
2641d24e67dSBen Walker 		return NULL;
2651d24e67dSBen Walker 	}
2661d24e67dSBen Walker 
2671d24e67dSBen Walker 	if (pthread_mutex_init(&map->mutex, NULL)) {
2681d24e67dSBen Walker 		free(map);
2691d24e67dSBen Walker 		return NULL;
2701d24e67dSBen Walker 	}
2711d24e67dSBen Walker 
2721d24e67dSBen Walker 	map->default_translation = default_translation;
2731d24e67dSBen Walker 	map->cb_ctx = cb_ctx;
2744e06bb5eSSeth Howell 	if (ops) {
2754e06bb5eSSeth Howell 		map->ops = *ops;
2764e06bb5eSSeth Howell 	}
2771d24e67dSBen Walker 
2784e06bb5eSSeth Howell 	if (ops && ops->notify_cb) {
279a9e484a6SDarek Stojaczyk 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
2803456377bSSeth Howell 		rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
281803449caSDarek Stojaczyk 		if (rc != 0) {
282803449caSDarek Stojaczyk 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
283803449caSDarek Stojaczyk 			DEBUG_PRINT("Initial mem_map notify failed\n");
284803449caSDarek Stojaczyk 			pthread_mutex_destroy(&map->mutex);
285c8837711SAviv Ben-David 			for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
286c8837711SAviv Ben-David 				free(map->map_256tb.map[i]);
287c8837711SAviv Ben-David 			}
288803449caSDarek Stojaczyk 			free(map);
289803449caSDarek Stojaczyk 			return NULL;
290803449caSDarek Stojaczyk 		}
2911d24e67dSBen Walker 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
2921d24e67dSBen Walker 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
293a9e484a6SDarek Stojaczyk 	}
2941d24e67dSBen Walker 
2951d24e67dSBen Walker 	return map;
2961d24e67dSBen Walker }
2971d24e67dSBen Walker 
2981d24e67dSBen Walker void
2991d24e67dSBen Walker spdk_mem_map_free(struct spdk_mem_map **pmap)
3001d24e67dSBen Walker {
3011d24e67dSBen Walker 	struct spdk_mem_map *map;
3021d24e67dSBen Walker 	size_t i;
3031d24e67dSBen Walker 
3041d24e67dSBen Walker 	if (!pmap) {
3051d24e67dSBen Walker 		return;
3061d24e67dSBen Walker 	}
3071d24e67dSBen Walker 
3081d24e67dSBen Walker 	map = *pmap;
3091d24e67dSBen Walker 
3101d24e67dSBen Walker 	if (!map) {
3111d24e67dSBen Walker 		return;
3121d24e67dSBen Walker 	}
3131d24e67dSBen Walker 
3140d57730eSDarek Stojaczyk 	if (map->ops.notify_cb) {
3151d24e67dSBen Walker 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
3163456377bSSeth Howell 		mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
3171d24e67dSBen Walker 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
3181d24e67dSBen Walker 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
3190d57730eSDarek Stojaczyk 	}
3201d24e67dSBen Walker 
32133e88d79SJim Harris 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
32233e88d79SJim Harris 		free(map->map_256tb.map[i]);
3231d24e67dSBen Walker 	}
3241d24e67dSBen Walker 
3251d24e67dSBen Walker 	pthread_mutex_destroy(&map->mutex);
3261d24e67dSBen Walker 
3271d24e67dSBen Walker 	free(map);
3281d24e67dSBen Walker 	*pmap = NULL;
3291d24e67dSBen Walker }
3301d24e67dSBen Walker 
3311d24e67dSBen Walker int
33280b71d70SJim Harris spdk_mem_register(void *_vaddr, size_t len)
3331d24e67dSBen Walker {
3341d24e67dSBen Walker 	struct spdk_mem_map *map;
3351d24e67dSBen Walker 	int rc;
33680b71d70SJim Harris 	uint64_t vaddr = (uintptr_t)_vaddr;
33780b71d70SJim Harris 	uint64_t seg_vaddr;
3381d24e67dSBen Walker 	size_t seg_len;
339a33e0943SDarek Stojaczyk 	uint64_t reg;
3401d24e67dSBen Walker 
34133e88d79SJim Harris 	if ((uintptr_t)vaddr & ~MASK_256TB) {
34280b71d70SJim Harris 		DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
3431d24e67dSBen Walker 		return -EINVAL;
3441d24e67dSBen Walker 	}
3451d24e67dSBen Walker 
3461d24e67dSBen Walker 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
34780b71d70SJim Harris 		DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
3481d24e67dSBen Walker 			    __func__, vaddr, len);
3491d24e67dSBen Walker 		return -EINVAL;
3501d24e67dSBen Walker 	}
3511d24e67dSBen Walker 
352a33e0943SDarek Stojaczyk 	if (len == 0) {
353a33e0943SDarek Stojaczyk 		return 0;
354a33e0943SDarek Stojaczyk 	}
355a33e0943SDarek Stojaczyk 
3561d24e67dSBen Walker 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
3571d24e67dSBen Walker 
3581d24e67dSBen Walker 	seg_vaddr = vaddr;
359a33e0943SDarek Stojaczyk 	seg_len = len;
360a33e0943SDarek Stojaczyk 	while (seg_len > 0) {
361a33e0943SDarek Stojaczyk 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
362a33e0943SDarek Stojaczyk 		if (reg & REG_MAP_REGISTERED) {
363a33e0943SDarek Stojaczyk 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
364a33e0943SDarek Stojaczyk 			return -EBUSY;
365a33e0943SDarek Stojaczyk 		}
366a33e0943SDarek Stojaczyk 		seg_vaddr += VALUE_2MB;
367a33e0943SDarek Stojaczyk 		seg_len -= VALUE_2MB;
368a33e0943SDarek Stojaczyk 	}
369a33e0943SDarek Stojaczyk 
370a33e0943SDarek Stojaczyk 	seg_vaddr = vaddr;
3711d24e67dSBen Walker 	seg_len = 0;
3721d24e67dSBen Walker 	while (len > 0) {
3736bcd9295SDarek Stojaczyk 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
374a33e0943SDarek Stojaczyk 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
3751d24e67dSBen Walker 		seg_len += VALUE_2MB;
3761d24e67dSBen Walker 		vaddr += VALUE_2MB;
3771d24e67dSBen Walker 		len -= VALUE_2MB;
3781d24e67dSBen Walker 	}
3791d24e67dSBen Walker 
3801d24e67dSBen Walker 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
38180b71d70SJim Harris 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER,
38280b71d70SJim Harris 					(void *)seg_vaddr, seg_len);
3831d24e67dSBen Walker 		if (rc != 0) {
3841d24e67dSBen Walker 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
3851d24e67dSBen Walker 			return rc;
3861d24e67dSBen Walker 		}
3871d24e67dSBen Walker 	}
3881d24e67dSBen Walker 
3891d24e67dSBen Walker 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
3901d24e67dSBen Walker 	return 0;
3911d24e67dSBen Walker }
3921d24e67dSBen Walker 
3931d24e67dSBen Walker int
39480b71d70SJim Harris spdk_mem_unregister(void *_vaddr, size_t len)
3951d24e67dSBen Walker {
3961d24e67dSBen Walker 	struct spdk_mem_map *map;
3971d24e67dSBen Walker 	int rc;
39880b71d70SJim Harris 	uint64_t vaddr = (uintptr_t)_vaddr;
39980b71d70SJim Harris 	uint64_t seg_vaddr;
4001d24e67dSBen Walker 	size_t seg_len;
40108039550SDarek Stojaczyk 	uint64_t reg, newreg;
4021d24e67dSBen Walker 
40333e88d79SJim Harris 	if ((uintptr_t)vaddr & ~MASK_256TB) {
40480b71d70SJim Harris 		DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
4051d24e67dSBen Walker 		return -EINVAL;
4061d24e67dSBen Walker 	}
4071d24e67dSBen Walker 
4081d24e67dSBen Walker 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
40980b71d70SJim Harris 		DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
4101d24e67dSBen Walker 			    __func__, vaddr, len);
4111d24e67dSBen Walker 		return -EINVAL;
4121d24e67dSBen Walker 	}
4131d24e67dSBen Walker 
4141d24e67dSBen Walker 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
4151d24e67dSBen Walker 
41608039550SDarek Stojaczyk 	/* The first page must be a start of a region. Also check if it's
41708039550SDarek Stojaczyk 	 * registered to make sure we don't return -ERANGE for non-registered
41808039550SDarek Stojaczyk 	 * regions.
41908039550SDarek Stojaczyk 	 */
42008039550SDarek Stojaczyk 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
42108039550SDarek Stojaczyk 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
42208039550SDarek Stojaczyk 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
42308039550SDarek Stojaczyk 		return -ERANGE;
42408039550SDarek Stojaczyk 	}
42508039550SDarek Stojaczyk 
4261d24e67dSBen Walker 	seg_vaddr = vaddr;
4271d24e67dSBen Walker 	seg_len = len;
4281d24e67dSBen Walker 	while (seg_len > 0) {
4296bcd9295SDarek Stojaczyk 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
430a33e0943SDarek Stojaczyk 		if ((reg & REG_MAP_REGISTERED) == 0) {
4311d24e67dSBen Walker 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
4321d24e67dSBen Walker 			return -EINVAL;
4331d24e67dSBen Walker 		}
4341d24e67dSBen Walker 		seg_vaddr += VALUE_2MB;
4351d24e67dSBen Walker 		seg_len -= VALUE_2MB;
4361d24e67dSBen Walker 	}
4371d24e67dSBen Walker 
43808039550SDarek Stojaczyk 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
43908039550SDarek Stojaczyk 	/* If the next page is registered, it must be a start of a region as well,
44008039550SDarek Stojaczyk 	 * otherwise we'd be unregistering only a part of a region.
44108039550SDarek Stojaczyk 	 */
44208039550SDarek Stojaczyk 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
44308039550SDarek Stojaczyk 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
44408039550SDarek Stojaczyk 		return -ERANGE;
44508039550SDarek Stojaczyk 	}
4461d24e67dSBen Walker 	seg_vaddr = vaddr;
4471d24e67dSBen Walker 	seg_len = 0;
44808039550SDarek Stojaczyk 
4491d24e67dSBen Walker 	while (len > 0) {
4506bcd9295SDarek Stojaczyk 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
451a33e0943SDarek Stojaczyk 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
4521d24e67dSBen Walker 
453a33e0943SDarek Stojaczyk 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
454dcac8e97SDarek Stojaczyk 			TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
45580b71d70SJim Harris 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
45680b71d70SJim Harris 							(void *)seg_vaddr, seg_len);
4571d24e67dSBen Walker 				if (rc != 0) {
4581d24e67dSBen Walker 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
4591d24e67dSBen Walker 					return rc;
4601d24e67dSBen Walker 				}
4611d24e67dSBen Walker 			}
4621d24e67dSBen Walker 
4633372a72cSDarek Stojaczyk 			seg_vaddr = vaddr;
4643372a72cSDarek Stojaczyk 			seg_len = VALUE_2MB;
4651d24e67dSBen Walker 		} else {
4661d24e67dSBen Walker 			seg_len += VALUE_2MB;
4671d24e67dSBen Walker 		}
4681d24e67dSBen Walker 
4691d24e67dSBen Walker 		vaddr += VALUE_2MB;
4701d24e67dSBen Walker 		len -= VALUE_2MB;
4711d24e67dSBen Walker 	}
4721d24e67dSBen Walker 
4731d24e67dSBen Walker 	if (seg_len > 0) {
474dcac8e97SDarek Stojaczyk 		TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
47580b71d70SJim Harris 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
47680b71d70SJim Harris 						(void *)seg_vaddr, seg_len);
4771d24e67dSBen Walker 			if (rc != 0) {
4781d24e67dSBen Walker 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
4791d24e67dSBen Walker 				return rc;
4801d24e67dSBen Walker 			}
4811d24e67dSBen Walker 		}
4821d24e67dSBen Walker 	}
4831d24e67dSBen Walker 
4841d24e67dSBen Walker 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
4851d24e67dSBen Walker 	return 0;
4861d24e67dSBen Walker }
4871d24e67dSBen Walker 
488cf450c0dSBen Walker int
489cf450c0dSBen Walker spdk_mem_reserve(void *vaddr, size_t len)
490cf450c0dSBen Walker {
491cf450c0dSBen Walker 	struct spdk_mem_map *map;
492cf450c0dSBen Walker 	void *seg_vaddr;
493cf450c0dSBen Walker 	size_t seg_len;
494cf450c0dSBen Walker 	uint64_t reg;
495cf450c0dSBen Walker 
496cf450c0dSBen Walker 	if ((uintptr_t)vaddr & ~MASK_256TB) {
497cf450c0dSBen Walker 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
498cf450c0dSBen Walker 		return -EINVAL;
499cf450c0dSBen Walker 	}
500cf450c0dSBen Walker 
501cf450c0dSBen Walker 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
502cf450c0dSBen Walker 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
503cf450c0dSBen Walker 			    __func__, vaddr, len);
504cf450c0dSBen Walker 		return -EINVAL;
505cf450c0dSBen Walker 	}
506cf450c0dSBen Walker 
507cf450c0dSBen Walker 	if (len == 0) {
508cf450c0dSBen Walker 		return 0;
509cf450c0dSBen Walker 	}
510cf450c0dSBen Walker 
511cf450c0dSBen Walker 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
512cf450c0dSBen Walker 
513cf450c0dSBen Walker 	/* Check if any part of this range is already registered */
514cf450c0dSBen Walker 	seg_vaddr = vaddr;
515cf450c0dSBen Walker 	seg_len = len;
516cf450c0dSBen Walker 	while (seg_len > 0) {
517cf450c0dSBen Walker 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
518cf450c0dSBen Walker 		if (reg & REG_MAP_REGISTERED) {
519cf450c0dSBen Walker 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
520cf450c0dSBen Walker 			return -EBUSY;
521cf450c0dSBen Walker 		}
522cf450c0dSBen Walker 		seg_vaddr += VALUE_2MB;
523cf450c0dSBen Walker 		seg_len -= VALUE_2MB;
524cf450c0dSBen Walker 	}
525cf450c0dSBen Walker 
526cf450c0dSBen Walker 	/* Simply set the translation to the memory map's default. This allocates the space in the
527cf450c0dSBen Walker 	 * map but does not provide a valid translation. */
528cf450c0dSBen Walker 	spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
529cf450c0dSBen Walker 				     g_mem_reg_map->default_translation);
530cf450c0dSBen Walker 
531cf450c0dSBen Walker 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
532cf450c0dSBen Walker 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
533cf450c0dSBen Walker 	}
534cf450c0dSBen Walker 
535cf450c0dSBen Walker 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
536cf450c0dSBen Walker 	return 0;
537cf450c0dSBen Walker }
538cf450c0dSBen Walker 
5391d24e67dSBen Walker static struct map_1gb *
5403456377bSSeth Howell mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
5411d24e67dSBen Walker {
5421d24e67dSBen Walker 	struct map_1gb *map_1gb;
54333e88d79SJim Harris 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
5441d24e67dSBen Walker 	size_t i;
5451d24e67dSBen Walker 
5467d24e2a4SDaniel Verkamp 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
5477d24e2a4SDaniel Verkamp 		return NULL;
5487d24e2a4SDaniel Verkamp 	}
5497d24e2a4SDaniel Verkamp 
55033e88d79SJim Harris 	map_1gb = map->map_256tb.map[idx_256tb];
5511d24e67dSBen Walker 
5521d24e67dSBen Walker 	if (!map_1gb) {
5531d24e67dSBen Walker 		pthread_mutex_lock(&map->mutex);
5541d24e67dSBen Walker 
5551d24e67dSBen Walker 		/* Recheck to make sure nobody else got the mutex first. */
55633e88d79SJim Harris 		map_1gb = map->map_256tb.map[idx_256tb];
5571d24e67dSBen Walker 		if (!map_1gb) {
5581d24e67dSBen Walker 			map_1gb = malloc(sizeof(struct map_1gb));
5591d24e67dSBen Walker 			if (map_1gb) {
5601d24e67dSBen Walker 				/* initialize all entries to default translation */
5611d24e67dSBen Walker 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
5621d24e67dSBen Walker 					map_1gb->map[i].translation_2mb = map->default_translation;
5631d24e67dSBen Walker 				}
56433e88d79SJim Harris 				map->map_256tb.map[idx_256tb] = map_1gb;
5651d24e67dSBen Walker 			}
5661d24e67dSBen Walker 		}
5671d24e67dSBen Walker 
5681d24e67dSBen Walker 		pthread_mutex_unlock(&map->mutex);
5691d24e67dSBen Walker 
5701d24e67dSBen Walker 		if (!map_1gb) {
5711d24e67dSBen Walker 			DEBUG_PRINT("allocation failed\n");
5721d24e67dSBen Walker 			return NULL;
5731d24e67dSBen Walker 		}
5741d24e67dSBen Walker 	}
5751d24e67dSBen Walker 
5761d24e67dSBen Walker 	return map_1gb;
5771d24e67dSBen Walker }
5781d24e67dSBen Walker 
5791d24e67dSBen Walker int
5801d24e67dSBen Walker spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
5811d24e67dSBen Walker 			     uint64_t translation)
5821d24e67dSBen Walker {
5831d24e67dSBen Walker 	uint64_t vfn_2mb;
5841d24e67dSBen Walker 	struct map_1gb *map_1gb;
5851d24e67dSBen Walker 	uint64_t idx_1gb;
5861d24e67dSBen Walker 	struct map_2mb *map_2mb;
5871d24e67dSBen Walker 
58833e88d79SJim Harris 	if ((uintptr_t)vaddr & ~MASK_256TB) {
589b8c99a3aSNick Connolly 		DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
5901d24e67dSBen Walker 		return -EINVAL;
5911d24e67dSBen Walker 	}
5921d24e67dSBen Walker 
59320836de2SJim Harris 	/* For now, only 2 MB-aligned registrations are supported */
5941d24e67dSBen Walker 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
595b8c99a3aSNick Connolly 		DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
5961d24e67dSBen Walker 			    __func__, vaddr, size);
5971d24e67dSBen Walker 		return -EINVAL;
5981d24e67dSBen Walker 	}
5991d24e67dSBen Walker 
6001d24e67dSBen Walker 	vfn_2mb = vaddr >> SHIFT_2MB;
6011d24e67dSBen Walker 
6021d24e67dSBen Walker 	while (size) {
6033456377bSSeth Howell 		map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
6041d24e67dSBen Walker 		if (!map_1gb) {
6051d24e67dSBen Walker 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
6061d24e67dSBen Walker 			return -ENOMEM;
6071d24e67dSBen Walker 		}
6081d24e67dSBen Walker 
6091d24e67dSBen Walker 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
6101d24e67dSBen Walker 		map_2mb = &map_1gb->map[idx_1gb];
6111d24e67dSBen Walker 		map_2mb->translation_2mb = translation;
6121d24e67dSBen Walker 
6131d24e67dSBen Walker 		size -= VALUE_2MB;
6141d24e67dSBen Walker 		vfn_2mb++;
6151d24e67dSBen Walker 	}
6161d24e67dSBen Walker 
6171d24e67dSBen Walker 	return 0;
6181d24e67dSBen Walker }
6191d24e67dSBen Walker 
6201d24e67dSBen Walker int
6211d24e67dSBen Walker spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
6221d24e67dSBen Walker {
623d7b5ca74SJim Harris 	return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
6241d24e67dSBen Walker }
6251d24e67dSBen Walker 
626608a2d58SJim Harris inline uint64_t
627d288c412SSeth Howell spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
6281d24e67dSBen Walker {
6291d24e67dSBen Walker 	const struct map_1gb *map_1gb;
6301d24e67dSBen Walker 	const struct map_2mb *map_2mb;
63133e88d79SJim Harris 	uint64_t idx_256tb;
6321d24e67dSBen Walker 	uint64_t idx_1gb;
6331d24e67dSBen Walker 	uint64_t vfn_2mb;
6345d573868SSeth Howell 	uint64_t cur_size;
6355d573868SSeth Howell 	uint64_t prev_translation;
6367245134aSSeth Howell 	uint64_t orig_translation;
6371d24e67dSBen Walker 
63833e88d79SJim Harris 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
6391d24e67dSBen Walker 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
6401d24e67dSBen Walker 		return map->default_translation;
6411d24e67dSBen Walker 	}
6421d24e67dSBen Walker 
6431d24e67dSBen Walker 	vfn_2mb = vaddr >> SHIFT_2MB;
64433e88d79SJim Harris 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
6451d24e67dSBen Walker 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
6461d24e67dSBen Walker 
64733e88d79SJim Harris 	map_1gb = map->map_256tb.map[idx_256tb];
6481d24e67dSBen Walker 	if (spdk_unlikely(!map_1gb)) {
6491d24e67dSBen Walker 		return map->default_translation;
6501d24e67dSBen Walker 	}
6511d24e67dSBen Walker 
65237b7a308SDarek Stojaczyk 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
6535d573868SSeth Howell 	map_2mb = &map_1gb->map[idx_1gb];
6545d573868SSeth Howell 	if (size == NULL || map->ops.are_contiguous == NULL ||
6555d573868SSeth Howell 	    map_2mb->translation_2mb == map->default_translation) {
656b45683c7SDarek Stojaczyk 		if (size != NULL) {
657641c5b00SDarek Stojaczyk 			*size = spdk_min(*size, cur_size);
658b45683c7SDarek Stojaczyk 		}
6591d24e67dSBen Walker 		return map_2mb->translation_2mb;
6601d24e67dSBen Walker 	}
6611d24e67dSBen Walker 
6627245134aSSeth Howell 	orig_translation = map_2mb->translation_2mb;
6637245134aSSeth Howell 	prev_translation = orig_translation;
664b45683c7SDarek Stojaczyk 	while (cur_size < *size) {
6655d573868SSeth Howell 		vfn_2mb++;
6665d573868SSeth Howell 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
6675d573868SSeth Howell 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
6685d573868SSeth Howell 
6695d573868SSeth Howell 		map_1gb = map->map_256tb.map[idx_256tb];
6705d573868SSeth Howell 		if (spdk_unlikely(!map_1gb)) {
6715d573868SSeth Howell 			break;
6725d573868SSeth Howell 		}
6735d573868SSeth Howell 
6745d573868SSeth Howell 		map_2mb = &map_1gb->map[idx_1gb];
6755d573868SSeth Howell 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
6765d573868SSeth Howell 			break;
6775d573868SSeth Howell 		}
6785d573868SSeth Howell 
6795d573868SSeth Howell 		cur_size += VALUE_2MB;
6805d573868SSeth Howell 		prev_translation = map_2mb->translation_2mb;
6815d573868SSeth Howell 	}
6825d573868SSeth Howell 
683641c5b00SDarek Stojaczyk 	*size = spdk_min(*size, cur_size);
6847245134aSSeth Howell 	return orig_translation;
6855d573868SSeth Howell }
6865d573868SSeth Howell 
687b6fce191SDariusz Stojaczyk static void
688b6fce191SDariusz Stojaczyk memory_hotplug_cb(enum rte_mem_event event_type,
689b6fce191SDariusz Stojaczyk 		  const void *addr, size_t len, void *arg)
690b6fce191SDariusz Stojaczyk {
691b6fce191SDariusz Stojaczyk 	if (event_type == RTE_MEM_EVENT_ALLOC) {
6929cec99b8SJim Harris 		spdk_mem_register((void *)addr, len);
6939cec99b8SJim Harris 
6940871b382SJim Harris 		if (!spdk_env_dpdk_external_init()) {
6950871b382SJim Harris 			return;
6960871b382SJim Harris 		}
6970871b382SJim Harris 
6986bb83abdSTomasz Zawadzki 		/* When the user initialized DPDK separately, we can't
6996bb83abdSTomasz Zawadzki 		 * be sure that --match-allocations RTE flag was specified.
7006bb83abdSTomasz Zawadzki 		 * Without this flag, DPDK can free memory in different units
7016bb83abdSTomasz Zawadzki 		 * than it was allocated. It doesn't work with things like RDMA MRs.
7020871b382SJim Harris 		 *
7036bb83abdSTomasz Zawadzki 		 * For such cases, we mark segments so they aren't freed.
7049cec99b8SJim Harris 		 */
705f4cb6c90SDarek Stojaczyk 		while (len > 0) {
706f4cb6c90SDarek Stojaczyk 			struct rte_memseg *seg;
707f4cb6c90SDarek Stojaczyk 
708f4cb6c90SDarek Stojaczyk 			seg = rte_mem_virt2memseg(addr, NULL);
709f4cb6c90SDarek Stojaczyk 			assert(seg != NULL);
7109cec99b8SJim Harris 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
711f4cb6c90SDarek Stojaczyk 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
712f4cb6c90SDarek Stojaczyk 			len -= seg->hugepage_sz;
713f4cb6c90SDarek Stojaczyk 		}
714b6fce191SDariusz Stojaczyk 	} else if (event_type == RTE_MEM_EVENT_FREE) {
715ba9853b9SJim Harris 		spdk_mem_unregister((void *)addr, len);
716b6fce191SDariusz Stojaczyk 	}
717b6fce191SDariusz Stojaczyk }
718b6fce191SDariusz Stojaczyk 
719b6fce191SDariusz Stojaczyk static int
720b6fce191SDariusz Stojaczyk memory_iter_cb(const struct rte_memseg_list *msl,
721b6fce191SDariusz Stojaczyk 	       const struct rte_memseg *ms, size_t len, void *arg)
722b6fce191SDariusz Stojaczyk {
723b6fce191SDariusz Stojaczyk 	return spdk_mem_register(ms->addr, len);
724b6fce191SDariusz Stojaczyk }
725b6fce191SDariusz Stojaczyk 
726095f4254SLance Hartmann int
72715d0ae62SSeth Howell mem_map_init(bool legacy_mem)
7281d24e67dSBen Walker {
729396c445cSJim Harris 	g_legacy_mem = legacy_mem;
730396c445cSJim Harris 
7311d24e67dSBen Walker 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
7321d24e67dSBen Walker 	if (g_mem_reg_map == NULL) {
7331d24e67dSBen Walker 		DEBUG_PRINT("memory registration map allocation failed\n");
73437c0a02eSJim Harris 		return -ENOMEM;
7351d24e67dSBen Walker 	}
7361d24e67dSBen Walker 
7371d24e67dSBen Walker 	/*
7381d24e67dSBen Walker 	 * Walk all DPDK memory segments and register them
739fe137c89SJim Harris 	 * with the main memory map
7401d24e67dSBen Walker 	 */
741a6658c54SSarvesh Lanke 	if (g_huge_pages) {
742b6fce191SDariusz Stojaczyk 		rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
743b6fce191SDariusz Stojaczyk 		rte_memseg_contig_walk(memory_iter_cb, NULL);
744a6658c54SSarvesh Lanke 	}
745095f4254SLance Hartmann 	return 0;
7461d24e67dSBen Walker }
747f74d069eSJim Harris 
748d9f92cd3SSeth Howell bool
749d9f92cd3SSeth Howell spdk_iommu_is_enabled(void)
750d9f92cd3SSeth Howell {
7513456377bSSeth Howell #if VFIO_ENABLED
752d9f92cd3SSeth Howell 	return g_vfio.enabled && !g_vfio.noiommu_enabled;
753d9f92cd3SSeth Howell #else
754d9f92cd3SSeth Howell 	return false;
755d9f92cd3SSeth Howell #endif
756d9f92cd3SSeth Howell }
757d9f92cd3SSeth Howell 
758f74d069eSJim Harris struct spdk_vtophys_pci_device {
759f74d069eSJim Harris 	struct rte_pci_device *pci_device;
760f74d069eSJim Harris 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
761f74d069eSJim Harris };
762f74d069eSJim Harris 
763f74d069eSJim Harris static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
764f74d069eSJim Harris static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
765f74d069eSJim Harris 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
766f74d069eSJim Harris 
767f74d069eSJim Harris static struct spdk_mem_map *g_vtophys_map;
768f4a63bb8SSeth Howell static struct spdk_mem_map *g_phys_ref_map;
769*40c9acf6SJim Harris static struct spdk_mem_map *g_numa_map;
770f74d069eSJim Harris 
771ba9853b9SJim Harris #if VFIO_ENABLED
772ba9853b9SJim Harris static int
773a36bc251SBen Walker _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
774ba9853b9SJim Harris {
775ba9853b9SJim Harris 	struct spdk_vfio_dma_map *dma_map;
776ba9853b9SJim Harris 	int ret;
777ba9853b9SJim Harris 
778ba9853b9SJim Harris 	dma_map = calloc(1, sizeof(*dma_map));
779ba9853b9SJim Harris 	if (dma_map == NULL) {
780ba9853b9SJim Harris 		return -ENOMEM;
781ba9853b9SJim Harris 	}
782ba9853b9SJim Harris 
783ba9853b9SJim Harris 	dma_map->map.argsz = sizeof(dma_map->map);
784ba9853b9SJim Harris 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
785ba9853b9SJim Harris 	dma_map->map.vaddr = vaddr;
786ba9853b9SJim Harris 	dma_map->map.iova = iova;
787ba9853b9SJim Harris 	dma_map->map.size = size;
788ba9853b9SJim Harris 
789ba9853b9SJim Harris 	if (g_vfio.device_ref == 0) {
790ba9853b9SJim Harris 		/* VFIO requires at least one device (IOMMU group) to be added to
791ba9853b9SJim Harris 		 * a VFIO container before it is possible to perform any IOMMU
792ba9853b9SJim Harris 		 * operations on that container. This memory will be mapped once
793ba9853b9SJim Harris 		 * the first device (IOMMU group) is hotplugged.
794ba9853b9SJim Harris 		 *
795ba9853b9SJim Harris 		 * Since the vfio container is managed internally by DPDK, it is
796ba9853b9SJim Harris 		 * also possible that some device is already in that container, but
797ba9853b9SJim Harris 		 * it's not managed by SPDK -  e.g. an NIC attached internally
798ba9853b9SJim Harris 		 * inside DPDK. We could map the memory straight away in such
799ba9853b9SJim Harris 		 * scenario, but there's no need to do it. DPDK devices clearly
800ba9853b9SJim Harris 		 * don't need our mappings and hence we defer the mapping
801ba9853b9SJim Harris 		 * unconditionally until the first SPDK-managed device is
802ba9853b9SJim Harris 		 * hotplugged.
803ba9853b9SJim Harris 		 */
804ba9853b9SJim Harris 		goto out_insert;
805ba9853b9SJim Harris 	}
806ba9853b9SJim Harris 
807ba9853b9SJim Harris 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
808ba9853b9SJim Harris 	if (ret) {
80962b9210cSChangpeng Liu 		/* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
81062b9210cSChangpeng Liu 		SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
811ba9853b9SJim Harris 	}
812ba9853b9SJim Harris 
813ba9853b9SJim Harris out_insert:
814ba9853b9SJim Harris 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
815a36bc251SBen Walker 	return 0;
816a36bc251SBen Walker }
817a36bc251SBen Walker 
818a36bc251SBen Walker 
819a36bc251SBen Walker static int
820a36bc251SBen Walker vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
821a36bc251SBen Walker {
822a36bc251SBen Walker 	uint64_t refcount;
823a36bc251SBen Walker 	int ret;
824a36bc251SBen Walker 
825a36bc251SBen Walker 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
826a36bc251SBen Walker 	assert(refcount < UINT64_MAX);
827a36bc251SBen Walker 	if (refcount > 0) {
828ba9853b9SJim Harris 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
829ba9853b9SJim Harris 		return 0;
830ba9853b9SJim Harris 	}
831ba9853b9SJim Harris 
832a36bc251SBen Walker 	pthread_mutex_lock(&g_vfio.mutex);
833a36bc251SBen Walker 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
834a36bc251SBen Walker 	pthread_mutex_unlock(&g_vfio.mutex);
835a36bc251SBen Walker 	if (ret) {
836a36bc251SBen Walker 		return ret;
837a36bc251SBen Walker 	}
838a36bc251SBen Walker 
839a36bc251SBen Walker 	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
840a36bc251SBen Walker 	return 0;
841a36bc251SBen Walker }
842a36bc251SBen Walker 
843a36bc251SBen Walker int
844a36bc251SBen Walker vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
845a36bc251SBen Walker {
846a36bc251SBen Walker 	int ret;
847a36bc251SBen Walker 
848a36bc251SBen Walker 	pthread_mutex_lock(&g_vfio.mutex);
849a36bc251SBen Walker 	ret = _vfio_iommu_map_dma(vaddr, iova, size);
850a36bc251SBen Walker 	pthread_mutex_unlock(&g_vfio.mutex);
851a36bc251SBen Walker 
852a36bc251SBen Walker 	return ret;
853a36bc251SBen Walker }
854a36bc251SBen Walker 
855a36bc251SBen Walker static int
856a36bc251SBen Walker _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
857a36bc251SBen Walker {
858a36bc251SBen Walker 	struct vfio_iommu_type1_dma_unmap unmap = {};
859a36bc251SBen Walker 	int ret;
860a36bc251SBen Walker 
861a36bc251SBen Walker 	if (g_vfio.device_ref == 0) {
862a36bc251SBen Walker 		/* Memory is not mapped anymore, just remove it's references */
863a36bc251SBen Walker 		goto out_remove;
864a36bc251SBen Walker 	}
865a36bc251SBen Walker 
866a36bc251SBen Walker 	unmap.argsz = sizeof(unmap);
867a36bc251SBen Walker 	unmap.flags = 0;
868a36bc251SBen Walker 	unmap.iova = dma_map->map.iova;
869a36bc251SBen Walker 	unmap.size = dma_map->map.size;
870a36bc251SBen Walker 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
871a36bc251SBen Walker 	if (ret) {
872a36bc251SBen Walker 		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
873a36bc251SBen Walker 	}
874a36bc251SBen Walker 
875a36bc251SBen Walker out_remove:
876a36bc251SBen Walker 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
877a36bc251SBen Walker 	free(dma_map);
878a36bc251SBen Walker 	return 0;
879a36bc251SBen Walker }
880a36bc251SBen Walker 
881ba9853b9SJim Harris static int
882ba9853b9SJim Harris vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
883ba9853b9SJim Harris {
884ba9853b9SJim Harris 	struct spdk_vfio_dma_map *dma_map;
885ba9853b9SJim Harris 	uint64_t refcount;
886ba9853b9SJim Harris 	int ret;
887ba9853b9SJim Harris 
888ba9853b9SJim Harris 	pthread_mutex_lock(&g_vfio.mutex);
889ba9853b9SJim Harris 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
890ba9853b9SJim Harris 		if (dma_map->map.iova == iova) {
891ba9853b9SJim Harris 			break;
892ba9853b9SJim Harris 		}
893ba9853b9SJim Harris 	}
894ba9853b9SJim Harris 
895ba9853b9SJim Harris 	if (dma_map == NULL) {
896ba9853b9SJim Harris 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
897ba9853b9SJim Harris 		pthread_mutex_unlock(&g_vfio.mutex);
898ba9853b9SJim Harris 		return -ENXIO;
899ba9853b9SJim Harris 	}
900ba9853b9SJim Harris 
901ba9853b9SJim Harris 	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
902ba9853b9SJim Harris 	assert(refcount < UINT64_MAX);
903ba9853b9SJim Harris 	if (refcount > 0) {
904ba9853b9SJim Harris 		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
905ba9853b9SJim Harris 	}
906ba9853b9SJim Harris 
907ba9853b9SJim Harris 	/* We still have outstanding references, don't clear it. */
908ba9853b9SJim Harris 	if (refcount > 1) {
909ba9853b9SJim Harris 		pthread_mutex_unlock(&g_vfio.mutex);
910ba9853b9SJim Harris 		return 0;
911ba9853b9SJim Harris 	}
912ba9853b9SJim Harris 
913ba9853b9SJim Harris 	/** don't support partial or multiple-page unmap for now */
914ba9853b9SJim Harris 	assert(dma_map->map.size == size);
915ba9853b9SJim Harris 
916a36bc251SBen Walker 	ret = _vfio_iommu_unmap_dma(dma_map);
917ba9853b9SJim Harris 	pthread_mutex_unlock(&g_vfio.mutex);
918a36bc251SBen Walker 
919a36bc251SBen Walker 	return ret;
920a36bc251SBen Walker }
921a36bc251SBen Walker 
922a36bc251SBen Walker int
923a36bc251SBen Walker vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
924a36bc251SBen Walker {
925a36bc251SBen Walker 	struct spdk_vfio_dma_map *dma_map;
926a36bc251SBen Walker 	int ret;
927a36bc251SBen Walker 
928a36bc251SBen Walker 	pthread_mutex_lock(&g_vfio.mutex);
929a36bc251SBen Walker 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
930a36bc251SBen Walker 		if (dma_map->map.vaddr == vaddr) {
931a36bc251SBen Walker 			break;
932a36bc251SBen Walker 		}
933a36bc251SBen Walker 	}
934a36bc251SBen Walker 
935a36bc251SBen Walker 	if (dma_map == NULL) {
936a36bc251SBen Walker 		DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
937a36bc251SBen Walker 		pthread_mutex_unlock(&g_vfio.mutex);
938a36bc251SBen Walker 		return -ENXIO;
939a36bc251SBen Walker 	}
940a36bc251SBen Walker 
941a36bc251SBen Walker 	ret = _vfio_iommu_unmap_dma(dma_map);
942a36bc251SBen Walker 	pthread_mutex_unlock(&g_vfio.mutex);
943a36bc251SBen Walker 	return ret;
944ba9853b9SJim Harris }
945ba9853b9SJim Harris #endif
946ba9853b9SJim Harris 
947f74d069eSJim Harris static uint64_t
948f74d069eSJim Harris vtophys_get_paddr_memseg(uint64_t vaddr)
949f74d069eSJim Harris {
950f74d069eSJim Harris 	uintptr_t paddr;
951f74d069eSJim Harris 	struct rte_memseg *seg;
952f74d069eSJim Harris 
953f74d069eSJim Harris 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
954f74d069eSJim Harris 	if (seg != NULL) {
955ab856faaSTomasz Zawadzki 		paddr = seg->iova;
956f74d069eSJim Harris 		if (paddr == RTE_BAD_IOVA) {
957f74d069eSJim Harris 			return SPDK_VTOPHYS_ERROR;
958f74d069eSJim Harris 		}
959f74d069eSJim Harris 		paddr += (vaddr - (uintptr_t)seg->addr);
960f74d069eSJim Harris 		return paddr;
961f74d069eSJim Harris 	}
962f74d069eSJim Harris 
963f74d069eSJim Harris 	return SPDK_VTOPHYS_ERROR;
964f74d069eSJim Harris }
965f74d069eSJim Harris 
966f74d069eSJim Harris /* Try to get the paddr from /proc/self/pagemap */
967f74d069eSJim Harris static uint64_t
968f74d069eSJim Harris vtophys_get_paddr_pagemap(uint64_t vaddr)
969f74d069eSJim Harris {
970f74d069eSJim Harris 	uintptr_t paddr;
971f74d069eSJim Harris 
972a7ff5ff5SDarek Stojaczyk 	/* Silence static analyzers */
973a7ff5ff5SDarek Stojaczyk 	assert(vaddr != 0);
9746f48bf7cSDarek Stojaczyk 	paddr = rte_mem_virt2iova((void *)vaddr);
9756f48bf7cSDarek Stojaczyk 	if (paddr == RTE_BAD_IOVA) {
976f74d069eSJim Harris 		/*
977f74d069eSJim Harris 		 * The vaddr may be valid but doesn't have a backing page
978f74d069eSJim Harris 		 * assigned yet.  Touch the page to ensure a backing page
979f74d069eSJim Harris 		 * gets assigned, then try to translate again.
980f74d069eSJim Harris 		 */
981f74d069eSJim Harris 		rte_atomic64_read((rte_atomic64_t *)vaddr);
9826f48bf7cSDarek Stojaczyk 		paddr = rte_mem_virt2iova((void *)vaddr);
983f74d069eSJim Harris 	}
9846f48bf7cSDarek Stojaczyk 	if (paddr == RTE_BAD_IOVA) {
985f74d069eSJim Harris 		/* Unable to get to the physical address. */
986f74d069eSJim Harris 		return SPDK_VTOPHYS_ERROR;
987f74d069eSJim Harris 	}
988f74d069eSJim Harris 
989f74d069eSJim Harris 	return paddr;
990f74d069eSJim Harris }
991f74d069eSJim Harris 
992c7f50109SJim Harris static uint64_t
993c7f50109SJim Harris pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
994c7f50109SJim Harris {
995c7f50109SJim Harris 	struct rte_mem_resource *res;
996c7f50109SJim Harris 	uint64_t paddr;
997c7f50109SJim Harris 	unsigned r;
998c7f50109SJim Harris 
999c7f50109SJim Harris 	for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1000c7f50109SJim Harris 		res = dpdk_pci_device_get_mem_resource(dev, r);
1001c7f50109SJim Harris 
1002c7f50109SJim Harris 		if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
1003c7f50109SJim Harris 		    (vaddr + len) >= (uint64_t)res->addr + res->len) {
1004c7f50109SJim Harris 			continue;
1005c7f50109SJim Harris 		}
1006c7f50109SJim Harris 
1007c7f50109SJim Harris #if VFIO_ENABLED
1008c7f50109SJim Harris 		if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1009c7f50109SJim Harris 			/*
1010c7f50109SJim Harris 			 * The IOMMU is on and we're using IOVA == VA. The BAR was
1011c7f50109SJim Harris 			 * automatically registered when it was mapped, so just return
1012c7f50109SJim Harris 			 * the virtual address here.
1013c7f50109SJim Harris 			 */
1014c7f50109SJim Harris 			return vaddr;
1015c7f50109SJim Harris 		}
1016c7f50109SJim Harris #endif
1017c7f50109SJim Harris 		paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1018c7f50109SJim Harris 		return paddr;
1019c7f50109SJim Harris 	}
1020c7f50109SJim Harris 
1021c7f50109SJim Harris 	return SPDK_VTOPHYS_ERROR;
1022c7f50109SJim Harris }
1023c7f50109SJim Harris 
1024f74d069eSJim Harris /* Try to get the paddr from pci devices */
1025f74d069eSJim Harris static uint64_t
1026a36bc251SBen Walker vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1027f74d069eSJim Harris {
1028f74d069eSJim Harris 	struct spdk_vtophys_pci_device *vtophys_dev;
1029f74d069eSJim Harris 	uintptr_t paddr;
1030f74d069eSJim Harris 	struct rte_pci_device	*dev;
1031f74d069eSJim Harris 
1032f74d069eSJim Harris 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1033f74d069eSJim Harris 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1034f74d069eSJim Harris 		dev = vtophys_dev->pci_device;
1035c7f50109SJim Harris 		paddr = pci_device_vtophys(dev, vaddr, len);
10362bb7185fSJim Harris 		if (paddr != SPDK_VTOPHYS_ERROR) {
1037f74d069eSJim Harris 			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1038f74d069eSJim Harris 			return paddr;
1039f74d069eSJim Harris 		}
1040f74d069eSJim Harris 	}
1041f74d069eSJim Harris 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1042f74d069eSJim Harris 
1043f74d069eSJim Harris 	return SPDK_VTOPHYS_ERROR;
1044f74d069eSJim Harris }
1045f74d069eSJim Harris 
1046f74d069eSJim Harris static int
10473456377bSSeth Howell vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1048f74d069eSJim Harris 	       enum spdk_mem_map_notify_action action,
1049f74d069eSJim Harris 	       void *vaddr, size_t len)
1050f74d069eSJim Harris {
1051a36bc251SBen Walker 	int rc = 0;
1052f74d069eSJim Harris 	uint64_t paddr;
1053f74d069eSJim Harris 
1054f74d069eSJim Harris 	if ((uintptr_t)vaddr & ~MASK_256TB) {
1055f74d069eSJim Harris 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1056f74d069eSJim Harris 		return -EINVAL;
1057f74d069eSJim Harris 	}
1058f74d069eSJim Harris 
1059f74d069eSJim Harris 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
10608a252783SSeth Howell 		DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
10618a252783SSeth Howell 			    vaddr, len);
1062f74d069eSJim Harris 		return -EINVAL;
1063f74d069eSJim Harris 	}
1064f74d069eSJim Harris 
1065f74d069eSJim Harris 	/* Get the physical address from the DPDK memsegs */
1066f74d069eSJim Harris 	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1067f74d069eSJim Harris 
1068f74d069eSJim Harris 	switch (action) {
1069f74d069eSJim Harris 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1070f74d069eSJim Harris 		if (paddr == SPDK_VTOPHYS_ERROR) {
1071ba9853b9SJim Harris 			/* This is not an address that DPDK is managing. */
1072a36bc251SBen Walker 
1073a36bc251SBen Walker 			/* Check if this is a PCI BAR. They need special handling */
1074a36bc251SBen Walker 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1075a36bc251SBen Walker 			if (paddr != SPDK_VTOPHYS_ERROR) {
1076a36bc251SBen Walker 				/* Get paddr for each 2MB chunk in this address range */
1077a36bc251SBen Walker 				while (len > 0) {
1078a36bc251SBen Walker 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1079a36bc251SBen Walker 					if (paddr == SPDK_VTOPHYS_ERROR) {
1080a36bc251SBen Walker 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1081a36bc251SBen Walker 						return -EFAULT;
1082a36bc251SBen Walker 					}
1083a36bc251SBen Walker 
1084a36bc251SBen Walker 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1085a36bc251SBen Walker 					if (rc != 0) {
1086a36bc251SBen Walker 						return rc;
1087a36bc251SBen Walker 					}
1088a36bc251SBen Walker 
1089a36bc251SBen Walker 					vaddr += VALUE_2MB;
1090a36bc251SBen Walker 					len -= VALUE_2MB;
1091a36bc251SBen Walker 				}
1092a36bc251SBen Walker 
1093a36bc251SBen Walker 				return 0;
1094a36bc251SBen Walker 			}
1095a36bc251SBen Walker 
10963456377bSSeth Howell #if VFIO_ENABLED
1097dd7cd80cSBen Walker 			enum rte_iova_mode iova_mode;
1098dd7cd80cSBen Walker 
1099dd7cd80cSBen Walker 			iova_mode = rte_eal_iova_mode();
1100dd7cd80cSBen Walker 
1101dd7cd80cSBen Walker 			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1102dd7cd80cSBen Walker 				/* We'll use the virtual address as the iova to match DPDK. */
1103f74d069eSJim Harris 				paddr = (uint64_t)vaddr;
1104ba9853b9SJim Harris 				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1105ba9853b9SJim Harris 				if (rc) {
1106ba9853b9SJim Harris 					return -EFAULT;
1107f74d069eSJim Harris 				}
1108ba9853b9SJim Harris 				while (len > 0) {
1109ba9853b9SJim Harris 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1110aaac4888SMaciej Szwed 					if (rc != 0) {
1111ba9853b9SJim Harris 						return rc;
1112aaac4888SMaciej Szwed 					}
1113aaac4888SMaciej Szwed 					vaddr += VALUE_2MB;
1114ba9853b9SJim Harris 					paddr += VALUE_2MB;
1115ba9853b9SJim Harris 					len -= VALUE_2MB;
1116aaac4888SMaciej Szwed 				}
1117f74d069eSJim Harris 			} else
1118f74d069eSJim Harris #endif
1119f74d069eSJim Harris 			{
1120f74d069eSJim Harris 				/* Get the physical address from /proc/self/pagemap. */
1121f74d069eSJim Harris 				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1122f74d069eSJim Harris 				if (paddr == SPDK_VTOPHYS_ERROR) {
1123f74d069eSJim Harris 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1124ba9853b9SJim Harris 					return -EFAULT;
1125f74d069eSJim Harris 				}
1126be04cfc3SDarek Stojaczyk 
1127be04cfc3SDarek Stojaczyk 				/* Get paddr for each 2MB chunk in this address range */
1128be04cfc3SDarek Stojaczyk 				while (len > 0) {
1129be04cfc3SDarek Stojaczyk 					/* Get the physical address from /proc/self/pagemap. */
1130be04cfc3SDarek Stojaczyk 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1131be04cfc3SDarek Stojaczyk 
1132be04cfc3SDarek Stojaczyk 					if (paddr == SPDK_VTOPHYS_ERROR) {
1133be04cfc3SDarek Stojaczyk 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1134ba9853b9SJim Harris 						return -EFAULT;
1135f74d069eSJim Harris 					}
1136be04cfc3SDarek Stojaczyk 
1137a36bc251SBen Walker 					if (paddr & MASK_2MB) {
1138f74d069eSJim Harris 						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1139ba9853b9SJim Harris 						return -EINVAL;
1140f74d069eSJim Harris 					}
11413456377bSSeth Howell #if VFIO_ENABLED
1142dd7cd80cSBen Walker 					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1143dd7cd80cSBen Walker 					 * with the IOMMU using the physical address to match. */
1144dd7cd80cSBen Walker 					if (spdk_iommu_is_enabled()) {
1145ba9853b9SJim Harris 						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1146ba9853b9SJim Harris 						if (rc) {
1147ba9853b9SJim Harris 							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1148ba9853b9SJim Harris 							return -EFAULT;
1149dd7cd80cSBen Walker 						}
1150dd7cd80cSBen Walker 					}
1151dd7cd80cSBen Walker #endif
1152f74d069eSJim Harris 
1153f74d069eSJim Harris 					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1154be04cfc3SDarek Stojaczyk 					if (rc != 0) {
1155ba9853b9SJim Harris 						return rc;
1156be04cfc3SDarek Stojaczyk 					}
1157be04cfc3SDarek Stojaczyk 
1158be04cfc3SDarek Stojaczyk 					vaddr += VALUE_2MB;
1159be04cfc3SDarek Stojaczyk 					len -= VALUE_2MB;
1160be04cfc3SDarek Stojaczyk 				}
1161be04cfc3SDarek Stojaczyk 			}
1162be04cfc3SDarek Stojaczyk 		} else {
1163be04cfc3SDarek Stojaczyk 			/* This is an address managed by DPDK. Just setup the translations. */
1164be04cfc3SDarek Stojaczyk 			while (len > 0) {
1165be04cfc3SDarek Stojaczyk 				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1166be04cfc3SDarek Stojaczyk 				if (paddr == SPDK_VTOPHYS_ERROR) {
1167be04cfc3SDarek Stojaczyk 					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1168be04cfc3SDarek Stojaczyk 					return -EFAULT;
1169be04cfc3SDarek Stojaczyk 				}
1170be04cfc3SDarek Stojaczyk 
1171be04cfc3SDarek Stojaczyk 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1172be04cfc3SDarek Stojaczyk 				if (rc != 0) {
1173be04cfc3SDarek Stojaczyk 					return rc;
1174be04cfc3SDarek Stojaczyk 				}
1175be04cfc3SDarek Stojaczyk 
1176be04cfc3SDarek Stojaczyk 				vaddr += VALUE_2MB;
1177be04cfc3SDarek Stojaczyk 				len -= VALUE_2MB;
1178be04cfc3SDarek Stojaczyk 			}
1179be04cfc3SDarek Stojaczyk 		}
1180be04cfc3SDarek Stojaczyk 
1181f74d069eSJim Harris 		break;
1182f74d069eSJim Harris 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
11833456377bSSeth Howell #if VFIO_ENABLED
1184ba9853b9SJim Harris 		if (paddr == SPDK_VTOPHYS_ERROR) {
1185ba9853b9SJim Harris 			/*
1186a36bc251SBen Walker 			 * This is not an address that DPDK is managing.
1187a36bc251SBen Walker 			 */
1188a36bc251SBen Walker 
1189a36bc251SBen Walker 			/* Check if this is a PCI BAR. They need special handling */
1190a36bc251SBen Walker 			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1191a36bc251SBen Walker 			if (paddr != SPDK_VTOPHYS_ERROR) {
1192a36bc251SBen Walker 				/* Get paddr for each 2MB chunk in this address range */
1193a36bc251SBen Walker 				while (len > 0) {
1194a36bc251SBen Walker 					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1195a36bc251SBen Walker 					if (paddr == SPDK_VTOPHYS_ERROR) {
1196a36bc251SBen Walker 						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1197a36bc251SBen Walker 						return -EFAULT;
1198a36bc251SBen Walker 					}
1199a36bc251SBen Walker 
1200a36bc251SBen Walker 					rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1201a36bc251SBen Walker 					if (rc != 0) {
1202a36bc251SBen Walker 						return rc;
1203a36bc251SBen Walker 					}
1204a36bc251SBen Walker 
1205a36bc251SBen Walker 					vaddr += VALUE_2MB;
1206a36bc251SBen Walker 					len -= VALUE_2MB;
1207a36bc251SBen Walker 				}
1208a36bc251SBen Walker 
1209a36bc251SBen Walker 				return 0;
1210a36bc251SBen Walker 			}
1211a36bc251SBen Walker 
1212a36bc251SBen Walker 			/* If vfio is enabled,
1213ba9853b9SJim Harris 			 * we need to unmap the range from the IOMMU
1214ba9853b9SJim Harris 			 */
121588179a65SSeth Howell 			if (spdk_iommu_is_enabled()) {
1216be04cfc3SDarek Stojaczyk 				uint64_t buffer_len = len;
1217dd7cd80cSBen Walker 				uint8_t *va = vaddr;
1218dd7cd80cSBen Walker 				enum rte_iova_mode iova_mode;
1219dd7cd80cSBen Walker 
1220dd7cd80cSBen Walker 				iova_mode = rte_eal_iova_mode();
1221dd7cd80cSBen Walker 				/*
1222dd7cd80cSBen Walker 				 * In virtual address mode, the region is contiguous and can be done in
1223dd7cd80cSBen Walker 				 * one unmap.
1224dd7cd80cSBen Walker 				 */
1225dd7cd80cSBen Walker 				if (iova_mode == RTE_IOVA_VA) {
1226dd7cd80cSBen Walker 					paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1227dd7cd80cSBen Walker 					if (buffer_len != len || paddr != (uintptr_t)va) {
1228dd7cd80cSBen Walker 						DEBUG_PRINT("Unmapping %p with length %lu failed because "
1229dd7cd80cSBen Walker 							    "translation had address 0x%" PRIx64 " and length %lu\n",
1230dd7cd80cSBen Walker 							    va, len, paddr, buffer_len);
1231f74d069eSJim Harris 						return -EINVAL;
1232f74d069eSJim Harris 					}
1233ba9853b9SJim Harris 					rc = vtophys_iommu_unmap_dma(paddr, len);
1234ba9853b9SJim Harris 					if (rc) {
1235ba9853b9SJim Harris 						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1236ba9853b9SJim Harris 						return -EFAULT;
1237f74d069eSJim Harris 					}
1238dd7cd80cSBen Walker 				} else if (iova_mode == RTE_IOVA_PA) {
1239dd7cd80cSBen Walker 					/* Get paddr for each 2MB chunk in this address range */
1240ba9853b9SJim Harris 					while (buffer_len > 0) {
1241dd7cd80cSBen Walker 						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1242dd7cd80cSBen Walker 
1243dd7cd80cSBen Walker 						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1244dd7cd80cSBen Walker 							DEBUG_PRINT("could not get phys addr for %p\n", va);
1245dd7cd80cSBen Walker 							return -EFAULT;
1246dd7cd80cSBen Walker 						}
1247dd7cd80cSBen Walker 
1248ba9853b9SJim Harris 						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1249ba9853b9SJim Harris 						if (rc) {
1250ba9853b9SJim Harris 							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1251ba9853b9SJim Harris 							return -EFAULT;
1252dd7cd80cSBen Walker 						}
1253dd7cd80cSBen Walker 
1254ba9853b9SJim Harris 						va += VALUE_2MB;
1255ba9853b9SJim Harris 						buffer_len -= VALUE_2MB;
1256dd7cd80cSBen Walker 					}
1257dd7cd80cSBen Walker 				}
1258f74d069eSJim Harris 			}
1259f74d069eSJim Harris 		}
1260f74d069eSJim Harris #endif
1261be04cfc3SDarek Stojaczyk 		while (len > 0) {
1262f74d069eSJim Harris 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1263f74d069eSJim Harris 			if (rc != 0) {
1264f74d069eSJim Harris 				return rc;
1265f74d069eSJim Harris 			}
1266be04cfc3SDarek Stojaczyk 
1267f74d069eSJim Harris 			vaddr += VALUE_2MB;
1268f74d069eSJim Harris 			len -= VALUE_2MB;
1269f74d069eSJim Harris 		}
1270f74d069eSJim Harris 
1271be04cfc3SDarek Stojaczyk 		break;
1272be04cfc3SDarek Stojaczyk 	default:
1273be04cfc3SDarek Stojaczyk 		SPDK_UNREACHABLE();
1274be04cfc3SDarek Stojaczyk 	}
1275be04cfc3SDarek Stojaczyk 
1276f74d069eSJim Harris 	return rc;
1277f74d069eSJim Harris }
1278f74d069eSJim Harris 
127943f4e393SDarek Stojaczyk static int
1280*40c9acf6SJim Harris numa_notify(void *cb_ctx, struct spdk_mem_map *map,
1281*40c9acf6SJim Harris 	    enum spdk_mem_map_notify_action action,
1282*40c9acf6SJim Harris 	    void *vaddr, size_t len)
1283*40c9acf6SJim Harris {
1284*40c9acf6SJim Harris 	struct rte_memseg *seg;
1285*40c9acf6SJim Harris 
1286*40c9acf6SJim Harris 	/* We always return 0 from here, even if we aren't able to get a
1287*40c9acf6SJim Harris 	 * memseg for the address. This can happen in non-DPDK memory
1288*40c9acf6SJim Harris 	 * registration paths, for example vhost or vfio-user. That is OK,
1289*40c9acf6SJim Harris 	 * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for
1290*40c9acf6SJim Harris 	 * that kind of memory. If we return an error here, the
1291*40c9acf6SJim Harris 	 * spdk_mem_register() from vhost or vfio-user would fail which is
1292*40c9acf6SJim Harris 	 * not what we want.
1293*40c9acf6SJim Harris 	 */
1294*40c9acf6SJim Harris 	seg = rte_mem_virt2memseg(vaddr, NULL);
1295*40c9acf6SJim Harris 	if (seg == NULL) {
1296*40c9acf6SJim Harris 		return 0;
1297*40c9acf6SJim Harris 	}
1298*40c9acf6SJim Harris 
1299*40c9acf6SJim Harris 	switch (action) {
1300*40c9acf6SJim Harris 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
1301*40c9acf6SJim Harris 		spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id);
1302*40c9acf6SJim Harris 		break;
1303*40c9acf6SJim Harris 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1304*40c9acf6SJim Harris 		spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len);
1305*40c9acf6SJim Harris 		break;
1306*40c9acf6SJim Harris 	default:
1307*40c9acf6SJim Harris 		break;
1308*40c9acf6SJim Harris 	}
1309*40c9acf6SJim Harris 
1310*40c9acf6SJim Harris 	return 0;
1311*40c9acf6SJim Harris }
1312*40c9acf6SJim Harris 
1313*40c9acf6SJim Harris static int
131443f4e393SDarek Stojaczyk vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
131543f4e393SDarek Stojaczyk {
131643f4e393SDarek Stojaczyk 	/* This function is always called with paddrs for two subsequent
131743f4e393SDarek Stojaczyk 	 * 2MB chunks in virtual address space, so those chunks will be only
131843f4e393SDarek Stojaczyk 	 * physically contiguous if the physical addresses are 2MB apart
131943f4e393SDarek Stojaczyk 	 * from each other as well.
132043f4e393SDarek Stojaczyk 	 */
132143f4e393SDarek Stojaczyk 	return (paddr2 - paddr1 == VALUE_2MB);
132243f4e393SDarek Stojaczyk }
132343f4e393SDarek Stojaczyk 
13243456377bSSeth Howell #if VFIO_ENABLED
1325f74d069eSJim Harris 
1326f74d069eSJim Harris static bool
13273456377bSSeth Howell vfio_enabled(void)
1328f74d069eSJim Harris {
1329f74d069eSJim Harris 	return rte_vfio_is_enabled("vfio_pci");
1330f74d069eSJim Harris }
1331f74d069eSJim Harris 
1332f74d069eSJim Harris /* Check if IOMMU is enabled on the system */
1333f74d069eSJim Harris static bool
1334f74d069eSJim Harris has_iommu_groups(void)
1335f74d069eSJim Harris {
1336f74d069eSJim Harris 	int count = 0;
1337f74d069eSJim Harris 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1338f74d069eSJim Harris 
1339f74d069eSJim Harris 	if (dir == NULL) {
1340f74d069eSJim Harris 		return false;
1341f74d069eSJim Harris 	}
1342f74d069eSJim Harris 
13432a53883aSyidong0635 	while (count < 3 && readdir(dir) != NULL) {
1344f74d069eSJim Harris 		count++;
1345f74d069eSJim Harris 	}
1346f74d069eSJim Harris 
1347f74d069eSJim Harris 	closedir(dir);
1348f74d069eSJim Harris 	/* there will always be ./ and ../ entries */
1349f74d069eSJim Harris 	return count > 2;
1350f74d069eSJim Harris }
1351f74d069eSJim Harris 
1352f74d069eSJim Harris static bool
13533456377bSSeth Howell vfio_noiommu_enabled(void)
1354f74d069eSJim Harris {
1355f74d069eSJim Harris 	return rte_vfio_noiommu_is_enabled();
1356f74d069eSJim Harris }
1357f74d069eSJim Harris 
1358f74d069eSJim Harris static void
13593456377bSSeth Howell vtophys_iommu_init(void)
1360f74d069eSJim Harris {
1361f74d069eSJim Harris 	char proc_fd_path[PATH_MAX + 1];
1362f74d069eSJim Harris 	char link_path[PATH_MAX + 1];
1363f74d069eSJim Harris 	const char vfio_path[] = "/dev/vfio/vfio";
1364f74d069eSJim Harris 	DIR *dir;
1365f74d069eSJim Harris 	struct dirent *d;
1366f74d069eSJim Harris 
13673456377bSSeth Howell 	if (!vfio_enabled()) {
1368f74d069eSJim Harris 		return;
1369f74d069eSJim Harris 	}
1370f74d069eSJim Harris 
13713456377bSSeth Howell 	if (vfio_noiommu_enabled()) {
1372f74d069eSJim Harris 		g_vfio.noiommu_enabled = true;
1373f74d069eSJim Harris 	} else if (!has_iommu_groups()) {
1374f74d069eSJim Harris 		return;
1375f74d069eSJim Harris 	}
1376f74d069eSJim Harris 
1377f74d069eSJim Harris 	dir = opendir("/proc/self/fd");
1378f74d069eSJim Harris 	if (!dir) {
1379f74d069eSJim Harris 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1380f74d069eSJim Harris 		return;
1381f74d069eSJim Harris 	}
1382f74d069eSJim Harris 
1383f74d069eSJim Harris 	while ((d = readdir(dir)) != NULL) {
1384f74d069eSJim Harris 		if (d->d_type != DT_LNK) {
1385f74d069eSJim Harris 			continue;
1386f74d069eSJim Harris 		}
1387f74d069eSJim Harris 
1388f74d069eSJim Harris 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1389f74d069eSJim Harris 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1390f74d069eSJim Harris 			continue;
1391f74d069eSJim Harris 		}
1392f74d069eSJim Harris 
1393f74d069eSJim Harris 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1394f74d069eSJim Harris 			sscanf(d->d_name, "%d", &g_vfio.fd);
1395f74d069eSJim Harris 			break;
1396f74d069eSJim Harris 		}
1397f74d069eSJim Harris 	}
1398f74d069eSJim Harris 
1399f74d069eSJim Harris 	closedir(dir);
1400f74d069eSJim Harris 
1401f74d069eSJim Harris 	if (g_vfio.fd < 0) {
1402f74d069eSJim Harris 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1403f74d069eSJim Harris 		return;
1404f74d069eSJim Harris 	}
1405f74d069eSJim Harris 
1406f74d069eSJim Harris 	g_vfio.enabled = true;
1407f74d069eSJim Harris 
1408f74d069eSJim Harris 	return;
1409f74d069eSJim Harris }
14108f7d9ec2SBen Walker 
1411f74d069eSJim Harris #endif
1412f74d069eSJim Harris 
1413f74d069eSJim Harris void
141415d0ae62SSeth Howell vtophys_pci_device_added(struct rte_pci_device *pci_device)
1415f74d069eSJim Harris {
1416f74d069eSJim Harris 	struct spdk_vtophys_pci_device *vtophys_dev;
1417f74d069eSJim Harris 
1418f74d069eSJim Harris 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1419f74d069eSJim Harris 
1420f74d069eSJim Harris 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1421f74d069eSJim Harris 	if (vtophys_dev) {
1422f74d069eSJim Harris 		vtophys_dev->pci_device = pci_device;
1423f74d069eSJim Harris 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1424f74d069eSJim Harris 	} else {
1425f74d069eSJim Harris 		DEBUG_PRINT("Memory allocation error\n");
1426f74d069eSJim Harris 	}
1427f74d069eSJim Harris 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
14287fed70f1SChangpeng Liu 
14297fed70f1SChangpeng Liu #if VFIO_ENABLED
14307fed70f1SChangpeng Liu 	struct spdk_vfio_dma_map *dma_map;
14317fed70f1SChangpeng Liu 	int ret;
14327fed70f1SChangpeng Liu 
14337fed70f1SChangpeng Liu 	if (!g_vfio.enabled) {
14347fed70f1SChangpeng Liu 		return;
14357fed70f1SChangpeng Liu 	}
14367fed70f1SChangpeng Liu 
14377fed70f1SChangpeng Liu 	pthread_mutex_lock(&g_vfio.mutex);
14387fed70f1SChangpeng Liu 	g_vfio.device_ref++;
14397fed70f1SChangpeng Liu 	if (g_vfio.device_ref > 1) {
14407fed70f1SChangpeng Liu 		pthread_mutex_unlock(&g_vfio.mutex);
14417fed70f1SChangpeng Liu 		return;
14427fed70f1SChangpeng Liu 	}
14437fed70f1SChangpeng Liu 
14447fed70f1SChangpeng Liu 	/* This is the first SPDK device using DPDK vfio. This means that the first
14457fed70f1SChangpeng Liu 	 * IOMMU group might have been just been added to the DPDK vfio container.
14467fed70f1SChangpeng Liu 	 * From this point it is certain that the memory can be mapped now.
14477fed70f1SChangpeng Liu 	 */
14487fed70f1SChangpeng Liu 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
14497fed70f1SChangpeng Liu 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
14507fed70f1SChangpeng Liu 		if (ret) {
14517fed70f1SChangpeng Liu 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
14527fed70f1SChangpeng Liu 			break;
14537fed70f1SChangpeng Liu 		}
14547fed70f1SChangpeng Liu 	}
14557fed70f1SChangpeng Liu 	pthread_mutex_unlock(&g_vfio.mutex);
14567fed70f1SChangpeng Liu #endif
1457f74d069eSJim Harris }
1458f74d069eSJim Harris 
1459f74d069eSJim Harris void
146015d0ae62SSeth Howell vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1461f74d069eSJim Harris {
1462f74d069eSJim Harris 	struct spdk_vtophys_pci_device *vtophys_dev;
1463f74d069eSJim Harris 
1464f74d069eSJim Harris 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1465f74d069eSJim Harris 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1466f74d069eSJim Harris 		if (vtophys_dev->pci_device == pci_device) {
1467f74d069eSJim Harris 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1468f74d069eSJim Harris 			free(vtophys_dev);
1469f74d069eSJim Harris 			break;
1470f74d069eSJim Harris 		}
1471f74d069eSJim Harris 	}
1472f74d069eSJim Harris 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
14737fed70f1SChangpeng Liu 
14747fed70f1SChangpeng Liu #if VFIO_ENABLED
14757fed70f1SChangpeng Liu 	struct spdk_vfio_dma_map *dma_map;
14767fed70f1SChangpeng Liu 	int ret;
14777fed70f1SChangpeng Liu 
14787fed70f1SChangpeng Liu 	if (!g_vfio.enabled) {
14797fed70f1SChangpeng Liu 		return;
14807fed70f1SChangpeng Liu 	}
14817fed70f1SChangpeng Liu 
14827fed70f1SChangpeng Liu 	pthread_mutex_lock(&g_vfio.mutex);
14837fed70f1SChangpeng Liu 	assert(g_vfio.device_ref > 0);
14847fed70f1SChangpeng Liu 	g_vfio.device_ref--;
14857fed70f1SChangpeng Liu 	if (g_vfio.device_ref > 0) {
14867fed70f1SChangpeng Liu 		pthread_mutex_unlock(&g_vfio.mutex);
14877fed70f1SChangpeng Liu 		return;
14887fed70f1SChangpeng Liu 	}
14897fed70f1SChangpeng Liu 
14907fed70f1SChangpeng Liu 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
14917fed70f1SChangpeng Liu 	 * any additional devices using it's vfio container, all the mappings
14927fed70f1SChangpeng Liu 	 * will be automatically removed by the Linux vfio driver. We unmap
14937fed70f1SChangpeng Liu 	 * the memory manually to be able to easily re-map it later regardless
14947fed70f1SChangpeng Liu 	 * of other, external factors.
14957fed70f1SChangpeng Liu 	 */
14967fed70f1SChangpeng Liu 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
14977fed70f1SChangpeng Liu 		struct vfio_iommu_type1_dma_unmap unmap = {};
14987fed70f1SChangpeng Liu 		unmap.argsz = sizeof(unmap);
14997fed70f1SChangpeng Liu 		unmap.flags = 0;
15007fed70f1SChangpeng Liu 		unmap.iova = dma_map->map.iova;
15017fed70f1SChangpeng Liu 		unmap.size = dma_map->map.size;
15027fed70f1SChangpeng Liu 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
15037fed70f1SChangpeng Liu 		if (ret) {
15047fed70f1SChangpeng Liu 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
15057fed70f1SChangpeng Liu 			break;
15067fed70f1SChangpeng Liu 		}
15077fed70f1SChangpeng Liu 	}
15087fed70f1SChangpeng Liu 	pthread_mutex_unlock(&g_vfio.mutex);
15097fed70f1SChangpeng Liu #endif
1510f74d069eSJim Harris }
1511f74d069eSJim Harris 
1512f74d069eSJim Harris int
151315d0ae62SSeth Howell vtophys_init(void)
1514f74d069eSJim Harris {
1515f74d069eSJim Harris 	const struct spdk_mem_map_ops vtophys_map_ops = {
15163456377bSSeth Howell 		.notify_cb = vtophys_notify,
151743f4e393SDarek Stojaczyk 		.are_contiguous = vtophys_check_contiguous_entries,
1518f74d069eSJim Harris 	};
1519f74d069eSJim Harris 
1520f4a63bb8SSeth Howell 	const struct spdk_mem_map_ops phys_ref_map_ops = {
1521f4a63bb8SSeth Howell 		.notify_cb = NULL,
1522f4a63bb8SSeth Howell 		.are_contiguous = NULL,
1523f4a63bb8SSeth Howell 	};
1524f4a63bb8SSeth Howell 
1525*40c9acf6SJim Harris 	const struct spdk_mem_map_ops numa_map_ops = {
1526*40c9acf6SJim Harris 		.notify_cb = numa_notify,
1527*40c9acf6SJim Harris 		.are_contiguous = NULL,
1528*40c9acf6SJim Harris 	};
1529*40c9acf6SJim Harris 
15303456377bSSeth Howell #if VFIO_ENABLED
15313456377bSSeth Howell 	vtophys_iommu_init();
1532f74d069eSJim Harris #endif
1533f74d069eSJim Harris 
1534f4a63bb8SSeth Howell 	g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1535f4a63bb8SSeth Howell 	if (g_phys_ref_map == NULL) {
1536f4a63bb8SSeth Howell 		DEBUG_PRINT("phys_ref map allocation failed.\n");
1537f4a63bb8SSeth Howell 		return -ENOMEM;
1538f4a63bb8SSeth Howell 	}
1539f4a63bb8SSeth Howell 
1540*40c9acf6SJim Harris 	g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL);
1541*40c9acf6SJim Harris 	if (g_numa_map == NULL) {
1542*40c9acf6SJim Harris 		DEBUG_PRINT("numa map allocation failed.\n");
1543*40c9acf6SJim Harris 		spdk_mem_map_free(&g_phys_ref_map);
1544*40c9acf6SJim Harris 		return -ENOMEM;
1545*40c9acf6SJim Harris 	}
1546*40c9acf6SJim Harris 
1547a6658c54SSarvesh Lanke 	if (g_huge_pages) {
1548f74d069eSJim Harris 		g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1549f74d069eSJim Harris 		if (g_vtophys_map == NULL) {
1550f74d069eSJim Harris 			DEBUG_PRINT("vtophys map allocation failed\n");
1551*40c9acf6SJim Harris 			spdk_mem_map_free(&g_numa_map);
155235dfd3eaSyidong0635 			spdk_mem_map_free(&g_phys_ref_map);
155337c0a02eSJim Harris 			return -ENOMEM;
1554f74d069eSJim Harris 		}
1555a6658c54SSarvesh Lanke 	}
1556f74d069eSJim Harris 	return 0;
1557f74d069eSJim Harris }
1558f74d069eSJim Harris 
1559f74d069eSJim Harris uint64_t
1560d48a7b26SBen Walker spdk_vtophys(const void *buf, uint64_t *size)
1561f74d069eSJim Harris {
1562f74d069eSJim Harris 	uint64_t vaddr, paddr_2mb;
1563f74d069eSJim Harris 
1564a6658c54SSarvesh Lanke 	if (!g_huge_pages) {
1565a6658c54SSarvesh Lanke 		return SPDK_VTOPHYS_ERROR;
1566a6658c54SSarvesh Lanke 	}
1567a6658c54SSarvesh Lanke 
1568f74d069eSJim Harris 	vaddr = (uint64_t)buf;
1569f74d069eSJim Harris 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1570f74d069eSJim Harris 
1571f74d069eSJim Harris 	/*
1572f74d069eSJim Harris 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1573f74d069eSJim Harris 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1574f74d069eSJim Harris 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1575f74d069eSJim Harris 	 * unaligned) we must now check the return value before addition.
1576f74d069eSJim Harris 	 */
1577f74d069eSJim Harris 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1578f74d069eSJim Harris 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1579f74d069eSJim Harris 		return SPDK_VTOPHYS_ERROR;
1580f74d069eSJim Harris 	} else {
1581f74d069eSJim Harris 		return paddr_2mb + (vaddr & MASK_2MB);
1582f74d069eSJim Harris 	}
1583f74d069eSJim Harris }
1584c0bf9314SChangpeng Liu 
1585*40c9acf6SJim Harris int32_t
1586*40c9acf6SJim Harris spdk_mem_get_numa_id(const void *buf, uint64_t *size)
1587*40c9acf6SJim Harris {
1588*40c9acf6SJim Harris 	return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size);
1589*40c9acf6SJim Harris }
1590*40c9acf6SJim Harris 
1591c0bf9314SChangpeng Liu int
1592c0bf9314SChangpeng Liu spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1593c0bf9314SChangpeng Liu {
1594c0bf9314SChangpeng Liu 	struct rte_memseg *seg;
1595c0bf9314SChangpeng Liu 	int ret, fd;
1596c0bf9314SChangpeng Liu 
1597c0bf9314SChangpeng Liu 	seg = rte_mem_virt2memseg(vaddr, NULL);
1598c0bf9314SChangpeng Liu 	if (!seg) {
1599c0bf9314SChangpeng Liu 		SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1600c0bf9314SChangpeng Liu 		return -ENOENT;
1601c0bf9314SChangpeng Liu 	}
1602c0bf9314SChangpeng Liu 
1603c0bf9314SChangpeng Liu 	fd = rte_memseg_get_fd_thread_unsafe(seg);
1604c0bf9314SChangpeng Liu 	if (fd < 0) {
1605c0bf9314SChangpeng Liu 		return fd;
1606c0bf9314SChangpeng Liu 	}
1607c0bf9314SChangpeng Liu 
1608c0bf9314SChangpeng Liu 	ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1609c0bf9314SChangpeng Liu 	if (ret < 0) {
1610c0bf9314SChangpeng Liu 		return ret;
1611c0bf9314SChangpeng Liu 	}
1612c0bf9314SChangpeng Liu 
1613c0bf9314SChangpeng Liu 	return fd;
1614c0bf9314SChangpeng Liu }
1615a6658c54SSarvesh Lanke 
1616a6658c54SSarvesh Lanke void
1617a6658c54SSarvesh Lanke mem_disable_huge_pages(void)
1618a6658c54SSarvesh Lanke {
1619a6658c54SSarvesh Lanke 	g_huge_pages = false;
1620a6658c54SSarvesh Lanke }
1621