xref: /spdk/lib/env_dpdk/memory.c (revision 56e12b00711b40d1e2ff45d4147193f45fdd9af0)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
40 
41 #include "spdk_internal/assert.h"
42 #include "spdk_internal/memory.h"
43 
44 #include "spdk/assert.h"
45 #include "spdk/likely.h"
46 #include "spdk/queue.h"
47 #include "spdk/util.h"
48 
49 #ifdef __FreeBSD__
50 #define SPDK_VFIO_ENABLED 0
51 #else
52 #include <linux/version.h>
53 /*
54  * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API,
55  * and we can't link to internal symbols when built against shared library DPDK,
56  * so disable VFIO entirely in that case.
57  */
58 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \
59     (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB))
60 
61 #define SPDK_VFIO_ENABLED 1
62 #include <linux/vfio.h>
63 
64 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
65 #include <rte_vfio.h>
66 #else
67 /* Internal DPDK function forward declaration */
68 int pci_vfio_is_enabled(void);
69 #endif
70 
71 struct spdk_vfio_dma_map {
72 	struct vfio_iommu_type1_dma_map map;
73 	struct vfio_iommu_type1_dma_unmap unmap;
74 	TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
75 };
76 
77 struct vfio_cfg {
78 	int fd;
79 	bool enabled;
80 	bool noiommu_enabled;
81 	unsigned device_ref;
82 	TAILQ_HEAD(, spdk_vfio_dma_map) maps;
83 	pthread_mutex_t mutex;
84 };
85 
86 static struct vfio_cfg g_vfio = {
87 	.fd = -1,
88 	.enabled = false,
89 	.noiommu_enabled = false,
90 	.device_ref = 0,
91 	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
92 	.mutex = PTHREAD_MUTEX_INITIALIZER
93 };
94 
95 #else
96 #define SPDK_VFIO_ENABLED 0
97 #endif
98 #endif
99 
100 #if DEBUG
101 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
102 #else
103 #define DEBUG_PRINT(...)
104 #endif
105 
106 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
107 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
108 
109 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
110 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
111 
112 /* Page is registered */
113 #define REG_MAP_REGISTERED	(1ULL << 62)
114 
115 /* A notification region barrier. The 2MB translation entry that's marked
116  * with this flag must be unregistered separately. This allows contiguous
117  * regions to be unregistered in the same chunks they were registered.
118  */
119 #define REG_MAP_NOTIFY_START	(1ULL << 63)
120 
121 /* Translation of a single 2MB page. */
122 struct map_2mb {
123 	uint64_t translation_2mb;
124 };
125 
126 /* Second-level map table indexed by bits [21..29] of the virtual address.
127  * Each entry contains the address translation or error for entries that haven't
128  * been retrieved yet.
129  */
130 struct map_1gb {
131 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
132 };
133 
134 /* Top-level map table indexed by bits [30..47] of the virtual address.
135  * Each entry points to a second-level map table or NULL.
136  */
137 struct map_256tb {
138 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
139 };
140 
141 /* Page-granularity memory address translation */
142 struct spdk_mem_map {
143 	struct map_256tb map_256tb;
144 	pthread_mutex_t mutex;
145 	uint64_t default_translation;
146 	struct spdk_mem_map_ops ops;
147 	void *cb_ctx;
148 	TAILQ_ENTRY(spdk_mem_map) tailq;
149 };
150 
151 /* Registrations map. The 64 bit translations are bit fields with the
152  * following layout (starting with the low bits):
153  *    0 - 61 : reserved
154  *   62 - 63 : flags
155  */
156 static struct spdk_mem_map *g_mem_reg_map;
157 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
158 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
159 
160 /*
161  * Walk the currently registered memory via the main memory registration map
162  * and call the new map's notify callback for each virtually contiguous region.
163  */
164 static int
165 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
166 {
167 	size_t idx_256tb;
168 	uint64_t idx_1gb;
169 	uint64_t contig_start = UINT64_MAX;
170 	uint64_t contig_end = UINT64_MAX;
171 	struct map_1gb *map_1gb;
172 	int rc;
173 
174 	if (!g_mem_reg_map) {
175 		return -EINVAL;
176 	}
177 
178 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
179 	pthread_mutex_lock(&g_mem_reg_map->mutex);
180 
181 	for (idx_256tb = 0;
182 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
183 	     idx_256tb++) {
184 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
185 
186 		if (!map_1gb) {
187 			if (contig_start != UINT64_MAX) {
188 				/* End of of a virtually contiguous range */
189 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
190 							(void *)contig_start,
191 							contig_end - contig_start + VALUE_2MB);
192 				/* Don't bother handling unregister failures. It can't be any worse */
193 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
194 					goto err_unregister;
195 				}
196 			}
197 			contig_start = UINT64_MAX;
198 			continue;
199 		}
200 
201 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
202 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
203 			    (contig_start == UINT64_MAX ||
204 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
205 				/* Rebuild the virtual address from the indexes */
206 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
207 
208 				if (contig_start == UINT64_MAX) {
209 					contig_start = vaddr;
210 				}
211 
212 				contig_end = vaddr;
213 			} else {
214 				if (contig_start != UINT64_MAX) {
215 					/* End of of a virtually contiguous range */
216 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
217 								(void *)contig_start,
218 								contig_end - contig_start + VALUE_2MB);
219 					/* Don't bother handling unregister failures. It can't be any worse */
220 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
221 						goto err_unregister;
222 					}
223 
224 					/* This page might be a part of a neighbour region, so process
225 					 * it again. The idx_1gb will be incremented immediately.
226 					 */
227 					idx_1gb--;
228 				}
229 				contig_start = UINT64_MAX;
230 			}
231 		}
232 	}
233 
234 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
235 	return 0;
236 
237 err_unregister:
238 	/* Unwind to the first empty translation so we don't unregister
239 	 * a region that just failed to register.
240 	 */
241 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
242 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
243 	contig_start = UINT64_MAX;
244 	contig_end = UINT64_MAX;
245 
246 	/* Unregister any memory we managed to register before the failure */
247 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
248 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
249 
250 		if (!map_1gb) {
251 			if (contig_end != UINT64_MAX) {
252 				/* End of of a virtually contiguous range */
253 				map->ops.notify_cb(map->cb_ctx, map,
254 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
255 						   (void *)contig_start,
256 						   contig_end - contig_start + VALUE_2MB);
257 			}
258 			contig_end = UINT64_MAX;
259 			continue;
260 		}
261 
262 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
263 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
264 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
265 				/* Rebuild the virtual address from the indexes */
266 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
267 
268 				if (contig_end == UINT64_MAX) {
269 					contig_end = vaddr;
270 				}
271 				contig_start = vaddr;
272 			} else {
273 				if (contig_end != UINT64_MAX) {
274 					/* End of of a virtually contiguous range */
275 					map->ops.notify_cb(map->cb_ctx, map,
276 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
277 							   (void *)contig_start,
278 							   contig_end - contig_start + VALUE_2MB);
279 					idx_1gb++;
280 				}
281 				contig_end = UINT64_MAX;
282 			}
283 		}
284 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
285 	}
286 
287 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
288 	return rc;
289 }
290 
291 struct spdk_mem_map *
292 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
293 {
294 	struct spdk_mem_map *map;
295 	int rc;
296 
297 	map = calloc(1, sizeof(*map));
298 	if (map == NULL) {
299 		return NULL;
300 	}
301 
302 	if (pthread_mutex_init(&map->mutex, NULL)) {
303 		free(map);
304 		return NULL;
305 	}
306 
307 	map->default_translation = default_translation;
308 	map->cb_ctx = cb_ctx;
309 	if (ops) {
310 		map->ops = *ops;
311 	}
312 
313 	if (ops && ops->notify_cb) {
314 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
315 		rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
316 		if (rc != 0) {
317 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
318 			DEBUG_PRINT("Initial mem_map notify failed\n");
319 			pthread_mutex_destroy(&map->mutex);
320 			free(map);
321 			return NULL;
322 		}
323 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
324 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
325 	}
326 
327 	return map;
328 }
329 
330 void
331 spdk_mem_map_free(struct spdk_mem_map **pmap)
332 {
333 	struct spdk_mem_map *map;
334 	size_t i;
335 
336 	if (!pmap) {
337 		return;
338 	}
339 
340 	map = *pmap;
341 
342 	if (!map) {
343 		return;
344 	}
345 
346 	if (map->ops.notify_cb) {
347 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
348 		spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
349 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
350 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
351 	}
352 
353 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
354 		free(map->map_256tb.map[i]);
355 	}
356 
357 	pthread_mutex_destroy(&map->mutex);
358 
359 	free(map);
360 	*pmap = NULL;
361 }
362 
363 int
364 spdk_mem_register(void *vaddr, size_t len)
365 {
366 	struct spdk_mem_map *map;
367 	int rc;
368 	void *seg_vaddr;
369 	size_t seg_len;
370 	uint64_t reg;
371 
372 	if ((uintptr_t)vaddr & ~MASK_256TB) {
373 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
374 		return -EINVAL;
375 	}
376 
377 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
378 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
379 			    __func__, vaddr, len);
380 		return -EINVAL;
381 	}
382 
383 	if (len == 0) {
384 		return 0;
385 	}
386 
387 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
388 
389 	seg_vaddr = vaddr;
390 	seg_len = len;
391 	while (seg_len > 0) {
392 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
393 		if (reg & REG_MAP_REGISTERED) {
394 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
395 			return -EBUSY;
396 		}
397 		seg_vaddr += VALUE_2MB;
398 		seg_len -= VALUE_2MB;
399 	}
400 
401 	seg_vaddr = vaddr;
402 	seg_len = 0;
403 	while (len > 0) {
404 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
405 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
406 		seg_len += VALUE_2MB;
407 		vaddr += VALUE_2MB;
408 		len -= VALUE_2MB;
409 	}
410 
411 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
412 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
413 		if (rc != 0) {
414 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
415 			return rc;
416 		}
417 	}
418 
419 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
420 	return 0;
421 }
422 
423 int
424 spdk_mem_unregister(void *vaddr, size_t len)
425 {
426 	struct spdk_mem_map *map;
427 	int rc;
428 	void *seg_vaddr;
429 	size_t seg_len;
430 	uint64_t reg, newreg;
431 
432 	if ((uintptr_t)vaddr & ~MASK_256TB) {
433 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
434 		return -EINVAL;
435 	}
436 
437 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
438 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
439 			    __func__, vaddr, len);
440 		return -EINVAL;
441 	}
442 
443 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
444 
445 	/* The first page must be a start of a region. Also check if it's
446 	 * registered to make sure we don't return -ERANGE for non-registered
447 	 * regions.
448 	 */
449 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
450 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
451 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
452 		return -ERANGE;
453 	}
454 
455 	seg_vaddr = vaddr;
456 	seg_len = len;
457 	while (seg_len > 0) {
458 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
459 		if ((reg & REG_MAP_REGISTERED) == 0) {
460 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
461 			return -EINVAL;
462 		}
463 		seg_vaddr += VALUE_2MB;
464 		seg_len -= VALUE_2MB;
465 	}
466 
467 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
468 	/* If the next page is registered, it must be a start of a region as well,
469 	 * otherwise we'd be unregistering only a part of a region.
470 	 */
471 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
472 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
473 		return -ERANGE;
474 	}
475 	seg_vaddr = vaddr;
476 	seg_len = 0;
477 
478 	while (len > 0) {
479 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
480 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
481 
482 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
483 			TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
484 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
485 				if (rc != 0) {
486 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
487 					return rc;
488 				}
489 			}
490 
491 			seg_vaddr = vaddr;
492 			seg_len = VALUE_2MB;
493 		} else {
494 			seg_len += VALUE_2MB;
495 		}
496 
497 		vaddr += VALUE_2MB;
498 		len -= VALUE_2MB;
499 	}
500 
501 	if (seg_len > 0) {
502 		TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
503 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
504 			if (rc != 0) {
505 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
506 				return rc;
507 			}
508 		}
509 	}
510 
511 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
512 	return 0;
513 }
514 
515 static struct map_1gb *
516 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
517 {
518 	struct map_1gb *map_1gb;
519 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
520 	size_t i;
521 
522 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
523 		return NULL;
524 	}
525 
526 	map_1gb = map->map_256tb.map[idx_256tb];
527 
528 	if (!map_1gb) {
529 		pthread_mutex_lock(&map->mutex);
530 
531 		/* Recheck to make sure nobody else got the mutex first. */
532 		map_1gb = map->map_256tb.map[idx_256tb];
533 		if (!map_1gb) {
534 			map_1gb = malloc(sizeof(struct map_1gb));
535 			if (map_1gb) {
536 				/* initialize all entries to default translation */
537 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
538 					map_1gb->map[i].translation_2mb = map->default_translation;
539 				}
540 				map->map_256tb.map[idx_256tb] = map_1gb;
541 			}
542 		}
543 
544 		pthread_mutex_unlock(&map->mutex);
545 
546 		if (!map_1gb) {
547 			DEBUG_PRINT("allocation failed\n");
548 			return NULL;
549 		}
550 	}
551 
552 	return map_1gb;
553 }
554 
555 int
556 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
557 			     uint64_t translation)
558 {
559 	uint64_t vfn_2mb;
560 	struct map_1gb *map_1gb;
561 	uint64_t idx_1gb;
562 	struct map_2mb *map_2mb;
563 
564 	if ((uintptr_t)vaddr & ~MASK_256TB) {
565 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
566 		return -EINVAL;
567 	}
568 
569 	/* For now, only 2 MB-aligned registrations are supported */
570 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
571 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
572 			    __func__, vaddr, size);
573 		return -EINVAL;
574 	}
575 
576 	vfn_2mb = vaddr >> SHIFT_2MB;
577 
578 	while (size) {
579 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
580 		if (!map_1gb) {
581 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
582 			return -ENOMEM;
583 		}
584 
585 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
586 		map_2mb = &map_1gb->map[idx_1gb];
587 		map_2mb->translation_2mb = translation;
588 
589 		size -= VALUE_2MB;
590 		vfn_2mb++;
591 	}
592 
593 	return 0;
594 }
595 
596 int
597 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
598 {
599 	uint64_t vfn_2mb;
600 	struct map_1gb *map_1gb;
601 	uint64_t idx_1gb;
602 	struct map_2mb *map_2mb;
603 
604 	if ((uintptr_t)vaddr & ~MASK_256TB) {
605 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
606 		return -EINVAL;
607 	}
608 
609 	/* For now, only 2 MB-aligned registrations are supported */
610 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
611 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
612 			    __func__, vaddr, size);
613 		return -EINVAL;
614 	}
615 
616 	vfn_2mb = vaddr >> SHIFT_2MB;
617 
618 	while (size) {
619 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
620 		if (!map_1gb) {
621 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
622 			return -ENOMEM;
623 		}
624 
625 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
626 		map_2mb = &map_1gb->map[idx_1gb];
627 		map_2mb->translation_2mb = map->default_translation;
628 
629 		size -= VALUE_2MB;
630 		vfn_2mb++;
631 	}
632 
633 	return 0;
634 }
635 
636 inline uint64_t
637 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
638 {
639 	const struct map_1gb *map_1gb;
640 	const struct map_2mb *map_2mb;
641 	uint64_t idx_256tb;
642 	uint64_t idx_1gb;
643 	uint64_t vfn_2mb;
644 	uint64_t cur_size;
645 	uint64_t prev_translation;
646 	uint64_t orig_translation;
647 
648 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
649 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
650 		return map->default_translation;
651 	}
652 
653 	vfn_2mb = vaddr >> SHIFT_2MB;
654 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
655 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
656 
657 	map_1gb = map->map_256tb.map[idx_256tb];
658 	if (spdk_unlikely(!map_1gb)) {
659 		return map->default_translation;
660 	}
661 
662 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
663 	map_2mb = &map_1gb->map[idx_1gb];
664 	if (size == NULL || map->ops.are_contiguous == NULL ||
665 	    map_2mb->translation_2mb == map->default_translation) {
666 		if (size != NULL) {
667 			*size = spdk_min(*size, cur_size);
668 		}
669 		return map_2mb->translation_2mb;
670 	}
671 
672 	orig_translation = map_2mb->translation_2mb;
673 	prev_translation = orig_translation;
674 	while (cur_size < *size) {
675 		vfn_2mb++;
676 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
677 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
678 
679 		map_1gb = map->map_256tb.map[idx_256tb];
680 		if (spdk_unlikely(!map_1gb)) {
681 			break;
682 		}
683 
684 		map_2mb = &map_1gb->map[idx_1gb];
685 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
686 			break;
687 		}
688 
689 		cur_size += VALUE_2MB;
690 		prev_translation = map_2mb->translation_2mb;
691 	}
692 
693 	*size = spdk_min(*size, cur_size);
694 	return orig_translation;
695 }
696 
697 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
698 static void
699 memory_hotplug_cb(enum rte_mem_event event_type,
700 		  const void *addr, size_t len, void *arg)
701 {
702 	if (event_type == RTE_MEM_EVENT_ALLOC) {
703 		spdk_mem_register((void *)addr, len);
704 
705 		/* Now mark each segment so that DPDK won't later free it.
706 		 * This ensures we don't have to deal with the memory
707 		 * getting freed in different units than it was allocated.
708 		 */
709 		while (len > 0) {
710 			struct rte_memseg *seg;
711 
712 			seg = rte_mem_virt2memseg(addr, NULL);
713 			assert(seg != NULL);
714 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
715 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
716 			len -= seg->hugepage_sz;
717 		}
718 	} else if (event_type == RTE_MEM_EVENT_FREE) {
719 		spdk_mem_unregister((void *)addr, len);
720 	}
721 }
722 
723 static int
724 memory_iter_cb(const struct rte_memseg_list *msl,
725 	       const struct rte_memseg *ms, size_t len, void *arg)
726 {
727 	return spdk_mem_register(ms->addr, len);
728 }
729 #endif
730 
731 int
732 spdk_mem_map_init(void)
733 {
734 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
735 	if (g_mem_reg_map == NULL) {
736 		DEBUG_PRINT("memory registration map allocation failed\n");
737 		return -1;
738 	}
739 
740 	/*
741 	 * Walk all DPDK memory segments and register them
742 	 * with the master memory map
743 	 */
744 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
745 	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
746 	rte_memseg_contig_walk(memory_iter_cb, NULL);
747 #else
748 	struct rte_mem_config *mcfg;
749 	size_t seg_idx;
750 
751 	mcfg = rte_eal_get_configuration()->mem_config;
752 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
753 		struct rte_memseg *seg = &mcfg->memseg[seg_idx];
754 
755 		if (seg->addr == NULL) {
756 			break;
757 		}
758 
759 		spdk_mem_register(seg->addr, seg->len);
760 	}
761 #endif
762 	return 0;
763 }
764 
765 struct spdk_vtophys_pci_device {
766 	struct rte_pci_device *pci_device;
767 	TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
768 };
769 
770 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
771 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
772 	TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
773 
774 static struct spdk_mem_map *g_vtophys_map;
775 
776 #if SPDK_VFIO_ENABLED
777 static int
778 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
779 {
780 	struct spdk_vfio_dma_map *dma_map;
781 	int ret;
782 
783 	dma_map = calloc(1, sizeof(*dma_map));
784 	if (dma_map == NULL) {
785 		return -ENOMEM;
786 	}
787 
788 	dma_map->map.argsz = sizeof(dma_map->map);
789 	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
790 	dma_map->map.vaddr = vaddr;
791 	dma_map->map.iova = iova;
792 	dma_map->map.size = size;
793 
794 	dma_map->unmap.argsz = sizeof(dma_map->unmap);
795 	dma_map->unmap.flags = 0;
796 	dma_map->unmap.iova = iova;
797 	dma_map->unmap.size = size;
798 
799 	pthread_mutex_lock(&g_vfio.mutex);
800 	if (g_vfio.device_ref == 0) {
801 		/* VFIO requires at least one device (IOMMU group) to be added to
802 		 * a VFIO container before it is possible to perform any IOMMU
803 		 * operations on that container. This memory will be mapped once
804 		 * the first device (IOMMU group) is hotplugged.
805 		 *
806 		 * Since the vfio container is managed internally by DPDK, it is
807 		 * also possible that some device is already in that container, but
808 		 * it's not managed by SPDK -  e.g. an NIC attached internally
809 		 * inside DPDK. We could map the memory straight away in such
810 		 * scenario, but there's no need to do it. DPDK devices clearly
811 		 * don't need our mappings and hence we defer the mapping
812 		 * unconditionally until the first SPDK-managed device is
813 		 * hotplugged.
814 		 */
815 		goto out_insert;
816 	}
817 
818 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
819 	if (ret) {
820 		DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
821 		pthread_mutex_unlock(&g_vfio.mutex);
822 		free(dma_map);
823 		return ret;
824 	}
825 
826 out_insert:
827 	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
828 	pthread_mutex_unlock(&g_vfio.mutex);
829 	return 0;
830 }
831 
832 static int
833 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
834 {
835 	struct spdk_vfio_dma_map *dma_map;
836 	int ret;
837 
838 	pthread_mutex_lock(&g_vfio.mutex);
839 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
840 		if (dma_map->map.iova == iova) {
841 			break;
842 		}
843 	}
844 
845 	if (dma_map == NULL) {
846 		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
847 		pthread_mutex_unlock(&g_vfio.mutex);
848 		return -ENXIO;
849 	}
850 
851 	/** don't support partial or multiple-page unmap for now */
852 	assert(dma_map->map.size == size);
853 
854 	if (g_vfio.device_ref == 0) {
855 		/* Memory is not mapped anymore, just remove it's references */
856 		goto out_remove;
857 	}
858 
859 
860 	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
861 	if (ret) {
862 		DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
863 		pthread_mutex_unlock(&g_vfio.mutex);
864 		return ret;
865 	}
866 
867 out_remove:
868 	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
869 	pthread_mutex_unlock(&g_vfio.mutex);
870 	free(dma_map);
871 	return 0;
872 }
873 #endif
874 
875 static uint64_t
876 vtophys_get_paddr_memseg(uint64_t vaddr)
877 {
878 	uintptr_t paddr;
879 	struct rte_memseg *seg;
880 
881 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
882 	seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
883 	if (seg != NULL) {
884 		paddr = seg->phys_addr;
885 		if (paddr == RTE_BAD_IOVA) {
886 			return SPDK_VTOPHYS_ERROR;
887 		}
888 		paddr += (vaddr - (uintptr_t)seg->addr);
889 		return paddr;
890 	}
891 #else
892 	struct rte_mem_config *mcfg;
893 	uint32_t seg_idx;
894 
895 	mcfg = rte_eal_get_configuration()->mem_config;
896 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
897 		seg = &mcfg->memseg[seg_idx];
898 		if (seg->addr == NULL) {
899 			break;
900 		}
901 
902 		if (vaddr >= (uintptr_t)seg->addr &&
903 		    vaddr < ((uintptr_t)seg->addr + seg->len)) {
904 			paddr = seg->phys_addr;
905 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
906 			if (paddr == RTE_BAD_IOVA) {
907 #else
908 			if (paddr == RTE_BAD_PHYS_ADDR) {
909 #endif
910 				return SPDK_VTOPHYS_ERROR;
911 			}
912 			paddr += (vaddr - (uintptr_t)seg->addr);
913 			return paddr;
914 		}
915 	}
916 #endif
917 
918 	return SPDK_VTOPHYS_ERROR;
919 }
920 
921 /* Try to get the paddr from /proc/self/pagemap */
922 static uint64_t
923 vtophys_get_paddr_pagemap(uint64_t vaddr)
924 {
925 	uintptr_t paddr;
926 
927 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
928 #define BAD_ADDR RTE_BAD_IOVA
929 #define VTOPHYS rte_mem_virt2iova
930 #else
931 #define BAD_ADDR RTE_BAD_PHYS_ADDR
932 #define VTOPHYS rte_mem_virt2phy
933 #endif
934 
935 	/*
936 	 * Note: the virt2phy/virt2iova functions have changed over time, such
937 	 * that older versions may return 0 while recent versions will never
938 	 * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead.  To support older and
939 	 * newer versions, check for both return values.
940 	 */
941 	paddr = VTOPHYS((void *)vaddr);
942 	if (paddr == 0 || paddr == BAD_ADDR) {
943 		/*
944 		 * The vaddr may be valid but doesn't have a backing page
945 		 * assigned yet.  Touch the page to ensure a backing page
946 		 * gets assigned, then try to translate again.
947 		 */
948 		rte_atomic64_read((rte_atomic64_t *)vaddr);
949 		paddr = VTOPHYS((void *)vaddr);
950 	}
951 	if (paddr == 0 || paddr == BAD_ADDR) {
952 		/* Unable to get to the physical address. */
953 		return SPDK_VTOPHYS_ERROR;
954 	}
955 
956 #undef BAD_ADDR
957 #undef VTOPHYS
958 
959 	return paddr;
960 }
961 
962 /* Try to get the paddr from pci devices */
963 static uint64_t
964 vtophys_get_paddr_pci(uint64_t vaddr)
965 {
966 	struct spdk_vtophys_pci_device *vtophys_dev;
967 	uintptr_t paddr;
968 	struct rte_pci_device	*dev;
969 	struct rte_mem_resource *res;
970 	unsigned r;
971 
972 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
973 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
974 		dev = vtophys_dev->pci_device;
975 
976 		for (r = 0; r < PCI_MAX_RESOURCE; r++) {
977 			res = &dev->mem_resource[r];
978 			if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
979 			    vaddr < (uint64_t)res->addr + res->len) {
980 				paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
981 				DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
982 					    (void *)paddr);
983 				pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
984 				return paddr;
985 			}
986 		}
987 	}
988 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
989 
990 	return  SPDK_VTOPHYS_ERROR;
991 }
992 
993 static int
994 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
995 		    enum spdk_mem_map_notify_action action,
996 		    void *vaddr, size_t len)
997 {
998 	int rc = 0, pci_phys = 0;
999 	uint64_t paddr;
1000 
1001 	if ((uintptr_t)vaddr & ~MASK_256TB) {
1002 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1003 		return -EINVAL;
1004 	}
1005 
1006 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1007 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
1008 			    __func__, vaddr, len);
1009 		return -EINVAL;
1010 	}
1011 
1012 	while (len > 0) {
1013 		/* Get the physical address from the DPDK memsegs */
1014 		paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1015 
1016 		switch (action) {
1017 		case SPDK_MEM_MAP_NOTIFY_REGISTER:
1018 			if (paddr == SPDK_VTOPHYS_ERROR) {
1019 				/* This is not an address that DPDK is managing. */
1020 #if SPDK_VFIO_ENABLED
1021 				if (g_vfio.enabled && !g_vfio.noiommu_enabled) {
1022 					/* We'll use the virtual address as the iova. DPDK
1023 					 * currently uses physical addresses as the iovas (or counts
1024 					 * up from 0 if it can't get physical addresses), so
1025 					 * the range of user space virtual addresses and physical
1026 					 * addresses will never overlap.
1027 					 */
1028 					paddr = (uint64_t)vaddr;
1029 					rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1030 					if (rc) {
1031 						return -EFAULT;
1032 					}
1033 				} else
1034 #endif
1035 				{
1036 					/* Get the physical address from /proc/self/pagemap. */
1037 					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1038 					if (paddr == SPDK_VTOPHYS_ERROR) {
1039 						/* Get the physical address from PCI devices */
1040 						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
1041 						if (paddr == SPDK_VTOPHYS_ERROR) {
1042 							DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1043 							return -EFAULT;
1044 						}
1045 						pci_phys = 1;
1046 					}
1047 				}
1048 			}
1049 			/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
1050 			if (!pci_phys && (paddr & MASK_2MB)) {
1051 				DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1052 				return -EINVAL;
1053 			}
1054 
1055 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1056 			break;
1057 		case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1058 #if SPDK_VFIO_ENABLED
1059 			if (paddr == SPDK_VTOPHYS_ERROR) {
1060 				/*
1061 				 * This is not an address that DPDK is managing. If vfio is enabled,
1062 				 * we need to unmap the range from the IOMMU
1063 				 */
1064 				if (g_vfio.enabled && !g_vfio.noiommu_enabled) {
1065 					uint64_t buffer_len = VALUE_2MB;
1066 					paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
1067 					if (buffer_len != VALUE_2MB) {
1068 						return -EINVAL;
1069 					}
1070 					rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1071 					if (rc) {
1072 						return -EFAULT;
1073 					}
1074 				}
1075 			}
1076 #endif
1077 			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1078 			break;
1079 		default:
1080 			SPDK_UNREACHABLE();
1081 		}
1082 
1083 		if (rc != 0) {
1084 			return rc;
1085 		}
1086 		vaddr += VALUE_2MB;
1087 		len -= VALUE_2MB;
1088 	}
1089 
1090 	return rc;
1091 }
1092 
1093 #if SPDK_VFIO_ENABLED
1094 
1095 static bool
1096 spdk_vfio_enabled(void)
1097 {
1098 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
1099 	return rte_vfio_is_enabled("vfio_pci");
1100 #else
1101 	return pci_vfio_is_enabled();
1102 #endif
1103 }
1104 
1105 /* Check if IOMMU is enabled on the system */
1106 static bool
1107 has_iommu_groups(void)
1108 {
1109 	struct dirent *d;
1110 	int count = 0;
1111 	DIR *dir = opendir("/sys/kernel/iommu_groups");
1112 
1113 	if (dir == NULL) {
1114 		return false;
1115 	}
1116 
1117 	while (count < 3 && (d = readdir(dir)) != NULL) {
1118 		count++;
1119 	}
1120 
1121 	closedir(dir);
1122 	/* there will always be ./ and ../ entries */
1123 	return count > 2;
1124 }
1125 
1126 static bool
1127 spdk_vfio_noiommu_enabled(void)
1128 {
1129 	return rte_vfio_noiommu_is_enabled();
1130 }
1131 
1132 static void
1133 spdk_vtophys_iommu_init(void)
1134 {
1135 	char proc_fd_path[PATH_MAX + 1];
1136 	char link_path[PATH_MAX + 1];
1137 	const char vfio_path[] = "/dev/vfio/vfio";
1138 	DIR *dir;
1139 	struct dirent *d;
1140 
1141 	if (!spdk_vfio_enabled()) {
1142 		return;
1143 	}
1144 
1145 	if (spdk_vfio_noiommu_enabled()) {
1146 		g_vfio.noiommu_enabled = true;
1147 	} else if (!has_iommu_groups()) {
1148 		return;
1149 	}
1150 
1151 	dir = opendir("/proc/self/fd");
1152 	if (!dir) {
1153 		DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1154 		return;
1155 	}
1156 
1157 	while ((d = readdir(dir)) != NULL) {
1158 		if (d->d_type != DT_LNK) {
1159 			continue;
1160 		}
1161 
1162 		snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1163 		if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1164 			continue;
1165 		}
1166 
1167 		if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1168 			sscanf(d->d_name, "%d", &g_vfio.fd);
1169 			break;
1170 		}
1171 	}
1172 
1173 	closedir(dir);
1174 
1175 	if (g_vfio.fd < 0) {
1176 		DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1177 		return;
1178 	}
1179 
1180 	g_vfio.enabled = true;
1181 
1182 	return;
1183 }
1184 #endif
1185 
1186 void
1187 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device)
1188 {
1189 	struct spdk_vtophys_pci_device *vtophys_dev;
1190 
1191 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1192 
1193 	vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1194 	if (vtophys_dev) {
1195 		vtophys_dev->pci_device = pci_device;
1196 		TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1197 	} else {
1198 		DEBUG_PRINT("Memory allocation error\n");
1199 	}
1200 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1201 
1202 #if SPDK_VFIO_ENABLED
1203 	struct spdk_vfio_dma_map *dma_map;
1204 	int ret;
1205 
1206 	if (!g_vfio.enabled) {
1207 		return;
1208 	}
1209 
1210 	pthread_mutex_lock(&g_vfio.mutex);
1211 	g_vfio.device_ref++;
1212 	if (g_vfio.device_ref > 1) {
1213 		pthread_mutex_unlock(&g_vfio.mutex);
1214 		return;
1215 	}
1216 
1217 	/* This is the first SPDK device using DPDK vfio. This means that the first
1218 	 * IOMMU group might have been just been added to the DPDK vfio container.
1219 	 * From this point it is certain that the memory can be mapped now.
1220 	 */
1221 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1222 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1223 		if (ret) {
1224 			DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1225 			break;
1226 		}
1227 	}
1228 	pthread_mutex_unlock(&g_vfio.mutex);
1229 #endif
1230 }
1231 
1232 void
1233 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1234 {
1235 	struct spdk_vtophys_pci_device *vtophys_dev;
1236 
1237 	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1238 	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1239 		if (vtophys_dev->pci_device == pci_device) {
1240 			TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1241 			free(vtophys_dev);
1242 			break;
1243 		}
1244 	}
1245 	pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1246 
1247 #if SPDK_VFIO_ENABLED
1248 	struct spdk_vfio_dma_map *dma_map;
1249 	int ret;
1250 
1251 	if (!g_vfio.enabled) {
1252 		return;
1253 	}
1254 
1255 	pthread_mutex_lock(&g_vfio.mutex);
1256 	assert(g_vfio.device_ref > 0);
1257 	g_vfio.device_ref--;
1258 	if (g_vfio.device_ref > 0) {
1259 		pthread_mutex_unlock(&g_vfio.mutex);
1260 		return;
1261 	}
1262 
1263 	/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1264 	 * any additional devices using it's vfio container, all the mappings
1265 	 * will be automatically removed by the Linux vfio driver. We unmap
1266 	 * the memory manually to be able to easily re-map it later regardless
1267 	 * of other, external factors.
1268 	 */
1269 	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1270 		ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
1271 		if (ret) {
1272 			DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1273 			break;
1274 		}
1275 	}
1276 	pthread_mutex_unlock(&g_vfio.mutex);
1277 #endif
1278 }
1279 
1280 int
1281 spdk_vtophys_init(void)
1282 {
1283 	const struct spdk_mem_map_ops vtophys_map_ops = {
1284 		.notify_cb = spdk_vtophys_notify,
1285 		.are_contiguous = NULL
1286 	};
1287 
1288 #if SPDK_VFIO_ENABLED
1289 	spdk_vtophys_iommu_init();
1290 #endif
1291 
1292 	g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1293 	if (g_vtophys_map == NULL) {
1294 		DEBUG_PRINT("vtophys map allocation failed\n");
1295 		return -1;
1296 	}
1297 	return 0;
1298 }
1299 
1300 uint64_t
1301 spdk_vtophys(void *buf, uint64_t *size)
1302 {
1303 	uint64_t vaddr, paddr_2mb;
1304 
1305 	vaddr = (uint64_t)buf;
1306 	paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1307 
1308 	/*
1309 	 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1310 	 * we will still bitwise-or it with the buf offset below, but the result will still be
1311 	 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1312 	 * unaligned) we must now check the return value before addition.
1313 	 */
1314 	SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1315 	if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1316 		return SPDK_VTOPHYS_ERROR;
1317 	} else {
1318 		return paddr_2mb + (vaddr & MASK_2MB);
1319 	}
1320 }
1321 
1322 static int
1323 spdk_bus_scan(void)
1324 {
1325 	return 0;
1326 }
1327 
1328 static int
1329 spdk_bus_probe(void)
1330 {
1331 	return 0;
1332 }
1333 
1334 static struct rte_device *
1335 spdk_bus_find_device(const struct rte_device *start,
1336 		     rte_dev_cmp_t cmp, const void *data)
1337 {
1338 	return NULL;
1339 }
1340 
1341 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
1342 static enum rte_iova_mode
1343 spdk_bus_get_iommu_class(void) {
1344 	/* Since we register our PCI drivers after EAL init, we have no chance
1345 	 * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu
1346 	 * class. DPDK uses RTE_IOVA_PA by default because for some platforms
1347 	 * it's the only supported mode, but then SPDK does not support those
1348 	 * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus
1349 	 * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used
1350 	 * (i.e. at least one device on the system is bound to uio_pci_generic),
1351 	 * so we simply return RTE_IOVA_VA here.
1352 	 */
1353 	return RTE_IOVA_VA;
1354 }
1355 #endif
1356 
1357 struct rte_bus spdk_bus = {
1358 	.scan = spdk_bus_scan,
1359 	.probe = spdk_bus_probe,
1360 	.find_device = spdk_bus_find_device,
1361 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3)
1362 	.get_iommu_class = spdk_bus_get_iommu_class,
1363 #endif
1364 };
1365 
1366 RTE_REGISTER_BUS(spdk, spdk_bus);
1367