1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 #include "pci_dpdk.h" 10 11 #include <rte_config.h> 12 #include <rte_memory.h> 13 #include <rte_eal_memconfig.h> 14 15 #include "spdk_internal/assert.h" 16 17 #include "spdk/assert.h" 18 #include "spdk/likely.h" 19 #include "spdk/queue.h" 20 #include "spdk/util.h" 21 #include "spdk/memory.h" 22 #include "spdk/env_dpdk.h" 23 #include "spdk/log.h" 24 25 #ifndef __linux__ 26 #define VFIO_ENABLED 0 27 #else 28 #include <linux/version.h> 29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30 #define VFIO_ENABLED 1 31 #include <linux/vfio.h> 32 #include <rte_vfio.h> 33 34 struct spdk_vfio_dma_map { 35 struct vfio_iommu_type1_dma_map map; 36 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 37 }; 38 39 struct vfio_cfg { 40 int fd; 41 bool enabled; 42 bool noiommu_enabled; 43 unsigned device_ref; 44 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 45 pthread_mutex_t mutex; 46 }; 47 48 static struct vfio_cfg g_vfio = { 49 .fd = -1, 50 .enabled = false, 51 .noiommu_enabled = false, 52 .device_ref = 0, 53 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 54 .mutex = PTHREAD_MUTEX_INITIALIZER 55 }; 56 57 #else 58 #define VFIO_ENABLED 0 59 #endif 60 #endif 61 62 #if DEBUG 63 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 64 #else 65 #define DEBUG_PRINT(...) 66 #endif 67 68 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 69 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 70 71 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 72 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 73 74 /* Page is registered */ 75 #define REG_MAP_REGISTERED (1ULL << 62) 76 77 /* A notification region barrier. The 2MB translation entry that's marked 78 * with this flag must be unregistered separately. This allows contiguous 79 * regions to be unregistered in the same chunks they were registered. 80 */ 81 #define REG_MAP_NOTIFY_START (1ULL << 63) 82 83 /* Translation of a single 2MB page. */ 84 struct map_2mb { 85 uint64_t translation_2mb; 86 }; 87 88 /* Second-level map table indexed by bits [21..29] of the virtual address. 89 * Each entry contains the address translation or error for entries that haven't 90 * been retrieved yet. 91 */ 92 struct map_1gb { 93 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 94 }; 95 96 /* Top-level map table indexed by bits [30..47] of the virtual address. 97 * Each entry points to a second-level map table or NULL. 98 */ 99 struct map_256tb { 100 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 101 }; 102 103 /* Page-granularity memory address translation */ 104 struct spdk_mem_map { 105 struct map_256tb map_256tb; 106 pthread_mutex_t mutex; 107 uint64_t default_translation; 108 struct spdk_mem_map_ops ops; 109 void *cb_ctx; 110 TAILQ_ENTRY(spdk_mem_map) tailq; 111 }; 112 113 /* Registrations map. The 64 bit translations are bit fields with the 114 * following layout (starting with the low bits): 115 * 0 - 61 : reserved 116 * 62 - 63 : flags 117 */ 118 static struct spdk_mem_map *g_mem_reg_map; 119 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 120 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 121 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 122 123 static bool g_legacy_mem; 124 125 /* 126 * Walk the currently registered memory via the main memory registration map 127 * and call the new map's notify callback for each virtually contiguous region. 128 */ 129 static int 130 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 131 { 132 size_t idx_256tb; 133 uint64_t idx_1gb; 134 uint64_t contig_start = UINT64_MAX; 135 uint64_t contig_end = UINT64_MAX; 136 struct map_1gb *map_1gb; 137 int rc; 138 139 if (!g_mem_reg_map) { 140 return -EINVAL; 141 } 142 143 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 144 pthread_mutex_lock(&g_mem_reg_map->mutex); 145 146 for (idx_256tb = 0; 147 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 148 idx_256tb++) { 149 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 150 151 if (!map_1gb) { 152 if (contig_start != UINT64_MAX) { 153 /* End of of a virtually contiguous range */ 154 rc = map->ops.notify_cb(map->cb_ctx, map, action, 155 (void *)contig_start, 156 contig_end - contig_start + VALUE_2MB); 157 /* Don't bother handling unregister failures. It can't be any worse */ 158 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 159 goto err_unregister; 160 } 161 } 162 contig_start = UINT64_MAX; 163 continue; 164 } 165 166 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 167 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 168 (contig_start == UINT64_MAX || 169 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 170 /* Rebuild the virtual address from the indexes */ 171 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 172 173 if (contig_start == UINT64_MAX) { 174 contig_start = vaddr; 175 } 176 177 contig_end = vaddr; 178 } else { 179 if (contig_start != UINT64_MAX) { 180 /* End of of a virtually contiguous range */ 181 rc = map->ops.notify_cb(map->cb_ctx, map, action, 182 (void *)contig_start, 183 contig_end - contig_start + VALUE_2MB); 184 /* Don't bother handling unregister failures. It can't be any worse */ 185 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 186 goto err_unregister; 187 } 188 189 /* This page might be a part of a neighbour region, so process 190 * it again. The idx_1gb will be incremented immediately. 191 */ 192 idx_1gb--; 193 } 194 contig_start = UINT64_MAX; 195 } 196 } 197 } 198 199 pthread_mutex_unlock(&g_mem_reg_map->mutex); 200 return 0; 201 202 err_unregister: 203 /* Unwind to the first empty translation so we don't unregister 204 * a region that just failed to register. 205 */ 206 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 207 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 208 contig_start = UINT64_MAX; 209 contig_end = UINT64_MAX; 210 211 /* Unregister any memory we managed to register before the failure */ 212 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 213 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 214 215 if (!map_1gb) { 216 if (contig_end != UINT64_MAX) { 217 /* End of of a virtually contiguous range */ 218 map->ops.notify_cb(map->cb_ctx, map, 219 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 220 (void *)contig_start, 221 contig_end - contig_start + VALUE_2MB); 222 } 223 contig_end = UINT64_MAX; 224 continue; 225 } 226 227 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 228 /* Rebuild the virtual address from the indexes */ 229 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 230 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 231 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 232 233 if (contig_end == UINT64_MAX) { 234 contig_end = vaddr; 235 } 236 contig_start = vaddr; 237 } else { 238 if (contig_end != UINT64_MAX) { 239 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 240 contig_start = vaddr; 241 } 242 /* End of of a virtually contiguous range */ 243 map->ops.notify_cb(map->cb_ctx, map, 244 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 245 (void *)contig_start, 246 contig_end - contig_start + VALUE_2MB); 247 } 248 contig_end = UINT64_MAX; 249 } 250 } 251 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 252 } 253 254 pthread_mutex_unlock(&g_mem_reg_map->mutex); 255 return rc; 256 } 257 258 struct spdk_mem_map * 259 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 260 { 261 struct spdk_mem_map *map; 262 int rc; 263 size_t i; 264 265 map = calloc(1, sizeof(*map)); 266 if (map == NULL) { 267 return NULL; 268 } 269 270 if (pthread_mutex_init(&map->mutex, NULL)) { 271 free(map); 272 return NULL; 273 } 274 275 map->default_translation = default_translation; 276 map->cb_ctx = cb_ctx; 277 if (ops) { 278 map->ops = *ops; 279 } 280 281 if (ops && ops->notify_cb) { 282 pthread_mutex_lock(&g_spdk_mem_map_mutex); 283 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 284 if (rc != 0) { 285 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 286 DEBUG_PRINT("Initial mem_map notify failed\n"); 287 pthread_mutex_destroy(&map->mutex); 288 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 289 free(map->map_256tb.map[i]); 290 } 291 free(map); 292 return NULL; 293 } 294 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 295 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 296 } 297 298 return map; 299 } 300 301 void 302 spdk_mem_map_free(struct spdk_mem_map **pmap) 303 { 304 struct spdk_mem_map *map; 305 size_t i; 306 307 if (!pmap) { 308 return; 309 } 310 311 map = *pmap; 312 313 if (!map) { 314 return; 315 } 316 317 if (map->ops.notify_cb) { 318 pthread_mutex_lock(&g_spdk_mem_map_mutex); 319 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 320 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 321 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 322 } 323 324 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 325 free(map->map_256tb.map[i]); 326 } 327 328 pthread_mutex_destroy(&map->mutex); 329 330 free(map); 331 *pmap = NULL; 332 } 333 334 int 335 spdk_mem_register(void *vaddr, size_t len) 336 { 337 struct spdk_mem_map *map; 338 int rc; 339 void *seg_vaddr; 340 size_t seg_len; 341 uint64_t reg; 342 343 if ((uintptr_t)vaddr & ~MASK_256TB) { 344 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 345 return -EINVAL; 346 } 347 348 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 349 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 350 __func__, vaddr, len); 351 return -EINVAL; 352 } 353 354 if (len == 0) { 355 return 0; 356 } 357 358 pthread_mutex_lock(&g_spdk_mem_map_mutex); 359 360 seg_vaddr = vaddr; 361 seg_len = len; 362 while (seg_len > 0) { 363 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 364 if (reg & REG_MAP_REGISTERED) { 365 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 366 return -EBUSY; 367 } 368 seg_vaddr += VALUE_2MB; 369 seg_len -= VALUE_2MB; 370 } 371 372 seg_vaddr = vaddr; 373 seg_len = 0; 374 while (len > 0) { 375 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 376 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 377 seg_len += VALUE_2MB; 378 vaddr += VALUE_2MB; 379 len -= VALUE_2MB; 380 } 381 382 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 383 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 384 if (rc != 0) { 385 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 386 return rc; 387 } 388 } 389 390 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 391 return 0; 392 } 393 394 int 395 spdk_mem_unregister(void *vaddr, size_t len) 396 { 397 struct spdk_mem_map *map; 398 int rc; 399 void *seg_vaddr; 400 size_t seg_len; 401 uint64_t reg, newreg; 402 403 if ((uintptr_t)vaddr & ~MASK_256TB) { 404 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 405 return -EINVAL; 406 } 407 408 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 409 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 410 __func__, vaddr, len); 411 return -EINVAL; 412 } 413 414 pthread_mutex_lock(&g_spdk_mem_map_mutex); 415 416 /* The first page must be a start of a region. Also check if it's 417 * registered to make sure we don't return -ERANGE for non-registered 418 * regions. 419 */ 420 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 421 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 422 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 423 return -ERANGE; 424 } 425 426 seg_vaddr = vaddr; 427 seg_len = len; 428 while (seg_len > 0) { 429 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 430 if ((reg & REG_MAP_REGISTERED) == 0) { 431 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 432 return -EINVAL; 433 } 434 seg_vaddr += VALUE_2MB; 435 seg_len -= VALUE_2MB; 436 } 437 438 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 439 /* If the next page is registered, it must be a start of a region as well, 440 * otherwise we'd be unregistering only a part of a region. 441 */ 442 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 443 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 444 return -ERANGE; 445 } 446 seg_vaddr = vaddr; 447 seg_len = 0; 448 449 while (len > 0) { 450 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 451 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 452 453 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 454 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 455 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 456 if (rc != 0) { 457 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 458 return rc; 459 } 460 } 461 462 seg_vaddr = vaddr; 463 seg_len = VALUE_2MB; 464 } else { 465 seg_len += VALUE_2MB; 466 } 467 468 vaddr += VALUE_2MB; 469 len -= VALUE_2MB; 470 } 471 472 if (seg_len > 0) { 473 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 474 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 475 if (rc != 0) { 476 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 477 return rc; 478 } 479 } 480 } 481 482 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 483 return 0; 484 } 485 486 int 487 spdk_mem_reserve(void *vaddr, size_t len) 488 { 489 struct spdk_mem_map *map; 490 void *seg_vaddr; 491 size_t seg_len; 492 uint64_t reg; 493 494 if ((uintptr_t)vaddr & ~MASK_256TB) { 495 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 496 return -EINVAL; 497 } 498 499 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 500 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 501 __func__, vaddr, len); 502 return -EINVAL; 503 } 504 505 if (len == 0) { 506 return 0; 507 } 508 509 pthread_mutex_lock(&g_spdk_mem_map_mutex); 510 511 /* Check if any part of this range is already registered */ 512 seg_vaddr = vaddr; 513 seg_len = len; 514 while (seg_len > 0) { 515 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 516 if (reg & REG_MAP_REGISTERED) { 517 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 518 return -EBUSY; 519 } 520 seg_vaddr += VALUE_2MB; 521 seg_len -= VALUE_2MB; 522 } 523 524 /* Simply set the translation to the memory map's default. This allocates the space in the 525 * map but does not provide a valid translation. */ 526 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 527 g_mem_reg_map->default_translation); 528 529 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 530 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 531 } 532 533 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 534 return 0; 535 } 536 537 static struct map_1gb * 538 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 539 { 540 struct map_1gb *map_1gb; 541 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 542 size_t i; 543 544 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 545 return NULL; 546 } 547 548 map_1gb = map->map_256tb.map[idx_256tb]; 549 550 if (!map_1gb) { 551 pthread_mutex_lock(&map->mutex); 552 553 /* Recheck to make sure nobody else got the mutex first. */ 554 map_1gb = map->map_256tb.map[idx_256tb]; 555 if (!map_1gb) { 556 map_1gb = malloc(sizeof(struct map_1gb)); 557 if (map_1gb) { 558 /* initialize all entries to default translation */ 559 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 560 map_1gb->map[i].translation_2mb = map->default_translation; 561 } 562 map->map_256tb.map[idx_256tb] = map_1gb; 563 } 564 } 565 566 pthread_mutex_unlock(&map->mutex); 567 568 if (!map_1gb) { 569 DEBUG_PRINT("allocation failed\n"); 570 return NULL; 571 } 572 } 573 574 return map_1gb; 575 } 576 577 int 578 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 579 uint64_t translation) 580 { 581 uint64_t vfn_2mb; 582 struct map_1gb *map_1gb; 583 uint64_t idx_1gb; 584 struct map_2mb *map_2mb; 585 586 if ((uintptr_t)vaddr & ~MASK_256TB) { 587 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 588 return -EINVAL; 589 } 590 591 /* For now, only 2 MB-aligned registrations are supported */ 592 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 593 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 594 __func__, vaddr, size); 595 return -EINVAL; 596 } 597 598 vfn_2mb = vaddr >> SHIFT_2MB; 599 600 while (size) { 601 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 602 if (!map_1gb) { 603 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 604 return -ENOMEM; 605 } 606 607 idx_1gb = MAP_1GB_IDX(vfn_2mb); 608 map_2mb = &map_1gb->map[idx_1gb]; 609 map_2mb->translation_2mb = translation; 610 611 size -= VALUE_2MB; 612 vfn_2mb++; 613 } 614 615 return 0; 616 } 617 618 int 619 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 620 { 621 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 622 } 623 624 inline uint64_t 625 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 626 { 627 const struct map_1gb *map_1gb; 628 const struct map_2mb *map_2mb; 629 uint64_t idx_256tb; 630 uint64_t idx_1gb; 631 uint64_t vfn_2mb; 632 uint64_t cur_size; 633 uint64_t prev_translation; 634 uint64_t orig_translation; 635 636 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 637 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 638 return map->default_translation; 639 } 640 641 vfn_2mb = vaddr >> SHIFT_2MB; 642 idx_256tb = MAP_256TB_IDX(vfn_2mb); 643 idx_1gb = MAP_1GB_IDX(vfn_2mb); 644 645 map_1gb = map->map_256tb.map[idx_256tb]; 646 if (spdk_unlikely(!map_1gb)) { 647 return map->default_translation; 648 } 649 650 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 651 map_2mb = &map_1gb->map[idx_1gb]; 652 if (size == NULL || map->ops.are_contiguous == NULL || 653 map_2mb->translation_2mb == map->default_translation) { 654 if (size != NULL) { 655 *size = spdk_min(*size, cur_size); 656 } 657 return map_2mb->translation_2mb; 658 } 659 660 orig_translation = map_2mb->translation_2mb; 661 prev_translation = orig_translation; 662 while (cur_size < *size) { 663 vfn_2mb++; 664 idx_256tb = MAP_256TB_IDX(vfn_2mb); 665 idx_1gb = MAP_1GB_IDX(vfn_2mb); 666 667 map_1gb = map->map_256tb.map[idx_256tb]; 668 if (spdk_unlikely(!map_1gb)) { 669 break; 670 } 671 672 map_2mb = &map_1gb->map[idx_1gb]; 673 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 674 break; 675 } 676 677 cur_size += VALUE_2MB; 678 prev_translation = map_2mb->translation_2mb; 679 } 680 681 *size = spdk_min(*size, cur_size); 682 return orig_translation; 683 } 684 685 static void 686 memory_hotplug_cb(enum rte_mem_event event_type, 687 const void *addr, size_t len, void *arg) 688 { 689 if (event_type == RTE_MEM_EVENT_ALLOC) { 690 spdk_mem_register((void *)addr, len); 691 692 if (!spdk_env_dpdk_external_init()) { 693 return; 694 } 695 696 /* When the user initialized DPDK separately, we can't 697 * be sure that --match-allocations RTE flag was specified. 698 * Without this flag, DPDK can free memory in different units 699 * than it was allocated. It doesn't work with things like RDMA MRs. 700 * 701 * For such cases, we mark segments so they aren't freed. 702 */ 703 while (len > 0) { 704 struct rte_memseg *seg; 705 706 seg = rte_mem_virt2memseg(addr, NULL); 707 assert(seg != NULL); 708 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 709 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 710 len -= seg->hugepage_sz; 711 } 712 } else if (event_type == RTE_MEM_EVENT_FREE) { 713 spdk_mem_unregister((void *)addr, len); 714 } 715 } 716 717 static int 718 memory_iter_cb(const struct rte_memseg_list *msl, 719 const struct rte_memseg *ms, size_t len, void *arg) 720 { 721 return spdk_mem_register(ms->addr, len); 722 } 723 724 int 725 mem_map_init(bool legacy_mem) 726 { 727 g_legacy_mem = legacy_mem; 728 729 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 730 if (g_mem_reg_map == NULL) { 731 DEBUG_PRINT("memory registration map allocation failed\n"); 732 return -ENOMEM; 733 } 734 735 /* 736 * Walk all DPDK memory segments and register them 737 * with the main memory map 738 */ 739 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 740 rte_memseg_contig_walk(memory_iter_cb, NULL); 741 return 0; 742 } 743 744 bool 745 spdk_iommu_is_enabled(void) 746 { 747 #if VFIO_ENABLED 748 return g_vfio.enabled && !g_vfio.noiommu_enabled; 749 #else 750 return false; 751 #endif 752 } 753 754 struct spdk_vtophys_pci_device { 755 struct rte_pci_device *pci_device; 756 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 757 }; 758 759 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 760 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 761 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 762 763 static struct spdk_mem_map *g_vtophys_map; 764 static struct spdk_mem_map *g_phys_ref_map; 765 766 #if VFIO_ENABLED 767 static int 768 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 769 { 770 struct spdk_vfio_dma_map *dma_map; 771 uint64_t refcount; 772 int ret; 773 774 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 775 assert(refcount < UINT64_MAX); 776 if (refcount > 0) { 777 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 778 return 0; 779 } 780 781 dma_map = calloc(1, sizeof(*dma_map)); 782 if (dma_map == NULL) { 783 return -ENOMEM; 784 } 785 786 dma_map->map.argsz = sizeof(dma_map->map); 787 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 788 dma_map->map.vaddr = vaddr; 789 dma_map->map.iova = iova; 790 dma_map->map.size = size; 791 792 pthread_mutex_lock(&g_vfio.mutex); 793 if (g_vfio.device_ref == 0) { 794 /* VFIO requires at least one device (IOMMU group) to be added to 795 * a VFIO container before it is possible to perform any IOMMU 796 * operations on that container. This memory will be mapped once 797 * the first device (IOMMU group) is hotplugged. 798 * 799 * Since the vfio container is managed internally by DPDK, it is 800 * also possible that some device is already in that container, but 801 * it's not managed by SPDK - e.g. an NIC attached internally 802 * inside DPDK. We could map the memory straight away in such 803 * scenario, but there's no need to do it. DPDK devices clearly 804 * don't need our mappings and hence we defer the mapping 805 * unconditionally until the first SPDK-managed device is 806 * hotplugged. 807 */ 808 goto out_insert; 809 } 810 811 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 812 if (ret) { 813 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 814 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 815 } 816 817 out_insert: 818 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 819 pthread_mutex_unlock(&g_vfio.mutex); 820 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 821 return 0; 822 } 823 824 static int 825 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 826 { 827 struct spdk_vfio_dma_map *dma_map; 828 uint64_t refcount; 829 int ret; 830 struct vfio_iommu_type1_dma_unmap unmap = {}; 831 832 pthread_mutex_lock(&g_vfio.mutex); 833 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 834 if (dma_map->map.iova == iova) { 835 break; 836 } 837 } 838 839 if (dma_map == NULL) { 840 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 841 pthread_mutex_unlock(&g_vfio.mutex); 842 return -ENXIO; 843 } 844 845 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 846 assert(refcount < UINT64_MAX); 847 if (refcount > 0) { 848 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 849 } 850 851 /* We still have outstanding references, don't clear it. */ 852 if (refcount > 1) { 853 pthread_mutex_unlock(&g_vfio.mutex); 854 return 0; 855 } 856 857 /** don't support partial or multiple-page unmap for now */ 858 assert(dma_map->map.size == size); 859 860 if (g_vfio.device_ref == 0) { 861 /* Memory is not mapped anymore, just remove it's references */ 862 goto out_remove; 863 } 864 865 unmap.argsz = sizeof(unmap); 866 unmap.flags = 0; 867 unmap.iova = dma_map->map.iova; 868 unmap.size = dma_map->map.size; 869 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 870 if (ret) { 871 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 872 } 873 874 out_remove: 875 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 876 pthread_mutex_unlock(&g_vfio.mutex); 877 free(dma_map); 878 return 0; 879 } 880 #endif 881 882 static uint64_t 883 vtophys_get_paddr_memseg(uint64_t vaddr) 884 { 885 uintptr_t paddr; 886 struct rte_memseg *seg; 887 888 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 889 if (seg != NULL) { 890 paddr = seg->iova; 891 if (paddr == RTE_BAD_IOVA) { 892 return SPDK_VTOPHYS_ERROR; 893 } 894 paddr += (vaddr - (uintptr_t)seg->addr); 895 return paddr; 896 } 897 898 return SPDK_VTOPHYS_ERROR; 899 } 900 901 /* Try to get the paddr from /proc/self/pagemap */ 902 static uint64_t 903 vtophys_get_paddr_pagemap(uint64_t vaddr) 904 { 905 uintptr_t paddr; 906 907 /* Silence static analyzers */ 908 assert(vaddr != 0); 909 paddr = rte_mem_virt2iova((void *)vaddr); 910 if (paddr == RTE_BAD_IOVA) { 911 /* 912 * The vaddr may be valid but doesn't have a backing page 913 * assigned yet. Touch the page to ensure a backing page 914 * gets assigned, then try to translate again. 915 */ 916 rte_atomic64_read((rte_atomic64_t *)vaddr); 917 paddr = rte_mem_virt2iova((void *)vaddr); 918 } 919 if (paddr == RTE_BAD_IOVA) { 920 /* Unable to get to the physical address. */ 921 return SPDK_VTOPHYS_ERROR; 922 } 923 924 return paddr; 925 } 926 927 /* Try to get the paddr from pci devices */ 928 static uint64_t 929 vtophys_get_paddr_pci(uint64_t vaddr) 930 { 931 struct spdk_vtophys_pci_device *vtophys_dev; 932 uintptr_t paddr; 933 struct rte_pci_device *dev; 934 935 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 936 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 937 dev = vtophys_dev->pci_device; 938 paddr = dpdk_pci_device_vtophys(dev, vaddr); 939 if (paddr != SPDK_VTOPHYS_ERROR) { 940 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 941 return paddr; 942 } 943 } 944 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 945 946 return SPDK_VTOPHYS_ERROR; 947 } 948 949 static int 950 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 951 enum spdk_mem_map_notify_action action, 952 void *vaddr, size_t len) 953 { 954 int rc = 0, pci_phys = 0; 955 uint64_t paddr; 956 957 if ((uintptr_t)vaddr & ~MASK_256TB) { 958 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 959 return -EINVAL; 960 } 961 962 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 963 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 964 vaddr, len); 965 return -EINVAL; 966 } 967 968 /* Get the physical address from the DPDK memsegs */ 969 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 970 971 switch (action) { 972 case SPDK_MEM_MAP_NOTIFY_REGISTER: 973 if (paddr == SPDK_VTOPHYS_ERROR) { 974 /* This is not an address that DPDK is managing. */ 975 #if VFIO_ENABLED 976 enum rte_iova_mode iova_mode; 977 978 iova_mode = rte_eal_iova_mode(); 979 980 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 981 /* We'll use the virtual address as the iova to match DPDK. */ 982 paddr = (uint64_t)vaddr; 983 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 984 if (rc) { 985 return -EFAULT; 986 } 987 while (len > 0) { 988 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 989 if (rc != 0) { 990 return rc; 991 } 992 vaddr += VALUE_2MB; 993 paddr += VALUE_2MB; 994 len -= VALUE_2MB; 995 } 996 } else 997 #endif 998 { 999 /* Get the physical address from /proc/self/pagemap. */ 1000 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1001 if (paddr == SPDK_VTOPHYS_ERROR) { 1002 /* Get the physical address from PCI devices */ 1003 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1004 if (paddr == SPDK_VTOPHYS_ERROR) { 1005 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1006 return -EFAULT; 1007 } 1008 /* The beginning of this address range points to a PCI resource, 1009 * so the rest must point to a PCI resource as well. 1010 */ 1011 pci_phys = 1; 1012 } 1013 1014 /* Get paddr for each 2MB chunk in this address range */ 1015 while (len > 0) { 1016 /* Get the physical address from /proc/self/pagemap. */ 1017 if (pci_phys) { 1018 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1019 } else { 1020 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1021 } 1022 1023 if (paddr == SPDK_VTOPHYS_ERROR) { 1024 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1025 return -EFAULT; 1026 } 1027 1028 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1029 if (!pci_phys && (paddr & MASK_2MB)) { 1030 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1031 return -EINVAL; 1032 } 1033 #if VFIO_ENABLED 1034 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1035 * with the IOMMU using the physical address to match. */ 1036 if (spdk_iommu_is_enabled()) { 1037 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1038 if (rc) { 1039 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1040 return -EFAULT; 1041 } 1042 } 1043 #endif 1044 1045 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1046 if (rc != 0) { 1047 return rc; 1048 } 1049 1050 vaddr += VALUE_2MB; 1051 len -= VALUE_2MB; 1052 } 1053 } 1054 } else { 1055 /* This is an address managed by DPDK. Just setup the translations. */ 1056 while (len > 0) { 1057 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1058 if (paddr == SPDK_VTOPHYS_ERROR) { 1059 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1060 return -EFAULT; 1061 } 1062 1063 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1064 if (rc != 0) { 1065 return rc; 1066 } 1067 1068 vaddr += VALUE_2MB; 1069 len -= VALUE_2MB; 1070 } 1071 } 1072 1073 break; 1074 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1075 #if VFIO_ENABLED 1076 if (paddr == SPDK_VTOPHYS_ERROR) { 1077 /* 1078 * This is not an address that DPDK is managing. If vfio is enabled, 1079 * we need to unmap the range from the IOMMU 1080 */ 1081 if (spdk_iommu_is_enabled()) { 1082 uint64_t buffer_len = len; 1083 uint8_t *va = vaddr; 1084 enum rte_iova_mode iova_mode; 1085 1086 iova_mode = rte_eal_iova_mode(); 1087 /* 1088 * In virtual address mode, the region is contiguous and can be done in 1089 * one unmap. 1090 */ 1091 if (iova_mode == RTE_IOVA_VA) { 1092 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1093 if (buffer_len != len || paddr != (uintptr_t)va) { 1094 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1095 "translation had address 0x%" PRIx64 " and length %lu\n", 1096 va, len, paddr, buffer_len); 1097 return -EINVAL; 1098 } 1099 rc = vtophys_iommu_unmap_dma(paddr, len); 1100 if (rc) { 1101 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1102 return -EFAULT; 1103 } 1104 } else if (iova_mode == RTE_IOVA_PA) { 1105 /* Get paddr for each 2MB chunk in this address range */ 1106 while (buffer_len > 0) { 1107 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1108 1109 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1110 DEBUG_PRINT("could not get phys addr for %p\n", va); 1111 return -EFAULT; 1112 } 1113 1114 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1115 if (rc) { 1116 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1117 return -EFAULT; 1118 } 1119 1120 va += VALUE_2MB; 1121 buffer_len -= VALUE_2MB; 1122 } 1123 } 1124 } 1125 } 1126 #endif 1127 while (len > 0) { 1128 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1129 if (rc != 0) { 1130 return rc; 1131 } 1132 1133 vaddr += VALUE_2MB; 1134 len -= VALUE_2MB; 1135 } 1136 1137 break; 1138 default: 1139 SPDK_UNREACHABLE(); 1140 } 1141 1142 return rc; 1143 } 1144 1145 static int 1146 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1147 { 1148 /* This function is always called with paddrs for two subsequent 1149 * 2MB chunks in virtual address space, so those chunks will be only 1150 * physically contiguous if the physical addresses are 2MB apart 1151 * from each other as well. 1152 */ 1153 return (paddr2 - paddr1 == VALUE_2MB); 1154 } 1155 1156 #if VFIO_ENABLED 1157 1158 static bool 1159 vfio_enabled(void) 1160 { 1161 return rte_vfio_is_enabled("vfio_pci"); 1162 } 1163 1164 /* Check if IOMMU is enabled on the system */ 1165 static bool 1166 has_iommu_groups(void) 1167 { 1168 int count = 0; 1169 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1170 1171 if (dir == NULL) { 1172 return false; 1173 } 1174 1175 while (count < 3 && readdir(dir) != NULL) { 1176 count++; 1177 } 1178 1179 closedir(dir); 1180 /* there will always be ./ and ../ entries */ 1181 return count > 2; 1182 } 1183 1184 static bool 1185 vfio_noiommu_enabled(void) 1186 { 1187 return rte_vfio_noiommu_is_enabled(); 1188 } 1189 1190 static void 1191 vtophys_iommu_init(void) 1192 { 1193 char proc_fd_path[PATH_MAX + 1]; 1194 char link_path[PATH_MAX + 1]; 1195 const char vfio_path[] = "/dev/vfio/vfio"; 1196 DIR *dir; 1197 struct dirent *d; 1198 1199 if (!vfio_enabled()) { 1200 return; 1201 } 1202 1203 if (vfio_noiommu_enabled()) { 1204 g_vfio.noiommu_enabled = true; 1205 } else if (!has_iommu_groups()) { 1206 return; 1207 } 1208 1209 dir = opendir("/proc/self/fd"); 1210 if (!dir) { 1211 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1212 return; 1213 } 1214 1215 while ((d = readdir(dir)) != NULL) { 1216 if (d->d_type != DT_LNK) { 1217 continue; 1218 } 1219 1220 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1221 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1222 continue; 1223 } 1224 1225 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1226 sscanf(d->d_name, "%d", &g_vfio.fd); 1227 break; 1228 } 1229 } 1230 1231 closedir(dir); 1232 1233 if (g_vfio.fd < 0) { 1234 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1235 return; 1236 } 1237 1238 g_vfio.enabled = true; 1239 1240 return; 1241 } 1242 1243 #endif 1244 1245 void 1246 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1247 { 1248 struct spdk_vtophys_pci_device *vtophys_dev; 1249 1250 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1251 1252 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1253 if (vtophys_dev) { 1254 vtophys_dev->pci_device = pci_device; 1255 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1256 } else { 1257 DEBUG_PRINT("Memory allocation error\n"); 1258 } 1259 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1260 1261 #if VFIO_ENABLED 1262 struct spdk_vfio_dma_map *dma_map; 1263 int ret; 1264 1265 if (!g_vfio.enabled) { 1266 return; 1267 } 1268 1269 pthread_mutex_lock(&g_vfio.mutex); 1270 g_vfio.device_ref++; 1271 if (g_vfio.device_ref > 1) { 1272 pthread_mutex_unlock(&g_vfio.mutex); 1273 return; 1274 } 1275 1276 /* This is the first SPDK device using DPDK vfio. This means that the first 1277 * IOMMU group might have been just been added to the DPDK vfio container. 1278 * From this point it is certain that the memory can be mapped now. 1279 */ 1280 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1281 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1282 if (ret) { 1283 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1284 break; 1285 } 1286 } 1287 pthread_mutex_unlock(&g_vfio.mutex); 1288 #endif 1289 } 1290 1291 void 1292 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1293 { 1294 struct spdk_vtophys_pci_device *vtophys_dev; 1295 1296 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1297 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1298 if (vtophys_dev->pci_device == pci_device) { 1299 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1300 free(vtophys_dev); 1301 break; 1302 } 1303 } 1304 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1305 1306 #if VFIO_ENABLED 1307 struct spdk_vfio_dma_map *dma_map; 1308 int ret; 1309 1310 if (!g_vfio.enabled) { 1311 return; 1312 } 1313 1314 pthread_mutex_lock(&g_vfio.mutex); 1315 assert(g_vfio.device_ref > 0); 1316 g_vfio.device_ref--; 1317 if (g_vfio.device_ref > 0) { 1318 pthread_mutex_unlock(&g_vfio.mutex); 1319 return; 1320 } 1321 1322 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1323 * any additional devices using it's vfio container, all the mappings 1324 * will be automatically removed by the Linux vfio driver. We unmap 1325 * the memory manually to be able to easily re-map it later regardless 1326 * of other, external factors. 1327 */ 1328 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1329 struct vfio_iommu_type1_dma_unmap unmap = {}; 1330 unmap.argsz = sizeof(unmap); 1331 unmap.flags = 0; 1332 unmap.iova = dma_map->map.iova; 1333 unmap.size = dma_map->map.size; 1334 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1335 if (ret) { 1336 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1337 break; 1338 } 1339 } 1340 pthread_mutex_unlock(&g_vfio.mutex); 1341 #endif 1342 } 1343 1344 int 1345 vtophys_init(void) 1346 { 1347 const struct spdk_mem_map_ops vtophys_map_ops = { 1348 .notify_cb = vtophys_notify, 1349 .are_contiguous = vtophys_check_contiguous_entries, 1350 }; 1351 1352 const struct spdk_mem_map_ops phys_ref_map_ops = { 1353 .notify_cb = NULL, 1354 .are_contiguous = NULL, 1355 }; 1356 1357 #if VFIO_ENABLED 1358 vtophys_iommu_init(); 1359 #endif 1360 1361 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1362 if (g_phys_ref_map == NULL) { 1363 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1364 return -ENOMEM; 1365 } 1366 1367 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1368 if (g_vtophys_map == NULL) { 1369 DEBUG_PRINT("vtophys map allocation failed\n"); 1370 spdk_mem_map_free(&g_phys_ref_map); 1371 return -ENOMEM; 1372 } 1373 return 0; 1374 } 1375 1376 uint64_t 1377 spdk_vtophys(const void *buf, uint64_t *size) 1378 { 1379 uint64_t vaddr, paddr_2mb; 1380 1381 vaddr = (uint64_t)buf; 1382 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1383 1384 /* 1385 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1386 * we will still bitwise-or it with the buf offset below, but the result will still be 1387 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1388 * unaligned) we must now check the return value before addition. 1389 */ 1390 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1391 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1392 return SPDK_VTOPHYS_ERROR; 1393 } else { 1394 return paddr_2mb + (vaddr & MASK_2MB); 1395 } 1396 } 1397 1398 int 1399 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1400 { 1401 struct rte_memseg *seg; 1402 int ret, fd; 1403 1404 seg = rte_mem_virt2memseg(vaddr, NULL); 1405 if (!seg) { 1406 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1407 return -ENOENT; 1408 } 1409 1410 fd = rte_memseg_get_fd_thread_unsafe(seg); 1411 if (fd < 0) { 1412 return fd; 1413 } 1414 1415 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1416 if (ret < 0) { 1417 return ret; 1418 } 1419 1420 return fd; 1421 } 1422