1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 10 #include <rte_config.h> 11 #include <rte_memory.h> 12 #include <rte_eal_memconfig.h> 13 14 #include "spdk_internal/assert.h" 15 16 #include "spdk/assert.h" 17 #include "spdk/likely.h" 18 #include "spdk/queue.h" 19 #include "spdk/util.h" 20 #include "spdk/memory.h" 21 #include "spdk/env_dpdk.h" 22 #include "spdk/log.h" 23 24 #ifndef __linux__ 25 #define VFIO_ENABLED 0 26 #else 27 #include <linux/version.h> 28 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 29 #define VFIO_ENABLED 1 30 #include <linux/vfio.h> 31 #include <rte_vfio.h> 32 33 struct spdk_vfio_dma_map { 34 struct vfio_iommu_type1_dma_map map; 35 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36 }; 37 38 struct vfio_cfg { 39 int fd; 40 bool enabled; 41 bool noiommu_enabled; 42 unsigned device_ref; 43 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44 pthread_mutex_t mutex; 45 }; 46 47 static struct vfio_cfg g_vfio = { 48 .fd = -1, 49 .enabled = false, 50 .noiommu_enabled = false, 51 .device_ref = 0, 52 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53 .mutex = PTHREAD_MUTEX_INITIALIZER 54 }; 55 56 #else 57 #define VFIO_ENABLED 0 58 #endif 59 #endif 60 61 #if DEBUG 62 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 63 #else 64 #define DEBUG_PRINT(...) 65 #endif 66 67 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 68 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 69 70 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 71 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 72 73 /* Page is registered */ 74 #define REG_MAP_REGISTERED (1ULL << 62) 75 76 /* A notification region barrier. The 2MB translation entry that's marked 77 * with this flag must be unregistered separately. This allows contiguous 78 * regions to be unregistered in the same chunks they were registered. 79 */ 80 #define REG_MAP_NOTIFY_START (1ULL << 63) 81 82 /* Translation of a single 2MB page. */ 83 struct map_2mb { 84 uint64_t translation_2mb; 85 }; 86 87 /* Second-level map table indexed by bits [21..29] of the virtual address. 88 * Each entry contains the address translation or error for entries that haven't 89 * been retrieved yet. 90 */ 91 struct map_1gb { 92 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 93 }; 94 95 /* Top-level map table indexed by bits [30..47] of the virtual address. 96 * Each entry points to a second-level map table or NULL. 97 */ 98 struct map_256tb { 99 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 100 }; 101 102 /* Page-granularity memory address translation */ 103 struct spdk_mem_map { 104 struct map_256tb map_256tb; 105 pthread_mutex_t mutex; 106 uint64_t default_translation; 107 struct spdk_mem_map_ops ops; 108 void *cb_ctx; 109 TAILQ_ENTRY(spdk_mem_map) tailq; 110 }; 111 112 /* Registrations map. The 64 bit translations are bit fields with the 113 * following layout (starting with the low bits): 114 * 0 - 61 : reserved 115 * 62 - 63 : flags 116 */ 117 static struct spdk_mem_map *g_mem_reg_map; 118 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 119 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 120 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 121 122 static bool g_legacy_mem; 123 124 /* 125 * Walk the currently registered memory via the main memory registration map 126 * and call the new map's notify callback for each virtually contiguous region. 127 */ 128 static int 129 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 130 { 131 size_t idx_256tb; 132 uint64_t idx_1gb; 133 uint64_t contig_start = UINT64_MAX; 134 uint64_t contig_end = UINT64_MAX; 135 struct map_1gb *map_1gb; 136 int rc; 137 138 if (!g_mem_reg_map) { 139 return -EINVAL; 140 } 141 142 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 143 pthread_mutex_lock(&g_mem_reg_map->mutex); 144 145 for (idx_256tb = 0; 146 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 147 idx_256tb++) { 148 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 149 150 if (!map_1gb) { 151 if (contig_start != UINT64_MAX) { 152 /* End of of a virtually contiguous range */ 153 rc = map->ops.notify_cb(map->cb_ctx, map, action, 154 (void *)contig_start, 155 contig_end - contig_start + VALUE_2MB); 156 /* Don't bother handling unregister failures. It can't be any worse */ 157 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 158 goto err_unregister; 159 } 160 } 161 contig_start = UINT64_MAX; 162 continue; 163 } 164 165 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 166 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 167 (contig_start == UINT64_MAX || 168 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 169 /* Rebuild the virtual address from the indexes */ 170 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 171 172 if (contig_start == UINT64_MAX) { 173 contig_start = vaddr; 174 } 175 176 contig_end = vaddr; 177 } else { 178 if (contig_start != UINT64_MAX) { 179 /* End of of a virtually contiguous range */ 180 rc = map->ops.notify_cb(map->cb_ctx, map, action, 181 (void *)contig_start, 182 contig_end - contig_start + VALUE_2MB); 183 /* Don't bother handling unregister failures. It can't be any worse */ 184 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 185 goto err_unregister; 186 } 187 188 /* This page might be a part of a neighbour region, so process 189 * it again. The idx_1gb will be incremented immediately. 190 */ 191 idx_1gb--; 192 } 193 contig_start = UINT64_MAX; 194 } 195 } 196 } 197 198 pthread_mutex_unlock(&g_mem_reg_map->mutex); 199 return 0; 200 201 err_unregister: 202 /* Unwind to the first empty translation so we don't unregister 203 * a region that just failed to register. 204 */ 205 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 206 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 207 contig_start = UINT64_MAX; 208 contig_end = UINT64_MAX; 209 210 /* Unregister any memory we managed to register before the failure */ 211 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 212 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 213 214 if (!map_1gb) { 215 if (contig_end != UINT64_MAX) { 216 /* End of of a virtually contiguous range */ 217 map->ops.notify_cb(map->cb_ctx, map, 218 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 219 (void *)contig_start, 220 contig_end - contig_start + VALUE_2MB); 221 } 222 contig_end = UINT64_MAX; 223 continue; 224 } 225 226 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 227 /* Rebuild the virtual address from the indexes */ 228 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 229 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 230 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 231 232 if (contig_end == UINT64_MAX) { 233 contig_end = vaddr; 234 } 235 contig_start = vaddr; 236 } else { 237 if (contig_end != UINT64_MAX) { 238 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 239 contig_start = vaddr; 240 } 241 /* End of of a virtually contiguous range */ 242 map->ops.notify_cb(map->cb_ctx, map, 243 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 244 (void *)contig_start, 245 contig_end - contig_start + VALUE_2MB); 246 } 247 contig_end = UINT64_MAX; 248 } 249 } 250 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 251 } 252 253 pthread_mutex_unlock(&g_mem_reg_map->mutex); 254 return rc; 255 } 256 257 struct spdk_mem_map * 258 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 259 { 260 struct spdk_mem_map *map; 261 int rc; 262 size_t i; 263 264 map = calloc(1, sizeof(*map)); 265 if (map == NULL) { 266 return NULL; 267 } 268 269 if (pthread_mutex_init(&map->mutex, NULL)) { 270 free(map); 271 return NULL; 272 } 273 274 map->default_translation = default_translation; 275 map->cb_ctx = cb_ctx; 276 if (ops) { 277 map->ops = *ops; 278 } 279 280 if (ops && ops->notify_cb) { 281 pthread_mutex_lock(&g_spdk_mem_map_mutex); 282 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 283 if (rc != 0) { 284 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 285 DEBUG_PRINT("Initial mem_map notify failed\n"); 286 pthread_mutex_destroy(&map->mutex); 287 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 288 free(map->map_256tb.map[i]); 289 } 290 free(map); 291 return NULL; 292 } 293 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 294 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 295 } 296 297 return map; 298 } 299 300 void 301 spdk_mem_map_free(struct spdk_mem_map **pmap) 302 { 303 struct spdk_mem_map *map; 304 size_t i; 305 306 if (!pmap) { 307 return; 308 } 309 310 map = *pmap; 311 312 if (!map) { 313 return; 314 } 315 316 if (map->ops.notify_cb) { 317 pthread_mutex_lock(&g_spdk_mem_map_mutex); 318 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 319 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 320 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 321 } 322 323 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 324 free(map->map_256tb.map[i]); 325 } 326 327 pthread_mutex_destroy(&map->mutex); 328 329 free(map); 330 *pmap = NULL; 331 } 332 333 int 334 spdk_mem_register(void *vaddr, size_t len) 335 { 336 struct spdk_mem_map *map; 337 int rc; 338 void *seg_vaddr; 339 size_t seg_len; 340 uint64_t reg; 341 342 if ((uintptr_t)vaddr & ~MASK_256TB) { 343 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 344 return -EINVAL; 345 } 346 347 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 348 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 349 __func__, vaddr, len); 350 return -EINVAL; 351 } 352 353 if (len == 0) { 354 return 0; 355 } 356 357 pthread_mutex_lock(&g_spdk_mem_map_mutex); 358 359 seg_vaddr = vaddr; 360 seg_len = len; 361 while (seg_len > 0) { 362 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 363 if (reg & REG_MAP_REGISTERED) { 364 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 365 return -EBUSY; 366 } 367 seg_vaddr += VALUE_2MB; 368 seg_len -= VALUE_2MB; 369 } 370 371 seg_vaddr = vaddr; 372 seg_len = 0; 373 while (len > 0) { 374 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 375 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 376 seg_len += VALUE_2MB; 377 vaddr += VALUE_2MB; 378 len -= VALUE_2MB; 379 } 380 381 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 382 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 383 if (rc != 0) { 384 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 385 return rc; 386 } 387 } 388 389 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 390 return 0; 391 } 392 393 int 394 spdk_mem_unregister(void *vaddr, size_t len) 395 { 396 struct spdk_mem_map *map; 397 int rc; 398 void *seg_vaddr; 399 size_t seg_len; 400 uint64_t reg, newreg; 401 402 if ((uintptr_t)vaddr & ~MASK_256TB) { 403 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 404 return -EINVAL; 405 } 406 407 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 408 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 409 __func__, vaddr, len); 410 return -EINVAL; 411 } 412 413 pthread_mutex_lock(&g_spdk_mem_map_mutex); 414 415 /* The first page must be a start of a region. Also check if it's 416 * registered to make sure we don't return -ERANGE for non-registered 417 * regions. 418 */ 419 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 420 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 421 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 422 return -ERANGE; 423 } 424 425 seg_vaddr = vaddr; 426 seg_len = len; 427 while (seg_len > 0) { 428 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 429 if ((reg & REG_MAP_REGISTERED) == 0) { 430 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 431 return -EINVAL; 432 } 433 seg_vaddr += VALUE_2MB; 434 seg_len -= VALUE_2MB; 435 } 436 437 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 438 /* If the next page is registered, it must be a start of a region as well, 439 * otherwise we'd be unregistering only a part of a region. 440 */ 441 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 442 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 443 return -ERANGE; 444 } 445 seg_vaddr = vaddr; 446 seg_len = 0; 447 448 while (len > 0) { 449 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 450 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 451 452 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 453 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 454 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 455 if (rc != 0) { 456 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 457 return rc; 458 } 459 } 460 461 seg_vaddr = vaddr; 462 seg_len = VALUE_2MB; 463 } else { 464 seg_len += VALUE_2MB; 465 } 466 467 vaddr += VALUE_2MB; 468 len -= VALUE_2MB; 469 } 470 471 if (seg_len > 0) { 472 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 473 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 474 if (rc != 0) { 475 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 476 return rc; 477 } 478 } 479 } 480 481 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 482 return 0; 483 } 484 485 int 486 spdk_mem_reserve(void *vaddr, size_t len) 487 { 488 struct spdk_mem_map *map; 489 void *seg_vaddr; 490 size_t seg_len; 491 uint64_t reg; 492 493 if ((uintptr_t)vaddr & ~MASK_256TB) { 494 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 495 return -EINVAL; 496 } 497 498 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 499 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 500 __func__, vaddr, len); 501 return -EINVAL; 502 } 503 504 if (len == 0) { 505 return 0; 506 } 507 508 pthread_mutex_lock(&g_spdk_mem_map_mutex); 509 510 /* Check if any part of this range is already registered */ 511 seg_vaddr = vaddr; 512 seg_len = len; 513 while (seg_len > 0) { 514 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 515 if (reg & REG_MAP_REGISTERED) { 516 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 517 return -EBUSY; 518 } 519 seg_vaddr += VALUE_2MB; 520 seg_len -= VALUE_2MB; 521 } 522 523 /* Simply set the translation to the memory map's default. This allocates the space in the 524 * map but does not provide a valid translation. */ 525 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 526 g_mem_reg_map->default_translation); 527 528 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 529 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 530 } 531 532 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 533 return 0; 534 } 535 536 static struct map_1gb * 537 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 538 { 539 struct map_1gb *map_1gb; 540 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 541 size_t i; 542 543 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 544 return NULL; 545 } 546 547 map_1gb = map->map_256tb.map[idx_256tb]; 548 549 if (!map_1gb) { 550 pthread_mutex_lock(&map->mutex); 551 552 /* Recheck to make sure nobody else got the mutex first. */ 553 map_1gb = map->map_256tb.map[idx_256tb]; 554 if (!map_1gb) { 555 map_1gb = malloc(sizeof(struct map_1gb)); 556 if (map_1gb) { 557 /* initialize all entries to default translation */ 558 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 559 map_1gb->map[i].translation_2mb = map->default_translation; 560 } 561 map->map_256tb.map[idx_256tb] = map_1gb; 562 } 563 } 564 565 pthread_mutex_unlock(&map->mutex); 566 567 if (!map_1gb) { 568 DEBUG_PRINT("allocation failed\n"); 569 return NULL; 570 } 571 } 572 573 return map_1gb; 574 } 575 576 int 577 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 578 uint64_t translation) 579 { 580 uint64_t vfn_2mb; 581 struct map_1gb *map_1gb; 582 uint64_t idx_1gb; 583 struct map_2mb *map_2mb; 584 585 if ((uintptr_t)vaddr & ~MASK_256TB) { 586 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 587 return -EINVAL; 588 } 589 590 /* For now, only 2 MB-aligned registrations are supported */ 591 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 592 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 593 __func__, vaddr, size); 594 return -EINVAL; 595 } 596 597 vfn_2mb = vaddr >> SHIFT_2MB; 598 599 while (size) { 600 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 601 if (!map_1gb) { 602 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 603 return -ENOMEM; 604 } 605 606 idx_1gb = MAP_1GB_IDX(vfn_2mb); 607 map_2mb = &map_1gb->map[idx_1gb]; 608 map_2mb->translation_2mb = translation; 609 610 size -= VALUE_2MB; 611 vfn_2mb++; 612 } 613 614 return 0; 615 } 616 617 int 618 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 619 { 620 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 621 } 622 623 inline uint64_t 624 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 625 { 626 const struct map_1gb *map_1gb; 627 const struct map_2mb *map_2mb; 628 uint64_t idx_256tb; 629 uint64_t idx_1gb; 630 uint64_t vfn_2mb; 631 uint64_t cur_size; 632 uint64_t prev_translation; 633 uint64_t orig_translation; 634 635 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 636 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 637 return map->default_translation; 638 } 639 640 vfn_2mb = vaddr >> SHIFT_2MB; 641 idx_256tb = MAP_256TB_IDX(vfn_2mb); 642 idx_1gb = MAP_1GB_IDX(vfn_2mb); 643 644 map_1gb = map->map_256tb.map[idx_256tb]; 645 if (spdk_unlikely(!map_1gb)) { 646 return map->default_translation; 647 } 648 649 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 650 map_2mb = &map_1gb->map[idx_1gb]; 651 if (size == NULL || map->ops.are_contiguous == NULL || 652 map_2mb->translation_2mb == map->default_translation) { 653 if (size != NULL) { 654 *size = spdk_min(*size, cur_size); 655 } 656 return map_2mb->translation_2mb; 657 } 658 659 orig_translation = map_2mb->translation_2mb; 660 prev_translation = orig_translation; 661 while (cur_size < *size) { 662 vfn_2mb++; 663 idx_256tb = MAP_256TB_IDX(vfn_2mb); 664 idx_1gb = MAP_1GB_IDX(vfn_2mb); 665 666 map_1gb = map->map_256tb.map[idx_256tb]; 667 if (spdk_unlikely(!map_1gb)) { 668 break; 669 } 670 671 map_2mb = &map_1gb->map[idx_1gb]; 672 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 673 break; 674 } 675 676 cur_size += VALUE_2MB; 677 prev_translation = map_2mb->translation_2mb; 678 } 679 680 *size = spdk_min(*size, cur_size); 681 return orig_translation; 682 } 683 684 static void 685 memory_hotplug_cb(enum rte_mem_event event_type, 686 const void *addr, size_t len, void *arg) 687 { 688 if (event_type == RTE_MEM_EVENT_ALLOC) { 689 spdk_mem_register((void *)addr, len); 690 691 if (!spdk_env_dpdk_external_init()) { 692 return; 693 } 694 695 /* When the user initialized DPDK separately, we can't 696 * be sure that --match-allocations RTE flag was specified. 697 * Without this flag, DPDK can free memory in different units 698 * than it was allocated. It doesn't work with things like RDMA MRs. 699 * 700 * For such cases, we mark segments so they aren't freed. 701 */ 702 while (len > 0) { 703 struct rte_memseg *seg; 704 705 seg = rte_mem_virt2memseg(addr, NULL); 706 assert(seg != NULL); 707 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 708 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 709 len -= seg->hugepage_sz; 710 } 711 } else if (event_type == RTE_MEM_EVENT_FREE) { 712 spdk_mem_unregister((void *)addr, len); 713 } 714 } 715 716 static int 717 memory_iter_cb(const struct rte_memseg_list *msl, 718 const struct rte_memseg *ms, size_t len, void *arg) 719 { 720 return spdk_mem_register(ms->addr, len); 721 } 722 723 int 724 mem_map_init(bool legacy_mem) 725 { 726 g_legacy_mem = legacy_mem; 727 728 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 729 if (g_mem_reg_map == NULL) { 730 DEBUG_PRINT("memory registration map allocation failed\n"); 731 return -ENOMEM; 732 } 733 734 /* 735 * Walk all DPDK memory segments and register them 736 * with the main memory map 737 */ 738 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 739 rte_memseg_contig_walk(memory_iter_cb, NULL); 740 return 0; 741 } 742 743 bool 744 spdk_iommu_is_enabled(void) 745 { 746 #if VFIO_ENABLED 747 return g_vfio.enabled && !g_vfio.noiommu_enabled; 748 #else 749 return false; 750 #endif 751 } 752 753 struct spdk_vtophys_pci_device { 754 struct rte_pci_device *pci_device; 755 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 756 }; 757 758 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 759 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 760 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 761 762 static struct spdk_mem_map *g_vtophys_map; 763 static struct spdk_mem_map *g_phys_ref_map; 764 765 #if VFIO_ENABLED 766 static int 767 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 768 { 769 struct spdk_vfio_dma_map *dma_map; 770 uint64_t refcount; 771 int ret; 772 773 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 774 assert(refcount < UINT64_MAX); 775 if (refcount > 0) { 776 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 777 return 0; 778 } 779 780 dma_map = calloc(1, sizeof(*dma_map)); 781 if (dma_map == NULL) { 782 return -ENOMEM; 783 } 784 785 dma_map->map.argsz = sizeof(dma_map->map); 786 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 787 dma_map->map.vaddr = vaddr; 788 dma_map->map.iova = iova; 789 dma_map->map.size = size; 790 791 pthread_mutex_lock(&g_vfio.mutex); 792 if (g_vfio.device_ref == 0) { 793 /* VFIO requires at least one device (IOMMU group) to be added to 794 * a VFIO container before it is possible to perform any IOMMU 795 * operations on that container. This memory will be mapped once 796 * the first device (IOMMU group) is hotplugged. 797 * 798 * Since the vfio container is managed internally by DPDK, it is 799 * also possible that some device is already in that container, but 800 * it's not managed by SPDK - e.g. an NIC attached internally 801 * inside DPDK. We could map the memory straight away in such 802 * scenario, but there's no need to do it. DPDK devices clearly 803 * don't need our mappings and hence we defer the mapping 804 * unconditionally until the first SPDK-managed device is 805 * hotplugged. 806 */ 807 goto out_insert; 808 } 809 810 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 811 if (ret) { 812 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 813 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 814 } 815 816 out_insert: 817 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 818 pthread_mutex_unlock(&g_vfio.mutex); 819 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 820 return 0; 821 } 822 823 static int 824 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 825 { 826 struct spdk_vfio_dma_map *dma_map; 827 uint64_t refcount; 828 int ret; 829 struct vfio_iommu_type1_dma_unmap unmap = {}; 830 831 pthread_mutex_lock(&g_vfio.mutex); 832 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 833 if (dma_map->map.iova == iova) { 834 break; 835 } 836 } 837 838 if (dma_map == NULL) { 839 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 840 pthread_mutex_unlock(&g_vfio.mutex); 841 return -ENXIO; 842 } 843 844 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 845 assert(refcount < UINT64_MAX); 846 if (refcount > 0) { 847 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 848 } 849 850 /* We still have outstanding references, don't clear it. */ 851 if (refcount > 1) { 852 pthread_mutex_unlock(&g_vfio.mutex); 853 return 0; 854 } 855 856 /** don't support partial or multiple-page unmap for now */ 857 assert(dma_map->map.size == size); 858 859 if (g_vfio.device_ref == 0) { 860 /* Memory is not mapped anymore, just remove it's references */ 861 goto out_remove; 862 } 863 864 unmap.argsz = sizeof(unmap); 865 unmap.flags = 0; 866 unmap.iova = dma_map->map.iova; 867 unmap.size = dma_map->map.size; 868 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 869 if (ret) { 870 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 871 } 872 873 out_remove: 874 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 875 pthread_mutex_unlock(&g_vfio.mutex); 876 free(dma_map); 877 return 0; 878 } 879 #endif 880 881 static uint64_t 882 vtophys_get_paddr_memseg(uint64_t vaddr) 883 { 884 uintptr_t paddr; 885 struct rte_memseg *seg; 886 887 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 888 if (seg != NULL) { 889 paddr = seg->iova; 890 if (paddr == RTE_BAD_IOVA) { 891 return SPDK_VTOPHYS_ERROR; 892 } 893 paddr += (vaddr - (uintptr_t)seg->addr); 894 return paddr; 895 } 896 897 return SPDK_VTOPHYS_ERROR; 898 } 899 900 /* Try to get the paddr from /proc/self/pagemap */ 901 static uint64_t 902 vtophys_get_paddr_pagemap(uint64_t vaddr) 903 { 904 uintptr_t paddr; 905 906 /* Silence static analyzers */ 907 assert(vaddr != 0); 908 paddr = rte_mem_virt2iova((void *)vaddr); 909 if (paddr == RTE_BAD_IOVA) { 910 /* 911 * The vaddr may be valid but doesn't have a backing page 912 * assigned yet. Touch the page to ensure a backing page 913 * gets assigned, then try to translate again. 914 */ 915 rte_atomic64_read((rte_atomic64_t *)vaddr); 916 paddr = rte_mem_virt2iova((void *)vaddr); 917 } 918 if (paddr == RTE_BAD_IOVA) { 919 /* Unable to get to the physical address. */ 920 return SPDK_VTOPHYS_ERROR; 921 } 922 923 return paddr; 924 } 925 926 /* Try to get the paddr from pci devices */ 927 static uint64_t 928 vtophys_get_paddr_pci(uint64_t vaddr) 929 { 930 struct spdk_vtophys_pci_device *vtophys_dev; 931 uintptr_t paddr; 932 struct rte_pci_device *dev; 933 struct rte_mem_resource *res; 934 unsigned r; 935 936 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 937 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 938 dev = vtophys_dev->pci_device; 939 940 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 941 res = &dev->mem_resource[r]; 942 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 943 vaddr < (uint64_t)res->addr + res->len) { 944 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 945 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 946 (void *)paddr); 947 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 948 return paddr; 949 } 950 } 951 } 952 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 953 954 return SPDK_VTOPHYS_ERROR; 955 } 956 957 static int 958 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 959 enum spdk_mem_map_notify_action action, 960 void *vaddr, size_t len) 961 { 962 int rc = 0, pci_phys = 0; 963 uint64_t paddr; 964 965 if ((uintptr_t)vaddr & ~MASK_256TB) { 966 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 967 return -EINVAL; 968 } 969 970 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 971 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 972 vaddr, len); 973 return -EINVAL; 974 } 975 976 /* Get the physical address from the DPDK memsegs */ 977 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 978 979 switch (action) { 980 case SPDK_MEM_MAP_NOTIFY_REGISTER: 981 if (paddr == SPDK_VTOPHYS_ERROR) { 982 /* This is not an address that DPDK is managing. */ 983 #if VFIO_ENABLED 984 enum rte_iova_mode iova_mode; 985 986 iova_mode = rte_eal_iova_mode(); 987 988 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 989 /* We'll use the virtual address as the iova to match DPDK. */ 990 paddr = (uint64_t)vaddr; 991 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 992 if (rc) { 993 return -EFAULT; 994 } 995 while (len > 0) { 996 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 997 if (rc != 0) { 998 return rc; 999 } 1000 vaddr += VALUE_2MB; 1001 paddr += VALUE_2MB; 1002 len -= VALUE_2MB; 1003 } 1004 } else 1005 #endif 1006 { 1007 /* Get the physical address from /proc/self/pagemap. */ 1008 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1009 if (paddr == SPDK_VTOPHYS_ERROR) { 1010 /* Get the physical address from PCI devices */ 1011 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1012 if (paddr == SPDK_VTOPHYS_ERROR) { 1013 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1014 return -EFAULT; 1015 } 1016 /* The beginning of this address range points to a PCI resource, 1017 * so the rest must point to a PCI resource as well. 1018 */ 1019 pci_phys = 1; 1020 } 1021 1022 /* Get paddr for each 2MB chunk in this address range */ 1023 while (len > 0) { 1024 /* Get the physical address from /proc/self/pagemap. */ 1025 if (pci_phys) { 1026 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1027 } else { 1028 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1029 } 1030 1031 if (paddr == SPDK_VTOPHYS_ERROR) { 1032 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1033 return -EFAULT; 1034 } 1035 1036 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1037 if (!pci_phys && (paddr & MASK_2MB)) { 1038 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1039 return -EINVAL; 1040 } 1041 #if VFIO_ENABLED 1042 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1043 * with the IOMMU using the physical address to match. */ 1044 if (spdk_iommu_is_enabled()) { 1045 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1046 if (rc) { 1047 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1048 return -EFAULT; 1049 } 1050 } 1051 #endif 1052 1053 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1054 if (rc != 0) { 1055 return rc; 1056 } 1057 1058 vaddr += VALUE_2MB; 1059 len -= VALUE_2MB; 1060 } 1061 } 1062 } else { 1063 /* This is an address managed by DPDK. Just setup the translations. */ 1064 while (len > 0) { 1065 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1066 if (paddr == SPDK_VTOPHYS_ERROR) { 1067 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1068 return -EFAULT; 1069 } 1070 1071 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1072 if (rc != 0) { 1073 return rc; 1074 } 1075 1076 vaddr += VALUE_2MB; 1077 len -= VALUE_2MB; 1078 } 1079 } 1080 1081 break; 1082 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1083 #if VFIO_ENABLED 1084 if (paddr == SPDK_VTOPHYS_ERROR) { 1085 /* 1086 * This is not an address that DPDK is managing. If vfio is enabled, 1087 * we need to unmap the range from the IOMMU 1088 */ 1089 if (spdk_iommu_is_enabled()) { 1090 uint64_t buffer_len = len; 1091 uint8_t *va = vaddr; 1092 enum rte_iova_mode iova_mode; 1093 1094 iova_mode = rte_eal_iova_mode(); 1095 /* 1096 * In virtual address mode, the region is contiguous and can be done in 1097 * one unmap. 1098 */ 1099 if (iova_mode == RTE_IOVA_VA) { 1100 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1101 if (buffer_len != len || paddr != (uintptr_t)va) { 1102 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1103 "translation had address 0x%" PRIx64 " and length %lu\n", 1104 va, len, paddr, buffer_len); 1105 return -EINVAL; 1106 } 1107 rc = vtophys_iommu_unmap_dma(paddr, len); 1108 if (rc) { 1109 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1110 return -EFAULT; 1111 } 1112 } else if (iova_mode == RTE_IOVA_PA) { 1113 /* Get paddr for each 2MB chunk in this address range */ 1114 while (buffer_len > 0) { 1115 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1116 1117 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1118 DEBUG_PRINT("could not get phys addr for %p\n", va); 1119 return -EFAULT; 1120 } 1121 1122 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1123 if (rc) { 1124 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1125 return -EFAULT; 1126 } 1127 1128 va += VALUE_2MB; 1129 buffer_len -= VALUE_2MB; 1130 } 1131 } 1132 } 1133 } 1134 #endif 1135 while (len > 0) { 1136 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1137 if (rc != 0) { 1138 return rc; 1139 } 1140 1141 vaddr += VALUE_2MB; 1142 len -= VALUE_2MB; 1143 } 1144 1145 break; 1146 default: 1147 SPDK_UNREACHABLE(); 1148 } 1149 1150 return rc; 1151 } 1152 1153 static int 1154 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1155 { 1156 /* This function is always called with paddrs for two subsequent 1157 * 2MB chunks in virtual address space, so those chunks will be only 1158 * physically contiguous if the physical addresses are 2MB apart 1159 * from each other as well. 1160 */ 1161 return (paddr2 - paddr1 == VALUE_2MB); 1162 } 1163 1164 #if VFIO_ENABLED 1165 1166 static bool 1167 vfio_enabled(void) 1168 { 1169 return rte_vfio_is_enabled("vfio_pci"); 1170 } 1171 1172 /* Check if IOMMU is enabled on the system */ 1173 static bool 1174 has_iommu_groups(void) 1175 { 1176 int count = 0; 1177 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1178 1179 if (dir == NULL) { 1180 return false; 1181 } 1182 1183 while (count < 3 && readdir(dir) != NULL) { 1184 count++; 1185 } 1186 1187 closedir(dir); 1188 /* there will always be ./ and ../ entries */ 1189 return count > 2; 1190 } 1191 1192 static bool 1193 vfio_noiommu_enabled(void) 1194 { 1195 return rte_vfio_noiommu_is_enabled(); 1196 } 1197 1198 static void 1199 vtophys_iommu_init(void) 1200 { 1201 char proc_fd_path[PATH_MAX + 1]; 1202 char link_path[PATH_MAX + 1]; 1203 const char vfio_path[] = "/dev/vfio/vfio"; 1204 DIR *dir; 1205 struct dirent *d; 1206 1207 if (!vfio_enabled()) { 1208 return; 1209 } 1210 1211 if (vfio_noiommu_enabled()) { 1212 g_vfio.noiommu_enabled = true; 1213 } else if (!has_iommu_groups()) { 1214 return; 1215 } 1216 1217 dir = opendir("/proc/self/fd"); 1218 if (!dir) { 1219 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1220 return; 1221 } 1222 1223 while ((d = readdir(dir)) != NULL) { 1224 if (d->d_type != DT_LNK) { 1225 continue; 1226 } 1227 1228 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1229 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1230 continue; 1231 } 1232 1233 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1234 sscanf(d->d_name, "%d", &g_vfio.fd); 1235 break; 1236 } 1237 } 1238 1239 closedir(dir); 1240 1241 if (g_vfio.fd < 0) { 1242 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1243 return; 1244 } 1245 1246 g_vfio.enabled = true; 1247 1248 return; 1249 } 1250 1251 #endif 1252 1253 void 1254 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1255 { 1256 struct spdk_vtophys_pci_device *vtophys_dev; 1257 1258 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1259 1260 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1261 if (vtophys_dev) { 1262 vtophys_dev->pci_device = pci_device; 1263 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1264 } else { 1265 DEBUG_PRINT("Memory allocation error\n"); 1266 } 1267 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1268 1269 #if VFIO_ENABLED 1270 struct spdk_vfio_dma_map *dma_map; 1271 int ret; 1272 1273 if (!g_vfio.enabled) { 1274 return; 1275 } 1276 1277 pthread_mutex_lock(&g_vfio.mutex); 1278 g_vfio.device_ref++; 1279 if (g_vfio.device_ref > 1) { 1280 pthread_mutex_unlock(&g_vfio.mutex); 1281 return; 1282 } 1283 1284 /* This is the first SPDK device using DPDK vfio. This means that the first 1285 * IOMMU group might have been just been added to the DPDK vfio container. 1286 * From this point it is certain that the memory can be mapped now. 1287 */ 1288 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1289 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1290 if (ret) { 1291 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1292 break; 1293 } 1294 } 1295 pthread_mutex_unlock(&g_vfio.mutex); 1296 #endif 1297 } 1298 1299 void 1300 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1301 { 1302 struct spdk_vtophys_pci_device *vtophys_dev; 1303 1304 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1305 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1306 if (vtophys_dev->pci_device == pci_device) { 1307 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1308 free(vtophys_dev); 1309 break; 1310 } 1311 } 1312 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1313 1314 #if VFIO_ENABLED 1315 struct spdk_vfio_dma_map *dma_map; 1316 int ret; 1317 1318 if (!g_vfio.enabled) { 1319 return; 1320 } 1321 1322 pthread_mutex_lock(&g_vfio.mutex); 1323 assert(g_vfio.device_ref > 0); 1324 g_vfio.device_ref--; 1325 if (g_vfio.device_ref > 0) { 1326 pthread_mutex_unlock(&g_vfio.mutex); 1327 return; 1328 } 1329 1330 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1331 * any additional devices using it's vfio container, all the mappings 1332 * will be automatically removed by the Linux vfio driver. We unmap 1333 * the memory manually to be able to easily re-map it later regardless 1334 * of other, external factors. 1335 */ 1336 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1337 struct vfio_iommu_type1_dma_unmap unmap = {}; 1338 unmap.argsz = sizeof(unmap); 1339 unmap.flags = 0; 1340 unmap.iova = dma_map->map.iova; 1341 unmap.size = dma_map->map.size; 1342 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1343 if (ret) { 1344 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1345 break; 1346 } 1347 } 1348 pthread_mutex_unlock(&g_vfio.mutex); 1349 #endif 1350 } 1351 1352 int 1353 vtophys_init(void) 1354 { 1355 const struct spdk_mem_map_ops vtophys_map_ops = { 1356 .notify_cb = vtophys_notify, 1357 .are_contiguous = vtophys_check_contiguous_entries, 1358 }; 1359 1360 const struct spdk_mem_map_ops phys_ref_map_ops = { 1361 .notify_cb = NULL, 1362 .are_contiguous = NULL, 1363 }; 1364 1365 #if VFIO_ENABLED 1366 vtophys_iommu_init(); 1367 #endif 1368 1369 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1370 if (g_phys_ref_map == NULL) { 1371 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1372 return -ENOMEM; 1373 } 1374 1375 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1376 if (g_vtophys_map == NULL) { 1377 DEBUG_PRINT("vtophys map allocation failed\n"); 1378 spdk_mem_map_free(&g_phys_ref_map); 1379 return -ENOMEM; 1380 } 1381 return 0; 1382 } 1383 1384 uint64_t 1385 spdk_vtophys(const void *buf, uint64_t *size) 1386 { 1387 uint64_t vaddr, paddr_2mb; 1388 1389 vaddr = (uint64_t)buf; 1390 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1391 1392 /* 1393 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1394 * we will still bitwise-or it with the buf offset below, but the result will still be 1395 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1396 * unaligned) we must now check the return value before addition. 1397 */ 1398 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1399 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1400 return SPDK_VTOPHYS_ERROR; 1401 } else { 1402 return paddr_2mb + (vaddr & MASK_2MB); 1403 } 1404 } 1405 1406 int 1407 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1408 { 1409 struct rte_memseg *seg; 1410 int ret, fd; 1411 1412 seg = rte_mem_virt2memseg(vaddr, NULL); 1413 if (!seg) { 1414 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1415 return -ENOENT; 1416 } 1417 1418 fd = rte_memseg_get_fd_thread_unsafe(seg); 1419 if (fd < 0) { 1420 return fd; 1421 } 1422 1423 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1424 if (ret < 0) { 1425 return ret; 1426 } 1427 1428 return fd; 1429 } 1430