1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 #include "pci_dpdk.h" 10 11 #include <rte_config.h> 12 #include <rte_memory.h> 13 #include <rte_eal_memconfig.h> 14 #include <rte_dev.h> 15 #include <rte_pci.h> 16 17 #include "spdk_internal/assert.h" 18 19 #include "spdk/assert.h" 20 #include "spdk/likely.h" 21 #include "spdk/queue.h" 22 #include "spdk/util.h" 23 #include "spdk/memory.h" 24 #include "spdk/env_dpdk.h" 25 #include "spdk/log.h" 26 27 #ifdef __linux__ 28 #include <linux/version.h> 29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30 #include <linux/vfio.h> 31 #include <rte_vfio.h> 32 33 struct spdk_vfio_dma_map { 34 struct vfio_iommu_type1_dma_map map; 35 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36 }; 37 38 struct vfio_cfg { 39 int fd; 40 bool enabled; 41 bool noiommu_enabled; 42 unsigned device_ref; 43 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44 pthread_mutex_t mutex; 45 }; 46 47 static struct vfio_cfg g_vfio = { 48 .fd = -1, 49 .enabled = false, 50 .noiommu_enabled = false, 51 .device_ref = 0, 52 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53 .mutex = PTHREAD_MUTEX_INITIALIZER 54 }; 55 #endif 56 #endif 57 58 #if DEBUG 59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 60 #else 61 #define DEBUG_PRINT(...) 62 #endif 63 64 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 65 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 66 67 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 68 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 69 70 /* Page is registered */ 71 #define REG_MAP_REGISTERED (1ULL << 62) 72 73 /* A notification region barrier. The 2MB translation entry that's marked 74 * with this flag must be unregistered separately. This allows contiguous 75 * regions to be unregistered in the same chunks they were registered. 76 */ 77 #define REG_MAP_NOTIFY_START (1ULL << 63) 78 79 /* Translation of a single 2MB page. */ 80 struct map_2mb { 81 uint64_t translation_2mb; 82 }; 83 84 /* Second-level map table indexed by bits [21..29] of the virtual address. 85 * Each entry contains the address translation or error for entries that haven't 86 * been retrieved yet. 87 */ 88 struct map_1gb { 89 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 90 }; 91 92 /* Top-level map table indexed by bits [30..47] of the virtual address. 93 * Each entry points to a second-level map table or NULL. 94 */ 95 struct map_256tb { 96 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 97 }; 98 99 /* Page-granularity memory address translation */ 100 struct spdk_mem_map { 101 struct map_256tb map_256tb; 102 pthread_mutex_t mutex; 103 uint64_t default_translation; 104 struct spdk_mem_map_ops ops; 105 void *cb_ctx; 106 TAILQ_ENTRY(spdk_mem_map) tailq; 107 }; 108 109 /* Registrations map. The 64 bit translations are bit fields with the 110 * following layout (starting with the low bits): 111 * 0 - 61 : reserved 112 * 62 - 63 : flags 113 */ 114 static struct spdk_mem_map *g_mem_reg_map; 115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 116 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 118 119 static bool g_legacy_mem; 120 121 /* 122 * Walk the currently registered memory via the main memory registration map 123 * and call the new map's notify callback for each virtually contiguous region. 124 */ 125 static int 126 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 127 { 128 size_t idx_256tb; 129 uint64_t idx_1gb; 130 uint64_t contig_start = UINT64_MAX; 131 uint64_t contig_end = UINT64_MAX; 132 struct map_1gb *map_1gb; 133 int rc; 134 135 if (!g_mem_reg_map) { 136 return -EINVAL; 137 } 138 139 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 140 pthread_mutex_lock(&g_mem_reg_map->mutex); 141 142 for (idx_256tb = 0; 143 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 144 idx_256tb++) { 145 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 146 147 if (!map_1gb) { 148 if (contig_start != UINT64_MAX) { 149 /* End of of a virtually contiguous range */ 150 rc = map->ops.notify_cb(map->cb_ctx, map, action, 151 (void *)contig_start, 152 contig_end - contig_start + VALUE_2MB); 153 /* Don't bother handling unregister failures. It can't be any worse */ 154 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 155 goto err_unregister; 156 } 157 } 158 contig_start = UINT64_MAX; 159 continue; 160 } 161 162 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 163 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 164 (contig_start == UINT64_MAX || 165 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 166 /* Rebuild the virtual address from the indexes */ 167 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 168 169 if (contig_start == UINT64_MAX) { 170 contig_start = vaddr; 171 } 172 173 contig_end = vaddr; 174 } else { 175 if (contig_start != UINT64_MAX) { 176 /* End of of a virtually contiguous range */ 177 rc = map->ops.notify_cb(map->cb_ctx, map, action, 178 (void *)contig_start, 179 contig_end - contig_start + VALUE_2MB); 180 /* Don't bother handling unregister failures. It can't be any worse */ 181 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 182 goto err_unregister; 183 } 184 185 /* This page might be a part of a neighbour region, so process 186 * it again. The idx_1gb will be incremented immediately. 187 */ 188 idx_1gb--; 189 } 190 contig_start = UINT64_MAX; 191 } 192 } 193 } 194 195 pthread_mutex_unlock(&g_mem_reg_map->mutex); 196 return 0; 197 198 err_unregister: 199 /* Unwind to the first empty translation so we don't unregister 200 * a region that just failed to register. 201 */ 202 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 203 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 204 contig_start = UINT64_MAX; 205 contig_end = UINT64_MAX; 206 207 /* Unregister any memory we managed to register before the failure */ 208 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 209 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 210 211 if (!map_1gb) { 212 if (contig_end != UINT64_MAX) { 213 /* End of of a virtually contiguous range */ 214 map->ops.notify_cb(map->cb_ctx, map, 215 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 216 (void *)contig_start, 217 contig_end - contig_start + VALUE_2MB); 218 } 219 contig_end = UINT64_MAX; 220 continue; 221 } 222 223 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 224 /* Rebuild the virtual address from the indexes */ 225 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 226 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 227 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 228 229 if (contig_end == UINT64_MAX) { 230 contig_end = vaddr; 231 } 232 contig_start = vaddr; 233 } else { 234 if (contig_end != UINT64_MAX) { 235 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 236 contig_start = vaddr; 237 } 238 /* End of of a virtually contiguous range */ 239 map->ops.notify_cb(map->cb_ctx, map, 240 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 241 (void *)contig_start, 242 contig_end - contig_start + VALUE_2MB); 243 } 244 contig_end = UINT64_MAX; 245 } 246 } 247 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 248 } 249 250 pthread_mutex_unlock(&g_mem_reg_map->mutex); 251 return rc; 252 } 253 254 struct spdk_mem_map * 255 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 256 { 257 struct spdk_mem_map *map; 258 int rc; 259 size_t i; 260 261 map = calloc(1, sizeof(*map)); 262 if (map == NULL) { 263 return NULL; 264 } 265 266 if (pthread_mutex_init(&map->mutex, NULL)) { 267 free(map); 268 return NULL; 269 } 270 271 map->default_translation = default_translation; 272 map->cb_ctx = cb_ctx; 273 if (ops) { 274 map->ops = *ops; 275 } 276 277 if (ops && ops->notify_cb) { 278 pthread_mutex_lock(&g_spdk_mem_map_mutex); 279 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 280 if (rc != 0) { 281 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 282 DEBUG_PRINT("Initial mem_map notify failed\n"); 283 pthread_mutex_destroy(&map->mutex); 284 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 285 free(map->map_256tb.map[i]); 286 } 287 free(map); 288 return NULL; 289 } 290 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 291 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 292 } 293 294 return map; 295 } 296 297 void 298 spdk_mem_map_free(struct spdk_mem_map **pmap) 299 { 300 struct spdk_mem_map *map; 301 size_t i; 302 303 if (!pmap) { 304 return; 305 } 306 307 map = *pmap; 308 309 if (!map) { 310 return; 311 } 312 313 if (map->ops.notify_cb) { 314 pthread_mutex_lock(&g_spdk_mem_map_mutex); 315 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 316 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 317 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 318 } 319 320 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 321 free(map->map_256tb.map[i]); 322 } 323 324 pthread_mutex_destroy(&map->mutex); 325 326 free(map); 327 *pmap = NULL; 328 } 329 330 int 331 spdk_mem_register(void *vaddr, size_t len) 332 { 333 struct spdk_mem_map *map; 334 int rc; 335 void *seg_vaddr; 336 size_t seg_len; 337 uint64_t reg; 338 339 if ((uintptr_t)vaddr & ~MASK_256TB) { 340 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 341 return -EINVAL; 342 } 343 344 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 345 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 346 __func__, vaddr, len); 347 return -EINVAL; 348 } 349 350 if (len == 0) { 351 return 0; 352 } 353 354 pthread_mutex_lock(&g_spdk_mem_map_mutex); 355 356 seg_vaddr = vaddr; 357 seg_len = len; 358 while (seg_len > 0) { 359 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 360 if (reg & REG_MAP_REGISTERED) { 361 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 362 return -EBUSY; 363 } 364 seg_vaddr += VALUE_2MB; 365 seg_len -= VALUE_2MB; 366 } 367 368 seg_vaddr = vaddr; 369 seg_len = 0; 370 while (len > 0) { 371 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 372 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 373 seg_len += VALUE_2MB; 374 vaddr += VALUE_2MB; 375 len -= VALUE_2MB; 376 } 377 378 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 379 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 380 if (rc != 0) { 381 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 382 return rc; 383 } 384 } 385 386 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 387 return 0; 388 } 389 390 int 391 spdk_mem_unregister(void *vaddr, size_t len) 392 { 393 struct spdk_mem_map *map; 394 int rc; 395 void *seg_vaddr; 396 size_t seg_len; 397 uint64_t reg, newreg; 398 399 if ((uintptr_t)vaddr & ~MASK_256TB) { 400 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 401 return -EINVAL; 402 } 403 404 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 405 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 406 __func__, vaddr, len); 407 return -EINVAL; 408 } 409 410 pthread_mutex_lock(&g_spdk_mem_map_mutex); 411 412 /* The first page must be a start of a region. Also check if it's 413 * registered to make sure we don't return -ERANGE for non-registered 414 * regions. 415 */ 416 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 417 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 418 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 419 return -ERANGE; 420 } 421 422 seg_vaddr = vaddr; 423 seg_len = len; 424 while (seg_len > 0) { 425 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 426 if ((reg & REG_MAP_REGISTERED) == 0) { 427 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 428 return -EINVAL; 429 } 430 seg_vaddr += VALUE_2MB; 431 seg_len -= VALUE_2MB; 432 } 433 434 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 435 /* If the next page is registered, it must be a start of a region as well, 436 * otherwise we'd be unregistering only a part of a region. 437 */ 438 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 439 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 440 return -ERANGE; 441 } 442 seg_vaddr = vaddr; 443 seg_len = 0; 444 445 while (len > 0) { 446 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 447 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 448 449 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 450 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 451 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 452 if (rc != 0) { 453 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 454 return rc; 455 } 456 } 457 458 seg_vaddr = vaddr; 459 seg_len = VALUE_2MB; 460 } else { 461 seg_len += VALUE_2MB; 462 } 463 464 vaddr += VALUE_2MB; 465 len -= VALUE_2MB; 466 } 467 468 if (seg_len > 0) { 469 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 470 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 471 if (rc != 0) { 472 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 473 return rc; 474 } 475 } 476 } 477 478 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 479 return 0; 480 } 481 482 int 483 spdk_mem_reserve(void *vaddr, size_t len) 484 { 485 struct spdk_mem_map *map; 486 void *seg_vaddr; 487 size_t seg_len; 488 uint64_t reg; 489 490 if ((uintptr_t)vaddr & ~MASK_256TB) { 491 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 492 return -EINVAL; 493 } 494 495 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 496 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 497 __func__, vaddr, len); 498 return -EINVAL; 499 } 500 501 if (len == 0) { 502 return 0; 503 } 504 505 pthread_mutex_lock(&g_spdk_mem_map_mutex); 506 507 /* Check if any part of this range is already registered */ 508 seg_vaddr = vaddr; 509 seg_len = len; 510 while (seg_len > 0) { 511 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 512 if (reg & REG_MAP_REGISTERED) { 513 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 514 return -EBUSY; 515 } 516 seg_vaddr += VALUE_2MB; 517 seg_len -= VALUE_2MB; 518 } 519 520 /* Simply set the translation to the memory map's default. This allocates the space in the 521 * map but does not provide a valid translation. */ 522 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 523 g_mem_reg_map->default_translation); 524 525 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 526 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 527 } 528 529 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 530 return 0; 531 } 532 533 static struct map_1gb * 534 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 535 { 536 struct map_1gb *map_1gb; 537 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 538 size_t i; 539 540 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 541 return NULL; 542 } 543 544 map_1gb = map->map_256tb.map[idx_256tb]; 545 546 if (!map_1gb) { 547 pthread_mutex_lock(&map->mutex); 548 549 /* Recheck to make sure nobody else got the mutex first. */ 550 map_1gb = map->map_256tb.map[idx_256tb]; 551 if (!map_1gb) { 552 map_1gb = malloc(sizeof(struct map_1gb)); 553 if (map_1gb) { 554 /* initialize all entries to default translation */ 555 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 556 map_1gb->map[i].translation_2mb = map->default_translation; 557 } 558 map->map_256tb.map[idx_256tb] = map_1gb; 559 } 560 } 561 562 pthread_mutex_unlock(&map->mutex); 563 564 if (!map_1gb) { 565 DEBUG_PRINT("allocation failed\n"); 566 return NULL; 567 } 568 } 569 570 return map_1gb; 571 } 572 573 int 574 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 575 uint64_t translation) 576 { 577 uint64_t vfn_2mb; 578 struct map_1gb *map_1gb; 579 uint64_t idx_1gb; 580 struct map_2mb *map_2mb; 581 582 if ((uintptr_t)vaddr & ~MASK_256TB) { 583 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 584 return -EINVAL; 585 } 586 587 /* For now, only 2 MB-aligned registrations are supported */ 588 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 589 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 590 __func__, vaddr, size); 591 return -EINVAL; 592 } 593 594 vfn_2mb = vaddr >> SHIFT_2MB; 595 596 while (size) { 597 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 598 if (!map_1gb) { 599 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 600 return -ENOMEM; 601 } 602 603 idx_1gb = MAP_1GB_IDX(vfn_2mb); 604 map_2mb = &map_1gb->map[idx_1gb]; 605 map_2mb->translation_2mb = translation; 606 607 size -= VALUE_2MB; 608 vfn_2mb++; 609 } 610 611 return 0; 612 } 613 614 int 615 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 616 { 617 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 618 } 619 620 inline uint64_t 621 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 622 { 623 const struct map_1gb *map_1gb; 624 const struct map_2mb *map_2mb; 625 uint64_t idx_256tb; 626 uint64_t idx_1gb; 627 uint64_t vfn_2mb; 628 uint64_t cur_size; 629 uint64_t prev_translation; 630 uint64_t orig_translation; 631 632 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 633 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 634 return map->default_translation; 635 } 636 637 vfn_2mb = vaddr >> SHIFT_2MB; 638 idx_256tb = MAP_256TB_IDX(vfn_2mb); 639 idx_1gb = MAP_1GB_IDX(vfn_2mb); 640 641 map_1gb = map->map_256tb.map[idx_256tb]; 642 if (spdk_unlikely(!map_1gb)) { 643 return map->default_translation; 644 } 645 646 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 647 map_2mb = &map_1gb->map[idx_1gb]; 648 if (size == NULL || map->ops.are_contiguous == NULL || 649 map_2mb->translation_2mb == map->default_translation) { 650 if (size != NULL) { 651 *size = spdk_min(*size, cur_size); 652 } 653 return map_2mb->translation_2mb; 654 } 655 656 orig_translation = map_2mb->translation_2mb; 657 prev_translation = orig_translation; 658 while (cur_size < *size) { 659 vfn_2mb++; 660 idx_256tb = MAP_256TB_IDX(vfn_2mb); 661 idx_1gb = MAP_1GB_IDX(vfn_2mb); 662 663 map_1gb = map->map_256tb.map[idx_256tb]; 664 if (spdk_unlikely(!map_1gb)) { 665 break; 666 } 667 668 map_2mb = &map_1gb->map[idx_1gb]; 669 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 670 break; 671 } 672 673 cur_size += VALUE_2MB; 674 prev_translation = map_2mb->translation_2mb; 675 } 676 677 *size = spdk_min(*size, cur_size); 678 return orig_translation; 679 } 680 681 static void 682 memory_hotplug_cb(enum rte_mem_event event_type, 683 const void *addr, size_t len, void *arg) 684 { 685 if (event_type == RTE_MEM_EVENT_ALLOC) { 686 spdk_mem_register((void *)addr, len); 687 688 if (!spdk_env_dpdk_external_init()) { 689 return; 690 } 691 692 /* When the user initialized DPDK separately, we can't 693 * be sure that --match-allocations RTE flag was specified. 694 * Without this flag, DPDK can free memory in different units 695 * than it was allocated. It doesn't work with things like RDMA MRs. 696 * 697 * For such cases, we mark segments so they aren't freed. 698 */ 699 while (len > 0) { 700 struct rte_memseg *seg; 701 702 seg = rte_mem_virt2memseg(addr, NULL); 703 assert(seg != NULL); 704 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 705 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 706 len -= seg->hugepage_sz; 707 } 708 } else if (event_type == RTE_MEM_EVENT_FREE) { 709 spdk_mem_unregister((void *)addr, len); 710 } 711 } 712 713 static int 714 memory_iter_cb(const struct rte_memseg_list *msl, 715 const struct rte_memseg *ms, size_t len, void *arg) 716 { 717 return spdk_mem_register(ms->addr, len); 718 } 719 720 int 721 mem_map_init(bool legacy_mem) 722 { 723 g_legacy_mem = legacy_mem; 724 725 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 726 if (g_mem_reg_map == NULL) { 727 DEBUG_PRINT("memory registration map allocation failed\n"); 728 return -ENOMEM; 729 } 730 731 /* 732 * Walk all DPDK memory segments and register them 733 * with the main memory map 734 */ 735 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 736 rte_memseg_contig_walk(memory_iter_cb, NULL); 737 return 0; 738 } 739 740 bool 741 spdk_iommu_is_enabled(void) 742 { 743 #if VFIO_ENABLED 744 return g_vfio.enabled && !g_vfio.noiommu_enabled; 745 #else 746 return false; 747 #endif 748 } 749 750 struct spdk_vtophys_pci_device { 751 struct rte_pci_device *pci_device; 752 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 753 }; 754 755 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 756 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 757 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 758 759 static struct spdk_mem_map *g_vtophys_map; 760 static struct spdk_mem_map *g_phys_ref_map; 761 762 #if VFIO_ENABLED 763 static int 764 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 765 { 766 struct spdk_vfio_dma_map *dma_map; 767 int ret; 768 769 dma_map = calloc(1, sizeof(*dma_map)); 770 if (dma_map == NULL) { 771 return -ENOMEM; 772 } 773 774 dma_map->map.argsz = sizeof(dma_map->map); 775 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 776 dma_map->map.vaddr = vaddr; 777 dma_map->map.iova = iova; 778 dma_map->map.size = size; 779 780 if (g_vfio.device_ref == 0) { 781 /* VFIO requires at least one device (IOMMU group) to be added to 782 * a VFIO container before it is possible to perform any IOMMU 783 * operations on that container. This memory will be mapped once 784 * the first device (IOMMU group) is hotplugged. 785 * 786 * Since the vfio container is managed internally by DPDK, it is 787 * also possible that some device is already in that container, but 788 * it's not managed by SPDK - e.g. an NIC attached internally 789 * inside DPDK. We could map the memory straight away in such 790 * scenario, but there's no need to do it. DPDK devices clearly 791 * don't need our mappings and hence we defer the mapping 792 * unconditionally until the first SPDK-managed device is 793 * hotplugged. 794 */ 795 goto out_insert; 796 } 797 798 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 799 if (ret) { 800 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 801 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 802 } 803 804 out_insert: 805 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 806 return 0; 807 } 808 809 810 static int 811 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 812 { 813 uint64_t refcount; 814 int ret; 815 816 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 817 assert(refcount < UINT64_MAX); 818 if (refcount > 0) { 819 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 820 return 0; 821 } 822 823 pthread_mutex_lock(&g_vfio.mutex); 824 ret = _vfio_iommu_map_dma(vaddr, iova, size); 825 pthread_mutex_unlock(&g_vfio.mutex); 826 if (ret) { 827 return ret; 828 } 829 830 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 831 return 0; 832 } 833 834 int 835 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) 836 { 837 int ret; 838 839 pthread_mutex_lock(&g_vfio.mutex); 840 ret = _vfio_iommu_map_dma(vaddr, iova, size); 841 pthread_mutex_unlock(&g_vfio.mutex); 842 843 return ret; 844 } 845 846 static int 847 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) 848 { 849 struct vfio_iommu_type1_dma_unmap unmap = {}; 850 int ret; 851 852 if (g_vfio.device_ref == 0) { 853 /* Memory is not mapped anymore, just remove it's references */ 854 goto out_remove; 855 } 856 857 unmap.argsz = sizeof(unmap); 858 unmap.flags = 0; 859 unmap.iova = dma_map->map.iova; 860 unmap.size = dma_map->map.size; 861 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 862 if (ret) { 863 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 864 } 865 866 out_remove: 867 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 868 free(dma_map); 869 return 0; 870 } 871 872 static int 873 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 874 { 875 struct spdk_vfio_dma_map *dma_map; 876 uint64_t refcount; 877 int ret; 878 879 pthread_mutex_lock(&g_vfio.mutex); 880 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 881 if (dma_map->map.iova == iova) { 882 break; 883 } 884 } 885 886 if (dma_map == NULL) { 887 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 888 pthread_mutex_unlock(&g_vfio.mutex); 889 return -ENXIO; 890 } 891 892 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 893 assert(refcount < UINT64_MAX); 894 if (refcount > 0) { 895 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 896 } 897 898 /* We still have outstanding references, don't clear it. */ 899 if (refcount > 1) { 900 pthread_mutex_unlock(&g_vfio.mutex); 901 return 0; 902 } 903 904 /** don't support partial or multiple-page unmap for now */ 905 assert(dma_map->map.size == size); 906 907 ret = _vfio_iommu_unmap_dma(dma_map); 908 pthread_mutex_unlock(&g_vfio.mutex); 909 910 return ret; 911 } 912 913 int 914 vtophys_iommu_unmap_dma_bar(uint64_t vaddr) 915 { 916 struct spdk_vfio_dma_map *dma_map; 917 int ret; 918 919 pthread_mutex_lock(&g_vfio.mutex); 920 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 921 if (dma_map->map.vaddr == vaddr) { 922 break; 923 } 924 } 925 926 if (dma_map == NULL) { 927 DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); 928 pthread_mutex_unlock(&g_vfio.mutex); 929 return -ENXIO; 930 } 931 932 ret = _vfio_iommu_unmap_dma(dma_map); 933 pthread_mutex_unlock(&g_vfio.mutex); 934 return ret; 935 } 936 #endif 937 938 static uint64_t 939 vtophys_get_paddr_memseg(uint64_t vaddr) 940 { 941 uintptr_t paddr; 942 struct rte_memseg *seg; 943 944 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 945 if (seg != NULL) { 946 paddr = seg->iova; 947 if (paddr == RTE_BAD_IOVA) { 948 return SPDK_VTOPHYS_ERROR; 949 } 950 paddr += (vaddr - (uintptr_t)seg->addr); 951 return paddr; 952 } 953 954 return SPDK_VTOPHYS_ERROR; 955 } 956 957 /* Try to get the paddr from /proc/self/pagemap */ 958 static uint64_t 959 vtophys_get_paddr_pagemap(uint64_t vaddr) 960 { 961 uintptr_t paddr; 962 963 /* Silence static analyzers */ 964 assert(vaddr != 0); 965 paddr = rte_mem_virt2iova((void *)vaddr); 966 if (paddr == RTE_BAD_IOVA) { 967 /* 968 * The vaddr may be valid but doesn't have a backing page 969 * assigned yet. Touch the page to ensure a backing page 970 * gets assigned, then try to translate again. 971 */ 972 rte_atomic64_read((rte_atomic64_t *)vaddr); 973 paddr = rte_mem_virt2iova((void *)vaddr); 974 } 975 if (paddr == RTE_BAD_IOVA) { 976 /* Unable to get to the physical address. */ 977 return SPDK_VTOPHYS_ERROR; 978 } 979 980 return paddr; 981 } 982 983 static uint64_t 984 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) 985 { 986 struct rte_mem_resource *res; 987 uint64_t paddr; 988 unsigned r; 989 990 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 991 res = dpdk_pci_device_get_mem_resource(dev, r); 992 993 if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || 994 (vaddr + len) >= (uint64_t)res->addr + res->len) { 995 continue; 996 } 997 998 #if VFIO_ENABLED 999 if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { 1000 /* 1001 * The IOMMU is on and we're using IOVA == VA. The BAR was 1002 * automatically registered when it was mapped, so just return 1003 * the virtual address here. 1004 */ 1005 return vaddr; 1006 } 1007 #endif 1008 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1009 return paddr; 1010 } 1011 1012 return SPDK_VTOPHYS_ERROR; 1013 } 1014 1015 /* Try to get the paddr from pci devices */ 1016 static uint64_t 1017 vtophys_get_paddr_pci(uint64_t vaddr, size_t len) 1018 { 1019 struct spdk_vtophys_pci_device *vtophys_dev; 1020 uintptr_t paddr; 1021 struct rte_pci_device *dev; 1022 1023 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1024 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1025 dev = vtophys_dev->pci_device; 1026 paddr = pci_device_vtophys(dev, vaddr, len); 1027 if (paddr != SPDK_VTOPHYS_ERROR) { 1028 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1029 return paddr; 1030 } 1031 } 1032 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1033 1034 return SPDK_VTOPHYS_ERROR; 1035 } 1036 1037 static int 1038 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1039 enum spdk_mem_map_notify_action action, 1040 void *vaddr, size_t len) 1041 { 1042 int rc = 0; 1043 uint64_t paddr; 1044 1045 if ((uintptr_t)vaddr & ~MASK_256TB) { 1046 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1047 return -EINVAL; 1048 } 1049 1050 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1051 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1052 vaddr, len); 1053 return -EINVAL; 1054 } 1055 1056 /* Get the physical address from the DPDK memsegs */ 1057 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1058 1059 switch (action) { 1060 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1061 if (paddr == SPDK_VTOPHYS_ERROR) { 1062 /* This is not an address that DPDK is managing. */ 1063 1064 /* Check if this is a PCI BAR. They need special handling */ 1065 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1066 if (paddr != SPDK_VTOPHYS_ERROR) { 1067 /* Get paddr for each 2MB chunk in this address range */ 1068 while (len > 0) { 1069 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1070 if (paddr == SPDK_VTOPHYS_ERROR) { 1071 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1072 return -EFAULT; 1073 } 1074 1075 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1076 if (rc != 0) { 1077 return rc; 1078 } 1079 1080 vaddr += VALUE_2MB; 1081 len -= VALUE_2MB; 1082 } 1083 1084 return 0; 1085 } 1086 1087 #if VFIO_ENABLED 1088 enum rte_iova_mode iova_mode; 1089 1090 iova_mode = rte_eal_iova_mode(); 1091 1092 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1093 /* We'll use the virtual address as the iova to match DPDK. */ 1094 paddr = (uint64_t)vaddr; 1095 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1096 if (rc) { 1097 return -EFAULT; 1098 } 1099 while (len > 0) { 1100 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1101 if (rc != 0) { 1102 return rc; 1103 } 1104 vaddr += VALUE_2MB; 1105 paddr += VALUE_2MB; 1106 len -= VALUE_2MB; 1107 } 1108 } else 1109 #endif 1110 { 1111 /* Get the physical address from /proc/self/pagemap. */ 1112 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1113 if (paddr == SPDK_VTOPHYS_ERROR) { 1114 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1115 return -EFAULT; 1116 } 1117 1118 /* Get paddr for each 2MB chunk in this address range */ 1119 while (len > 0) { 1120 /* Get the physical address from /proc/self/pagemap. */ 1121 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1122 1123 if (paddr == SPDK_VTOPHYS_ERROR) { 1124 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1125 return -EFAULT; 1126 } 1127 1128 if (paddr & MASK_2MB) { 1129 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1130 return -EINVAL; 1131 } 1132 #if VFIO_ENABLED 1133 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1134 * with the IOMMU using the physical address to match. */ 1135 if (spdk_iommu_is_enabled()) { 1136 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1137 if (rc) { 1138 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1139 return -EFAULT; 1140 } 1141 } 1142 #endif 1143 1144 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1145 if (rc != 0) { 1146 return rc; 1147 } 1148 1149 vaddr += VALUE_2MB; 1150 len -= VALUE_2MB; 1151 } 1152 } 1153 } else { 1154 /* This is an address managed by DPDK. Just setup the translations. */ 1155 while (len > 0) { 1156 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1157 if (paddr == SPDK_VTOPHYS_ERROR) { 1158 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1159 return -EFAULT; 1160 } 1161 1162 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1163 if (rc != 0) { 1164 return rc; 1165 } 1166 1167 vaddr += VALUE_2MB; 1168 len -= VALUE_2MB; 1169 } 1170 } 1171 1172 break; 1173 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1174 #if VFIO_ENABLED 1175 if (paddr == SPDK_VTOPHYS_ERROR) { 1176 /* 1177 * This is not an address that DPDK is managing. 1178 */ 1179 1180 /* Check if this is a PCI BAR. They need special handling */ 1181 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1182 if (paddr != SPDK_VTOPHYS_ERROR) { 1183 /* Get paddr for each 2MB chunk in this address range */ 1184 while (len > 0) { 1185 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1186 if (paddr == SPDK_VTOPHYS_ERROR) { 1187 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1188 return -EFAULT; 1189 } 1190 1191 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1192 if (rc != 0) { 1193 return rc; 1194 } 1195 1196 vaddr += VALUE_2MB; 1197 len -= VALUE_2MB; 1198 } 1199 1200 return 0; 1201 } 1202 1203 /* If vfio is enabled, 1204 * we need to unmap the range from the IOMMU 1205 */ 1206 if (spdk_iommu_is_enabled()) { 1207 uint64_t buffer_len = len; 1208 uint8_t *va = vaddr; 1209 enum rte_iova_mode iova_mode; 1210 1211 iova_mode = rte_eal_iova_mode(); 1212 /* 1213 * In virtual address mode, the region is contiguous and can be done in 1214 * one unmap. 1215 */ 1216 if (iova_mode == RTE_IOVA_VA) { 1217 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1218 if (buffer_len != len || paddr != (uintptr_t)va) { 1219 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1220 "translation had address 0x%" PRIx64 " and length %lu\n", 1221 va, len, paddr, buffer_len); 1222 return -EINVAL; 1223 } 1224 rc = vtophys_iommu_unmap_dma(paddr, len); 1225 if (rc) { 1226 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1227 return -EFAULT; 1228 } 1229 } else if (iova_mode == RTE_IOVA_PA) { 1230 /* Get paddr for each 2MB chunk in this address range */ 1231 while (buffer_len > 0) { 1232 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1233 1234 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1235 DEBUG_PRINT("could not get phys addr for %p\n", va); 1236 return -EFAULT; 1237 } 1238 1239 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1240 if (rc) { 1241 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1242 return -EFAULT; 1243 } 1244 1245 va += VALUE_2MB; 1246 buffer_len -= VALUE_2MB; 1247 } 1248 } 1249 } 1250 } 1251 #endif 1252 while (len > 0) { 1253 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1254 if (rc != 0) { 1255 return rc; 1256 } 1257 1258 vaddr += VALUE_2MB; 1259 len -= VALUE_2MB; 1260 } 1261 1262 break; 1263 default: 1264 SPDK_UNREACHABLE(); 1265 } 1266 1267 return rc; 1268 } 1269 1270 static int 1271 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1272 { 1273 /* This function is always called with paddrs for two subsequent 1274 * 2MB chunks in virtual address space, so those chunks will be only 1275 * physically contiguous if the physical addresses are 2MB apart 1276 * from each other as well. 1277 */ 1278 return (paddr2 - paddr1 == VALUE_2MB); 1279 } 1280 1281 #if VFIO_ENABLED 1282 1283 static bool 1284 vfio_enabled(void) 1285 { 1286 return rte_vfio_is_enabled("vfio_pci"); 1287 } 1288 1289 /* Check if IOMMU is enabled on the system */ 1290 static bool 1291 has_iommu_groups(void) 1292 { 1293 int count = 0; 1294 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1295 1296 if (dir == NULL) { 1297 return false; 1298 } 1299 1300 while (count < 3 && readdir(dir) != NULL) { 1301 count++; 1302 } 1303 1304 closedir(dir); 1305 /* there will always be ./ and ../ entries */ 1306 return count > 2; 1307 } 1308 1309 static bool 1310 vfio_noiommu_enabled(void) 1311 { 1312 return rte_vfio_noiommu_is_enabled(); 1313 } 1314 1315 static void 1316 vtophys_iommu_init(void) 1317 { 1318 char proc_fd_path[PATH_MAX + 1]; 1319 char link_path[PATH_MAX + 1]; 1320 const char vfio_path[] = "/dev/vfio/vfio"; 1321 DIR *dir; 1322 struct dirent *d; 1323 1324 if (!vfio_enabled()) { 1325 return; 1326 } 1327 1328 if (vfio_noiommu_enabled()) { 1329 g_vfio.noiommu_enabled = true; 1330 } else if (!has_iommu_groups()) { 1331 return; 1332 } 1333 1334 dir = opendir("/proc/self/fd"); 1335 if (!dir) { 1336 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1337 return; 1338 } 1339 1340 while ((d = readdir(dir)) != NULL) { 1341 if (d->d_type != DT_LNK) { 1342 continue; 1343 } 1344 1345 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1346 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1347 continue; 1348 } 1349 1350 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1351 sscanf(d->d_name, "%d", &g_vfio.fd); 1352 break; 1353 } 1354 } 1355 1356 closedir(dir); 1357 1358 if (g_vfio.fd < 0) { 1359 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1360 return; 1361 } 1362 1363 g_vfio.enabled = true; 1364 1365 return; 1366 } 1367 1368 #endif 1369 1370 void 1371 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1372 { 1373 struct spdk_vtophys_pci_device *vtophys_dev; 1374 1375 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1376 1377 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1378 if (vtophys_dev) { 1379 vtophys_dev->pci_device = pci_device; 1380 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1381 } else { 1382 DEBUG_PRINT("Memory allocation error\n"); 1383 } 1384 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1385 1386 #if VFIO_ENABLED 1387 struct spdk_vfio_dma_map *dma_map; 1388 int ret; 1389 1390 if (!g_vfio.enabled) { 1391 return; 1392 } 1393 1394 pthread_mutex_lock(&g_vfio.mutex); 1395 g_vfio.device_ref++; 1396 if (g_vfio.device_ref > 1) { 1397 pthread_mutex_unlock(&g_vfio.mutex); 1398 return; 1399 } 1400 1401 /* This is the first SPDK device using DPDK vfio. This means that the first 1402 * IOMMU group might have been just been added to the DPDK vfio container. 1403 * From this point it is certain that the memory can be mapped now. 1404 */ 1405 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1406 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1407 if (ret) { 1408 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1409 break; 1410 } 1411 } 1412 pthread_mutex_unlock(&g_vfio.mutex); 1413 #endif 1414 } 1415 1416 void 1417 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1418 { 1419 struct spdk_vtophys_pci_device *vtophys_dev; 1420 1421 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1422 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1423 if (vtophys_dev->pci_device == pci_device) { 1424 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1425 free(vtophys_dev); 1426 break; 1427 } 1428 } 1429 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1430 1431 #if VFIO_ENABLED 1432 struct spdk_vfio_dma_map *dma_map; 1433 int ret; 1434 1435 if (!g_vfio.enabled) { 1436 return; 1437 } 1438 1439 pthread_mutex_lock(&g_vfio.mutex); 1440 assert(g_vfio.device_ref > 0); 1441 g_vfio.device_ref--; 1442 if (g_vfio.device_ref > 0) { 1443 pthread_mutex_unlock(&g_vfio.mutex); 1444 return; 1445 } 1446 1447 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1448 * any additional devices using it's vfio container, all the mappings 1449 * will be automatically removed by the Linux vfio driver. We unmap 1450 * the memory manually to be able to easily re-map it later regardless 1451 * of other, external factors. 1452 */ 1453 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1454 struct vfio_iommu_type1_dma_unmap unmap = {}; 1455 unmap.argsz = sizeof(unmap); 1456 unmap.flags = 0; 1457 unmap.iova = dma_map->map.iova; 1458 unmap.size = dma_map->map.size; 1459 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1460 if (ret) { 1461 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1462 break; 1463 } 1464 } 1465 pthread_mutex_unlock(&g_vfio.mutex); 1466 #endif 1467 } 1468 1469 int 1470 vtophys_init(void) 1471 { 1472 const struct spdk_mem_map_ops vtophys_map_ops = { 1473 .notify_cb = vtophys_notify, 1474 .are_contiguous = vtophys_check_contiguous_entries, 1475 }; 1476 1477 const struct spdk_mem_map_ops phys_ref_map_ops = { 1478 .notify_cb = NULL, 1479 .are_contiguous = NULL, 1480 }; 1481 1482 #if VFIO_ENABLED 1483 vtophys_iommu_init(); 1484 #endif 1485 1486 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1487 if (g_phys_ref_map == NULL) { 1488 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1489 return -ENOMEM; 1490 } 1491 1492 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1493 if (g_vtophys_map == NULL) { 1494 DEBUG_PRINT("vtophys map allocation failed\n"); 1495 spdk_mem_map_free(&g_phys_ref_map); 1496 return -ENOMEM; 1497 } 1498 return 0; 1499 } 1500 1501 uint64_t 1502 spdk_vtophys(const void *buf, uint64_t *size) 1503 { 1504 uint64_t vaddr, paddr_2mb; 1505 1506 vaddr = (uint64_t)buf; 1507 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1508 1509 /* 1510 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1511 * we will still bitwise-or it with the buf offset below, but the result will still be 1512 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1513 * unaligned) we must now check the return value before addition. 1514 */ 1515 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1516 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1517 return SPDK_VTOPHYS_ERROR; 1518 } else { 1519 return paddr_2mb + (vaddr & MASK_2MB); 1520 } 1521 } 1522 1523 int 1524 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1525 { 1526 struct rte_memseg *seg; 1527 int ret, fd; 1528 1529 seg = rte_mem_virt2memseg(vaddr, NULL); 1530 if (!seg) { 1531 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1532 return -ENOENT; 1533 } 1534 1535 fd = rte_memseg_get_fd_thread_unsafe(seg); 1536 if (fd < 0) { 1537 return fd; 1538 } 1539 1540 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1541 if (ret < 0) { 1542 return ret; 1543 } 1544 1545 return fd; 1546 } 1547