1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 #include "pci_dpdk.h" 10 11 #include <rte_config.h> 12 #include <rte_memory.h> 13 #include <rte_eal_memconfig.h> 14 #include <rte_dev.h> 15 #include <rte_pci.h> 16 17 #include "spdk_internal/assert.h" 18 19 #include "spdk/assert.h" 20 #include "spdk/likely.h" 21 #include "spdk/queue.h" 22 #include "spdk/util.h" 23 #include "spdk/memory.h" 24 #include "spdk/env_dpdk.h" 25 #include "spdk/log.h" 26 27 #ifdef __linux__ 28 #include <linux/version.h> 29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30 #include <linux/vfio.h> 31 #include <rte_vfio.h> 32 33 struct spdk_vfio_dma_map { 34 struct vfio_iommu_type1_dma_map map; 35 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36 }; 37 38 struct vfio_cfg { 39 int fd; 40 bool enabled; 41 bool noiommu_enabled; 42 unsigned device_ref; 43 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44 pthread_mutex_t mutex; 45 }; 46 47 static struct vfio_cfg g_vfio = { 48 .fd = -1, 49 .enabled = false, 50 .noiommu_enabled = false, 51 .device_ref = 0, 52 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53 .mutex = PTHREAD_MUTEX_INITIALIZER 54 }; 55 #endif 56 #endif 57 58 #if DEBUG 59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 60 #else 61 #define DEBUG_PRINT(...) 62 #endif 63 64 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 65 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 66 67 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 68 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 69 70 /* Page is registered */ 71 #define REG_MAP_REGISTERED (1ULL << 62) 72 73 /* A notification region barrier. The 2MB translation entry that's marked 74 * with this flag must be unregistered separately. This allows contiguous 75 * regions to be unregistered in the same chunks they were registered. 76 */ 77 #define REG_MAP_NOTIFY_START (1ULL << 63) 78 79 /* Translation of a single 2MB page. */ 80 struct map_2mb { 81 uint64_t translation_2mb; 82 }; 83 84 /* Second-level map table indexed by bits [21..29] of the virtual address. 85 * Each entry contains the address translation or error for entries that haven't 86 * been retrieved yet. 87 */ 88 struct map_1gb { 89 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 90 }; 91 92 /* Top-level map table indexed by bits [30..47] of the virtual address. 93 * Each entry points to a second-level map table or NULL. 94 */ 95 struct map_256tb { 96 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 97 }; 98 99 /* Page-granularity memory address translation */ 100 struct spdk_mem_map { 101 struct map_256tb map_256tb; 102 pthread_mutex_t mutex; 103 uint64_t default_translation; 104 struct spdk_mem_map_ops ops; 105 void *cb_ctx; 106 TAILQ_ENTRY(spdk_mem_map) tailq; 107 }; 108 109 /* Registrations map. The 64 bit translations are bit fields with the 110 * following layout (starting with the low bits): 111 * 0 - 61 : reserved 112 * 62 - 63 : flags 113 */ 114 static struct spdk_mem_map *g_mem_reg_map; 115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 116 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 118 119 static bool g_legacy_mem; 120 static bool g_huge_pages = true; 121 122 /* 123 * Walk the currently registered memory via the main memory registration map 124 * and call the new map's notify callback for each virtually contiguous region. 125 */ 126 static int 127 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 128 { 129 size_t idx_256tb; 130 uint64_t idx_1gb; 131 uint64_t contig_start = UINT64_MAX; 132 uint64_t contig_end = UINT64_MAX; 133 struct map_1gb *map_1gb; 134 int rc; 135 136 if (!g_mem_reg_map) { 137 return -EINVAL; 138 } 139 140 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 141 pthread_mutex_lock(&g_mem_reg_map->mutex); 142 143 for (idx_256tb = 0; 144 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 145 idx_256tb++) { 146 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 147 148 if (!map_1gb) { 149 if (contig_start != UINT64_MAX) { 150 /* End of of a virtually contiguous range */ 151 rc = map->ops.notify_cb(map->cb_ctx, map, action, 152 (void *)contig_start, 153 contig_end - contig_start + VALUE_2MB); 154 /* Don't bother handling unregister failures. It can't be any worse */ 155 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 156 goto err_unregister; 157 } 158 } 159 contig_start = UINT64_MAX; 160 continue; 161 } 162 163 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 164 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 165 (contig_start == UINT64_MAX || 166 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 167 /* Rebuild the virtual address from the indexes */ 168 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 169 170 if (contig_start == UINT64_MAX) { 171 contig_start = vaddr; 172 } 173 174 contig_end = vaddr; 175 } else { 176 if (contig_start != UINT64_MAX) { 177 /* End of of a virtually contiguous range */ 178 rc = map->ops.notify_cb(map->cb_ctx, map, action, 179 (void *)contig_start, 180 contig_end - contig_start + VALUE_2MB); 181 /* Don't bother handling unregister failures. It can't be any worse */ 182 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 183 goto err_unregister; 184 } 185 186 /* This page might be a part of a neighbour region, so process 187 * it again. The idx_1gb will be incremented immediately. 188 */ 189 idx_1gb--; 190 } 191 contig_start = UINT64_MAX; 192 } 193 } 194 } 195 196 pthread_mutex_unlock(&g_mem_reg_map->mutex); 197 return 0; 198 199 err_unregister: 200 /* Unwind to the first empty translation so we don't unregister 201 * a region that just failed to register. 202 */ 203 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 204 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 205 contig_start = UINT64_MAX; 206 contig_end = UINT64_MAX; 207 208 /* Unregister any memory we managed to register before the failure */ 209 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 210 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 211 212 if (!map_1gb) { 213 if (contig_end != UINT64_MAX) { 214 /* End of of a virtually contiguous range */ 215 map->ops.notify_cb(map->cb_ctx, map, 216 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 217 (void *)contig_start, 218 contig_end - contig_start + VALUE_2MB); 219 } 220 contig_end = UINT64_MAX; 221 continue; 222 } 223 224 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 225 /* Rebuild the virtual address from the indexes */ 226 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 227 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 228 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 229 230 if (contig_end == UINT64_MAX) { 231 contig_end = vaddr; 232 } 233 contig_start = vaddr; 234 } else { 235 if (contig_end != UINT64_MAX) { 236 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 237 contig_start = vaddr; 238 } 239 /* End of of a virtually contiguous range */ 240 map->ops.notify_cb(map->cb_ctx, map, 241 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 242 (void *)contig_start, 243 contig_end - contig_start + VALUE_2MB); 244 } 245 contig_end = UINT64_MAX; 246 } 247 } 248 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 249 } 250 251 pthread_mutex_unlock(&g_mem_reg_map->mutex); 252 return rc; 253 } 254 255 struct spdk_mem_map * 256 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 257 { 258 struct spdk_mem_map *map; 259 int rc; 260 size_t i; 261 262 map = calloc(1, sizeof(*map)); 263 if (map == NULL) { 264 return NULL; 265 } 266 267 if (pthread_mutex_init(&map->mutex, NULL)) { 268 free(map); 269 return NULL; 270 } 271 272 map->default_translation = default_translation; 273 map->cb_ctx = cb_ctx; 274 if (ops) { 275 map->ops = *ops; 276 } 277 278 if (ops && ops->notify_cb) { 279 pthread_mutex_lock(&g_spdk_mem_map_mutex); 280 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 281 if (rc != 0) { 282 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 283 DEBUG_PRINT("Initial mem_map notify failed\n"); 284 pthread_mutex_destroy(&map->mutex); 285 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 286 free(map->map_256tb.map[i]); 287 } 288 free(map); 289 return NULL; 290 } 291 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 292 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 293 } 294 295 return map; 296 } 297 298 void 299 spdk_mem_map_free(struct spdk_mem_map **pmap) 300 { 301 struct spdk_mem_map *map; 302 size_t i; 303 304 if (!pmap) { 305 return; 306 } 307 308 map = *pmap; 309 310 if (!map) { 311 return; 312 } 313 314 if (map->ops.notify_cb) { 315 pthread_mutex_lock(&g_spdk_mem_map_mutex); 316 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 317 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 318 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 319 } 320 321 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 322 free(map->map_256tb.map[i]); 323 } 324 325 pthread_mutex_destroy(&map->mutex); 326 327 free(map); 328 *pmap = NULL; 329 } 330 331 int 332 spdk_mem_register(void *vaddr, size_t len) 333 { 334 struct spdk_mem_map *map; 335 int rc; 336 void *seg_vaddr; 337 size_t seg_len; 338 uint64_t reg; 339 340 if ((uintptr_t)vaddr & ~MASK_256TB) { 341 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 342 return -EINVAL; 343 } 344 345 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 346 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 347 __func__, vaddr, len); 348 return -EINVAL; 349 } 350 351 if (len == 0) { 352 return 0; 353 } 354 355 pthread_mutex_lock(&g_spdk_mem_map_mutex); 356 357 seg_vaddr = vaddr; 358 seg_len = len; 359 while (seg_len > 0) { 360 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 361 if (reg & REG_MAP_REGISTERED) { 362 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 363 return -EBUSY; 364 } 365 seg_vaddr += VALUE_2MB; 366 seg_len -= VALUE_2MB; 367 } 368 369 seg_vaddr = vaddr; 370 seg_len = 0; 371 while (len > 0) { 372 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 373 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 374 seg_len += VALUE_2MB; 375 vaddr += VALUE_2MB; 376 len -= VALUE_2MB; 377 } 378 379 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 380 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 381 if (rc != 0) { 382 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 383 return rc; 384 } 385 } 386 387 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 388 return 0; 389 } 390 391 int 392 spdk_mem_unregister(void *vaddr, size_t len) 393 { 394 struct spdk_mem_map *map; 395 int rc; 396 void *seg_vaddr; 397 size_t seg_len; 398 uint64_t reg, newreg; 399 400 if ((uintptr_t)vaddr & ~MASK_256TB) { 401 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 402 return -EINVAL; 403 } 404 405 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 406 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 407 __func__, vaddr, len); 408 return -EINVAL; 409 } 410 411 pthread_mutex_lock(&g_spdk_mem_map_mutex); 412 413 /* The first page must be a start of a region. Also check if it's 414 * registered to make sure we don't return -ERANGE for non-registered 415 * regions. 416 */ 417 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 418 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 419 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 420 return -ERANGE; 421 } 422 423 seg_vaddr = vaddr; 424 seg_len = len; 425 while (seg_len > 0) { 426 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 427 if ((reg & REG_MAP_REGISTERED) == 0) { 428 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 429 return -EINVAL; 430 } 431 seg_vaddr += VALUE_2MB; 432 seg_len -= VALUE_2MB; 433 } 434 435 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 436 /* If the next page is registered, it must be a start of a region as well, 437 * otherwise we'd be unregistering only a part of a region. 438 */ 439 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 440 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 441 return -ERANGE; 442 } 443 seg_vaddr = vaddr; 444 seg_len = 0; 445 446 while (len > 0) { 447 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 448 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 449 450 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 451 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 452 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 453 if (rc != 0) { 454 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 455 return rc; 456 } 457 } 458 459 seg_vaddr = vaddr; 460 seg_len = VALUE_2MB; 461 } else { 462 seg_len += VALUE_2MB; 463 } 464 465 vaddr += VALUE_2MB; 466 len -= VALUE_2MB; 467 } 468 469 if (seg_len > 0) { 470 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 471 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 472 if (rc != 0) { 473 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 474 return rc; 475 } 476 } 477 } 478 479 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 480 return 0; 481 } 482 483 int 484 spdk_mem_reserve(void *vaddr, size_t len) 485 { 486 struct spdk_mem_map *map; 487 void *seg_vaddr; 488 size_t seg_len; 489 uint64_t reg; 490 491 if ((uintptr_t)vaddr & ~MASK_256TB) { 492 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 493 return -EINVAL; 494 } 495 496 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 497 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 498 __func__, vaddr, len); 499 return -EINVAL; 500 } 501 502 if (len == 0) { 503 return 0; 504 } 505 506 pthread_mutex_lock(&g_spdk_mem_map_mutex); 507 508 /* Check if any part of this range is already registered */ 509 seg_vaddr = vaddr; 510 seg_len = len; 511 while (seg_len > 0) { 512 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 513 if (reg & REG_MAP_REGISTERED) { 514 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 515 return -EBUSY; 516 } 517 seg_vaddr += VALUE_2MB; 518 seg_len -= VALUE_2MB; 519 } 520 521 /* Simply set the translation to the memory map's default. This allocates the space in the 522 * map but does not provide a valid translation. */ 523 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 524 g_mem_reg_map->default_translation); 525 526 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 527 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 528 } 529 530 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 531 return 0; 532 } 533 534 static struct map_1gb * 535 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 536 { 537 struct map_1gb *map_1gb; 538 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 539 size_t i; 540 541 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 542 return NULL; 543 } 544 545 map_1gb = map->map_256tb.map[idx_256tb]; 546 547 if (!map_1gb) { 548 pthread_mutex_lock(&map->mutex); 549 550 /* Recheck to make sure nobody else got the mutex first. */ 551 map_1gb = map->map_256tb.map[idx_256tb]; 552 if (!map_1gb) { 553 map_1gb = malloc(sizeof(struct map_1gb)); 554 if (map_1gb) { 555 /* initialize all entries to default translation */ 556 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 557 map_1gb->map[i].translation_2mb = map->default_translation; 558 } 559 map->map_256tb.map[idx_256tb] = map_1gb; 560 } 561 } 562 563 pthread_mutex_unlock(&map->mutex); 564 565 if (!map_1gb) { 566 DEBUG_PRINT("allocation failed\n"); 567 return NULL; 568 } 569 } 570 571 return map_1gb; 572 } 573 574 int 575 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 576 uint64_t translation) 577 { 578 uint64_t vfn_2mb; 579 struct map_1gb *map_1gb; 580 uint64_t idx_1gb; 581 struct map_2mb *map_2mb; 582 583 if ((uintptr_t)vaddr & ~MASK_256TB) { 584 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 585 return -EINVAL; 586 } 587 588 /* For now, only 2 MB-aligned registrations are supported */ 589 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 590 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 591 __func__, vaddr, size); 592 return -EINVAL; 593 } 594 595 vfn_2mb = vaddr >> SHIFT_2MB; 596 597 while (size) { 598 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 599 if (!map_1gb) { 600 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 601 return -ENOMEM; 602 } 603 604 idx_1gb = MAP_1GB_IDX(vfn_2mb); 605 map_2mb = &map_1gb->map[idx_1gb]; 606 map_2mb->translation_2mb = translation; 607 608 size -= VALUE_2MB; 609 vfn_2mb++; 610 } 611 612 return 0; 613 } 614 615 int 616 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 617 { 618 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 619 } 620 621 inline uint64_t 622 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 623 { 624 const struct map_1gb *map_1gb; 625 const struct map_2mb *map_2mb; 626 uint64_t idx_256tb; 627 uint64_t idx_1gb; 628 uint64_t vfn_2mb; 629 uint64_t cur_size; 630 uint64_t prev_translation; 631 uint64_t orig_translation; 632 633 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 634 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 635 return map->default_translation; 636 } 637 638 vfn_2mb = vaddr >> SHIFT_2MB; 639 idx_256tb = MAP_256TB_IDX(vfn_2mb); 640 idx_1gb = MAP_1GB_IDX(vfn_2mb); 641 642 map_1gb = map->map_256tb.map[idx_256tb]; 643 if (spdk_unlikely(!map_1gb)) { 644 return map->default_translation; 645 } 646 647 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 648 map_2mb = &map_1gb->map[idx_1gb]; 649 if (size == NULL || map->ops.are_contiguous == NULL || 650 map_2mb->translation_2mb == map->default_translation) { 651 if (size != NULL) { 652 *size = spdk_min(*size, cur_size); 653 } 654 return map_2mb->translation_2mb; 655 } 656 657 orig_translation = map_2mb->translation_2mb; 658 prev_translation = orig_translation; 659 while (cur_size < *size) { 660 vfn_2mb++; 661 idx_256tb = MAP_256TB_IDX(vfn_2mb); 662 idx_1gb = MAP_1GB_IDX(vfn_2mb); 663 664 map_1gb = map->map_256tb.map[idx_256tb]; 665 if (spdk_unlikely(!map_1gb)) { 666 break; 667 } 668 669 map_2mb = &map_1gb->map[idx_1gb]; 670 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 671 break; 672 } 673 674 cur_size += VALUE_2MB; 675 prev_translation = map_2mb->translation_2mb; 676 } 677 678 *size = spdk_min(*size, cur_size); 679 return orig_translation; 680 } 681 682 static void 683 memory_hotplug_cb(enum rte_mem_event event_type, 684 const void *addr, size_t len, void *arg) 685 { 686 if (event_type == RTE_MEM_EVENT_ALLOC) { 687 spdk_mem_register((void *)addr, len); 688 689 if (!spdk_env_dpdk_external_init()) { 690 return; 691 } 692 693 /* When the user initialized DPDK separately, we can't 694 * be sure that --match-allocations RTE flag was specified. 695 * Without this flag, DPDK can free memory in different units 696 * than it was allocated. It doesn't work with things like RDMA MRs. 697 * 698 * For such cases, we mark segments so they aren't freed. 699 */ 700 while (len > 0) { 701 struct rte_memseg *seg; 702 703 seg = rte_mem_virt2memseg(addr, NULL); 704 assert(seg != NULL); 705 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 706 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 707 len -= seg->hugepage_sz; 708 } 709 } else if (event_type == RTE_MEM_EVENT_FREE) { 710 spdk_mem_unregister((void *)addr, len); 711 } 712 } 713 714 static int 715 memory_iter_cb(const struct rte_memseg_list *msl, 716 const struct rte_memseg *ms, size_t len, void *arg) 717 { 718 return spdk_mem_register(ms->addr, len); 719 } 720 721 int 722 mem_map_init(bool legacy_mem) 723 { 724 g_legacy_mem = legacy_mem; 725 726 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 727 if (g_mem_reg_map == NULL) { 728 DEBUG_PRINT("memory registration map allocation failed\n"); 729 return -ENOMEM; 730 } 731 732 /* 733 * Walk all DPDK memory segments and register them 734 * with the main memory map 735 */ 736 if (g_huge_pages) { 737 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 738 rte_memseg_contig_walk(memory_iter_cb, NULL); 739 } 740 return 0; 741 } 742 743 bool 744 spdk_iommu_is_enabled(void) 745 { 746 #if VFIO_ENABLED 747 return g_vfio.enabled && !g_vfio.noiommu_enabled; 748 #else 749 return false; 750 #endif 751 } 752 753 struct spdk_vtophys_pci_device { 754 struct rte_pci_device *pci_device; 755 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 756 }; 757 758 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 759 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 760 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 761 762 static struct spdk_mem_map *g_vtophys_map; 763 static struct spdk_mem_map *g_phys_ref_map; 764 765 #if VFIO_ENABLED 766 static int 767 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 768 { 769 struct spdk_vfio_dma_map *dma_map; 770 int ret; 771 772 dma_map = calloc(1, sizeof(*dma_map)); 773 if (dma_map == NULL) { 774 return -ENOMEM; 775 } 776 777 dma_map->map.argsz = sizeof(dma_map->map); 778 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 779 dma_map->map.vaddr = vaddr; 780 dma_map->map.iova = iova; 781 dma_map->map.size = size; 782 783 if (g_vfio.device_ref == 0) { 784 /* VFIO requires at least one device (IOMMU group) to be added to 785 * a VFIO container before it is possible to perform any IOMMU 786 * operations on that container. This memory will be mapped once 787 * the first device (IOMMU group) is hotplugged. 788 * 789 * Since the vfio container is managed internally by DPDK, it is 790 * also possible that some device is already in that container, but 791 * it's not managed by SPDK - e.g. an NIC attached internally 792 * inside DPDK. We could map the memory straight away in such 793 * scenario, but there's no need to do it. DPDK devices clearly 794 * don't need our mappings and hence we defer the mapping 795 * unconditionally until the first SPDK-managed device is 796 * hotplugged. 797 */ 798 goto out_insert; 799 } 800 801 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 802 if (ret) { 803 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 804 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 805 } 806 807 out_insert: 808 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 809 return 0; 810 } 811 812 813 static int 814 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 815 { 816 uint64_t refcount; 817 int ret; 818 819 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 820 assert(refcount < UINT64_MAX); 821 if (refcount > 0) { 822 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 823 return 0; 824 } 825 826 pthread_mutex_lock(&g_vfio.mutex); 827 ret = _vfio_iommu_map_dma(vaddr, iova, size); 828 pthread_mutex_unlock(&g_vfio.mutex); 829 if (ret) { 830 return ret; 831 } 832 833 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 834 return 0; 835 } 836 837 int 838 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) 839 { 840 int ret; 841 842 pthread_mutex_lock(&g_vfio.mutex); 843 ret = _vfio_iommu_map_dma(vaddr, iova, size); 844 pthread_mutex_unlock(&g_vfio.mutex); 845 846 return ret; 847 } 848 849 static int 850 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) 851 { 852 struct vfio_iommu_type1_dma_unmap unmap = {}; 853 int ret; 854 855 if (g_vfio.device_ref == 0) { 856 /* Memory is not mapped anymore, just remove it's references */ 857 goto out_remove; 858 } 859 860 unmap.argsz = sizeof(unmap); 861 unmap.flags = 0; 862 unmap.iova = dma_map->map.iova; 863 unmap.size = dma_map->map.size; 864 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 865 if (ret) { 866 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 867 } 868 869 out_remove: 870 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 871 free(dma_map); 872 return 0; 873 } 874 875 static int 876 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 877 { 878 struct spdk_vfio_dma_map *dma_map; 879 uint64_t refcount; 880 int ret; 881 882 pthread_mutex_lock(&g_vfio.mutex); 883 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 884 if (dma_map->map.iova == iova) { 885 break; 886 } 887 } 888 889 if (dma_map == NULL) { 890 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 891 pthread_mutex_unlock(&g_vfio.mutex); 892 return -ENXIO; 893 } 894 895 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 896 assert(refcount < UINT64_MAX); 897 if (refcount > 0) { 898 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 899 } 900 901 /* We still have outstanding references, don't clear it. */ 902 if (refcount > 1) { 903 pthread_mutex_unlock(&g_vfio.mutex); 904 return 0; 905 } 906 907 /** don't support partial or multiple-page unmap for now */ 908 assert(dma_map->map.size == size); 909 910 ret = _vfio_iommu_unmap_dma(dma_map); 911 pthread_mutex_unlock(&g_vfio.mutex); 912 913 return ret; 914 } 915 916 int 917 vtophys_iommu_unmap_dma_bar(uint64_t vaddr) 918 { 919 struct spdk_vfio_dma_map *dma_map; 920 int ret; 921 922 pthread_mutex_lock(&g_vfio.mutex); 923 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 924 if (dma_map->map.vaddr == vaddr) { 925 break; 926 } 927 } 928 929 if (dma_map == NULL) { 930 DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); 931 pthread_mutex_unlock(&g_vfio.mutex); 932 return -ENXIO; 933 } 934 935 ret = _vfio_iommu_unmap_dma(dma_map); 936 pthread_mutex_unlock(&g_vfio.mutex); 937 return ret; 938 } 939 #endif 940 941 static uint64_t 942 vtophys_get_paddr_memseg(uint64_t vaddr) 943 { 944 uintptr_t paddr; 945 struct rte_memseg *seg; 946 947 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 948 if (seg != NULL) { 949 paddr = seg->iova; 950 if (paddr == RTE_BAD_IOVA) { 951 return SPDK_VTOPHYS_ERROR; 952 } 953 paddr += (vaddr - (uintptr_t)seg->addr); 954 return paddr; 955 } 956 957 return SPDK_VTOPHYS_ERROR; 958 } 959 960 /* Try to get the paddr from /proc/self/pagemap */ 961 static uint64_t 962 vtophys_get_paddr_pagemap(uint64_t vaddr) 963 { 964 uintptr_t paddr; 965 966 /* Silence static analyzers */ 967 assert(vaddr != 0); 968 paddr = rte_mem_virt2iova((void *)vaddr); 969 if (paddr == RTE_BAD_IOVA) { 970 /* 971 * The vaddr may be valid but doesn't have a backing page 972 * assigned yet. Touch the page to ensure a backing page 973 * gets assigned, then try to translate again. 974 */ 975 rte_atomic64_read((rte_atomic64_t *)vaddr); 976 paddr = rte_mem_virt2iova((void *)vaddr); 977 } 978 if (paddr == RTE_BAD_IOVA) { 979 /* Unable to get to the physical address. */ 980 return SPDK_VTOPHYS_ERROR; 981 } 982 983 return paddr; 984 } 985 986 static uint64_t 987 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) 988 { 989 struct rte_mem_resource *res; 990 uint64_t paddr; 991 unsigned r; 992 993 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 994 res = dpdk_pci_device_get_mem_resource(dev, r); 995 996 if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || 997 (vaddr + len) >= (uint64_t)res->addr + res->len) { 998 continue; 999 } 1000 1001 #if VFIO_ENABLED 1002 if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { 1003 /* 1004 * The IOMMU is on and we're using IOVA == VA. The BAR was 1005 * automatically registered when it was mapped, so just return 1006 * the virtual address here. 1007 */ 1008 return vaddr; 1009 } 1010 #endif 1011 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1012 return paddr; 1013 } 1014 1015 return SPDK_VTOPHYS_ERROR; 1016 } 1017 1018 /* Try to get the paddr from pci devices */ 1019 static uint64_t 1020 vtophys_get_paddr_pci(uint64_t vaddr, size_t len) 1021 { 1022 struct spdk_vtophys_pci_device *vtophys_dev; 1023 uintptr_t paddr; 1024 struct rte_pci_device *dev; 1025 1026 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1027 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1028 dev = vtophys_dev->pci_device; 1029 paddr = pci_device_vtophys(dev, vaddr, len); 1030 if (paddr != SPDK_VTOPHYS_ERROR) { 1031 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1032 return paddr; 1033 } 1034 } 1035 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1036 1037 return SPDK_VTOPHYS_ERROR; 1038 } 1039 1040 static int 1041 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1042 enum spdk_mem_map_notify_action action, 1043 void *vaddr, size_t len) 1044 { 1045 int rc = 0; 1046 uint64_t paddr; 1047 1048 if ((uintptr_t)vaddr & ~MASK_256TB) { 1049 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1050 return -EINVAL; 1051 } 1052 1053 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1054 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1055 vaddr, len); 1056 return -EINVAL; 1057 } 1058 1059 /* Get the physical address from the DPDK memsegs */ 1060 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1061 1062 switch (action) { 1063 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1064 if (paddr == SPDK_VTOPHYS_ERROR) { 1065 /* This is not an address that DPDK is managing. */ 1066 1067 /* Check if this is a PCI BAR. They need special handling */ 1068 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1069 if (paddr != SPDK_VTOPHYS_ERROR) { 1070 /* Get paddr for each 2MB chunk in this address range */ 1071 while (len > 0) { 1072 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1073 if (paddr == SPDK_VTOPHYS_ERROR) { 1074 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1075 return -EFAULT; 1076 } 1077 1078 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1079 if (rc != 0) { 1080 return rc; 1081 } 1082 1083 vaddr += VALUE_2MB; 1084 len -= VALUE_2MB; 1085 } 1086 1087 return 0; 1088 } 1089 1090 #if VFIO_ENABLED 1091 enum rte_iova_mode iova_mode; 1092 1093 iova_mode = rte_eal_iova_mode(); 1094 1095 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1096 /* We'll use the virtual address as the iova to match DPDK. */ 1097 paddr = (uint64_t)vaddr; 1098 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1099 if (rc) { 1100 return -EFAULT; 1101 } 1102 while (len > 0) { 1103 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1104 if (rc != 0) { 1105 return rc; 1106 } 1107 vaddr += VALUE_2MB; 1108 paddr += VALUE_2MB; 1109 len -= VALUE_2MB; 1110 } 1111 } else 1112 #endif 1113 { 1114 /* Get the physical address from /proc/self/pagemap. */ 1115 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1116 if (paddr == SPDK_VTOPHYS_ERROR) { 1117 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1118 return -EFAULT; 1119 } 1120 1121 /* Get paddr for each 2MB chunk in this address range */ 1122 while (len > 0) { 1123 /* Get the physical address from /proc/self/pagemap. */ 1124 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1125 1126 if (paddr == SPDK_VTOPHYS_ERROR) { 1127 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1128 return -EFAULT; 1129 } 1130 1131 if (paddr & MASK_2MB) { 1132 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1133 return -EINVAL; 1134 } 1135 #if VFIO_ENABLED 1136 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1137 * with the IOMMU using the physical address to match. */ 1138 if (spdk_iommu_is_enabled()) { 1139 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1140 if (rc) { 1141 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1142 return -EFAULT; 1143 } 1144 } 1145 #endif 1146 1147 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1148 if (rc != 0) { 1149 return rc; 1150 } 1151 1152 vaddr += VALUE_2MB; 1153 len -= VALUE_2MB; 1154 } 1155 } 1156 } else { 1157 /* This is an address managed by DPDK. Just setup the translations. */ 1158 while (len > 0) { 1159 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1160 if (paddr == SPDK_VTOPHYS_ERROR) { 1161 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1162 return -EFAULT; 1163 } 1164 1165 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1166 if (rc != 0) { 1167 return rc; 1168 } 1169 1170 vaddr += VALUE_2MB; 1171 len -= VALUE_2MB; 1172 } 1173 } 1174 1175 break; 1176 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1177 #if VFIO_ENABLED 1178 if (paddr == SPDK_VTOPHYS_ERROR) { 1179 /* 1180 * This is not an address that DPDK is managing. 1181 */ 1182 1183 /* Check if this is a PCI BAR. They need special handling */ 1184 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1185 if (paddr != SPDK_VTOPHYS_ERROR) { 1186 /* Get paddr for each 2MB chunk in this address range */ 1187 while (len > 0) { 1188 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1189 if (paddr == SPDK_VTOPHYS_ERROR) { 1190 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1191 return -EFAULT; 1192 } 1193 1194 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1195 if (rc != 0) { 1196 return rc; 1197 } 1198 1199 vaddr += VALUE_2MB; 1200 len -= VALUE_2MB; 1201 } 1202 1203 return 0; 1204 } 1205 1206 /* If vfio is enabled, 1207 * we need to unmap the range from the IOMMU 1208 */ 1209 if (spdk_iommu_is_enabled()) { 1210 uint64_t buffer_len = len; 1211 uint8_t *va = vaddr; 1212 enum rte_iova_mode iova_mode; 1213 1214 iova_mode = rte_eal_iova_mode(); 1215 /* 1216 * In virtual address mode, the region is contiguous and can be done in 1217 * one unmap. 1218 */ 1219 if (iova_mode == RTE_IOVA_VA) { 1220 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1221 if (buffer_len != len || paddr != (uintptr_t)va) { 1222 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1223 "translation had address 0x%" PRIx64 " and length %lu\n", 1224 va, len, paddr, buffer_len); 1225 return -EINVAL; 1226 } 1227 rc = vtophys_iommu_unmap_dma(paddr, len); 1228 if (rc) { 1229 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1230 return -EFAULT; 1231 } 1232 } else if (iova_mode == RTE_IOVA_PA) { 1233 /* Get paddr for each 2MB chunk in this address range */ 1234 while (buffer_len > 0) { 1235 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1236 1237 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1238 DEBUG_PRINT("could not get phys addr for %p\n", va); 1239 return -EFAULT; 1240 } 1241 1242 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1243 if (rc) { 1244 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1245 return -EFAULT; 1246 } 1247 1248 va += VALUE_2MB; 1249 buffer_len -= VALUE_2MB; 1250 } 1251 } 1252 } 1253 } 1254 #endif 1255 while (len > 0) { 1256 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1257 if (rc != 0) { 1258 return rc; 1259 } 1260 1261 vaddr += VALUE_2MB; 1262 len -= VALUE_2MB; 1263 } 1264 1265 break; 1266 default: 1267 SPDK_UNREACHABLE(); 1268 } 1269 1270 return rc; 1271 } 1272 1273 static int 1274 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1275 { 1276 /* This function is always called with paddrs for two subsequent 1277 * 2MB chunks in virtual address space, so those chunks will be only 1278 * physically contiguous if the physical addresses are 2MB apart 1279 * from each other as well. 1280 */ 1281 return (paddr2 - paddr1 == VALUE_2MB); 1282 } 1283 1284 #if VFIO_ENABLED 1285 1286 static bool 1287 vfio_enabled(void) 1288 { 1289 return rte_vfio_is_enabled("vfio_pci"); 1290 } 1291 1292 /* Check if IOMMU is enabled on the system */ 1293 static bool 1294 has_iommu_groups(void) 1295 { 1296 int count = 0; 1297 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1298 1299 if (dir == NULL) { 1300 return false; 1301 } 1302 1303 while (count < 3 && readdir(dir) != NULL) { 1304 count++; 1305 } 1306 1307 closedir(dir); 1308 /* there will always be ./ and ../ entries */ 1309 return count > 2; 1310 } 1311 1312 static bool 1313 vfio_noiommu_enabled(void) 1314 { 1315 return rte_vfio_noiommu_is_enabled(); 1316 } 1317 1318 static void 1319 vtophys_iommu_init(void) 1320 { 1321 char proc_fd_path[PATH_MAX + 1]; 1322 char link_path[PATH_MAX + 1]; 1323 const char vfio_path[] = "/dev/vfio/vfio"; 1324 DIR *dir; 1325 struct dirent *d; 1326 1327 if (!vfio_enabled()) { 1328 return; 1329 } 1330 1331 if (vfio_noiommu_enabled()) { 1332 g_vfio.noiommu_enabled = true; 1333 } else if (!has_iommu_groups()) { 1334 return; 1335 } 1336 1337 dir = opendir("/proc/self/fd"); 1338 if (!dir) { 1339 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1340 return; 1341 } 1342 1343 while ((d = readdir(dir)) != NULL) { 1344 if (d->d_type != DT_LNK) { 1345 continue; 1346 } 1347 1348 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1349 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1350 continue; 1351 } 1352 1353 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1354 sscanf(d->d_name, "%d", &g_vfio.fd); 1355 break; 1356 } 1357 } 1358 1359 closedir(dir); 1360 1361 if (g_vfio.fd < 0) { 1362 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1363 return; 1364 } 1365 1366 g_vfio.enabled = true; 1367 1368 return; 1369 } 1370 1371 #endif 1372 1373 void 1374 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1375 { 1376 struct spdk_vtophys_pci_device *vtophys_dev; 1377 1378 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1379 1380 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1381 if (vtophys_dev) { 1382 vtophys_dev->pci_device = pci_device; 1383 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1384 } else { 1385 DEBUG_PRINT("Memory allocation error\n"); 1386 } 1387 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1388 1389 #if VFIO_ENABLED 1390 struct spdk_vfio_dma_map *dma_map; 1391 int ret; 1392 1393 if (!g_vfio.enabled) { 1394 return; 1395 } 1396 1397 pthread_mutex_lock(&g_vfio.mutex); 1398 g_vfio.device_ref++; 1399 if (g_vfio.device_ref > 1) { 1400 pthread_mutex_unlock(&g_vfio.mutex); 1401 return; 1402 } 1403 1404 /* This is the first SPDK device using DPDK vfio. This means that the first 1405 * IOMMU group might have been just been added to the DPDK vfio container. 1406 * From this point it is certain that the memory can be mapped now. 1407 */ 1408 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1409 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1410 if (ret) { 1411 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1412 break; 1413 } 1414 } 1415 pthread_mutex_unlock(&g_vfio.mutex); 1416 #endif 1417 } 1418 1419 void 1420 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1421 { 1422 struct spdk_vtophys_pci_device *vtophys_dev; 1423 1424 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1425 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1426 if (vtophys_dev->pci_device == pci_device) { 1427 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1428 free(vtophys_dev); 1429 break; 1430 } 1431 } 1432 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1433 1434 #if VFIO_ENABLED 1435 struct spdk_vfio_dma_map *dma_map; 1436 int ret; 1437 1438 if (!g_vfio.enabled) { 1439 return; 1440 } 1441 1442 pthread_mutex_lock(&g_vfio.mutex); 1443 assert(g_vfio.device_ref > 0); 1444 g_vfio.device_ref--; 1445 if (g_vfio.device_ref > 0) { 1446 pthread_mutex_unlock(&g_vfio.mutex); 1447 return; 1448 } 1449 1450 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1451 * any additional devices using it's vfio container, all the mappings 1452 * will be automatically removed by the Linux vfio driver. We unmap 1453 * the memory manually to be able to easily re-map it later regardless 1454 * of other, external factors. 1455 */ 1456 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1457 struct vfio_iommu_type1_dma_unmap unmap = {}; 1458 unmap.argsz = sizeof(unmap); 1459 unmap.flags = 0; 1460 unmap.iova = dma_map->map.iova; 1461 unmap.size = dma_map->map.size; 1462 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1463 if (ret) { 1464 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1465 break; 1466 } 1467 } 1468 pthread_mutex_unlock(&g_vfio.mutex); 1469 #endif 1470 } 1471 1472 int 1473 vtophys_init(void) 1474 { 1475 const struct spdk_mem_map_ops vtophys_map_ops = { 1476 .notify_cb = vtophys_notify, 1477 .are_contiguous = vtophys_check_contiguous_entries, 1478 }; 1479 1480 const struct spdk_mem_map_ops phys_ref_map_ops = { 1481 .notify_cb = NULL, 1482 .are_contiguous = NULL, 1483 }; 1484 1485 #if VFIO_ENABLED 1486 vtophys_iommu_init(); 1487 #endif 1488 1489 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1490 if (g_phys_ref_map == NULL) { 1491 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1492 return -ENOMEM; 1493 } 1494 1495 if (g_huge_pages) { 1496 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1497 if (g_vtophys_map == NULL) { 1498 DEBUG_PRINT("vtophys map allocation failed\n"); 1499 spdk_mem_map_free(&g_phys_ref_map); 1500 return -ENOMEM; 1501 } 1502 } 1503 return 0; 1504 } 1505 1506 uint64_t 1507 spdk_vtophys(const void *buf, uint64_t *size) 1508 { 1509 uint64_t vaddr, paddr_2mb; 1510 1511 if (!g_huge_pages) { 1512 return SPDK_VTOPHYS_ERROR; 1513 } 1514 1515 vaddr = (uint64_t)buf; 1516 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1517 1518 /* 1519 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1520 * we will still bitwise-or it with the buf offset below, but the result will still be 1521 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1522 * unaligned) we must now check the return value before addition. 1523 */ 1524 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1525 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1526 return SPDK_VTOPHYS_ERROR; 1527 } else { 1528 return paddr_2mb + (vaddr & MASK_2MB); 1529 } 1530 } 1531 1532 int 1533 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1534 { 1535 struct rte_memseg *seg; 1536 int ret, fd; 1537 1538 seg = rte_mem_virt2memseg(vaddr, NULL); 1539 if (!seg) { 1540 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1541 return -ENOENT; 1542 } 1543 1544 fd = rte_memseg_get_fd_thread_unsafe(seg); 1545 if (fd < 0) { 1546 return fd; 1547 } 1548 1549 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1550 if (ret < 0) { 1551 return ret; 1552 } 1553 1554 return fd; 1555 } 1556 1557 void 1558 mem_disable_huge_pages(void) 1559 { 1560 g_huge_pages = false; 1561 } 1562