1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 #include "pci_dpdk.h" 10 11 #include <rte_config.h> 12 #include <rte_memory.h> 13 #include <rte_eal_memconfig.h> 14 #include <rte_dev.h> 15 #include <rte_pci.h> 16 17 #include "spdk_internal/assert.h" 18 19 #include "spdk/assert.h" 20 #include "spdk/likely.h" 21 #include "spdk/queue.h" 22 #include "spdk/util.h" 23 #include "spdk/memory.h" 24 #include "spdk/env_dpdk.h" 25 #include "spdk/log.h" 26 27 #ifdef __linux__ 28 #include <linux/version.h> 29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30 #include <linux/vfio.h> 31 #include <rte_vfio.h> 32 33 struct spdk_vfio_dma_map { 34 struct vfio_iommu_type1_dma_map map; 35 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36 }; 37 38 struct vfio_cfg { 39 int fd; 40 bool enabled; 41 bool noiommu_enabled; 42 unsigned device_ref; 43 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44 pthread_mutex_t mutex; 45 }; 46 47 static struct vfio_cfg g_vfio = { 48 .fd = -1, 49 .enabled = false, 50 .noiommu_enabled = false, 51 .device_ref = 0, 52 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53 .mutex = PTHREAD_MUTEX_INITIALIZER 54 }; 55 #endif 56 #endif 57 58 #if DEBUG 59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 60 #else 61 #define DEBUG_PRINT(...) 62 #endif 63 64 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 65 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 66 67 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 68 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 69 70 /* Page is registered */ 71 #define REG_MAP_REGISTERED (1ULL << 62) 72 73 /* A notification region barrier. The 2MB translation entry that's marked 74 * with this flag must be unregistered separately. This allows contiguous 75 * regions to be unregistered in the same chunks they were registered. 76 */ 77 #define REG_MAP_NOTIFY_START (1ULL << 63) 78 79 /* Translation of a single 2MB page. */ 80 struct map_2mb { 81 uint64_t translation_2mb; 82 }; 83 84 /* Second-level map table indexed by bits [21..29] of the virtual address. 85 * Each entry contains the address translation or error for entries that haven't 86 * been retrieved yet. 87 */ 88 struct map_1gb { 89 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 90 }; 91 92 /* Top-level map table indexed by bits [30..47] of the virtual address. 93 * Each entry points to a second-level map table or NULL. 94 */ 95 struct map_256tb { 96 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 97 }; 98 99 /* Page-granularity memory address translation */ 100 struct spdk_mem_map { 101 struct map_256tb map_256tb; 102 pthread_mutex_t mutex; 103 uint64_t default_translation; 104 struct spdk_mem_map_ops ops; 105 void *cb_ctx; 106 TAILQ_ENTRY(spdk_mem_map) tailq; 107 }; 108 109 /* Registrations map. The 64 bit translations are bit fields with the 110 * following layout (starting with the low bits): 111 * 0 - 61 : reserved 112 * 62 - 63 : flags 113 */ 114 static struct spdk_mem_map *g_mem_reg_map; 115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 116 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 118 119 static bool g_legacy_mem; 120 static bool g_huge_pages = true; 121 122 /* 123 * Walk the currently registered memory via the main memory registration map 124 * and call the new map's notify callback for each virtually contiguous region. 125 */ 126 static int 127 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 128 { 129 size_t idx_256tb; 130 uint64_t idx_1gb; 131 uint64_t contig_start = UINT64_MAX; 132 uint64_t contig_end = UINT64_MAX; 133 struct map_1gb *map_1gb; 134 int rc; 135 136 if (!g_mem_reg_map) { 137 return -EINVAL; 138 } 139 140 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 141 pthread_mutex_lock(&g_mem_reg_map->mutex); 142 143 for (idx_256tb = 0; 144 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 145 idx_256tb++) { 146 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 147 148 if (!map_1gb) { 149 if (contig_start != UINT64_MAX) { 150 /* End of of a virtually contiguous range */ 151 rc = map->ops.notify_cb(map->cb_ctx, map, action, 152 (void *)contig_start, 153 contig_end - contig_start + VALUE_2MB); 154 /* Don't bother handling unregister failures. It can't be any worse */ 155 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 156 goto err_unregister; 157 } 158 } 159 contig_start = UINT64_MAX; 160 continue; 161 } 162 163 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 164 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 165 (contig_start == UINT64_MAX || 166 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 167 /* Rebuild the virtual address from the indexes */ 168 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 169 170 if (contig_start == UINT64_MAX) { 171 contig_start = vaddr; 172 } 173 174 contig_end = vaddr; 175 } else { 176 if (contig_start != UINT64_MAX) { 177 /* End of of a virtually contiguous range */ 178 rc = map->ops.notify_cb(map->cb_ctx, map, action, 179 (void *)contig_start, 180 contig_end - contig_start + VALUE_2MB); 181 /* Don't bother handling unregister failures. It can't be any worse */ 182 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 183 goto err_unregister; 184 } 185 186 /* This page might be a part of a neighbour region, so process 187 * it again. The idx_1gb will be incremented immediately. 188 */ 189 idx_1gb--; 190 } 191 contig_start = UINT64_MAX; 192 } 193 } 194 } 195 196 pthread_mutex_unlock(&g_mem_reg_map->mutex); 197 return 0; 198 199 err_unregister: 200 /* Unwind to the first empty translation so we don't unregister 201 * a region that just failed to register. 202 */ 203 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 204 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 205 contig_start = UINT64_MAX; 206 contig_end = UINT64_MAX; 207 208 /* Unregister any memory we managed to register before the failure */ 209 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 210 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 211 212 if (!map_1gb) { 213 if (contig_end != UINT64_MAX) { 214 /* End of of a virtually contiguous range */ 215 map->ops.notify_cb(map->cb_ctx, map, 216 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 217 (void *)contig_start, 218 contig_end - contig_start + VALUE_2MB); 219 } 220 contig_end = UINT64_MAX; 221 continue; 222 } 223 224 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 225 /* Rebuild the virtual address from the indexes */ 226 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 227 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 228 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 229 230 if (contig_end == UINT64_MAX) { 231 contig_end = vaddr; 232 } 233 contig_start = vaddr; 234 } else { 235 if (contig_end != UINT64_MAX) { 236 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 237 contig_start = vaddr; 238 } 239 /* End of of a virtually contiguous range */ 240 map->ops.notify_cb(map->cb_ctx, map, 241 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 242 (void *)contig_start, 243 contig_end - contig_start + VALUE_2MB); 244 } 245 contig_end = UINT64_MAX; 246 } 247 } 248 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 249 } 250 251 pthread_mutex_unlock(&g_mem_reg_map->mutex); 252 return rc; 253 } 254 255 struct spdk_mem_map * 256 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 257 { 258 struct spdk_mem_map *map; 259 int rc; 260 size_t i; 261 262 map = calloc(1, sizeof(*map)); 263 if (map == NULL) { 264 return NULL; 265 } 266 267 if (pthread_mutex_init(&map->mutex, NULL)) { 268 free(map); 269 return NULL; 270 } 271 272 map->default_translation = default_translation; 273 map->cb_ctx = cb_ctx; 274 if (ops) { 275 map->ops = *ops; 276 } 277 278 if (ops && ops->notify_cb) { 279 pthread_mutex_lock(&g_spdk_mem_map_mutex); 280 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 281 if (rc != 0) { 282 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 283 DEBUG_PRINT("Initial mem_map notify failed\n"); 284 pthread_mutex_destroy(&map->mutex); 285 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 286 free(map->map_256tb.map[i]); 287 } 288 free(map); 289 return NULL; 290 } 291 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 292 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 293 } 294 295 return map; 296 } 297 298 void 299 spdk_mem_map_free(struct spdk_mem_map **pmap) 300 { 301 struct spdk_mem_map *map; 302 size_t i; 303 304 if (!pmap) { 305 return; 306 } 307 308 map = *pmap; 309 310 if (!map) { 311 return; 312 } 313 314 if (map->ops.notify_cb) { 315 pthread_mutex_lock(&g_spdk_mem_map_mutex); 316 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 317 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 318 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 319 } 320 321 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 322 free(map->map_256tb.map[i]); 323 } 324 325 pthread_mutex_destroy(&map->mutex); 326 327 free(map); 328 *pmap = NULL; 329 } 330 331 int 332 spdk_mem_register(void *_vaddr, size_t len) 333 { 334 struct spdk_mem_map *map; 335 int rc; 336 uint64_t vaddr = (uintptr_t)_vaddr; 337 uint64_t seg_vaddr; 338 size_t seg_len; 339 uint64_t reg; 340 341 if ((uintptr_t)vaddr & ~MASK_256TB) { 342 DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 343 return -EINVAL; 344 } 345 346 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 347 DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 348 __func__, vaddr, len); 349 return -EINVAL; 350 } 351 352 if (len == 0) { 353 return 0; 354 } 355 356 pthread_mutex_lock(&g_spdk_mem_map_mutex); 357 358 seg_vaddr = vaddr; 359 seg_len = len; 360 while (seg_len > 0) { 361 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 362 if (reg & REG_MAP_REGISTERED) { 363 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 364 return -EBUSY; 365 } 366 seg_vaddr += VALUE_2MB; 367 seg_len -= VALUE_2MB; 368 } 369 370 seg_vaddr = vaddr; 371 seg_len = 0; 372 while (len > 0) { 373 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 374 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 375 seg_len += VALUE_2MB; 376 vaddr += VALUE_2MB; 377 len -= VALUE_2MB; 378 } 379 380 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 381 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, 382 (void *)seg_vaddr, seg_len); 383 if (rc != 0) { 384 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 385 return rc; 386 } 387 } 388 389 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 390 return 0; 391 } 392 393 int 394 spdk_mem_unregister(void *_vaddr, size_t len) 395 { 396 struct spdk_mem_map *map; 397 int rc; 398 uint64_t vaddr = (uintptr_t)_vaddr; 399 uint64_t seg_vaddr; 400 size_t seg_len; 401 uint64_t reg, newreg; 402 403 if ((uintptr_t)vaddr & ~MASK_256TB) { 404 DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 405 return -EINVAL; 406 } 407 408 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 409 DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 410 __func__, vaddr, len); 411 return -EINVAL; 412 } 413 414 pthread_mutex_lock(&g_spdk_mem_map_mutex); 415 416 /* The first page must be a start of a region. Also check if it's 417 * registered to make sure we don't return -ERANGE for non-registered 418 * regions. 419 */ 420 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 421 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 422 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 423 return -ERANGE; 424 } 425 426 seg_vaddr = vaddr; 427 seg_len = len; 428 while (seg_len > 0) { 429 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 430 if ((reg & REG_MAP_REGISTERED) == 0) { 431 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 432 return -EINVAL; 433 } 434 seg_vaddr += VALUE_2MB; 435 seg_len -= VALUE_2MB; 436 } 437 438 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 439 /* If the next page is registered, it must be a start of a region as well, 440 * otherwise we'd be unregistering only a part of a region. 441 */ 442 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 443 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 444 return -ERANGE; 445 } 446 seg_vaddr = vaddr; 447 seg_len = 0; 448 449 while (len > 0) { 450 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 451 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 452 453 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 454 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 455 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 456 (void *)seg_vaddr, seg_len); 457 if (rc != 0) { 458 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 459 return rc; 460 } 461 } 462 463 seg_vaddr = vaddr; 464 seg_len = VALUE_2MB; 465 } else { 466 seg_len += VALUE_2MB; 467 } 468 469 vaddr += VALUE_2MB; 470 len -= VALUE_2MB; 471 } 472 473 if (seg_len > 0) { 474 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 475 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 476 (void *)seg_vaddr, seg_len); 477 if (rc != 0) { 478 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 479 return rc; 480 } 481 } 482 } 483 484 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 485 return 0; 486 } 487 488 int 489 spdk_mem_reserve(void *vaddr, size_t len) 490 { 491 struct spdk_mem_map *map; 492 void *seg_vaddr; 493 size_t seg_len; 494 uint64_t reg; 495 496 if ((uintptr_t)vaddr & ~MASK_256TB) { 497 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 498 return -EINVAL; 499 } 500 501 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 502 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 503 __func__, vaddr, len); 504 return -EINVAL; 505 } 506 507 if (len == 0) { 508 return 0; 509 } 510 511 pthread_mutex_lock(&g_spdk_mem_map_mutex); 512 513 /* Check if any part of this range is already registered */ 514 seg_vaddr = vaddr; 515 seg_len = len; 516 while (seg_len > 0) { 517 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 518 if (reg & REG_MAP_REGISTERED) { 519 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 520 return -EBUSY; 521 } 522 seg_vaddr += VALUE_2MB; 523 seg_len -= VALUE_2MB; 524 } 525 526 /* Simply set the translation to the memory map's default. This allocates the space in the 527 * map but does not provide a valid translation. */ 528 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 529 g_mem_reg_map->default_translation); 530 531 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 532 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 533 } 534 535 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 536 return 0; 537 } 538 539 static struct map_1gb * 540 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 541 { 542 struct map_1gb *map_1gb; 543 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 544 size_t i; 545 546 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 547 return NULL; 548 } 549 550 map_1gb = map->map_256tb.map[idx_256tb]; 551 552 if (!map_1gb) { 553 pthread_mutex_lock(&map->mutex); 554 555 /* Recheck to make sure nobody else got the mutex first. */ 556 map_1gb = map->map_256tb.map[idx_256tb]; 557 if (!map_1gb) { 558 map_1gb = malloc(sizeof(struct map_1gb)); 559 if (map_1gb) { 560 /* initialize all entries to default translation */ 561 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 562 map_1gb->map[i].translation_2mb = map->default_translation; 563 } 564 map->map_256tb.map[idx_256tb] = map_1gb; 565 } 566 } 567 568 pthread_mutex_unlock(&map->mutex); 569 570 if (!map_1gb) { 571 DEBUG_PRINT("allocation failed\n"); 572 return NULL; 573 } 574 } 575 576 return map_1gb; 577 } 578 579 int 580 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 581 uint64_t translation) 582 { 583 uint64_t vfn_2mb; 584 struct map_1gb *map_1gb; 585 uint64_t idx_1gb; 586 struct map_2mb *map_2mb; 587 588 if ((uintptr_t)vaddr & ~MASK_256TB) { 589 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 590 return -EINVAL; 591 } 592 593 /* For now, only 2 MB-aligned registrations are supported */ 594 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 595 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 596 __func__, vaddr, size); 597 return -EINVAL; 598 } 599 600 vfn_2mb = vaddr >> SHIFT_2MB; 601 602 while (size) { 603 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 604 if (!map_1gb) { 605 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 606 return -ENOMEM; 607 } 608 609 idx_1gb = MAP_1GB_IDX(vfn_2mb); 610 map_2mb = &map_1gb->map[idx_1gb]; 611 map_2mb->translation_2mb = translation; 612 613 size -= VALUE_2MB; 614 vfn_2mb++; 615 } 616 617 return 0; 618 } 619 620 int 621 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 622 { 623 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 624 } 625 626 inline uint64_t 627 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 628 { 629 const struct map_1gb *map_1gb; 630 const struct map_2mb *map_2mb; 631 uint64_t idx_256tb; 632 uint64_t idx_1gb; 633 uint64_t vfn_2mb; 634 uint64_t cur_size; 635 uint64_t prev_translation; 636 uint64_t orig_translation; 637 638 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 639 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 640 return map->default_translation; 641 } 642 643 vfn_2mb = vaddr >> SHIFT_2MB; 644 idx_256tb = MAP_256TB_IDX(vfn_2mb); 645 idx_1gb = MAP_1GB_IDX(vfn_2mb); 646 647 map_1gb = map->map_256tb.map[idx_256tb]; 648 if (spdk_unlikely(!map_1gb)) { 649 return map->default_translation; 650 } 651 652 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 653 map_2mb = &map_1gb->map[idx_1gb]; 654 if (size == NULL || map->ops.are_contiguous == NULL || 655 map_2mb->translation_2mb == map->default_translation) { 656 if (size != NULL) { 657 *size = spdk_min(*size, cur_size); 658 } 659 return map_2mb->translation_2mb; 660 } 661 662 orig_translation = map_2mb->translation_2mb; 663 prev_translation = orig_translation; 664 while (cur_size < *size) { 665 vfn_2mb++; 666 idx_256tb = MAP_256TB_IDX(vfn_2mb); 667 idx_1gb = MAP_1GB_IDX(vfn_2mb); 668 669 map_1gb = map->map_256tb.map[idx_256tb]; 670 if (spdk_unlikely(!map_1gb)) { 671 break; 672 } 673 674 map_2mb = &map_1gb->map[idx_1gb]; 675 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 676 break; 677 } 678 679 cur_size += VALUE_2MB; 680 prev_translation = map_2mb->translation_2mb; 681 } 682 683 *size = spdk_min(*size, cur_size); 684 return orig_translation; 685 } 686 687 static void 688 memory_hotplug_cb(enum rte_mem_event event_type, 689 const void *addr, size_t len, void *arg) 690 { 691 if (event_type == RTE_MEM_EVENT_ALLOC) { 692 spdk_mem_register((void *)addr, len); 693 694 if (!spdk_env_dpdk_external_init()) { 695 return; 696 } 697 698 /* When the user initialized DPDK separately, we can't 699 * be sure that --match-allocations RTE flag was specified. 700 * Without this flag, DPDK can free memory in different units 701 * than it was allocated. It doesn't work with things like RDMA MRs. 702 * 703 * For such cases, we mark segments so they aren't freed. 704 */ 705 while (len > 0) { 706 struct rte_memseg *seg; 707 708 seg = rte_mem_virt2memseg(addr, NULL); 709 assert(seg != NULL); 710 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 711 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 712 len -= seg->hugepage_sz; 713 } 714 } else if (event_type == RTE_MEM_EVENT_FREE) { 715 spdk_mem_unregister((void *)addr, len); 716 } 717 } 718 719 static int 720 memory_iter_cb(const struct rte_memseg_list *msl, 721 const struct rte_memseg *ms, size_t len, void *arg) 722 { 723 return spdk_mem_register(ms->addr, len); 724 } 725 726 int 727 mem_map_init(bool legacy_mem) 728 { 729 g_legacy_mem = legacy_mem; 730 731 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 732 if (g_mem_reg_map == NULL) { 733 DEBUG_PRINT("memory registration map allocation failed\n"); 734 return -ENOMEM; 735 } 736 737 /* 738 * Walk all DPDK memory segments and register them 739 * with the main memory map 740 */ 741 if (g_huge_pages) { 742 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 743 rte_memseg_contig_walk(memory_iter_cb, NULL); 744 } 745 return 0; 746 } 747 748 bool 749 spdk_iommu_is_enabled(void) 750 { 751 #if VFIO_ENABLED 752 return g_vfio.enabled && !g_vfio.noiommu_enabled; 753 #else 754 return false; 755 #endif 756 } 757 758 struct spdk_vtophys_pci_device { 759 struct rte_pci_device *pci_device; 760 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 761 }; 762 763 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 764 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 765 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 766 767 static struct spdk_mem_map *g_vtophys_map; 768 static struct spdk_mem_map *g_phys_ref_map; 769 770 #if VFIO_ENABLED 771 static int 772 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 773 { 774 struct spdk_vfio_dma_map *dma_map; 775 int ret; 776 777 dma_map = calloc(1, sizeof(*dma_map)); 778 if (dma_map == NULL) { 779 return -ENOMEM; 780 } 781 782 dma_map->map.argsz = sizeof(dma_map->map); 783 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 784 dma_map->map.vaddr = vaddr; 785 dma_map->map.iova = iova; 786 dma_map->map.size = size; 787 788 if (g_vfio.device_ref == 0) { 789 /* VFIO requires at least one device (IOMMU group) to be added to 790 * a VFIO container before it is possible to perform any IOMMU 791 * operations on that container. This memory will be mapped once 792 * the first device (IOMMU group) is hotplugged. 793 * 794 * Since the vfio container is managed internally by DPDK, it is 795 * also possible that some device is already in that container, but 796 * it's not managed by SPDK - e.g. an NIC attached internally 797 * inside DPDK. We could map the memory straight away in such 798 * scenario, but there's no need to do it. DPDK devices clearly 799 * don't need our mappings and hence we defer the mapping 800 * unconditionally until the first SPDK-managed device is 801 * hotplugged. 802 */ 803 goto out_insert; 804 } 805 806 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 807 if (ret) { 808 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 809 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 810 } 811 812 out_insert: 813 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 814 return 0; 815 } 816 817 818 static int 819 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 820 { 821 uint64_t refcount; 822 int ret; 823 824 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 825 assert(refcount < UINT64_MAX); 826 if (refcount > 0) { 827 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 828 return 0; 829 } 830 831 pthread_mutex_lock(&g_vfio.mutex); 832 ret = _vfio_iommu_map_dma(vaddr, iova, size); 833 pthread_mutex_unlock(&g_vfio.mutex); 834 if (ret) { 835 return ret; 836 } 837 838 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 839 return 0; 840 } 841 842 int 843 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) 844 { 845 int ret; 846 847 pthread_mutex_lock(&g_vfio.mutex); 848 ret = _vfio_iommu_map_dma(vaddr, iova, size); 849 pthread_mutex_unlock(&g_vfio.mutex); 850 851 return ret; 852 } 853 854 static int 855 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) 856 { 857 struct vfio_iommu_type1_dma_unmap unmap = {}; 858 int ret; 859 860 if (g_vfio.device_ref == 0) { 861 /* Memory is not mapped anymore, just remove it's references */ 862 goto out_remove; 863 } 864 865 unmap.argsz = sizeof(unmap); 866 unmap.flags = 0; 867 unmap.iova = dma_map->map.iova; 868 unmap.size = dma_map->map.size; 869 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 870 if (ret) { 871 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 872 } 873 874 out_remove: 875 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 876 free(dma_map); 877 return 0; 878 } 879 880 static int 881 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 882 { 883 struct spdk_vfio_dma_map *dma_map; 884 uint64_t refcount; 885 int ret; 886 887 pthread_mutex_lock(&g_vfio.mutex); 888 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 889 if (dma_map->map.iova == iova) { 890 break; 891 } 892 } 893 894 if (dma_map == NULL) { 895 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 896 pthread_mutex_unlock(&g_vfio.mutex); 897 return -ENXIO; 898 } 899 900 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 901 assert(refcount < UINT64_MAX); 902 if (refcount > 0) { 903 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 904 } 905 906 /* We still have outstanding references, don't clear it. */ 907 if (refcount > 1) { 908 pthread_mutex_unlock(&g_vfio.mutex); 909 return 0; 910 } 911 912 /** don't support partial or multiple-page unmap for now */ 913 assert(dma_map->map.size == size); 914 915 ret = _vfio_iommu_unmap_dma(dma_map); 916 pthread_mutex_unlock(&g_vfio.mutex); 917 918 return ret; 919 } 920 921 int 922 vtophys_iommu_unmap_dma_bar(uint64_t vaddr) 923 { 924 struct spdk_vfio_dma_map *dma_map; 925 int ret; 926 927 pthread_mutex_lock(&g_vfio.mutex); 928 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 929 if (dma_map->map.vaddr == vaddr) { 930 break; 931 } 932 } 933 934 if (dma_map == NULL) { 935 DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); 936 pthread_mutex_unlock(&g_vfio.mutex); 937 return -ENXIO; 938 } 939 940 ret = _vfio_iommu_unmap_dma(dma_map); 941 pthread_mutex_unlock(&g_vfio.mutex); 942 return ret; 943 } 944 #endif 945 946 static uint64_t 947 vtophys_get_paddr_memseg(uint64_t vaddr) 948 { 949 uintptr_t paddr; 950 struct rte_memseg *seg; 951 952 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 953 if (seg != NULL) { 954 paddr = seg->iova; 955 if (paddr == RTE_BAD_IOVA) { 956 return SPDK_VTOPHYS_ERROR; 957 } 958 paddr += (vaddr - (uintptr_t)seg->addr); 959 return paddr; 960 } 961 962 return SPDK_VTOPHYS_ERROR; 963 } 964 965 /* Try to get the paddr from /proc/self/pagemap */ 966 static uint64_t 967 vtophys_get_paddr_pagemap(uint64_t vaddr) 968 { 969 uintptr_t paddr; 970 971 /* Silence static analyzers */ 972 assert(vaddr != 0); 973 paddr = rte_mem_virt2iova((void *)vaddr); 974 if (paddr == RTE_BAD_IOVA) { 975 /* 976 * The vaddr may be valid but doesn't have a backing page 977 * assigned yet. Touch the page to ensure a backing page 978 * gets assigned, then try to translate again. 979 */ 980 rte_atomic64_read((rte_atomic64_t *)vaddr); 981 paddr = rte_mem_virt2iova((void *)vaddr); 982 } 983 if (paddr == RTE_BAD_IOVA) { 984 /* Unable to get to the physical address. */ 985 return SPDK_VTOPHYS_ERROR; 986 } 987 988 return paddr; 989 } 990 991 static uint64_t 992 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) 993 { 994 struct rte_mem_resource *res; 995 uint64_t paddr; 996 unsigned r; 997 998 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 999 res = dpdk_pci_device_get_mem_resource(dev, r); 1000 1001 if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || 1002 (vaddr + len) >= (uint64_t)res->addr + res->len) { 1003 continue; 1004 } 1005 1006 #if VFIO_ENABLED 1007 if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { 1008 /* 1009 * The IOMMU is on and we're using IOVA == VA. The BAR was 1010 * automatically registered when it was mapped, so just return 1011 * the virtual address here. 1012 */ 1013 return vaddr; 1014 } 1015 #endif 1016 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1017 return paddr; 1018 } 1019 1020 return SPDK_VTOPHYS_ERROR; 1021 } 1022 1023 /* Try to get the paddr from pci devices */ 1024 static uint64_t 1025 vtophys_get_paddr_pci(uint64_t vaddr, size_t len) 1026 { 1027 struct spdk_vtophys_pci_device *vtophys_dev; 1028 uintptr_t paddr; 1029 struct rte_pci_device *dev; 1030 1031 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1032 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1033 dev = vtophys_dev->pci_device; 1034 paddr = pci_device_vtophys(dev, vaddr, len); 1035 if (paddr != SPDK_VTOPHYS_ERROR) { 1036 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1037 return paddr; 1038 } 1039 } 1040 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1041 1042 return SPDK_VTOPHYS_ERROR; 1043 } 1044 1045 static int 1046 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1047 enum spdk_mem_map_notify_action action, 1048 void *vaddr, size_t len) 1049 { 1050 int rc = 0; 1051 uint64_t paddr; 1052 1053 if ((uintptr_t)vaddr & ~MASK_256TB) { 1054 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1055 return -EINVAL; 1056 } 1057 1058 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1059 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1060 vaddr, len); 1061 return -EINVAL; 1062 } 1063 1064 /* Get the physical address from the DPDK memsegs */ 1065 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1066 1067 switch (action) { 1068 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1069 if (paddr == SPDK_VTOPHYS_ERROR) { 1070 /* This is not an address that DPDK is managing. */ 1071 1072 /* Check if this is a PCI BAR. They need special handling */ 1073 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1074 if (paddr != SPDK_VTOPHYS_ERROR) { 1075 /* Get paddr for each 2MB chunk in this address range */ 1076 while (len > 0) { 1077 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1078 if (paddr == SPDK_VTOPHYS_ERROR) { 1079 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1080 return -EFAULT; 1081 } 1082 1083 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1084 if (rc != 0) { 1085 return rc; 1086 } 1087 1088 vaddr += VALUE_2MB; 1089 len -= VALUE_2MB; 1090 } 1091 1092 return 0; 1093 } 1094 1095 #if VFIO_ENABLED 1096 enum rte_iova_mode iova_mode; 1097 1098 iova_mode = rte_eal_iova_mode(); 1099 1100 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1101 /* We'll use the virtual address as the iova to match DPDK. */ 1102 paddr = (uint64_t)vaddr; 1103 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1104 if (rc) { 1105 return -EFAULT; 1106 } 1107 while (len > 0) { 1108 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1109 if (rc != 0) { 1110 return rc; 1111 } 1112 vaddr += VALUE_2MB; 1113 paddr += VALUE_2MB; 1114 len -= VALUE_2MB; 1115 } 1116 } else 1117 #endif 1118 { 1119 /* Get the physical address from /proc/self/pagemap. */ 1120 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1121 if (paddr == SPDK_VTOPHYS_ERROR) { 1122 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1123 return -EFAULT; 1124 } 1125 1126 /* Get paddr for each 2MB chunk in this address range */ 1127 while (len > 0) { 1128 /* Get the physical address from /proc/self/pagemap. */ 1129 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1130 1131 if (paddr == SPDK_VTOPHYS_ERROR) { 1132 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1133 return -EFAULT; 1134 } 1135 1136 if (paddr & MASK_2MB) { 1137 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1138 return -EINVAL; 1139 } 1140 #if VFIO_ENABLED 1141 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1142 * with the IOMMU using the physical address to match. */ 1143 if (spdk_iommu_is_enabled()) { 1144 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1145 if (rc) { 1146 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1147 return -EFAULT; 1148 } 1149 } 1150 #endif 1151 1152 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1153 if (rc != 0) { 1154 return rc; 1155 } 1156 1157 vaddr += VALUE_2MB; 1158 len -= VALUE_2MB; 1159 } 1160 } 1161 } else { 1162 /* This is an address managed by DPDK. Just setup the translations. */ 1163 while (len > 0) { 1164 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1165 if (paddr == SPDK_VTOPHYS_ERROR) { 1166 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1167 return -EFAULT; 1168 } 1169 1170 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1171 if (rc != 0) { 1172 return rc; 1173 } 1174 1175 vaddr += VALUE_2MB; 1176 len -= VALUE_2MB; 1177 } 1178 } 1179 1180 break; 1181 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1182 #if VFIO_ENABLED 1183 if (paddr == SPDK_VTOPHYS_ERROR) { 1184 /* 1185 * This is not an address that DPDK is managing. 1186 */ 1187 1188 /* Check if this is a PCI BAR. They need special handling */ 1189 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1190 if (paddr != SPDK_VTOPHYS_ERROR) { 1191 /* Get paddr for each 2MB chunk in this address range */ 1192 while (len > 0) { 1193 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1194 if (paddr == SPDK_VTOPHYS_ERROR) { 1195 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1196 return -EFAULT; 1197 } 1198 1199 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1200 if (rc != 0) { 1201 return rc; 1202 } 1203 1204 vaddr += VALUE_2MB; 1205 len -= VALUE_2MB; 1206 } 1207 1208 return 0; 1209 } 1210 1211 /* If vfio is enabled, 1212 * we need to unmap the range from the IOMMU 1213 */ 1214 if (spdk_iommu_is_enabled()) { 1215 uint64_t buffer_len = len; 1216 uint8_t *va = vaddr; 1217 enum rte_iova_mode iova_mode; 1218 1219 iova_mode = rte_eal_iova_mode(); 1220 /* 1221 * In virtual address mode, the region is contiguous and can be done in 1222 * one unmap. 1223 */ 1224 if (iova_mode == RTE_IOVA_VA) { 1225 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1226 if (buffer_len != len || paddr != (uintptr_t)va) { 1227 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1228 "translation had address 0x%" PRIx64 " and length %lu\n", 1229 va, len, paddr, buffer_len); 1230 return -EINVAL; 1231 } 1232 rc = vtophys_iommu_unmap_dma(paddr, len); 1233 if (rc) { 1234 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1235 return -EFAULT; 1236 } 1237 } else if (iova_mode == RTE_IOVA_PA) { 1238 /* Get paddr for each 2MB chunk in this address range */ 1239 while (buffer_len > 0) { 1240 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1241 1242 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1243 DEBUG_PRINT("could not get phys addr for %p\n", va); 1244 return -EFAULT; 1245 } 1246 1247 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1248 if (rc) { 1249 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1250 return -EFAULT; 1251 } 1252 1253 va += VALUE_2MB; 1254 buffer_len -= VALUE_2MB; 1255 } 1256 } 1257 } 1258 } 1259 #endif 1260 while (len > 0) { 1261 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1262 if (rc != 0) { 1263 return rc; 1264 } 1265 1266 vaddr += VALUE_2MB; 1267 len -= VALUE_2MB; 1268 } 1269 1270 break; 1271 default: 1272 SPDK_UNREACHABLE(); 1273 } 1274 1275 return rc; 1276 } 1277 1278 static int 1279 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1280 { 1281 /* This function is always called with paddrs for two subsequent 1282 * 2MB chunks in virtual address space, so those chunks will be only 1283 * physically contiguous if the physical addresses are 2MB apart 1284 * from each other as well. 1285 */ 1286 return (paddr2 - paddr1 == VALUE_2MB); 1287 } 1288 1289 #if VFIO_ENABLED 1290 1291 static bool 1292 vfio_enabled(void) 1293 { 1294 return rte_vfio_is_enabled("vfio_pci"); 1295 } 1296 1297 /* Check if IOMMU is enabled on the system */ 1298 static bool 1299 has_iommu_groups(void) 1300 { 1301 int count = 0; 1302 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1303 1304 if (dir == NULL) { 1305 return false; 1306 } 1307 1308 while (count < 3 && readdir(dir) != NULL) { 1309 count++; 1310 } 1311 1312 closedir(dir); 1313 /* there will always be ./ and ../ entries */ 1314 return count > 2; 1315 } 1316 1317 static bool 1318 vfio_noiommu_enabled(void) 1319 { 1320 return rte_vfio_noiommu_is_enabled(); 1321 } 1322 1323 static void 1324 vtophys_iommu_init(void) 1325 { 1326 char proc_fd_path[PATH_MAX + 1]; 1327 char link_path[PATH_MAX + 1]; 1328 const char vfio_path[] = "/dev/vfio/vfio"; 1329 DIR *dir; 1330 struct dirent *d; 1331 1332 if (!vfio_enabled()) { 1333 return; 1334 } 1335 1336 if (vfio_noiommu_enabled()) { 1337 g_vfio.noiommu_enabled = true; 1338 } else if (!has_iommu_groups()) { 1339 return; 1340 } 1341 1342 dir = opendir("/proc/self/fd"); 1343 if (!dir) { 1344 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1345 return; 1346 } 1347 1348 while ((d = readdir(dir)) != NULL) { 1349 if (d->d_type != DT_LNK) { 1350 continue; 1351 } 1352 1353 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1354 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1355 continue; 1356 } 1357 1358 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1359 sscanf(d->d_name, "%d", &g_vfio.fd); 1360 break; 1361 } 1362 } 1363 1364 closedir(dir); 1365 1366 if (g_vfio.fd < 0) { 1367 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1368 return; 1369 } 1370 1371 g_vfio.enabled = true; 1372 1373 return; 1374 } 1375 1376 #endif 1377 1378 void 1379 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1380 { 1381 struct spdk_vtophys_pci_device *vtophys_dev; 1382 1383 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1384 1385 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1386 if (vtophys_dev) { 1387 vtophys_dev->pci_device = pci_device; 1388 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1389 } else { 1390 DEBUG_PRINT("Memory allocation error\n"); 1391 } 1392 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1393 1394 #if VFIO_ENABLED 1395 struct spdk_vfio_dma_map *dma_map; 1396 int ret; 1397 1398 if (!g_vfio.enabled) { 1399 return; 1400 } 1401 1402 pthread_mutex_lock(&g_vfio.mutex); 1403 g_vfio.device_ref++; 1404 if (g_vfio.device_ref > 1) { 1405 pthread_mutex_unlock(&g_vfio.mutex); 1406 return; 1407 } 1408 1409 /* This is the first SPDK device using DPDK vfio. This means that the first 1410 * IOMMU group might have been just been added to the DPDK vfio container. 1411 * From this point it is certain that the memory can be mapped now. 1412 */ 1413 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1414 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1415 if (ret) { 1416 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1417 break; 1418 } 1419 } 1420 pthread_mutex_unlock(&g_vfio.mutex); 1421 #endif 1422 } 1423 1424 void 1425 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1426 { 1427 struct spdk_vtophys_pci_device *vtophys_dev; 1428 1429 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1430 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1431 if (vtophys_dev->pci_device == pci_device) { 1432 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1433 free(vtophys_dev); 1434 break; 1435 } 1436 } 1437 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1438 1439 #if VFIO_ENABLED 1440 struct spdk_vfio_dma_map *dma_map; 1441 int ret; 1442 1443 if (!g_vfio.enabled) { 1444 return; 1445 } 1446 1447 pthread_mutex_lock(&g_vfio.mutex); 1448 assert(g_vfio.device_ref > 0); 1449 g_vfio.device_ref--; 1450 if (g_vfio.device_ref > 0) { 1451 pthread_mutex_unlock(&g_vfio.mutex); 1452 return; 1453 } 1454 1455 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1456 * any additional devices using it's vfio container, all the mappings 1457 * will be automatically removed by the Linux vfio driver. We unmap 1458 * the memory manually to be able to easily re-map it later regardless 1459 * of other, external factors. 1460 */ 1461 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1462 struct vfio_iommu_type1_dma_unmap unmap = {}; 1463 unmap.argsz = sizeof(unmap); 1464 unmap.flags = 0; 1465 unmap.iova = dma_map->map.iova; 1466 unmap.size = dma_map->map.size; 1467 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1468 if (ret) { 1469 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1470 break; 1471 } 1472 } 1473 pthread_mutex_unlock(&g_vfio.mutex); 1474 #endif 1475 } 1476 1477 int 1478 vtophys_init(void) 1479 { 1480 const struct spdk_mem_map_ops vtophys_map_ops = { 1481 .notify_cb = vtophys_notify, 1482 .are_contiguous = vtophys_check_contiguous_entries, 1483 }; 1484 1485 const struct spdk_mem_map_ops phys_ref_map_ops = { 1486 .notify_cb = NULL, 1487 .are_contiguous = NULL, 1488 }; 1489 1490 #if VFIO_ENABLED 1491 vtophys_iommu_init(); 1492 #endif 1493 1494 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1495 if (g_phys_ref_map == NULL) { 1496 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1497 return -ENOMEM; 1498 } 1499 1500 if (g_huge_pages) { 1501 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1502 if (g_vtophys_map == NULL) { 1503 DEBUG_PRINT("vtophys map allocation failed\n"); 1504 spdk_mem_map_free(&g_phys_ref_map); 1505 return -ENOMEM; 1506 } 1507 } 1508 return 0; 1509 } 1510 1511 uint64_t 1512 spdk_vtophys(const void *buf, uint64_t *size) 1513 { 1514 uint64_t vaddr, paddr_2mb; 1515 1516 if (!g_huge_pages) { 1517 return SPDK_VTOPHYS_ERROR; 1518 } 1519 1520 vaddr = (uint64_t)buf; 1521 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1522 1523 /* 1524 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1525 * we will still bitwise-or it with the buf offset below, but the result will still be 1526 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1527 * unaligned) we must now check the return value before addition. 1528 */ 1529 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1530 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1531 return SPDK_VTOPHYS_ERROR; 1532 } else { 1533 return paddr_2mb + (vaddr & MASK_2MB); 1534 } 1535 } 1536 1537 int 1538 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1539 { 1540 struct rte_memseg *seg; 1541 int ret, fd; 1542 1543 seg = rte_mem_virt2memseg(vaddr, NULL); 1544 if (!seg) { 1545 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1546 return -ENOENT; 1547 } 1548 1549 fd = rte_memseg_get_fd_thread_unsafe(seg); 1550 if (fd < 0) { 1551 return fd; 1552 } 1553 1554 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1555 if (ret < 0) { 1556 return ret; 1557 } 1558 1559 return fd; 1560 } 1561 1562 void 1563 mem_disable_huge_pages(void) 1564 { 1565 g_huge_pages = false; 1566 } 1567