1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "spdk/stdinc.h" 7 8 #include "env_internal.h" 9 #include "pci_dpdk.h" 10 11 #include <rte_config.h> 12 #include <rte_memory.h> 13 #include <rte_eal_memconfig.h> 14 #include <rte_dev.h> 15 #include <rte_pci.h> 16 17 #include "spdk_internal/assert.h" 18 19 #include "spdk/assert.h" 20 #include "spdk/likely.h" 21 #include "spdk/queue.h" 22 #include "spdk/util.h" 23 #include "spdk/memory.h" 24 #include "spdk/env_dpdk.h" 25 #include "spdk/log.h" 26 27 #ifdef __linux__ 28 #include <linux/version.h> 29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30 #include <linux/vfio.h> 31 #include <rte_vfio.h> 32 33 struct spdk_vfio_dma_map { 34 struct vfio_iommu_type1_dma_map map; 35 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36 }; 37 38 struct vfio_cfg { 39 int fd; 40 bool enabled; 41 bool noiommu_enabled; 42 unsigned device_ref; 43 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44 pthread_mutex_t mutex; 45 }; 46 47 static struct vfio_cfg g_vfio = { 48 .fd = -1, 49 .enabled = false, 50 .noiommu_enabled = false, 51 .device_ref = 0, 52 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53 .mutex = PTHREAD_MUTEX_INITIALIZER 54 }; 55 #endif 56 #endif 57 58 #if DEBUG 59 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 60 #else 61 #define DEBUG_PRINT(...) 62 #endif 63 64 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 65 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 66 67 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 68 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 69 70 /* Page is registered */ 71 #define REG_MAP_REGISTERED (1ULL << 62) 72 73 /* A notification region barrier. The 2MB translation entry that's marked 74 * with this flag must be unregistered separately. This allows contiguous 75 * regions to be unregistered in the same chunks they were registered. 76 */ 77 #define REG_MAP_NOTIFY_START (1ULL << 63) 78 79 /* Translation of a single 2MB page. */ 80 struct map_2mb { 81 uint64_t translation_2mb; 82 }; 83 84 /* Second-level map table indexed by bits [21..29] of the virtual address. 85 * Each entry contains the address translation or error for entries that haven't 86 * been retrieved yet. 87 */ 88 struct map_1gb { 89 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 90 }; 91 92 /* Top-level map table indexed by bits [30..47] of the virtual address. 93 * Each entry points to a second-level map table or NULL. 94 */ 95 struct map_256tb { 96 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 97 }; 98 99 /* Page-granularity memory address translation */ 100 struct spdk_mem_map { 101 struct map_256tb map_256tb; 102 pthread_mutex_t mutex; 103 uint64_t default_translation; 104 struct spdk_mem_map_ops ops; 105 void *cb_ctx; 106 TAILQ_ENTRY(spdk_mem_map) tailq; 107 }; 108 109 /* Registrations map. The 64 bit translations are bit fields with the 110 * following layout (starting with the low bits): 111 * 0 - 61 : reserved 112 * 62 - 63 : flags 113 */ 114 static struct spdk_mem_map *g_mem_reg_map; 115 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 116 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 117 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 118 119 static bool g_legacy_mem; 120 static bool g_huge_pages = true; 121 122 /* 123 * Walk the currently registered memory via the main memory registration map 124 * and call the new map's notify callback for each virtually contiguous region. 125 */ 126 static int 127 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 128 { 129 size_t idx_256tb; 130 uint64_t idx_1gb; 131 uint64_t contig_start = UINT64_MAX; 132 uint64_t contig_end = UINT64_MAX; 133 struct map_1gb *map_1gb; 134 int rc; 135 136 if (!g_mem_reg_map) { 137 return -EINVAL; 138 } 139 140 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 141 pthread_mutex_lock(&g_mem_reg_map->mutex); 142 143 for (idx_256tb = 0; 144 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 145 idx_256tb++) { 146 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 147 148 if (!map_1gb) { 149 if (contig_start != UINT64_MAX) { 150 /* End of of a virtually contiguous range */ 151 rc = map->ops.notify_cb(map->cb_ctx, map, action, 152 (void *)contig_start, 153 contig_end - contig_start + VALUE_2MB); 154 /* Don't bother handling unregister failures. It can't be any worse */ 155 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 156 goto err_unregister; 157 } 158 } 159 contig_start = UINT64_MAX; 160 continue; 161 } 162 163 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 164 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 165 (contig_start == UINT64_MAX || 166 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 167 /* Rebuild the virtual address from the indexes */ 168 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 169 170 if (contig_start == UINT64_MAX) { 171 contig_start = vaddr; 172 } 173 174 contig_end = vaddr; 175 } else { 176 if (contig_start != UINT64_MAX) { 177 /* End of of a virtually contiguous range */ 178 rc = map->ops.notify_cb(map->cb_ctx, map, action, 179 (void *)contig_start, 180 contig_end - contig_start + VALUE_2MB); 181 /* Don't bother handling unregister failures. It can't be any worse */ 182 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 183 goto err_unregister; 184 } 185 186 /* This page might be a part of a neighbour region, so process 187 * it again. The idx_1gb will be incremented immediately. 188 */ 189 idx_1gb--; 190 } 191 contig_start = UINT64_MAX; 192 } 193 } 194 } 195 196 pthread_mutex_unlock(&g_mem_reg_map->mutex); 197 return 0; 198 199 err_unregister: 200 /* Unwind to the first empty translation so we don't unregister 201 * a region that just failed to register. 202 */ 203 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 204 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 205 contig_start = UINT64_MAX; 206 contig_end = UINT64_MAX; 207 208 /* Unregister any memory we managed to register before the failure */ 209 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 210 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 211 212 if (!map_1gb) { 213 if (contig_end != UINT64_MAX) { 214 /* End of of a virtually contiguous range */ 215 map->ops.notify_cb(map->cb_ctx, map, 216 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 217 (void *)contig_start, 218 contig_end - contig_start + VALUE_2MB); 219 } 220 contig_end = UINT64_MAX; 221 continue; 222 } 223 224 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 225 /* Rebuild the virtual address from the indexes */ 226 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 227 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 228 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 229 230 if (contig_end == UINT64_MAX) { 231 contig_end = vaddr; 232 } 233 contig_start = vaddr; 234 } else { 235 if (contig_end != UINT64_MAX) { 236 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 237 contig_start = vaddr; 238 } 239 /* End of of a virtually contiguous range */ 240 map->ops.notify_cb(map->cb_ctx, map, 241 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 242 (void *)contig_start, 243 contig_end - contig_start + VALUE_2MB); 244 } 245 contig_end = UINT64_MAX; 246 } 247 } 248 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 249 } 250 251 pthread_mutex_unlock(&g_mem_reg_map->mutex); 252 return rc; 253 } 254 255 struct spdk_mem_map * 256 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 257 { 258 struct spdk_mem_map *map; 259 int rc; 260 size_t i; 261 262 map = calloc(1, sizeof(*map)); 263 if (map == NULL) { 264 return NULL; 265 } 266 267 if (pthread_mutex_init(&map->mutex, NULL)) { 268 free(map); 269 return NULL; 270 } 271 272 map->default_translation = default_translation; 273 map->cb_ctx = cb_ctx; 274 if (ops) { 275 map->ops = *ops; 276 } 277 278 if (ops && ops->notify_cb) { 279 pthread_mutex_lock(&g_spdk_mem_map_mutex); 280 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 281 if (rc != 0) { 282 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 283 DEBUG_PRINT("Initial mem_map notify failed\n"); 284 pthread_mutex_destroy(&map->mutex); 285 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 286 free(map->map_256tb.map[i]); 287 } 288 free(map); 289 return NULL; 290 } 291 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 292 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 293 } 294 295 return map; 296 } 297 298 void 299 spdk_mem_map_free(struct spdk_mem_map **pmap) 300 { 301 struct spdk_mem_map *map; 302 size_t i; 303 304 if (!pmap) { 305 return; 306 } 307 308 map = *pmap; 309 310 if (!map) { 311 return; 312 } 313 314 if (map->ops.notify_cb) { 315 pthread_mutex_lock(&g_spdk_mem_map_mutex); 316 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 317 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 318 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 319 } 320 321 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 322 free(map->map_256tb.map[i]); 323 } 324 325 pthread_mutex_destroy(&map->mutex); 326 327 free(map); 328 *pmap = NULL; 329 } 330 331 int 332 spdk_mem_register(void *_vaddr, size_t len) 333 { 334 struct spdk_mem_map *map; 335 int rc; 336 uint64_t vaddr = (uintptr_t)_vaddr; 337 uint64_t seg_vaddr; 338 size_t seg_len; 339 uint64_t reg; 340 341 if ((uintptr_t)vaddr & ~MASK_256TB) { 342 DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 343 return -EINVAL; 344 } 345 346 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 347 DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 348 __func__, vaddr, len); 349 return -EINVAL; 350 } 351 352 if (len == 0) { 353 return 0; 354 } 355 356 pthread_mutex_lock(&g_spdk_mem_map_mutex); 357 358 seg_vaddr = vaddr; 359 seg_len = len; 360 while (seg_len > 0) { 361 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 362 if (reg & REG_MAP_REGISTERED) { 363 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 364 return -EBUSY; 365 } 366 seg_vaddr += VALUE_2MB; 367 seg_len -= VALUE_2MB; 368 } 369 370 seg_vaddr = vaddr; 371 seg_len = 0; 372 while (len > 0) { 373 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 374 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 375 seg_len += VALUE_2MB; 376 vaddr += VALUE_2MB; 377 len -= VALUE_2MB; 378 } 379 380 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 381 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, 382 (void *)seg_vaddr, seg_len); 383 if (rc != 0) { 384 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 385 return rc; 386 } 387 } 388 389 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 390 return 0; 391 } 392 393 int 394 spdk_mem_unregister(void *_vaddr, size_t len) 395 { 396 struct spdk_mem_map *map; 397 int rc; 398 uint64_t vaddr = (uintptr_t)_vaddr; 399 uint64_t seg_vaddr; 400 size_t seg_len; 401 uint64_t reg, newreg; 402 403 if ((uintptr_t)vaddr & ~MASK_256TB) { 404 DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 405 return -EINVAL; 406 } 407 408 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 409 DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 410 __func__, vaddr, len); 411 return -EINVAL; 412 } 413 414 pthread_mutex_lock(&g_spdk_mem_map_mutex); 415 416 /* The first page must be a start of a region. Also check if it's 417 * registered to make sure we don't return -ERANGE for non-registered 418 * regions. 419 */ 420 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 421 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 422 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 423 return -ERANGE; 424 } 425 426 seg_vaddr = vaddr; 427 seg_len = len; 428 while (seg_len > 0) { 429 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 430 if ((reg & REG_MAP_REGISTERED) == 0) { 431 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 432 return -EINVAL; 433 } 434 seg_vaddr += VALUE_2MB; 435 seg_len -= VALUE_2MB; 436 } 437 438 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 439 /* If the next page is registered, it must be a start of a region as well, 440 * otherwise we'd be unregistering only a part of a region. 441 */ 442 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 443 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 444 return -ERANGE; 445 } 446 seg_vaddr = vaddr; 447 seg_len = 0; 448 449 while (len > 0) { 450 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 451 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 452 453 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 454 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 455 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 456 (void *)seg_vaddr, seg_len); 457 if (rc != 0) { 458 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 459 return rc; 460 } 461 } 462 463 seg_vaddr = vaddr; 464 seg_len = VALUE_2MB; 465 } else { 466 seg_len += VALUE_2MB; 467 } 468 469 vaddr += VALUE_2MB; 470 len -= VALUE_2MB; 471 } 472 473 if (seg_len > 0) { 474 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 475 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 476 (void *)seg_vaddr, seg_len); 477 if (rc != 0) { 478 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 479 return rc; 480 } 481 } 482 } 483 484 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 485 return 0; 486 } 487 488 int 489 spdk_mem_reserve(void *vaddr, size_t len) 490 { 491 struct spdk_mem_map *map; 492 void *seg_vaddr; 493 size_t seg_len; 494 uint64_t reg; 495 496 if ((uintptr_t)vaddr & ~MASK_256TB) { 497 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 498 return -EINVAL; 499 } 500 501 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 502 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 503 __func__, vaddr, len); 504 return -EINVAL; 505 } 506 507 if (len == 0) { 508 return 0; 509 } 510 511 pthread_mutex_lock(&g_spdk_mem_map_mutex); 512 513 /* Check if any part of this range is already registered */ 514 seg_vaddr = vaddr; 515 seg_len = len; 516 while (seg_len > 0) { 517 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 518 if (reg & REG_MAP_REGISTERED) { 519 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 520 return -EBUSY; 521 } 522 seg_vaddr += VALUE_2MB; 523 seg_len -= VALUE_2MB; 524 } 525 526 /* Simply set the translation to the memory map's default. This allocates the space in the 527 * map but does not provide a valid translation. */ 528 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 529 g_mem_reg_map->default_translation); 530 531 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 532 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 533 } 534 535 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 536 return 0; 537 } 538 539 static struct map_1gb * 540 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 541 { 542 struct map_1gb *map_1gb; 543 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 544 size_t i; 545 546 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 547 return NULL; 548 } 549 550 map_1gb = map->map_256tb.map[idx_256tb]; 551 552 if (!map_1gb) { 553 pthread_mutex_lock(&map->mutex); 554 555 /* Recheck to make sure nobody else got the mutex first. */ 556 map_1gb = map->map_256tb.map[idx_256tb]; 557 if (!map_1gb) { 558 map_1gb = malloc(sizeof(struct map_1gb)); 559 if (map_1gb) { 560 /* initialize all entries to default translation */ 561 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 562 map_1gb->map[i].translation_2mb = map->default_translation; 563 } 564 map->map_256tb.map[idx_256tb] = map_1gb; 565 } 566 } 567 568 pthread_mutex_unlock(&map->mutex); 569 570 if (!map_1gb) { 571 DEBUG_PRINT("allocation failed\n"); 572 return NULL; 573 } 574 } 575 576 return map_1gb; 577 } 578 579 int 580 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 581 uint64_t translation) 582 { 583 uint64_t vfn_2mb; 584 struct map_1gb *map_1gb; 585 uint64_t idx_1gb; 586 struct map_2mb *map_2mb; 587 588 if ((uintptr_t)vaddr & ~MASK_256TB) { 589 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 590 return -EINVAL; 591 } 592 593 /* For now, only 2 MB-aligned registrations are supported */ 594 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 595 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 596 __func__, vaddr, size); 597 return -EINVAL; 598 } 599 600 vfn_2mb = vaddr >> SHIFT_2MB; 601 602 while (size) { 603 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 604 if (!map_1gb) { 605 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 606 return -ENOMEM; 607 } 608 609 idx_1gb = MAP_1GB_IDX(vfn_2mb); 610 map_2mb = &map_1gb->map[idx_1gb]; 611 map_2mb->translation_2mb = translation; 612 613 size -= VALUE_2MB; 614 vfn_2mb++; 615 } 616 617 return 0; 618 } 619 620 int 621 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 622 { 623 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 624 } 625 626 inline uint64_t 627 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 628 { 629 const struct map_1gb *map_1gb; 630 const struct map_2mb *map_2mb; 631 uint64_t idx_256tb; 632 uint64_t idx_1gb; 633 uint64_t vfn_2mb; 634 uint64_t cur_size; 635 uint64_t prev_translation; 636 uint64_t orig_translation; 637 638 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 639 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 640 return map->default_translation; 641 } 642 643 vfn_2mb = vaddr >> SHIFT_2MB; 644 idx_256tb = MAP_256TB_IDX(vfn_2mb); 645 idx_1gb = MAP_1GB_IDX(vfn_2mb); 646 647 map_1gb = map->map_256tb.map[idx_256tb]; 648 if (spdk_unlikely(!map_1gb)) { 649 return map->default_translation; 650 } 651 652 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 653 map_2mb = &map_1gb->map[idx_1gb]; 654 if (size == NULL || map->ops.are_contiguous == NULL || 655 map_2mb->translation_2mb == map->default_translation) { 656 if (size != NULL) { 657 *size = spdk_min(*size, cur_size); 658 } 659 return map_2mb->translation_2mb; 660 } 661 662 orig_translation = map_2mb->translation_2mb; 663 prev_translation = orig_translation; 664 while (cur_size < *size) { 665 vfn_2mb++; 666 idx_256tb = MAP_256TB_IDX(vfn_2mb); 667 idx_1gb = MAP_1GB_IDX(vfn_2mb); 668 669 map_1gb = map->map_256tb.map[idx_256tb]; 670 if (spdk_unlikely(!map_1gb)) { 671 break; 672 } 673 674 map_2mb = &map_1gb->map[idx_1gb]; 675 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 676 break; 677 } 678 679 cur_size += VALUE_2MB; 680 prev_translation = map_2mb->translation_2mb; 681 } 682 683 *size = spdk_min(*size, cur_size); 684 return orig_translation; 685 } 686 687 static void 688 memory_hotplug_cb(enum rte_mem_event event_type, 689 const void *addr, size_t len, void *arg) 690 { 691 if (event_type == RTE_MEM_EVENT_ALLOC) { 692 spdk_mem_register((void *)addr, len); 693 694 if (!spdk_env_dpdk_external_init()) { 695 return; 696 } 697 698 /* When the user initialized DPDK separately, we can't 699 * be sure that --match-allocations RTE flag was specified. 700 * Without this flag, DPDK can free memory in different units 701 * than it was allocated. It doesn't work with things like RDMA MRs. 702 * 703 * For such cases, we mark segments so they aren't freed. 704 */ 705 while (len > 0) { 706 struct rte_memseg *seg; 707 708 seg = rte_mem_virt2memseg(addr, NULL); 709 assert(seg != NULL); 710 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 711 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 712 len -= seg->hugepage_sz; 713 } 714 } else if (event_type == RTE_MEM_EVENT_FREE) { 715 spdk_mem_unregister((void *)addr, len); 716 } 717 } 718 719 static int 720 memory_iter_cb(const struct rte_memseg_list *msl, 721 const struct rte_memseg *ms, size_t len, void *arg) 722 { 723 return spdk_mem_register(ms->addr, len); 724 } 725 726 int 727 mem_map_init(bool legacy_mem) 728 { 729 g_legacy_mem = legacy_mem; 730 731 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 732 if (g_mem_reg_map == NULL) { 733 DEBUG_PRINT("memory registration map allocation failed\n"); 734 return -ENOMEM; 735 } 736 737 /* 738 * Walk all DPDK memory segments and register them 739 * with the main memory map 740 */ 741 if (g_huge_pages) { 742 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 743 rte_memseg_contig_walk(memory_iter_cb, NULL); 744 } 745 return 0; 746 } 747 748 bool 749 spdk_iommu_is_enabled(void) 750 { 751 #if VFIO_ENABLED 752 return g_vfio.enabled && !g_vfio.noiommu_enabled; 753 #else 754 return false; 755 #endif 756 } 757 758 struct spdk_vtophys_pci_device { 759 struct rte_pci_device *pci_device; 760 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 761 }; 762 763 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 764 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 765 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 766 767 static struct spdk_mem_map *g_vtophys_map; 768 static struct spdk_mem_map *g_phys_ref_map; 769 static struct spdk_mem_map *g_numa_map; 770 771 #if VFIO_ENABLED 772 static int 773 _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 774 { 775 struct spdk_vfio_dma_map *dma_map; 776 int ret; 777 778 dma_map = calloc(1, sizeof(*dma_map)); 779 if (dma_map == NULL) { 780 return -ENOMEM; 781 } 782 783 dma_map->map.argsz = sizeof(dma_map->map); 784 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 785 dma_map->map.vaddr = vaddr; 786 dma_map->map.iova = iova; 787 dma_map->map.size = size; 788 789 if (g_vfio.device_ref == 0) { 790 /* VFIO requires at least one device (IOMMU group) to be added to 791 * a VFIO container before it is possible to perform any IOMMU 792 * operations on that container. This memory will be mapped once 793 * the first device (IOMMU group) is hotplugged. 794 * 795 * Since the vfio container is managed internally by DPDK, it is 796 * also possible that some device is already in that container, but 797 * it's not managed by SPDK - e.g. an NIC attached internally 798 * inside DPDK. We could map the memory straight away in such 799 * scenario, but there's no need to do it. DPDK devices clearly 800 * don't need our mappings and hence we defer the mapping 801 * unconditionally until the first SPDK-managed device is 802 * hotplugged. 803 */ 804 goto out_insert; 805 } 806 807 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 808 if (ret) { 809 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 810 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 811 } 812 813 out_insert: 814 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 815 return 0; 816 } 817 818 819 static int 820 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 821 { 822 uint64_t refcount; 823 int ret; 824 825 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 826 assert(refcount < UINT64_MAX); 827 if (refcount > 0) { 828 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 829 return 0; 830 } 831 832 pthread_mutex_lock(&g_vfio.mutex); 833 ret = _vfio_iommu_map_dma(vaddr, iova, size); 834 pthread_mutex_unlock(&g_vfio.mutex); 835 if (ret) { 836 return ret; 837 } 838 839 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 840 return 0; 841 } 842 843 int 844 vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) 845 { 846 int ret; 847 848 pthread_mutex_lock(&g_vfio.mutex); 849 ret = _vfio_iommu_map_dma(vaddr, iova, size); 850 pthread_mutex_unlock(&g_vfio.mutex); 851 852 return ret; 853 } 854 855 static int 856 _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) 857 { 858 struct vfio_iommu_type1_dma_unmap unmap = {}; 859 int ret; 860 861 if (g_vfio.device_ref == 0) { 862 /* Memory is not mapped anymore, just remove it's references */ 863 goto out_remove; 864 } 865 866 unmap.argsz = sizeof(unmap); 867 unmap.flags = 0; 868 unmap.iova = dma_map->map.iova; 869 unmap.size = dma_map->map.size; 870 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 871 if (ret) { 872 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 873 } 874 875 out_remove: 876 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 877 free(dma_map); 878 return 0; 879 } 880 881 static int 882 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 883 { 884 struct spdk_vfio_dma_map *dma_map; 885 uint64_t refcount; 886 int ret; 887 888 pthread_mutex_lock(&g_vfio.mutex); 889 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 890 if (dma_map->map.iova == iova) { 891 break; 892 } 893 } 894 895 if (dma_map == NULL) { 896 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 897 pthread_mutex_unlock(&g_vfio.mutex); 898 return -ENXIO; 899 } 900 901 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 902 assert(refcount < UINT64_MAX); 903 if (refcount > 0) { 904 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 905 } 906 907 /* We still have outstanding references, don't clear it. */ 908 if (refcount > 1) { 909 pthread_mutex_unlock(&g_vfio.mutex); 910 return 0; 911 } 912 913 /** don't support partial or multiple-page unmap for now */ 914 assert(dma_map->map.size == size); 915 916 ret = _vfio_iommu_unmap_dma(dma_map); 917 pthread_mutex_unlock(&g_vfio.mutex); 918 919 return ret; 920 } 921 922 int 923 vtophys_iommu_unmap_dma_bar(uint64_t vaddr) 924 { 925 struct spdk_vfio_dma_map *dma_map; 926 int ret; 927 928 pthread_mutex_lock(&g_vfio.mutex); 929 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 930 if (dma_map->map.vaddr == vaddr) { 931 break; 932 } 933 } 934 935 if (dma_map == NULL) { 936 DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); 937 pthread_mutex_unlock(&g_vfio.mutex); 938 return -ENXIO; 939 } 940 941 ret = _vfio_iommu_unmap_dma(dma_map); 942 pthread_mutex_unlock(&g_vfio.mutex); 943 return ret; 944 } 945 #endif 946 947 static uint64_t 948 vtophys_get_paddr_memseg(uint64_t vaddr) 949 { 950 uintptr_t paddr; 951 struct rte_memseg *seg; 952 953 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 954 if (seg != NULL) { 955 paddr = seg->iova; 956 if (paddr == RTE_BAD_IOVA) { 957 return SPDK_VTOPHYS_ERROR; 958 } 959 paddr += (vaddr - (uintptr_t)seg->addr); 960 return paddr; 961 } 962 963 return SPDK_VTOPHYS_ERROR; 964 } 965 966 /* Try to get the paddr from /proc/self/pagemap */ 967 static uint64_t 968 vtophys_get_paddr_pagemap(uint64_t vaddr) 969 { 970 uintptr_t paddr; 971 972 /* Silence static analyzers */ 973 assert(vaddr != 0); 974 paddr = rte_mem_virt2iova((void *)vaddr); 975 if (paddr == RTE_BAD_IOVA) { 976 /* 977 * The vaddr may be valid but doesn't have a backing page 978 * assigned yet. Touch the page to ensure a backing page 979 * gets assigned, then try to translate again. 980 */ 981 rte_atomic64_read((rte_atomic64_t *)vaddr); 982 paddr = rte_mem_virt2iova((void *)vaddr); 983 } 984 if (paddr == RTE_BAD_IOVA) { 985 /* Unable to get to the physical address. */ 986 return SPDK_VTOPHYS_ERROR; 987 } 988 989 return paddr; 990 } 991 992 static uint64_t 993 pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) 994 { 995 struct rte_mem_resource *res; 996 uint64_t paddr; 997 unsigned r; 998 999 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 1000 res = dpdk_pci_device_get_mem_resource(dev, r); 1001 1002 if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || 1003 (vaddr + len) >= (uint64_t)res->addr + res->len) { 1004 continue; 1005 } 1006 1007 #if VFIO_ENABLED 1008 if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { 1009 /* 1010 * The IOMMU is on and we're using IOVA == VA. The BAR was 1011 * automatically registered when it was mapped, so just return 1012 * the virtual address here. 1013 */ 1014 return vaddr; 1015 } 1016 #endif 1017 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1018 return paddr; 1019 } 1020 1021 return SPDK_VTOPHYS_ERROR; 1022 } 1023 1024 /* Try to get the paddr from pci devices */ 1025 static uint64_t 1026 vtophys_get_paddr_pci(uint64_t vaddr, size_t len) 1027 { 1028 struct spdk_vtophys_pci_device *vtophys_dev; 1029 uintptr_t paddr; 1030 struct rte_pci_device *dev; 1031 1032 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1033 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1034 dev = vtophys_dev->pci_device; 1035 paddr = pci_device_vtophys(dev, vaddr, len); 1036 if (paddr != SPDK_VTOPHYS_ERROR) { 1037 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1038 return paddr; 1039 } 1040 } 1041 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1042 1043 return SPDK_VTOPHYS_ERROR; 1044 } 1045 1046 static int 1047 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1048 enum spdk_mem_map_notify_action action, 1049 void *vaddr, size_t len) 1050 { 1051 int rc = 0; 1052 uint64_t paddr; 1053 1054 if ((uintptr_t)vaddr & ~MASK_256TB) { 1055 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1056 return -EINVAL; 1057 } 1058 1059 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1060 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1061 vaddr, len); 1062 return -EINVAL; 1063 } 1064 1065 /* Get the physical address from the DPDK memsegs */ 1066 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1067 1068 switch (action) { 1069 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1070 if (paddr == SPDK_VTOPHYS_ERROR) { 1071 /* This is not an address that DPDK is managing. */ 1072 1073 /* Check if this is a PCI BAR. They need special handling */ 1074 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1075 if (paddr != SPDK_VTOPHYS_ERROR) { 1076 /* Get paddr for each 2MB chunk in this address range */ 1077 while (len > 0) { 1078 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1079 if (paddr == SPDK_VTOPHYS_ERROR) { 1080 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1081 return -EFAULT; 1082 } 1083 1084 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1085 if (rc != 0) { 1086 return rc; 1087 } 1088 1089 vaddr += VALUE_2MB; 1090 len -= VALUE_2MB; 1091 } 1092 1093 return 0; 1094 } 1095 1096 #if VFIO_ENABLED 1097 enum rte_iova_mode iova_mode; 1098 1099 iova_mode = rte_eal_iova_mode(); 1100 1101 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1102 /* We'll use the virtual address as the iova to match DPDK. */ 1103 paddr = (uint64_t)vaddr; 1104 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1105 if (rc) { 1106 return -EFAULT; 1107 } 1108 while (len > 0) { 1109 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1110 if (rc != 0) { 1111 return rc; 1112 } 1113 vaddr += VALUE_2MB; 1114 paddr += VALUE_2MB; 1115 len -= VALUE_2MB; 1116 } 1117 } else 1118 #endif 1119 { 1120 /* Get the physical address from /proc/self/pagemap. */ 1121 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1122 if (paddr == SPDK_VTOPHYS_ERROR) { 1123 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1124 return -EFAULT; 1125 } 1126 1127 /* Get paddr for each 2MB chunk in this address range */ 1128 while (len > 0) { 1129 /* Get the physical address from /proc/self/pagemap. */ 1130 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1131 1132 if (paddr == SPDK_VTOPHYS_ERROR) { 1133 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1134 return -EFAULT; 1135 } 1136 1137 if (paddr & MASK_2MB) { 1138 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1139 return -EINVAL; 1140 } 1141 #if VFIO_ENABLED 1142 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1143 * with the IOMMU using the physical address to match. */ 1144 if (spdk_iommu_is_enabled()) { 1145 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1146 if (rc) { 1147 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1148 return -EFAULT; 1149 } 1150 } 1151 #endif 1152 1153 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1154 if (rc != 0) { 1155 return rc; 1156 } 1157 1158 vaddr += VALUE_2MB; 1159 len -= VALUE_2MB; 1160 } 1161 } 1162 } else { 1163 /* This is an address managed by DPDK. Just setup the translations. */ 1164 while (len > 0) { 1165 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1166 if (paddr == SPDK_VTOPHYS_ERROR) { 1167 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1168 return -EFAULT; 1169 } 1170 1171 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1172 if (rc != 0) { 1173 return rc; 1174 } 1175 1176 vaddr += VALUE_2MB; 1177 len -= VALUE_2MB; 1178 } 1179 } 1180 1181 break; 1182 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1183 #if VFIO_ENABLED 1184 if (paddr == SPDK_VTOPHYS_ERROR) { 1185 /* 1186 * This is not an address that DPDK is managing. 1187 */ 1188 1189 /* Check if this is a PCI BAR. They need special handling */ 1190 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1191 if (paddr != SPDK_VTOPHYS_ERROR) { 1192 /* Get paddr for each 2MB chunk in this address range */ 1193 while (len > 0) { 1194 paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1195 if (paddr == SPDK_VTOPHYS_ERROR) { 1196 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1197 return -EFAULT; 1198 } 1199 1200 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1201 if (rc != 0) { 1202 return rc; 1203 } 1204 1205 vaddr += VALUE_2MB; 1206 len -= VALUE_2MB; 1207 } 1208 1209 return 0; 1210 } 1211 1212 /* If vfio is enabled, 1213 * we need to unmap the range from the IOMMU 1214 */ 1215 if (spdk_iommu_is_enabled()) { 1216 uint64_t buffer_len = len; 1217 uint8_t *va = vaddr; 1218 enum rte_iova_mode iova_mode; 1219 1220 iova_mode = rte_eal_iova_mode(); 1221 /* 1222 * In virtual address mode, the region is contiguous and can be done in 1223 * one unmap. 1224 */ 1225 if (iova_mode == RTE_IOVA_VA) { 1226 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1227 if (buffer_len != len || paddr != (uintptr_t)va) { 1228 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1229 "translation had address 0x%" PRIx64 " and length %lu\n", 1230 va, len, paddr, buffer_len); 1231 return -EINVAL; 1232 } 1233 rc = vtophys_iommu_unmap_dma(paddr, len); 1234 if (rc) { 1235 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1236 return -EFAULT; 1237 } 1238 } else if (iova_mode == RTE_IOVA_PA) { 1239 /* Get paddr for each 2MB chunk in this address range */ 1240 while (buffer_len > 0) { 1241 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1242 1243 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1244 DEBUG_PRINT("could not get phys addr for %p\n", va); 1245 return -EFAULT; 1246 } 1247 1248 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1249 if (rc) { 1250 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1251 return -EFAULT; 1252 } 1253 1254 va += VALUE_2MB; 1255 buffer_len -= VALUE_2MB; 1256 } 1257 } 1258 } 1259 } 1260 #endif 1261 while (len > 0) { 1262 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1263 if (rc != 0) { 1264 return rc; 1265 } 1266 1267 vaddr += VALUE_2MB; 1268 len -= VALUE_2MB; 1269 } 1270 1271 break; 1272 default: 1273 SPDK_UNREACHABLE(); 1274 } 1275 1276 return rc; 1277 } 1278 1279 static int 1280 numa_notify(void *cb_ctx, struct spdk_mem_map *map, 1281 enum spdk_mem_map_notify_action action, 1282 void *vaddr, size_t len) 1283 { 1284 struct rte_memseg *seg; 1285 1286 /* We always return 0 from here, even if we aren't able to get a 1287 * memseg for the address. This can happen in non-DPDK memory 1288 * registration paths, for example vhost or vfio-user. That is OK, 1289 * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for 1290 * that kind of memory. If we return an error here, the 1291 * spdk_mem_register() from vhost or vfio-user would fail which is 1292 * not what we want. 1293 */ 1294 seg = rte_mem_virt2memseg(vaddr, NULL); 1295 if (seg == NULL) { 1296 return 0; 1297 } 1298 1299 switch (action) { 1300 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1301 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id); 1302 break; 1303 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1304 spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len); 1305 break; 1306 default: 1307 break; 1308 } 1309 1310 return 0; 1311 } 1312 1313 static int 1314 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1315 { 1316 /* This function is always called with paddrs for two subsequent 1317 * 2MB chunks in virtual address space, so those chunks will be only 1318 * physically contiguous if the physical addresses are 2MB apart 1319 * from each other as well. 1320 */ 1321 return (paddr2 - paddr1 == VALUE_2MB); 1322 } 1323 1324 #if VFIO_ENABLED 1325 1326 static bool 1327 vfio_enabled(void) 1328 { 1329 return rte_vfio_is_enabled("vfio_pci"); 1330 } 1331 1332 /* Check if IOMMU is enabled on the system */ 1333 static bool 1334 has_iommu_groups(void) 1335 { 1336 int count = 0; 1337 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1338 1339 if (dir == NULL) { 1340 return false; 1341 } 1342 1343 while (count < 3 && readdir(dir) != NULL) { 1344 count++; 1345 } 1346 1347 closedir(dir); 1348 /* there will always be ./ and ../ entries */ 1349 return count > 2; 1350 } 1351 1352 static bool 1353 vfio_noiommu_enabled(void) 1354 { 1355 return rte_vfio_noiommu_is_enabled(); 1356 } 1357 1358 static void 1359 vtophys_iommu_init(void) 1360 { 1361 char proc_fd_path[PATH_MAX + 1]; 1362 char link_path[PATH_MAX + 1]; 1363 const char vfio_path[] = "/dev/vfio/vfio"; 1364 DIR *dir; 1365 struct dirent *d; 1366 1367 if (!vfio_enabled()) { 1368 return; 1369 } 1370 1371 if (vfio_noiommu_enabled()) { 1372 g_vfio.noiommu_enabled = true; 1373 } else if (!has_iommu_groups()) { 1374 return; 1375 } 1376 1377 dir = opendir("/proc/self/fd"); 1378 if (!dir) { 1379 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1380 return; 1381 } 1382 1383 while ((d = readdir(dir)) != NULL) { 1384 if (d->d_type != DT_LNK) { 1385 continue; 1386 } 1387 1388 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1389 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1390 continue; 1391 } 1392 1393 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1394 sscanf(d->d_name, "%d", &g_vfio.fd); 1395 break; 1396 } 1397 } 1398 1399 closedir(dir); 1400 1401 if (g_vfio.fd < 0) { 1402 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1403 return; 1404 } 1405 1406 g_vfio.enabled = true; 1407 1408 return; 1409 } 1410 1411 #endif 1412 1413 void 1414 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1415 { 1416 struct spdk_vtophys_pci_device *vtophys_dev; 1417 1418 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1419 1420 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1421 if (vtophys_dev) { 1422 vtophys_dev->pci_device = pci_device; 1423 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1424 } else { 1425 DEBUG_PRINT("Memory allocation error\n"); 1426 } 1427 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1428 1429 #if VFIO_ENABLED 1430 struct spdk_vfio_dma_map *dma_map; 1431 int ret; 1432 1433 if (!g_vfio.enabled) { 1434 return; 1435 } 1436 1437 pthread_mutex_lock(&g_vfio.mutex); 1438 g_vfio.device_ref++; 1439 if (g_vfio.device_ref > 1) { 1440 pthread_mutex_unlock(&g_vfio.mutex); 1441 return; 1442 } 1443 1444 /* This is the first SPDK device using DPDK vfio. This means that the first 1445 * IOMMU group might have been just been added to the DPDK vfio container. 1446 * From this point it is certain that the memory can be mapped now. 1447 */ 1448 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1449 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1450 if (ret) { 1451 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1452 break; 1453 } 1454 } 1455 pthread_mutex_unlock(&g_vfio.mutex); 1456 #endif 1457 } 1458 1459 void 1460 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1461 { 1462 struct spdk_vtophys_pci_device *vtophys_dev; 1463 1464 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1465 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1466 if (vtophys_dev->pci_device == pci_device) { 1467 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1468 free(vtophys_dev); 1469 break; 1470 } 1471 } 1472 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1473 1474 #if VFIO_ENABLED 1475 struct spdk_vfio_dma_map *dma_map; 1476 int ret; 1477 1478 if (!g_vfio.enabled) { 1479 return; 1480 } 1481 1482 pthread_mutex_lock(&g_vfio.mutex); 1483 assert(g_vfio.device_ref > 0); 1484 g_vfio.device_ref--; 1485 if (g_vfio.device_ref > 0) { 1486 pthread_mutex_unlock(&g_vfio.mutex); 1487 return; 1488 } 1489 1490 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1491 * any additional devices using it's vfio container, all the mappings 1492 * will be automatically removed by the Linux vfio driver. We unmap 1493 * the memory manually to be able to easily re-map it later regardless 1494 * of other, external factors. 1495 */ 1496 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1497 struct vfio_iommu_type1_dma_unmap unmap = {}; 1498 unmap.argsz = sizeof(unmap); 1499 unmap.flags = 0; 1500 unmap.iova = dma_map->map.iova; 1501 unmap.size = dma_map->map.size; 1502 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1503 if (ret) { 1504 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1505 break; 1506 } 1507 } 1508 pthread_mutex_unlock(&g_vfio.mutex); 1509 #endif 1510 } 1511 1512 int 1513 vtophys_init(void) 1514 { 1515 const struct spdk_mem_map_ops vtophys_map_ops = { 1516 .notify_cb = vtophys_notify, 1517 .are_contiguous = vtophys_check_contiguous_entries, 1518 }; 1519 1520 const struct spdk_mem_map_ops phys_ref_map_ops = { 1521 .notify_cb = NULL, 1522 .are_contiguous = NULL, 1523 }; 1524 1525 const struct spdk_mem_map_ops numa_map_ops = { 1526 .notify_cb = numa_notify, 1527 .are_contiguous = NULL, 1528 }; 1529 1530 #if VFIO_ENABLED 1531 vtophys_iommu_init(); 1532 #endif 1533 1534 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1535 if (g_phys_ref_map == NULL) { 1536 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1537 return -ENOMEM; 1538 } 1539 1540 g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL); 1541 if (g_numa_map == NULL) { 1542 DEBUG_PRINT("numa map allocation failed.\n"); 1543 spdk_mem_map_free(&g_phys_ref_map); 1544 return -ENOMEM; 1545 } 1546 1547 if (g_huge_pages) { 1548 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1549 if (g_vtophys_map == NULL) { 1550 DEBUG_PRINT("vtophys map allocation failed\n"); 1551 spdk_mem_map_free(&g_numa_map); 1552 spdk_mem_map_free(&g_phys_ref_map); 1553 return -ENOMEM; 1554 } 1555 } 1556 return 0; 1557 } 1558 1559 uint64_t 1560 spdk_vtophys(const void *buf, uint64_t *size) 1561 { 1562 uint64_t vaddr, paddr_2mb; 1563 1564 if (!g_huge_pages) { 1565 return SPDK_VTOPHYS_ERROR; 1566 } 1567 1568 vaddr = (uint64_t)buf; 1569 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1570 1571 /* 1572 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1573 * we will still bitwise-or it with the buf offset below, but the result will still be 1574 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1575 * unaligned) we must now check the return value before addition. 1576 */ 1577 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1578 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1579 return SPDK_VTOPHYS_ERROR; 1580 } else { 1581 return paddr_2mb + (vaddr & MASK_2MB); 1582 } 1583 } 1584 1585 int32_t 1586 spdk_mem_get_numa_id(const void *buf, uint64_t *size) 1587 { 1588 return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size); 1589 } 1590 1591 int 1592 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1593 { 1594 struct rte_memseg *seg; 1595 int ret, fd; 1596 1597 seg = rte_mem_virt2memseg(vaddr, NULL); 1598 if (!seg) { 1599 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1600 return -ENOENT; 1601 } 1602 1603 fd = rte_memseg_get_fd_thread_unsafe(seg); 1604 if (fd < 0) { 1605 return fd; 1606 } 1607 1608 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1609 if (ret < 0) { 1610 return ret; 1611 } 1612 1613 return fd; 1614 } 1615 1616 void 1617 mem_disable_huge_pages(void) 1618 { 1619 g_huge_pages = false; 1620 } 1621