1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_dev.h> 39 #include <rte_config.h> 40 #include <rte_memory.h> 41 #include <rte_eal_memconfig.h> 42 43 #include "spdk_internal/assert.h" 44 45 #include "spdk/assert.h" 46 #include "spdk/likely.h" 47 #include "spdk/queue.h" 48 #include "spdk/util.h" 49 #include "spdk/memory.h" 50 #include "spdk/env_dpdk.h" 51 #include "spdk/log.h" 52 53 #ifndef __linux__ 54 #define VFIO_ENABLED 0 55 #else 56 #include <linux/version.h> 57 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 58 #define VFIO_ENABLED 1 59 #include <linux/vfio.h> 60 #include <rte_vfio.h> 61 62 struct spdk_vfio_dma_map { 63 struct vfio_iommu_type1_dma_map map; 64 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 65 }; 66 67 struct vfio_cfg { 68 int fd; 69 bool enabled; 70 bool noiommu_enabled; 71 unsigned device_ref; 72 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 73 pthread_mutex_t mutex; 74 }; 75 76 static struct vfio_cfg g_vfio = { 77 .fd = -1, 78 .enabled = false, 79 .noiommu_enabled = false, 80 .device_ref = 0, 81 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 82 .mutex = PTHREAD_MUTEX_INITIALIZER 83 }; 84 85 #else 86 #define VFIO_ENABLED 0 87 #endif 88 #endif 89 90 #if DEBUG 91 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 92 #else 93 #define DEBUG_PRINT(...) 94 #endif 95 96 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 97 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 98 99 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 100 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 101 102 /* Page is registered */ 103 #define REG_MAP_REGISTERED (1ULL << 62) 104 105 /* A notification region barrier. The 2MB translation entry that's marked 106 * with this flag must be unregistered separately. This allows contiguous 107 * regions to be unregistered in the same chunks they were registered. 108 */ 109 #define REG_MAP_NOTIFY_START (1ULL << 63) 110 111 /* Translation of a single 2MB page. */ 112 struct map_2mb { 113 uint64_t translation_2mb; 114 }; 115 116 /* Second-level map table indexed by bits [21..29] of the virtual address. 117 * Each entry contains the address translation or error for entries that haven't 118 * been retrieved yet. 119 */ 120 struct map_1gb { 121 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 122 }; 123 124 /* Top-level map table indexed by bits [30..47] of the virtual address. 125 * Each entry points to a second-level map table or NULL. 126 */ 127 struct map_256tb { 128 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 129 }; 130 131 /* Page-granularity memory address translation */ 132 struct spdk_mem_map { 133 struct map_256tb map_256tb; 134 pthread_mutex_t mutex; 135 uint64_t default_translation; 136 struct spdk_mem_map_ops ops; 137 void *cb_ctx; 138 TAILQ_ENTRY(spdk_mem_map) tailq; 139 }; 140 141 /* Registrations map. The 64 bit translations are bit fields with the 142 * following layout (starting with the low bits): 143 * 0 - 61 : reserved 144 * 62 - 63 : flags 145 */ 146 static struct spdk_mem_map *g_mem_reg_map; 147 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 148 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 149 struct spdk_mem_region_head g_spdk_mem_regions = TAILQ_HEAD_INITIALIZER(g_spdk_mem_regions); 150 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 151 pthread_mutex_t g_spdk_mem_region_mutex = PTHREAD_MUTEX_INITIALIZER; 152 __thread bool g_spdk_mem_do_not_notify; 153 154 static bool g_legacy_mem; 155 156 /* 157 * Walk the currently registered memory via the main memory registration map 158 * and call the new map's notify callback for each virtually contiguous region. 159 */ 160 static int 161 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 162 { 163 size_t idx_256tb; 164 uint64_t idx_1gb; 165 uint64_t contig_start = UINT64_MAX; 166 uint64_t contig_end = UINT64_MAX; 167 struct map_1gb *map_1gb; 168 int rc; 169 170 if (!g_mem_reg_map) { 171 return -EINVAL; 172 } 173 174 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 175 pthread_mutex_lock(&g_mem_reg_map->mutex); 176 177 for (idx_256tb = 0; 178 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 179 idx_256tb++) { 180 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 181 182 if (!map_1gb) { 183 if (contig_start != UINT64_MAX) { 184 /* End of of a virtually contiguous range */ 185 rc = map->ops.notify_cb(map->cb_ctx, map, action, 186 (void *)contig_start, 187 contig_end - contig_start + VALUE_2MB); 188 /* Don't bother handling unregister failures. It can't be any worse */ 189 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 190 goto err_unregister; 191 } 192 } 193 contig_start = UINT64_MAX; 194 continue; 195 } 196 197 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 198 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 199 (contig_start == UINT64_MAX || 200 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 201 /* Rebuild the virtual address from the indexes */ 202 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 203 204 if (contig_start == UINT64_MAX) { 205 contig_start = vaddr; 206 } 207 208 contig_end = vaddr; 209 } else { 210 if (contig_start != UINT64_MAX) { 211 /* End of of a virtually contiguous range */ 212 rc = map->ops.notify_cb(map->cb_ctx, map, action, 213 (void *)contig_start, 214 contig_end - contig_start + VALUE_2MB); 215 /* Don't bother handling unregister failures. It can't be any worse */ 216 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 217 goto err_unregister; 218 } 219 220 /* This page might be a part of a neighbour region, so process 221 * it again. The idx_1gb will be incremented immediately. 222 */ 223 idx_1gb--; 224 } 225 contig_start = UINT64_MAX; 226 } 227 } 228 } 229 230 pthread_mutex_unlock(&g_mem_reg_map->mutex); 231 return 0; 232 233 err_unregister: 234 /* Unwind to the first empty translation so we don't unregister 235 * a region that just failed to register. 236 */ 237 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 238 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 239 contig_start = UINT64_MAX; 240 contig_end = UINT64_MAX; 241 242 /* Unregister any memory we managed to register before the failure */ 243 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 244 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 245 246 if (!map_1gb) { 247 if (contig_end != UINT64_MAX) { 248 /* End of of a virtually contiguous range */ 249 map->ops.notify_cb(map->cb_ctx, map, 250 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 251 (void *)contig_start, 252 contig_end - contig_start + VALUE_2MB); 253 } 254 contig_end = UINT64_MAX; 255 continue; 256 } 257 258 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 259 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 260 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 261 /* Rebuild the virtual address from the indexes */ 262 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 263 264 if (contig_end == UINT64_MAX) { 265 contig_end = vaddr; 266 } 267 contig_start = vaddr; 268 } else { 269 if (contig_end != UINT64_MAX) { 270 /* End of of a virtually contiguous range */ 271 map->ops.notify_cb(map->cb_ctx, map, 272 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 273 (void *)contig_start, 274 contig_end - contig_start + VALUE_2MB); 275 idx_1gb++; 276 } 277 contig_end = UINT64_MAX; 278 } 279 } 280 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 281 } 282 283 pthread_mutex_unlock(&g_mem_reg_map->mutex); 284 return rc; 285 } 286 287 struct spdk_mem_map * 288 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 289 { 290 struct spdk_mem_map *map; 291 int rc; 292 293 map = calloc(1, sizeof(*map)); 294 if (map == NULL) { 295 return NULL; 296 } 297 298 if (pthread_mutex_init(&map->mutex, NULL)) { 299 free(map); 300 return NULL; 301 } 302 303 map->default_translation = default_translation; 304 map->cb_ctx = cb_ctx; 305 if (ops) { 306 map->ops = *ops; 307 } 308 309 if (ops && ops->notify_cb) { 310 pthread_mutex_lock(&g_spdk_mem_map_mutex); 311 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 312 if (rc != 0) { 313 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 314 DEBUG_PRINT("Initial mem_map notify failed\n"); 315 pthread_mutex_destroy(&map->mutex); 316 free(map); 317 return NULL; 318 } 319 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 320 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 321 } 322 323 return map; 324 } 325 326 void 327 spdk_mem_map_free(struct spdk_mem_map **pmap) 328 { 329 struct spdk_mem_map *map; 330 size_t i; 331 332 if (!pmap) { 333 return; 334 } 335 336 map = *pmap; 337 338 if (!map) { 339 return; 340 } 341 342 if (map->ops.notify_cb) { 343 pthread_mutex_lock(&g_spdk_mem_map_mutex); 344 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 345 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 346 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 347 } 348 349 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 350 free(map->map_256tb.map[i]); 351 } 352 353 pthread_mutex_destroy(&map->mutex); 354 355 free(map); 356 *pmap = NULL; 357 } 358 359 int 360 spdk_mem_register(void *vaddr, size_t len) 361 { 362 struct spdk_mem_map *map; 363 int rc; 364 void *seg_vaddr; 365 size_t seg_len; 366 uint64_t reg; 367 368 if ((uintptr_t)vaddr & ~MASK_256TB) { 369 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 370 return -EINVAL; 371 } 372 373 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 374 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 375 __func__, vaddr, len); 376 return -EINVAL; 377 } 378 379 if (len == 0) { 380 return 0; 381 } 382 383 pthread_mutex_lock(&g_spdk_mem_map_mutex); 384 385 seg_vaddr = vaddr; 386 seg_len = len; 387 while (seg_len > 0) { 388 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 389 if (reg & REG_MAP_REGISTERED) { 390 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 391 return -EBUSY; 392 } 393 seg_vaddr += VALUE_2MB; 394 seg_len -= VALUE_2MB; 395 } 396 397 seg_vaddr = vaddr; 398 seg_len = 0; 399 while (len > 0) { 400 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 401 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 402 seg_len += VALUE_2MB; 403 vaddr += VALUE_2MB; 404 len -= VALUE_2MB; 405 } 406 407 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 408 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 409 if (rc != 0) { 410 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 411 return rc; 412 } 413 } 414 415 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 416 return 0; 417 } 418 419 int 420 spdk_mem_unregister(void *vaddr, size_t len) 421 { 422 struct spdk_mem_map *map; 423 int rc; 424 void *seg_vaddr; 425 size_t seg_len; 426 uint64_t reg, newreg; 427 428 if ((uintptr_t)vaddr & ~MASK_256TB) { 429 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 430 return -EINVAL; 431 } 432 433 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 434 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 435 __func__, vaddr, len); 436 return -EINVAL; 437 } 438 439 pthread_mutex_lock(&g_spdk_mem_map_mutex); 440 441 /* The first page must be a start of a region. Also check if it's 442 * registered to make sure we don't return -ERANGE for non-registered 443 * regions. 444 */ 445 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 446 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 447 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 448 return -ERANGE; 449 } 450 451 seg_vaddr = vaddr; 452 seg_len = len; 453 while (seg_len > 0) { 454 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 455 if ((reg & REG_MAP_REGISTERED) == 0) { 456 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 457 return -EINVAL; 458 } 459 seg_vaddr += VALUE_2MB; 460 seg_len -= VALUE_2MB; 461 } 462 463 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 464 /* If the next page is registered, it must be a start of a region as well, 465 * otherwise we'd be unregistering only a part of a region. 466 */ 467 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 468 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 469 return -ERANGE; 470 } 471 seg_vaddr = vaddr; 472 seg_len = 0; 473 474 while (len > 0) { 475 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 476 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 477 478 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 479 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 480 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 481 if (rc != 0) { 482 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 483 return rc; 484 } 485 } 486 487 seg_vaddr = vaddr; 488 seg_len = VALUE_2MB; 489 } else { 490 seg_len += VALUE_2MB; 491 } 492 493 vaddr += VALUE_2MB; 494 len -= VALUE_2MB; 495 } 496 497 if (seg_len > 0) { 498 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 499 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 500 if (rc != 0) { 501 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 502 return rc; 503 } 504 } 505 } 506 507 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 508 return 0; 509 } 510 511 int 512 spdk_mem_reserve(void *vaddr, size_t len) 513 { 514 struct spdk_mem_map *map; 515 void *seg_vaddr; 516 size_t seg_len; 517 uint64_t reg; 518 519 if ((uintptr_t)vaddr & ~MASK_256TB) { 520 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 521 return -EINVAL; 522 } 523 524 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 525 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 526 __func__, vaddr, len); 527 return -EINVAL; 528 } 529 530 if (len == 0) { 531 return 0; 532 } 533 534 pthread_mutex_lock(&g_spdk_mem_map_mutex); 535 536 /* Check if any part of this range is already registered */ 537 seg_vaddr = vaddr; 538 seg_len = len; 539 while (seg_len > 0) { 540 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 541 if (reg & REG_MAP_REGISTERED) { 542 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 543 return -EBUSY; 544 } 545 seg_vaddr += VALUE_2MB; 546 seg_len -= VALUE_2MB; 547 } 548 549 /* Simply set the translation to the memory map's default. This allocates the space in the 550 * map but does not provide a valid translation. */ 551 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 552 g_mem_reg_map->default_translation); 553 554 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 555 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 556 } 557 558 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 559 return 0; 560 } 561 562 static struct map_1gb * 563 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 564 { 565 struct map_1gb *map_1gb; 566 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 567 size_t i; 568 569 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 570 return NULL; 571 } 572 573 map_1gb = map->map_256tb.map[idx_256tb]; 574 575 if (!map_1gb) { 576 pthread_mutex_lock(&map->mutex); 577 578 /* Recheck to make sure nobody else got the mutex first. */ 579 map_1gb = map->map_256tb.map[idx_256tb]; 580 if (!map_1gb) { 581 map_1gb = malloc(sizeof(struct map_1gb)); 582 if (map_1gb) { 583 /* initialize all entries to default translation */ 584 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 585 map_1gb->map[i].translation_2mb = map->default_translation; 586 } 587 map->map_256tb.map[idx_256tb] = map_1gb; 588 } 589 } 590 591 pthread_mutex_unlock(&map->mutex); 592 593 if (!map_1gb) { 594 DEBUG_PRINT("allocation failed\n"); 595 return NULL; 596 } 597 } 598 599 return map_1gb; 600 } 601 602 int 603 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 604 uint64_t translation) 605 { 606 uint64_t vfn_2mb; 607 struct map_1gb *map_1gb; 608 uint64_t idx_1gb; 609 struct map_2mb *map_2mb; 610 611 if ((uintptr_t)vaddr & ~MASK_256TB) { 612 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 613 return -EINVAL; 614 } 615 616 /* For now, only 2 MB-aligned registrations are supported */ 617 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 618 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 619 __func__, vaddr, size); 620 return -EINVAL; 621 } 622 623 vfn_2mb = vaddr >> SHIFT_2MB; 624 625 while (size) { 626 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 627 if (!map_1gb) { 628 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 629 return -ENOMEM; 630 } 631 632 idx_1gb = MAP_1GB_IDX(vfn_2mb); 633 map_2mb = &map_1gb->map[idx_1gb]; 634 map_2mb->translation_2mb = translation; 635 636 size -= VALUE_2MB; 637 vfn_2mb++; 638 } 639 640 return 0; 641 } 642 643 int 644 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 645 { 646 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 647 } 648 649 inline uint64_t 650 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 651 { 652 const struct map_1gb *map_1gb; 653 const struct map_2mb *map_2mb; 654 uint64_t idx_256tb; 655 uint64_t idx_1gb; 656 uint64_t vfn_2mb; 657 uint64_t cur_size; 658 uint64_t prev_translation; 659 uint64_t orig_translation; 660 661 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 662 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 663 return map->default_translation; 664 } 665 666 vfn_2mb = vaddr >> SHIFT_2MB; 667 idx_256tb = MAP_256TB_IDX(vfn_2mb); 668 idx_1gb = MAP_1GB_IDX(vfn_2mb); 669 670 map_1gb = map->map_256tb.map[idx_256tb]; 671 if (spdk_unlikely(!map_1gb)) { 672 return map->default_translation; 673 } 674 675 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 676 map_2mb = &map_1gb->map[idx_1gb]; 677 if (size == NULL || map->ops.are_contiguous == NULL || 678 map_2mb->translation_2mb == map->default_translation) { 679 if (size != NULL) { 680 *size = spdk_min(*size, cur_size); 681 } 682 return map_2mb->translation_2mb; 683 } 684 685 orig_translation = map_2mb->translation_2mb; 686 prev_translation = orig_translation; 687 while (cur_size < *size) { 688 vfn_2mb++; 689 idx_256tb = MAP_256TB_IDX(vfn_2mb); 690 idx_1gb = MAP_1GB_IDX(vfn_2mb); 691 692 map_1gb = map->map_256tb.map[idx_256tb]; 693 if (spdk_unlikely(!map_1gb)) { 694 break; 695 } 696 697 map_2mb = &map_1gb->map[idx_1gb]; 698 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 699 break; 700 } 701 702 cur_size += VALUE_2MB; 703 prev_translation = map_2mb->translation_2mb; 704 } 705 706 *size = spdk_min(*size, cur_size); 707 return orig_translation; 708 } 709 710 static void 711 memory_hotplug_cb(enum rte_mem_event event_type, 712 const void *addr, size_t len, void *arg) 713 { 714 if (event_type == RTE_MEM_EVENT_ALLOC) { 715 spdk_mem_register((void *)addr, len); 716 717 if (!spdk_env_dpdk_external_init()) { 718 return; 719 } 720 721 /* When the user initialized DPDK separately, we can't 722 * be sure that --match-allocations RTE flag was specified. 723 * Without this flag, DPDK can free memory in different units 724 * than it was allocated. It doesn't work with things like RDMA MRs. 725 * 726 * For such cases, we mark segments so they aren't freed. 727 */ 728 while (len > 0) { 729 struct rte_memseg *seg; 730 731 seg = rte_mem_virt2memseg(addr, NULL); 732 assert(seg != NULL); 733 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 734 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 735 len -= seg->hugepage_sz; 736 } 737 } else if (event_type == RTE_MEM_EVENT_FREE) { 738 /* We need to postpone spdk_mem_unregister() call here to avoid 739 * double lock of SPDK mutex (g_spdk_mem_map_mutex) and DPDK mutex 740 * (memory_hotplug_lock) which may happen when one thread is calling 741 * spdk_free() and another one is calling vhost_session_mem_unregister() */ 742 struct spdk_mem_region *region; 743 744 region = calloc(1, sizeof(*region)); 745 assert(region != NULL); 746 region->addr = (void *)addr; 747 region->len = len; 748 pthread_mutex_lock(&g_spdk_mem_region_mutex); 749 TAILQ_INSERT_TAIL(&g_spdk_mem_regions, region, tailq); 750 pthread_mutex_unlock(&g_spdk_mem_region_mutex); 751 } 752 } 753 754 static int 755 memory_iter_cb(const struct rte_memseg_list *msl, 756 const struct rte_memseg *ms, size_t len, void *arg) 757 { 758 return spdk_mem_register(ms->addr, len); 759 } 760 761 int 762 mem_map_init(bool legacy_mem) 763 { 764 g_legacy_mem = legacy_mem; 765 766 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 767 if (g_mem_reg_map == NULL) { 768 DEBUG_PRINT("memory registration map allocation failed\n"); 769 return -ENOMEM; 770 } 771 772 /* 773 * Walk all DPDK memory segments and register them 774 * with the main memory map 775 */ 776 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 777 rte_memseg_contig_walk(memory_iter_cb, NULL); 778 return 0; 779 } 780 781 bool 782 spdk_iommu_is_enabled(void) 783 { 784 #if VFIO_ENABLED 785 return g_vfio.enabled && !g_vfio.noiommu_enabled; 786 #else 787 return false; 788 #endif 789 } 790 791 struct spdk_vtophys_pci_device { 792 struct rte_pci_device *pci_device; 793 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 794 }; 795 796 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 797 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 798 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 799 800 static struct spdk_mem_map *g_vtophys_map; 801 static struct spdk_mem_map *g_phys_ref_map; 802 803 static uint64_t 804 vtophys_get_paddr_memseg(uint64_t vaddr) 805 { 806 uintptr_t paddr; 807 struct rte_memseg *seg; 808 809 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 810 if (seg != NULL) { 811 paddr = seg->iova; 812 if (paddr == RTE_BAD_IOVA) { 813 return SPDK_VTOPHYS_ERROR; 814 } 815 paddr += (vaddr - (uintptr_t)seg->addr); 816 return paddr; 817 } 818 819 return SPDK_VTOPHYS_ERROR; 820 } 821 822 /* Try to get the paddr from /proc/self/pagemap */ 823 static uint64_t 824 vtophys_get_paddr_pagemap(uint64_t vaddr) 825 { 826 uintptr_t paddr; 827 828 /* Silence static analyzers */ 829 assert(vaddr != 0); 830 paddr = rte_mem_virt2iova((void *)vaddr); 831 if (paddr == RTE_BAD_IOVA) { 832 /* 833 * The vaddr may be valid but doesn't have a backing page 834 * assigned yet. Touch the page to ensure a backing page 835 * gets assigned, then try to translate again. 836 */ 837 rte_atomic64_read((rte_atomic64_t *)vaddr); 838 paddr = rte_mem_virt2iova((void *)vaddr); 839 } 840 if (paddr == RTE_BAD_IOVA) { 841 /* Unable to get to the physical address. */ 842 return SPDK_VTOPHYS_ERROR; 843 } 844 845 return paddr; 846 } 847 848 /* Try to get the paddr from pci devices */ 849 static uint64_t 850 vtophys_get_paddr_pci(uint64_t vaddr) 851 { 852 struct spdk_vtophys_pci_device *vtophys_dev; 853 uintptr_t paddr; 854 struct rte_pci_device *dev; 855 struct rte_mem_resource *res; 856 unsigned r; 857 858 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 859 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 860 dev = vtophys_dev->pci_device; 861 862 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 863 res = &dev->mem_resource[r]; 864 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 865 vaddr < (uint64_t)res->addr + res->len) { 866 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 867 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 868 (void *)paddr); 869 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 870 return paddr; 871 } 872 } 873 } 874 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 875 876 return SPDK_VTOPHYS_ERROR; 877 } 878 879 static int 880 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 881 enum spdk_mem_map_notify_action action, 882 void *vaddr, size_t len) 883 { 884 int rc = 0, pci_phys = 0; 885 uint64_t paddr; 886 #if VFIO_ENABLED 887 uint32_t num_pages, i; 888 rte_iova_t *iovas = NULL; 889 rte_iova_t iova; 890 struct rte_dev_iterator dev_iter; 891 struct rte_device *dev; 892 const char *devstr = "bus=pci"; 893 #endif 894 895 if ((uintptr_t)vaddr & ~MASK_256TB) { 896 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 897 return -EINVAL; 898 } 899 900 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 901 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 902 vaddr, len); 903 return -EINVAL; 904 } 905 906 /* Get the physical address from the DPDK memsegs */ 907 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 908 909 switch (action) { 910 case SPDK_MEM_MAP_NOTIFY_REGISTER: 911 if (paddr == SPDK_VTOPHYS_ERROR) { 912 /* This is not an address that DPDK is managing currently. 913 * Let's register it. */ 914 915 #if VFIO_ENABLED 916 enum rte_iova_mode iova_mode; 917 918 iova_mode = rte_eal_iova_mode(); 919 920 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 921 /* We'll use the virtual address as the iova to match DPDK. */ 922 paddr = (uint64_t)vaddr; 923 924 num_pages = len / VALUE_2MB; 925 iovas = calloc(num_pages, sizeof(rte_iova_t)); 926 if (iovas == NULL) { 927 return -ENOMEM; 928 } 929 930 for (i = 0; i < num_pages; i++) { 931 iovas[i] = paddr; 932 paddr += VALUE_2MB; 933 } 934 935 rc = rte_extmem_register(vaddr, len, iovas, num_pages, VALUE_2MB); 936 if (rc != 0) { 937 goto error; 938 } 939 940 /* For each device, map the memory */ 941 RTE_DEV_FOREACH(dev, devstr, &dev_iter) { 942 rte_dev_dma_map(dev, vaddr, (uint64_t)vaddr, len); 943 } 944 945 for (i = 0; i < num_pages; i++) { 946 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, iovas[i]); 947 if (rc != 0) { 948 goto error; 949 } 950 951 vaddr += VALUE_2MB; 952 } 953 954 free(iovas); 955 } else 956 #endif 957 { 958 /* Get the physical address from /proc/self/pagemap. */ 959 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 960 if (paddr == SPDK_VTOPHYS_ERROR) { 961 /* Get the physical address from PCI devices */ 962 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 963 if (paddr == SPDK_VTOPHYS_ERROR) { 964 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 965 rc = -EFAULT; 966 goto error; 967 } 968 /* The beginning of this address range points to a PCI resource, 969 * so the rest must point to a PCI resource as well. 970 */ 971 pci_phys = 1; 972 } 973 974 /* Get paddr for each 2MB chunk in this address range */ 975 while (len > 0) { 976 /* Get the physical address from /proc/self/pagemap. */ 977 if (pci_phys) { 978 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 979 } else { 980 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 981 } 982 983 if (paddr == SPDK_VTOPHYS_ERROR) { 984 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 985 rc = -EFAULT; 986 goto error; 987 988 } 989 990 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 991 if (!pci_phys && (paddr & MASK_2MB)) { 992 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 993 rc = -EINVAL; 994 goto error; 995 996 } 997 #if VFIO_ENABLED 998 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 999 * with the IOMMU using the physical address to match. */ 1000 if (spdk_iommu_is_enabled()) { 1001 iova = paddr; 1002 1003 rc = rte_extmem_register(vaddr, VALUE_2MB, &iova, 1, VALUE_2MB); 1004 if (rc != 0) { 1005 goto error; 1006 } 1007 1008 /* For each device, map the memory */ 1009 RTE_DEV_FOREACH(dev, devstr, &dev_iter) { 1010 rte_dev_dma_map(dev, vaddr, paddr, len); 1011 } 1012 } 1013 #endif 1014 1015 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1016 if (rc != 0) { 1017 goto error; 1018 } 1019 1020 vaddr += VALUE_2MB; 1021 len -= VALUE_2MB; 1022 } 1023 } 1024 } else { 1025 /* This is an address managed by DPDK. Just setup the translations. */ 1026 while (len > 0) { 1027 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1028 if (paddr == SPDK_VTOPHYS_ERROR) { 1029 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1030 return -EFAULT; 1031 } 1032 1033 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1034 if (rc != 0) { 1035 return rc; 1036 } 1037 1038 vaddr += VALUE_2MB; 1039 len -= VALUE_2MB; 1040 } 1041 } 1042 1043 break; 1044 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1045 #if VFIO_ENABLED 1046 if (g_spdk_mem_do_not_notify == false) { 1047 /* If vfio is enabled, we need to unmap the range from the IOMMU */ 1048 if (spdk_iommu_is_enabled()) { 1049 uint64_t buffer_len = len; 1050 uint8_t *va = vaddr; 1051 enum rte_iova_mode iova_mode; 1052 1053 iova_mode = rte_eal_iova_mode(); 1054 /* 1055 * In virtual address mode, the region is contiguous and can be done in 1056 * one unmap. 1057 */ 1058 if (iova_mode == RTE_IOVA_VA) { 1059 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1060 if (buffer_len != len || paddr != (uintptr_t)va) { 1061 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1062 "translation had address 0x%" PRIx64 " and length %lu\n", 1063 va, len, paddr, buffer_len); 1064 return -EINVAL; 1065 } 1066 1067 /* For each device, map the memory */ 1068 RTE_DEV_FOREACH(dev, devstr, &dev_iter) { 1069 rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len); 1070 } 1071 1072 rc = rte_extmem_unregister(vaddr, len); 1073 if (rc != 0) { 1074 return rc; 1075 } 1076 } else if (iova_mode == RTE_IOVA_PA) { 1077 /* Get paddr for each 2MB chunk in this address range */ 1078 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1079 1080 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1081 DEBUG_PRINT("could not get phys addr for %p\n", va); 1082 return -EFAULT; 1083 } 1084 1085 /* For each device, map the memory */ 1086 RTE_DEV_FOREACH(dev, devstr, &dev_iter) { 1087 rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len); 1088 } 1089 1090 rc = rte_extmem_unregister(vaddr, len); 1091 if (rc != 0) { 1092 return rc; 1093 } 1094 } 1095 } 1096 } 1097 #endif 1098 while (len > 0) { 1099 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1100 if (rc != 0) { 1101 return rc; 1102 } 1103 1104 vaddr += VALUE_2MB; 1105 len -= VALUE_2MB; 1106 } 1107 1108 break; 1109 default: 1110 SPDK_UNREACHABLE(); 1111 } 1112 1113 return rc; 1114 1115 error: 1116 #if VFIO_ENABLED 1117 free(iovas); 1118 #endif 1119 1120 return rc; 1121 } 1122 1123 static int 1124 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1125 { 1126 /* This function is always called with paddrs for two subsequent 1127 * 2MB chunks in virtual address space, so those chunks will be only 1128 * physically contiguous if the physical addresses are 2MB apart 1129 * from each other as well. 1130 */ 1131 return (paddr2 - paddr1 == VALUE_2MB); 1132 } 1133 1134 #if VFIO_ENABLED 1135 1136 static bool 1137 vfio_enabled(void) 1138 { 1139 return rte_vfio_is_enabled("vfio_pci"); 1140 } 1141 1142 /* Check if IOMMU is enabled on the system */ 1143 static bool 1144 has_iommu_groups(void) 1145 { 1146 int count = 0; 1147 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1148 1149 if (dir == NULL) { 1150 return false; 1151 } 1152 1153 while (count < 3 && readdir(dir) != NULL) { 1154 count++; 1155 } 1156 1157 closedir(dir); 1158 /* there will always be ./ and ../ entries */ 1159 return count > 2; 1160 } 1161 1162 static bool 1163 vfio_noiommu_enabled(void) 1164 { 1165 return rte_vfio_noiommu_is_enabled(); 1166 } 1167 1168 static void 1169 vtophys_iommu_device_event(const char *device_name, 1170 enum rte_dev_event_type event, 1171 void *cb_arg) 1172 { 1173 struct rte_dev_iterator dev_iter; 1174 struct rte_device *dev; 1175 1176 pthread_mutex_lock(&g_vfio.mutex); 1177 1178 switch (event) { 1179 default: 1180 case RTE_DEV_EVENT_ADD: 1181 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1182 if (strcmp(dev->name, device_name) == 0) { 1183 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1184 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1185 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1186 #else 1187 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1188 #endif 1189 /* This is a new PCI device using vfio */ 1190 g_vfio.device_ref++; 1191 } 1192 break; 1193 } 1194 } 1195 1196 if (g_vfio.device_ref == 1) { 1197 struct spdk_vfio_dma_map *dma_map; 1198 int ret; 1199 1200 /* This is the first device registered. This means that the first 1201 * IOMMU group might have been just been added to the DPDK vfio container. 1202 * From this point it is certain that the memory can be mapped now. 1203 */ 1204 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1205 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1206 if (ret) { 1207 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1208 break; 1209 } 1210 } 1211 } 1212 break; 1213 case RTE_DEV_EVENT_REMOVE: 1214 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1215 if (strcmp(dev->name, device_name) == 0) { 1216 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1217 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1218 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1219 #else 1220 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1221 #endif 1222 /* This is a PCI device using vfio */ 1223 g_vfio.device_ref--; 1224 } 1225 break; 1226 } 1227 } 1228 1229 if (g_vfio.device_ref == 0) { 1230 struct spdk_vfio_dma_map *dma_map; 1231 int ret; 1232 1233 /* If DPDK doesn't have any additional devices using it's vfio container, 1234 * all the mappings will be automatically removed by the Linux vfio driver. 1235 * We unmap the memory manually to be able to easily re-map it later regardless 1236 * of other, external factors. 1237 */ 1238 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1239 struct vfio_iommu_type1_dma_unmap unmap = {}; 1240 unmap.argsz = sizeof(unmap); 1241 unmap.flags = 0; 1242 unmap.iova = dma_map->map.iova; 1243 unmap.size = dma_map->map.size; 1244 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1245 if (ret) { 1246 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1247 break; 1248 } 1249 } 1250 } 1251 break; 1252 } 1253 1254 pthread_mutex_unlock(&g_vfio.mutex); 1255 } 1256 1257 static void 1258 vtophys_iommu_init(void) 1259 { 1260 char proc_fd_path[PATH_MAX + 1]; 1261 char link_path[PATH_MAX + 1]; 1262 const char vfio_path[] = "/dev/vfio/vfio"; 1263 DIR *dir; 1264 struct dirent *d; 1265 struct rte_dev_iterator dev_iter; 1266 struct rte_device *dev; 1267 int rc; 1268 1269 if (!vfio_enabled()) { 1270 return; 1271 } 1272 1273 if (vfio_noiommu_enabled()) { 1274 g_vfio.noiommu_enabled = true; 1275 } else if (!has_iommu_groups()) { 1276 return; 1277 } 1278 1279 dir = opendir("/proc/self/fd"); 1280 if (!dir) { 1281 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1282 return; 1283 } 1284 1285 while ((d = readdir(dir)) != NULL) { 1286 if (d->d_type != DT_LNK) { 1287 continue; 1288 } 1289 1290 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1291 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1292 continue; 1293 } 1294 1295 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1296 sscanf(d->d_name, "%d", &g_vfio.fd); 1297 break; 1298 } 1299 } 1300 1301 closedir(dir); 1302 1303 if (g_vfio.fd < 0) { 1304 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1305 return; 1306 } 1307 1308 /* If the IOMMU is enabled, we need to track whether there are any devices present because 1309 * it's only valid to perform vfio IOCTLs to the containers when there is at least 1310 * one device. The device may be a DPDK device that SPDK doesn't otherwise know about, but 1311 * that's ok. 1312 */ 1313 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1314 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1315 1316 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1317 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1318 #else 1319 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1320 #endif 1321 /* This is a PCI device using vfio */ 1322 g_vfio.device_ref++; 1323 } 1324 } 1325 1326 if (spdk_process_is_primary()) { 1327 rc = rte_dev_event_callback_register(NULL, vtophys_iommu_device_event, NULL); 1328 if (rc) { 1329 DEBUG_PRINT("Failed to register device event callback\n"); 1330 return; 1331 } 1332 rc = rte_dev_event_monitor_start(); 1333 if (rc) { 1334 DEBUG_PRINT("Failed to start device event monitoring.\n"); 1335 return; 1336 } 1337 } 1338 1339 g_vfio.enabled = true; 1340 1341 return; 1342 } 1343 1344 static void 1345 vtophys_iommu_fini(void) 1346 { 1347 if (spdk_process_is_primary()) { 1348 rte_dev_event_callback_unregister(NULL, vtophys_iommu_device_event, NULL); 1349 rte_dev_event_monitor_stop(); 1350 } 1351 } 1352 1353 #endif 1354 1355 void 1356 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1357 { 1358 struct spdk_vtophys_pci_device *vtophys_dev; 1359 1360 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1361 1362 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1363 if (vtophys_dev) { 1364 vtophys_dev->pci_device = pci_device; 1365 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1366 } else { 1367 DEBUG_PRINT("Memory allocation error\n"); 1368 } 1369 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1370 } 1371 1372 void 1373 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1374 { 1375 struct spdk_vtophys_pci_device *vtophys_dev; 1376 1377 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1378 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1379 if (vtophys_dev->pci_device == pci_device) { 1380 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1381 free(vtophys_dev); 1382 break; 1383 } 1384 } 1385 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1386 } 1387 1388 int 1389 vtophys_init(void) 1390 { 1391 const struct spdk_mem_map_ops vtophys_map_ops = { 1392 .notify_cb = vtophys_notify, 1393 .are_contiguous = vtophys_check_contiguous_entries, 1394 }; 1395 1396 const struct spdk_mem_map_ops phys_ref_map_ops = { 1397 .notify_cb = NULL, 1398 .are_contiguous = NULL, 1399 }; 1400 1401 #if VFIO_ENABLED 1402 vtophys_iommu_init(); 1403 #endif 1404 1405 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1406 if (g_phys_ref_map == NULL) { 1407 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1408 return -ENOMEM; 1409 } 1410 1411 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1412 if (g_vtophys_map == NULL) { 1413 DEBUG_PRINT("vtophys map allocation failed\n"); 1414 return -ENOMEM; 1415 } 1416 return 0; 1417 } 1418 1419 void 1420 vtophys_fini(void) 1421 { 1422 #if VFIO_ENABLED 1423 vtophys_iommu_fini(); 1424 #endif 1425 } 1426 1427 uint64_t 1428 spdk_vtophys(const void *buf, uint64_t *size) 1429 { 1430 uint64_t vaddr, paddr_2mb; 1431 1432 vaddr = (uint64_t)buf; 1433 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1434 1435 /* 1436 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1437 * we will still bitwise-or it with the buf offset below, but the result will still be 1438 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1439 * unaligned) we must now check the return value before addition. 1440 */ 1441 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1442 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1443 return SPDK_VTOPHYS_ERROR; 1444 } else { 1445 return paddr_2mb + (vaddr & MASK_2MB); 1446 } 1447 } 1448 1449 int 1450 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1451 { 1452 struct rte_memseg *seg; 1453 int ret, fd; 1454 1455 seg = rte_mem_virt2memseg(vaddr, NULL); 1456 if (!seg) { 1457 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1458 return -ENOENT; 1459 } 1460 1461 fd = rte_memseg_get_fd_thread_unsafe(seg); 1462 if (fd < 0) { 1463 return fd; 1464 } 1465 1466 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1467 if (ret < 0) { 1468 return ret; 1469 } 1470 1471 return fd; 1472 } 1473