1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_dev.h> 39 #include <rte_config.h> 40 #include <rte_memory.h> 41 #include <rte_eal_memconfig.h> 42 43 #include "spdk_internal/assert.h" 44 45 #include "spdk/assert.h" 46 #include "spdk/likely.h" 47 #include "spdk/queue.h" 48 #include "spdk/util.h" 49 #include "spdk/memory.h" 50 #include "spdk/env_dpdk.h" 51 #include "spdk/log.h" 52 53 #ifndef __linux__ 54 #define VFIO_ENABLED 0 55 #else 56 #include <linux/version.h> 57 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 58 #define VFIO_ENABLED 1 59 #include <linux/vfio.h> 60 #include <rte_vfio.h> 61 62 struct spdk_vfio_dma_map { 63 struct vfio_iommu_type1_dma_map map; 64 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 65 }; 66 67 struct vfio_cfg { 68 int fd; 69 bool enabled; 70 bool noiommu_enabled; 71 unsigned device_ref; 72 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 73 pthread_mutex_t mutex; 74 }; 75 76 static struct vfio_cfg g_vfio = { 77 .fd = -1, 78 .enabled = false, 79 .noiommu_enabled = false, 80 .device_ref = 0, 81 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 82 .mutex = PTHREAD_MUTEX_INITIALIZER 83 }; 84 85 #else 86 #define VFIO_ENABLED 0 87 #endif 88 #endif 89 90 #if DEBUG 91 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 92 #else 93 #define DEBUG_PRINT(...) 94 #endif 95 96 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 97 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 98 99 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 100 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 101 102 /* Page is registered */ 103 #define REG_MAP_REGISTERED (1ULL << 62) 104 105 /* A notification region barrier. The 2MB translation entry that's marked 106 * with this flag must be unregistered separately. This allows contiguous 107 * regions to be unregistered in the same chunks they were registered. 108 */ 109 #define REG_MAP_NOTIFY_START (1ULL << 63) 110 111 /* Translation of a single 2MB page. */ 112 struct map_2mb { 113 uint64_t translation_2mb; 114 }; 115 116 /* Second-level map table indexed by bits [21..29] of the virtual address. 117 * Each entry contains the address translation or error for entries that haven't 118 * been retrieved yet. 119 */ 120 struct map_1gb { 121 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 122 }; 123 124 /* Top-level map table indexed by bits [30..47] of the virtual address. 125 * Each entry points to a second-level map table or NULL. 126 */ 127 struct map_256tb { 128 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 129 }; 130 131 /* Page-granularity memory address translation */ 132 struct spdk_mem_map { 133 struct map_256tb map_256tb; 134 pthread_mutex_t mutex; 135 uint64_t default_translation; 136 struct spdk_mem_map_ops ops; 137 void *cb_ctx; 138 TAILQ_ENTRY(spdk_mem_map) tailq; 139 }; 140 141 /* Registrations map. The 64 bit translations are bit fields with the 142 * following layout (starting with the low bits): 143 * 0 - 61 : reserved 144 * 62 - 63 : flags 145 */ 146 static struct spdk_mem_map *g_mem_reg_map; 147 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 148 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 149 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 150 151 static bool g_legacy_mem; 152 153 /* 154 * Walk the currently registered memory via the main memory registration map 155 * and call the new map's notify callback for each virtually contiguous region. 156 */ 157 static int 158 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 159 { 160 size_t idx_256tb; 161 uint64_t idx_1gb; 162 uint64_t contig_start = UINT64_MAX; 163 uint64_t contig_end = UINT64_MAX; 164 struct map_1gb *map_1gb; 165 int rc; 166 167 if (!g_mem_reg_map) { 168 return -EINVAL; 169 } 170 171 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 172 pthread_mutex_lock(&g_mem_reg_map->mutex); 173 174 for (idx_256tb = 0; 175 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 176 idx_256tb++) { 177 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 178 179 if (!map_1gb) { 180 if (contig_start != UINT64_MAX) { 181 /* End of of a virtually contiguous range */ 182 rc = map->ops.notify_cb(map->cb_ctx, map, action, 183 (void *)contig_start, 184 contig_end - contig_start + VALUE_2MB); 185 /* Don't bother handling unregister failures. It can't be any worse */ 186 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 187 goto err_unregister; 188 } 189 } 190 contig_start = UINT64_MAX; 191 continue; 192 } 193 194 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 195 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 196 (contig_start == UINT64_MAX || 197 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 198 /* Rebuild the virtual address from the indexes */ 199 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 200 201 if (contig_start == UINT64_MAX) { 202 contig_start = vaddr; 203 } 204 205 contig_end = vaddr; 206 } else { 207 if (contig_start != UINT64_MAX) { 208 /* End of of a virtually contiguous range */ 209 rc = map->ops.notify_cb(map->cb_ctx, map, action, 210 (void *)contig_start, 211 contig_end - contig_start + VALUE_2MB); 212 /* Don't bother handling unregister failures. It can't be any worse */ 213 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 214 goto err_unregister; 215 } 216 217 /* This page might be a part of a neighbour region, so process 218 * it again. The idx_1gb will be incremented immediately. 219 */ 220 idx_1gb--; 221 } 222 contig_start = UINT64_MAX; 223 } 224 } 225 } 226 227 pthread_mutex_unlock(&g_mem_reg_map->mutex); 228 return 0; 229 230 err_unregister: 231 /* Unwind to the first empty translation so we don't unregister 232 * a region that just failed to register. 233 */ 234 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 235 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 236 contig_start = UINT64_MAX; 237 contig_end = UINT64_MAX; 238 239 /* Unregister any memory we managed to register before the failure */ 240 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 241 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 242 243 if (!map_1gb) { 244 if (contig_end != UINT64_MAX) { 245 /* End of of a virtually contiguous range */ 246 map->ops.notify_cb(map->cb_ctx, map, 247 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 248 (void *)contig_start, 249 contig_end - contig_start + VALUE_2MB); 250 } 251 contig_end = UINT64_MAX; 252 continue; 253 } 254 255 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 256 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 257 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 258 /* Rebuild the virtual address from the indexes */ 259 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 260 261 if (contig_end == UINT64_MAX) { 262 contig_end = vaddr; 263 } 264 contig_start = vaddr; 265 } else { 266 if (contig_end != UINT64_MAX) { 267 /* End of of a virtually contiguous range */ 268 map->ops.notify_cb(map->cb_ctx, map, 269 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 270 (void *)contig_start, 271 contig_end - contig_start + VALUE_2MB); 272 idx_1gb++; 273 } 274 contig_end = UINT64_MAX; 275 } 276 } 277 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 278 } 279 280 pthread_mutex_unlock(&g_mem_reg_map->mutex); 281 return rc; 282 } 283 284 struct spdk_mem_map * 285 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 286 { 287 struct spdk_mem_map *map; 288 int rc; 289 290 map = calloc(1, sizeof(*map)); 291 if (map == NULL) { 292 return NULL; 293 } 294 295 if (pthread_mutex_init(&map->mutex, NULL)) { 296 free(map); 297 return NULL; 298 } 299 300 map->default_translation = default_translation; 301 map->cb_ctx = cb_ctx; 302 if (ops) { 303 map->ops = *ops; 304 } 305 306 if (ops && ops->notify_cb) { 307 pthread_mutex_lock(&g_spdk_mem_map_mutex); 308 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 309 if (rc != 0) { 310 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 311 DEBUG_PRINT("Initial mem_map notify failed\n"); 312 pthread_mutex_destroy(&map->mutex); 313 free(map); 314 return NULL; 315 } 316 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 317 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 318 } 319 320 return map; 321 } 322 323 void 324 spdk_mem_map_free(struct spdk_mem_map **pmap) 325 { 326 struct spdk_mem_map *map; 327 size_t i; 328 329 if (!pmap) { 330 return; 331 } 332 333 map = *pmap; 334 335 if (!map) { 336 return; 337 } 338 339 if (map->ops.notify_cb) { 340 pthread_mutex_lock(&g_spdk_mem_map_mutex); 341 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 342 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 343 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 344 } 345 346 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 347 free(map->map_256tb.map[i]); 348 } 349 350 pthread_mutex_destroy(&map->mutex); 351 352 free(map); 353 *pmap = NULL; 354 } 355 356 int 357 spdk_mem_register(void *vaddr, size_t len) 358 { 359 struct spdk_mem_map *map; 360 int rc; 361 void *seg_vaddr; 362 size_t seg_len; 363 uint64_t reg; 364 365 if ((uintptr_t)vaddr & ~MASK_256TB) { 366 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 367 return -EINVAL; 368 } 369 370 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 371 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 372 __func__, vaddr, len); 373 return -EINVAL; 374 } 375 376 if (len == 0) { 377 return 0; 378 } 379 380 pthread_mutex_lock(&g_spdk_mem_map_mutex); 381 382 seg_vaddr = vaddr; 383 seg_len = len; 384 while (seg_len > 0) { 385 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 386 if (reg & REG_MAP_REGISTERED) { 387 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 388 return -EBUSY; 389 } 390 seg_vaddr += VALUE_2MB; 391 seg_len -= VALUE_2MB; 392 } 393 394 seg_vaddr = vaddr; 395 seg_len = 0; 396 while (len > 0) { 397 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 398 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 399 seg_len += VALUE_2MB; 400 vaddr += VALUE_2MB; 401 len -= VALUE_2MB; 402 } 403 404 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 405 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 406 if (rc != 0) { 407 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 408 return rc; 409 } 410 } 411 412 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 413 return 0; 414 } 415 416 int 417 spdk_mem_unregister(void *vaddr, size_t len) 418 { 419 struct spdk_mem_map *map; 420 int rc; 421 void *seg_vaddr; 422 size_t seg_len; 423 uint64_t reg, newreg; 424 425 if ((uintptr_t)vaddr & ~MASK_256TB) { 426 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 427 return -EINVAL; 428 } 429 430 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 431 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 432 __func__, vaddr, len); 433 return -EINVAL; 434 } 435 436 pthread_mutex_lock(&g_spdk_mem_map_mutex); 437 438 /* The first page must be a start of a region. Also check if it's 439 * registered to make sure we don't return -ERANGE for non-registered 440 * regions. 441 */ 442 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 443 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 444 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 445 return -ERANGE; 446 } 447 448 seg_vaddr = vaddr; 449 seg_len = len; 450 while (seg_len > 0) { 451 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 452 if ((reg & REG_MAP_REGISTERED) == 0) { 453 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 454 return -EINVAL; 455 } 456 seg_vaddr += VALUE_2MB; 457 seg_len -= VALUE_2MB; 458 } 459 460 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 461 /* If the next page is registered, it must be a start of a region as well, 462 * otherwise we'd be unregistering only a part of a region. 463 */ 464 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 465 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 466 return -ERANGE; 467 } 468 seg_vaddr = vaddr; 469 seg_len = 0; 470 471 while (len > 0) { 472 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 473 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 474 475 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 476 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 477 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 478 if (rc != 0) { 479 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 480 return rc; 481 } 482 } 483 484 seg_vaddr = vaddr; 485 seg_len = VALUE_2MB; 486 } else { 487 seg_len += VALUE_2MB; 488 } 489 490 vaddr += VALUE_2MB; 491 len -= VALUE_2MB; 492 } 493 494 if (seg_len > 0) { 495 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 496 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 497 if (rc != 0) { 498 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 499 return rc; 500 } 501 } 502 } 503 504 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 505 return 0; 506 } 507 508 int 509 spdk_mem_reserve(void *vaddr, size_t len) 510 { 511 struct spdk_mem_map *map; 512 void *seg_vaddr; 513 size_t seg_len; 514 uint64_t reg; 515 516 if ((uintptr_t)vaddr & ~MASK_256TB) { 517 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 518 return -EINVAL; 519 } 520 521 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 522 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 523 __func__, vaddr, len); 524 return -EINVAL; 525 } 526 527 if (len == 0) { 528 return 0; 529 } 530 531 pthread_mutex_lock(&g_spdk_mem_map_mutex); 532 533 /* Check if any part of this range is already registered */ 534 seg_vaddr = vaddr; 535 seg_len = len; 536 while (seg_len > 0) { 537 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 538 if (reg & REG_MAP_REGISTERED) { 539 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 540 return -EBUSY; 541 } 542 seg_vaddr += VALUE_2MB; 543 seg_len -= VALUE_2MB; 544 } 545 546 /* Simply set the translation to the memory map's default. This allocates the space in the 547 * map but does not provide a valid translation. */ 548 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 549 g_mem_reg_map->default_translation); 550 551 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 552 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 553 } 554 555 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 556 return 0; 557 } 558 559 static struct map_1gb * 560 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 561 { 562 struct map_1gb *map_1gb; 563 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 564 size_t i; 565 566 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 567 return NULL; 568 } 569 570 map_1gb = map->map_256tb.map[idx_256tb]; 571 572 if (!map_1gb) { 573 pthread_mutex_lock(&map->mutex); 574 575 /* Recheck to make sure nobody else got the mutex first. */ 576 map_1gb = map->map_256tb.map[idx_256tb]; 577 if (!map_1gb) { 578 map_1gb = malloc(sizeof(struct map_1gb)); 579 if (map_1gb) { 580 /* initialize all entries to default translation */ 581 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 582 map_1gb->map[i].translation_2mb = map->default_translation; 583 } 584 map->map_256tb.map[idx_256tb] = map_1gb; 585 } 586 } 587 588 pthread_mutex_unlock(&map->mutex); 589 590 if (!map_1gb) { 591 DEBUG_PRINT("allocation failed\n"); 592 return NULL; 593 } 594 } 595 596 return map_1gb; 597 } 598 599 int 600 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 601 uint64_t translation) 602 { 603 uint64_t vfn_2mb; 604 struct map_1gb *map_1gb; 605 uint64_t idx_1gb; 606 struct map_2mb *map_2mb; 607 608 if ((uintptr_t)vaddr & ~MASK_256TB) { 609 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 610 return -EINVAL; 611 } 612 613 /* For now, only 2 MB-aligned registrations are supported */ 614 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 615 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 616 __func__, vaddr, size); 617 return -EINVAL; 618 } 619 620 vfn_2mb = vaddr >> SHIFT_2MB; 621 622 while (size) { 623 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 624 if (!map_1gb) { 625 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 626 return -ENOMEM; 627 } 628 629 idx_1gb = MAP_1GB_IDX(vfn_2mb); 630 map_2mb = &map_1gb->map[idx_1gb]; 631 map_2mb->translation_2mb = translation; 632 633 size -= VALUE_2MB; 634 vfn_2mb++; 635 } 636 637 return 0; 638 } 639 640 int 641 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 642 { 643 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 644 } 645 646 inline uint64_t 647 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 648 { 649 const struct map_1gb *map_1gb; 650 const struct map_2mb *map_2mb; 651 uint64_t idx_256tb; 652 uint64_t idx_1gb; 653 uint64_t vfn_2mb; 654 uint64_t cur_size; 655 uint64_t prev_translation; 656 uint64_t orig_translation; 657 658 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 659 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 660 return map->default_translation; 661 } 662 663 vfn_2mb = vaddr >> SHIFT_2MB; 664 idx_256tb = MAP_256TB_IDX(vfn_2mb); 665 idx_1gb = MAP_1GB_IDX(vfn_2mb); 666 667 map_1gb = map->map_256tb.map[idx_256tb]; 668 if (spdk_unlikely(!map_1gb)) { 669 return map->default_translation; 670 } 671 672 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 673 map_2mb = &map_1gb->map[idx_1gb]; 674 if (size == NULL || map->ops.are_contiguous == NULL || 675 map_2mb->translation_2mb == map->default_translation) { 676 if (size != NULL) { 677 *size = spdk_min(*size, cur_size); 678 } 679 return map_2mb->translation_2mb; 680 } 681 682 orig_translation = map_2mb->translation_2mb; 683 prev_translation = orig_translation; 684 while (cur_size < *size) { 685 vfn_2mb++; 686 idx_256tb = MAP_256TB_IDX(vfn_2mb); 687 idx_1gb = MAP_1GB_IDX(vfn_2mb); 688 689 map_1gb = map->map_256tb.map[idx_256tb]; 690 if (spdk_unlikely(!map_1gb)) { 691 break; 692 } 693 694 map_2mb = &map_1gb->map[idx_1gb]; 695 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 696 break; 697 } 698 699 cur_size += VALUE_2MB; 700 prev_translation = map_2mb->translation_2mb; 701 } 702 703 *size = spdk_min(*size, cur_size); 704 return orig_translation; 705 } 706 707 static void 708 memory_hotplug_cb(enum rte_mem_event event_type, 709 const void *addr, size_t len, void *arg) 710 { 711 if (event_type == RTE_MEM_EVENT_ALLOC) { 712 spdk_mem_register((void *)addr, len); 713 714 if (!spdk_env_dpdk_external_init()) { 715 return; 716 } 717 718 /* When the user initialized DPDK separately, we can't 719 * be sure that --match-allocations RTE flag was specified. 720 * Without this flag, DPDK can free memory in different units 721 * than it was allocated. It doesn't work with things like RDMA MRs. 722 * 723 * For such cases, we mark segments so they aren't freed. 724 */ 725 while (len > 0) { 726 struct rte_memseg *seg; 727 728 seg = rte_mem_virt2memseg(addr, NULL); 729 assert(seg != NULL); 730 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 731 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 732 len -= seg->hugepage_sz; 733 } 734 } else if (event_type == RTE_MEM_EVENT_FREE) { 735 spdk_mem_unregister((void *)addr, len); 736 } 737 } 738 739 static int 740 memory_iter_cb(const struct rte_memseg_list *msl, 741 const struct rte_memseg *ms, size_t len, void *arg) 742 { 743 return spdk_mem_register(ms->addr, len); 744 } 745 746 int 747 mem_map_init(bool legacy_mem) 748 { 749 g_legacy_mem = legacy_mem; 750 751 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 752 if (g_mem_reg_map == NULL) { 753 DEBUG_PRINT("memory registration map allocation failed\n"); 754 return -ENOMEM; 755 } 756 757 /* 758 * Walk all DPDK memory segments and register them 759 * with the main memory map 760 */ 761 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 762 rte_memseg_contig_walk(memory_iter_cb, NULL); 763 return 0; 764 } 765 766 bool 767 spdk_iommu_is_enabled(void) 768 { 769 #if VFIO_ENABLED 770 return g_vfio.enabled && !g_vfio.noiommu_enabled; 771 #else 772 return false; 773 #endif 774 } 775 776 struct spdk_vtophys_pci_device { 777 struct rte_pci_device *pci_device; 778 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 779 }; 780 781 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 782 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 783 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 784 785 static struct spdk_mem_map *g_vtophys_map; 786 static struct spdk_mem_map *g_phys_ref_map; 787 788 #if VFIO_ENABLED 789 static int 790 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 791 { 792 struct spdk_vfio_dma_map *dma_map; 793 uint64_t refcount; 794 int ret; 795 796 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 797 assert(refcount < UINT64_MAX); 798 if (refcount > 0) { 799 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 800 return 0; 801 } 802 803 dma_map = calloc(1, sizeof(*dma_map)); 804 if (dma_map == NULL) { 805 return -ENOMEM; 806 } 807 808 dma_map->map.argsz = sizeof(dma_map->map); 809 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 810 dma_map->map.vaddr = vaddr; 811 dma_map->map.iova = iova; 812 dma_map->map.size = size; 813 814 pthread_mutex_lock(&g_vfio.mutex); 815 if (g_vfio.device_ref == 0) { 816 /* VFIO requires at least one device (IOMMU group) to be added to 817 * a VFIO container before it is possible to perform any IOMMU 818 * operations on that container. This memory will be mapped once 819 * the first device (IOMMU group) is hotplugged. 820 * 821 * Since the vfio container is managed internally by DPDK, it is 822 * also possible that some device is already in that container, but 823 * it's not managed by SPDK - e.g. an NIC attached internally 824 * inside DPDK. We could map the memory straight away in such 825 * scenario, but there's no need to do it. DPDK devices clearly 826 * don't need our mappings and hence we defer the mapping 827 * unconditionally until the first SPDK-managed device is 828 * hotplugged. 829 */ 830 goto out_insert; 831 } 832 833 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 834 if (ret) { 835 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 836 pthread_mutex_unlock(&g_vfio.mutex); 837 free(dma_map); 838 return ret; 839 } 840 841 out_insert: 842 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 843 pthread_mutex_unlock(&g_vfio.mutex); 844 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 845 return 0; 846 } 847 848 static int 849 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 850 { 851 struct spdk_vfio_dma_map *dma_map; 852 uint64_t refcount; 853 int ret; 854 struct vfio_iommu_type1_dma_unmap unmap = {}; 855 856 pthread_mutex_lock(&g_vfio.mutex); 857 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 858 if (dma_map->map.iova == iova) { 859 break; 860 } 861 } 862 863 if (dma_map == NULL) { 864 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 865 pthread_mutex_unlock(&g_vfio.mutex); 866 return -ENXIO; 867 } 868 869 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 870 assert(refcount < UINT64_MAX); 871 if (refcount > 0) { 872 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 873 } 874 875 /* We still have outstanding references, don't clear it. */ 876 if (refcount > 1) { 877 pthread_mutex_unlock(&g_vfio.mutex); 878 return 0; 879 } 880 881 /** don't support partial or multiple-page unmap for now */ 882 assert(dma_map->map.size == size); 883 884 if (g_vfio.device_ref == 0) { 885 /* Memory is not mapped anymore, just remove it's references */ 886 goto out_remove; 887 } 888 889 unmap.argsz = sizeof(unmap); 890 unmap.flags = 0; 891 unmap.iova = dma_map->map.iova; 892 unmap.size = dma_map->map.size; 893 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 894 if (ret) { 895 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 896 pthread_mutex_unlock(&g_vfio.mutex); 897 return ret; 898 } 899 900 out_remove: 901 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 902 pthread_mutex_unlock(&g_vfio.mutex); 903 free(dma_map); 904 return 0; 905 } 906 #endif 907 908 static uint64_t 909 vtophys_get_paddr_memseg(uint64_t vaddr) 910 { 911 uintptr_t paddr; 912 struct rte_memseg *seg; 913 914 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 915 if (seg != NULL) { 916 paddr = seg->iova; 917 if (paddr == RTE_BAD_IOVA) { 918 return SPDK_VTOPHYS_ERROR; 919 } 920 paddr += (vaddr - (uintptr_t)seg->addr); 921 return paddr; 922 } 923 924 return SPDK_VTOPHYS_ERROR; 925 } 926 927 /* Try to get the paddr from /proc/self/pagemap */ 928 static uint64_t 929 vtophys_get_paddr_pagemap(uint64_t vaddr) 930 { 931 uintptr_t paddr; 932 933 /* Silence static analyzers */ 934 assert(vaddr != 0); 935 paddr = rte_mem_virt2iova((void *)vaddr); 936 if (paddr == RTE_BAD_IOVA) { 937 /* 938 * The vaddr may be valid but doesn't have a backing page 939 * assigned yet. Touch the page to ensure a backing page 940 * gets assigned, then try to translate again. 941 */ 942 rte_atomic64_read((rte_atomic64_t *)vaddr); 943 paddr = rte_mem_virt2iova((void *)vaddr); 944 } 945 if (paddr == RTE_BAD_IOVA) { 946 /* Unable to get to the physical address. */ 947 return SPDK_VTOPHYS_ERROR; 948 } 949 950 return paddr; 951 } 952 953 /* Try to get the paddr from pci devices */ 954 static uint64_t 955 vtophys_get_paddr_pci(uint64_t vaddr) 956 { 957 struct spdk_vtophys_pci_device *vtophys_dev; 958 uintptr_t paddr; 959 struct rte_pci_device *dev; 960 struct rte_mem_resource *res; 961 unsigned r; 962 963 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 964 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 965 dev = vtophys_dev->pci_device; 966 967 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 968 res = &dev->mem_resource[r]; 969 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 970 vaddr < (uint64_t)res->addr + res->len) { 971 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 972 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 973 (void *)paddr); 974 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 975 return paddr; 976 } 977 } 978 } 979 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 980 981 return SPDK_VTOPHYS_ERROR; 982 } 983 984 static int 985 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 986 enum spdk_mem_map_notify_action action, 987 void *vaddr, size_t len) 988 { 989 int rc = 0, pci_phys = 0; 990 uint64_t paddr; 991 992 if ((uintptr_t)vaddr & ~MASK_256TB) { 993 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 994 return -EINVAL; 995 } 996 997 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 998 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 999 vaddr, len); 1000 return -EINVAL; 1001 } 1002 1003 /* Get the physical address from the DPDK memsegs */ 1004 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1005 1006 switch (action) { 1007 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1008 if (paddr == SPDK_VTOPHYS_ERROR) { 1009 /* This is not an address that DPDK is managing. */ 1010 #if VFIO_ENABLED 1011 enum rte_iova_mode iova_mode; 1012 1013 iova_mode = rte_eal_iova_mode(); 1014 1015 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1016 /* We'll use the virtual address as the iova to match DPDK. */ 1017 paddr = (uint64_t)vaddr; 1018 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1019 if (rc) { 1020 return -EFAULT; 1021 } 1022 while (len > 0) { 1023 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1024 if (rc != 0) { 1025 return rc; 1026 } 1027 vaddr += VALUE_2MB; 1028 paddr += VALUE_2MB; 1029 len -= VALUE_2MB; 1030 } 1031 } else 1032 #endif 1033 { 1034 /* Get the physical address from /proc/self/pagemap. */ 1035 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1036 if (paddr == SPDK_VTOPHYS_ERROR) { 1037 /* Get the physical address from PCI devices */ 1038 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1039 if (paddr == SPDK_VTOPHYS_ERROR) { 1040 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1041 return -EFAULT; 1042 } 1043 /* The beginning of this address range points to a PCI resource, 1044 * so the rest must point to a PCI resource as well. 1045 */ 1046 pci_phys = 1; 1047 } 1048 1049 /* Get paddr for each 2MB chunk in this address range */ 1050 while (len > 0) { 1051 /* Get the physical address from /proc/self/pagemap. */ 1052 if (pci_phys) { 1053 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1054 } else { 1055 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1056 } 1057 1058 if (paddr == SPDK_VTOPHYS_ERROR) { 1059 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1060 return -EFAULT; 1061 } 1062 1063 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1064 if (!pci_phys && (paddr & MASK_2MB)) { 1065 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1066 return -EINVAL; 1067 } 1068 #if VFIO_ENABLED 1069 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1070 * with the IOMMU using the physical address to match. */ 1071 if (spdk_iommu_is_enabled()) { 1072 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1073 if (rc) { 1074 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1075 return -EFAULT; 1076 } 1077 } 1078 #endif 1079 1080 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1081 if (rc != 0) { 1082 return rc; 1083 } 1084 1085 vaddr += VALUE_2MB; 1086 len -= VALUE_2MB; 1087 } 1088 } 1089 } else { 1090 /* This is an address managed by DPDK. Just setup the translations. */ 1091 while (len > 0) { 1092 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1093 if (paddr == SPDK_VTOPHYS_ERROR) { 1094 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1095 return -EFAULT; 1096 } 1097 1098 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1099 if (rc != 0) { 1100 return rc; 1101 } 1102 1103 vaddr += VALUE_2MB; 1104 len -= VALUE_2MB; 1105 } 1106 } 1107 1108 break; 1109 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1110 #if VFIO_ENABLED 1111 if (paddr == SPDK_VTOPHYS_ERROR) { 1112 /* 1113 * This is not an address that DPDK is managing. If vfio is enabled, 1114 * we need to unmap the range from the IOMMU 1115 */ 1116 if (spdk_iommu_is_enabled()) { 1117 uint64_t buffer_len = len; 1118 uint8_t *va = vaddr; 1119 enum rte_iova_mode iova_mode; 1120 1121 iova_mode = rte_eal_iova_mode(); 1122 /* 1123 * In virtual address mode, the region is contiguous and can be done in 1124 * one unmap. 1125 */ 1126 if (iova_mode == RTE_IOVA_VA) { 1127 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1128 if (buffer_len != len || paddr != (uintptr_t)va) { 1129 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1130 "translation had address 0x%" PRIx64 " and length %lu\n", 1131 va, len, paddr, buffer_len); 1132 return -EINVAL; 1133 } 1134 rc = vtophys_iommu_unmap_dma(paddr, len); 1135 if (rc) { 1136 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1137 return -EFAULT; 1138 } 1139 } else if (iova_mode == RTE_IOVA_PA) { 1140 /* Get paddr for each 2MB chunk in this address range */ 1141 while (buffer_len > 0) { 1142 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1143 1144 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1145 DEBUG_PRINT("could not get phys addr for %p\n", va); 1146 return -EFAULT; 1147 } 1148 1149 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1150 if (rc) { 1151 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1152 return -EFAULT; 1153 } 1154 1155 va += VALUE_2MB; 1156 buffer_len -= VALUE_2MB; 1157 } 1158 } 1159 } 1160 } 1161 #endif 1162 while (len > 0) { 1163 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1164 if (rc != 0) { 1165 return rc; 1166 } 1167 1168 vaddr += VALUE_2MB; 1169 len -= VALUE_2MB; 1170 } 1171 1172 break; 1173 default: 1174 SPDK_UNREACHABLE(); 1175 } 1176 1177 return rc; 1178 } 1179 1180 static int 1181 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1182 { 1183 /* This function is always called with paddrs for two subsequent 1184 * 2MB chunks in virtual address space, so those chunks will be only 1185 * physically contiguous if the physical addresses are 2MB apart 1186 * from each other as well. 1187 */ 1188 return (paddr2 - paddr1 == VALUE_2MB); 1189 } 1190 1191 #if VFIO_ENABLED 1192 1193 static bool 1194 vfio_enabled(void) 1195 { 1196 return rte_vfio_is_enabled("vfio_pci"); 1197 } 1198 1199 /* Check if IOMMU is enabled on the system */ 1200 static bool 1201 has_iommu_groups(void) 1202 { 1203 int count = 0; 1204 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1205 1206 if (dir == NULL) { 1207 return false; 1208 } 1209 1210 while (count < 3 && readdir(dir) != NULL) { 1211 count++; 1212 } 1213 1214 closedir(dir); 1215 /* there will always be ./ and ../ entries */ 1216 return count > 2; 1217 } 1218 1219 static bool 1220 vfio_noiommu_enabled(void) 1221 { 1222 return rte_vfio_noiommu_is_enabled(); 1223 } 1224 1225 static void 1226 vtophys_iommu_device_event(const char *device_name, 1227 enum rte_dev_event_type event, 1228 void *cb_arg) 1229 { 1230 struct rte_dev_iterator dev_iter; 1231 struct rte_device *dev; 1232 1233 pthread_mutex_lock(&g_vfio.mutex); 1234 1235 switch (event) { 1236 default: 1237 case RTE_DEV_EVENT_ADD: 1238 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1239 if (strcmp(dev->name, device_name) == 0) { 1240 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1241 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1242 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1243 #else 1244 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1245 #endif 1246 /* This is a new PCI device using vfio */ 1247 g_vfio.device_ref++; 1248 } 1249 break; 1250 } 1251 } 1252 1253 if (g_vfio.device_ref == 1) { 1254 struct spdk_vfio_dma_map *dma_map; 1255 int ret; 1256 1257 /* This is the first device registered. This means that the first 1258 * IOMMU group might have been just been added to the DPDK vfio container. 1259 * From this point it is certain that the memory can be mapped now. 1260 */ 1261 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1262 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1263 if (ret) { 1264 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1265 break; 1266 } 1267 } 1268 } 1269 break; 1270 case RTE_DEV_EVENT_REMOVE: 1271 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1272 if (strcmp(dev->name, device_name) == 0) { 1273 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1274 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1275 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1276 #else 1277 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1278 #endif 1279 /* This is a PCI device using vfio */ 1280 g_vfio.device_ref--; 1281 } 1282 break; 1283 } 1284 } 1285 1286 if (g_vfio.device_ref == 0) { 1287 struct spdk_vfio_dma_map *dma_map; 1288 int ret; 1289 1290 /* If DPDK doesn't have any additional devices using it's vfio container, 1291 * all the mappings will be automatically removed by the Linux vfio driver. 1292 * We unmap the memory manually to be able to easily re-map it later regardless 1293 * of other, external factors. 1294 */ 1295 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1296 struct vfio_iommu_type1_dma_unmap unmap = {}; 1297 unmap.argsz = sizeof(unmap); 1298 unmap.flags = 0; 1299 unmap.iova = dma_map->map.iova; 1300 unmap.size = dma_map->map.size; 1301 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1302 if (ret) { 1303 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1304 break; 1305 } 1306 } 1307 } 1308 break; 1309 } 1310 1311 pthread_mutex_unlock(&g_vfio.mutex); 1312 } 1313 1314 static void 1315 vtophys_iommu_init(void) 1316 { 1317 char proc_fd_path[PATH_MAX + 1]; 1318 char link_path[PATH_MAX + 1]; 1319 const char vfio_path[] = "/dev/vfio/vfio"; 1320 DIR *dir; 1321 struct dirent *d; 1322 struct rte_dev_iterator dev_iter; 1323 struct rte_device *dev; 1324 int rc; 1325 1326 if (!vfio_enabled()) { 1327 return; 1328 } 1329 1330 if (vfio_noiommu_enabled()) { 1331 g_vfio.noiommu_enabled = true; 1332 } else if (!has_iommu_groups()) { 1333 return; 1334 } 1335 1336 dir = opendir("/proc/self/fd"); 1337 if (!dir) { 1338 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1339 return; 1340 } 1341 1342 while ((d = readdir(dir)) != NULL) { 1343 if (d->d_type != DT_LNK) { 1344 continue; 1345 } 1346 1347 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1348 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1349 continue; 1350 } 1351 1352 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1353 sscanf(d->d_name, "%d", &g_vfio.fd); 1354 break; 1355 } 1356 } 1357 1358 closedir(dir); 1359 1360 if (g_vfio.fd < 0) { 1361 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1362 return; 1363 } 1364 1365 /* If the IOMMU is enabled, we need to track whether there are any devices present because 1366 * it's only valid to perform vfio IOCTLs to the containers when there is at least 1367 * one device. The device may be a DPDK device that SPDK doesn't otherwise know about, but 1368 * that's ok. 1369 */ 1370 RTE_DEV_FOREACH(dev, "bus=pci", &dev_iter) { 1371 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev); 1372 1373 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0) 1374 if (pci_dev->kdrv == RTE_KDRV_VFIO) { 1375 #else 1376 if (pci_dev->kdrv == RTE_PCI_KDRV_VFIO) { 1377 #endif 1378 /* This is a PCI device using vfio */ 1379 g_vfio.device_ref++; 1380 } 1381 } 1382 1383 if (spdk_process_is_primary()) { 1384 rc = rte_dev_event_callback_register(NULL, vtophys_iommu_device_event, NULL); 1385 if (rc) { 1386 DEBUG_PRINT("Failed to register device event callback\n"); 1387 return; 1388 } 1389 rc = rte_dev_event_monitor_start(); 1390 if (rc) { 1391 DEBUG_PRINT("Failed to start device event monitoring.\n"); 1392 return; 1393 } 1394 } 1395 1396 g_vfio.enabled = true; 1397 1398 return; 1399 } 1400 1401 static void 1402 vtophys_iommu_fini(void) 1403 { 1404 if (spdk_process_is_primary()) { 1405 rte_dev_event_callback_unregister(NULL, vtophys_iommu_device_event, NULL); 1406 rte_dev_event_monitor_stop(); 1407 } 1408 } 1409 1410 #endif 1411 1412 void 1413 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1414 { 1415 struct spdk_vtophys_pci_device *vtophys_dev; 1416 1417 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1418 1419 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1420 if (vtophys_dev) { 1421 vtophys_dev->pci_device = pci_device; 1422 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1423 } else { 1424 DEBUG_PRINT("Memory allocation error\n"); 1425 } 1426 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1427 } 1428 1429 void 1430 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1431 { 1432 struct spdk_vtophys_pci_device *vtophys_dev; 1433 1434 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1435 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1436 if (vtophys_dev->pci_device == pci_device) { 1437 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1438 free(vtophys_dev); 1439 break; 1440 } 1441 } 1442 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1443 } 1444 1445 int 1446 vtophys_init(void) 1447 { 1448 const struct spdk_mem_map_ops vtophys_map_ops = { 1449 .notify_cb = vtophys_notify, 1450 .are_contiguous = vtophys_check_contiguous_entries, 1451 }; 1452 1453 const struct spdk_mem_map_ops phys_ref_map_ops = { 1454 .notify_cb = NULL, 1455 .are_contiguous = NULL, 1456 }; 1457 1458 #if VFIO_ENABLED 1459 vtophys_iommu_init(); 1460 #endif 1461 1462 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1463 if (g_phys_ref_map == NULL) { 1464 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1465 return -ENOMEM; 1466 } 1467 1468 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1469 if (g_vtophys_map == NULL) { 1470 DEBUG_PRINT("vtophys map allocation failed\n"); 1471 spdk_mem_map_free(&g_phys_ref_map); 1472 return -ENOMEM; 1473 } 1474 return 0; 1475 } 1476 1477 void 1478 vtophys_fini(void) 1479 { 1480 #if VFIO_ENABLED 1481 vtophys_iommu_fini(); 1482 #endif 1483 } 1484 1485 uint64_t 1486 spdk_vtophys(const void *buf, uint64_t *size) 1487 { 1488 uint64_t vaddr, paddr_2mb; 1489 1490 vaddr = (uint64_t)buf; 1491 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1492 1493 /* 1494 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1495 * we will still bitwise-or it with the buf offset below, but the result will still be 1496 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1497 * unaligned) we must now check the return value before addition. 1498 */ 1499 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1500 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1501 return SPDK_VTOPHYS_ERROR; 1502 } else { 1503 return paddr_2mb + (vaddr & MASK_2MB); 1504 } 1505 } 1506 1507 int 1508 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1509 { 1510 struct rte_memseg *seg; 1511 int ret, fd; 1512 1513 seg = rte_mem_virt2memseg(vaddr, NULL); 1514 if (!seg) { 1515 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1516 return -ENOENT; 1517 } 1518 1519 fd = rte_memseg_get_fd_thread_unsafe(seg); 1520 if (fd < 0) { 1521 return fd; 1522 } 1523 1524 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1525 if (ret < 0) { 1526 return ret; 1527 } 1528 1529 return fd; 1530 } 1531