1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_eal_memconfig.h> 40 41 #include "spdk_internal/assert.h" 42 #include "spdk_internal/memory.h" 43 44 #include "spdk/assert.h" 45 #include "spdk/likely.h" 46 #include "spdk/queue.h" 47 #include "spdk/util.h" 48 #include "spdk/env_dpdk.h" 49 50 #ifdef __FreeBSD__ 51 #define SPDK_VFIO_ENABLED 0 52 #else 53 #include <linux/version.h> 54 /* 55 * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API, 56 * and we can't link to internal symbols when built against shared library DPDK, 57 * so disable VFIO entirely in that case. 58 */ 59 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \ 60 (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB)) 61 62 #define SPDK_VFIO_ENABLED 1 63 #include <linux/vfio.h> 64 65 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 66 #include <rte_vfio.h> 67 #else 68 /* Internal DPDK function forward declaration */ 69 int pci_vfio_is_enabled(void); 70 #endif 71 72 struct spdk_vfio_dma_map { 73 struct vfio_iommu_type1_dma_map map; 74 struct vfio_iommu_type1_dma_unmap unmap; 75 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 76 }; 77 78 struct vfio_cfg { 79 int fd; 80 bool enabled; 81 bool noiommu_enabled; 82 unsigned device_ref; 83 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 84 pthread_mutex_t mutex; 85 }; 86 87 static struct vfio_cfg g_vfio = { 88 .fd = -1, 89 .enabled = false, 90 .noiommu_enabled = false, 91 .device_ref = 0, 92 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 93 .mutex = PTHREAD_MUTEX_INITIALIZER 94 }; 95 96 #else 97 #define SPDK_VFIO_ENABLED 0 98 #endif 99 #endif 100 101 #if DEBUG 102 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 103 #else 104 #define DEBUG_PRINT(...) 105 #endif 106 107 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 108 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 109 110 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 111 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 112 113 /* Page is registered */ 114 #define REG_MAP_REGISTERED (1ULL << 62) 115 116 /* A notification region barrier. The 2MB translation entry that's marked 117 * with this flag must be unregistered separately. This allows contiguous 118 * regions to be unregistered in the same chunks they were registered. 119 */ 120 #define REG_MAP_NOTIFY_START (1ULL << 63) 121 122 /* Translation of a single 2MB page. */ 123 struct map_2mb { 124 uint64_t translation_2mb; 125 }; 126 127 /* Second-level map table indexed by bits [21..29] of the virtual address. 128 * Each entry contains the address translation or error for entries that haven't 129 * been retrieved yet. 130 */ 131 struct map_1gb { 132 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 133 }; 134 135 /* Top-level map table indexed by bits [30..47] of the virtual address. 136 * Each entry points to a second-level map table or NULL. 137 */ 138 struct map_256tb { 139 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 140 }; 141 142 /* Page-granularity memory address translation */ 143 struct spdk_mem_map { 144 struct map_256tb map_256tb; 145 pthread_mutex_t mutex; 146 uint64_t default_translation; 147 struct spdk_mem_map_ops ops; 148 void *cb_ctx; 149 TAILQ_ENTRY(spdk_mem_map) tailq; 150 }; 151 152 /* Registrations map. The 64 bit translations are bit fields with the 153 * following layout (starting with the low bits): 154 * 0 - 61 : reserved 155 * 62 - 63 : flags 156 */ 157 static struct spdk_mem_map *g_mem_reg_map; 158 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 159 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 160 161 /* 162 * Walk the currently registered memory via the main memory registration map 163 * and call the new map's notify callback for each virtually contiguous region. 164 */ 165 static int 166 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 167 { 168 size_t idx_256tb; 169 uint64_t idx_1gb; 170 uint64_t contig_start = UINT64_MAX; 171 uint64_t contig_end = UINT64_MAX; 172 struct map_1gb *map_1gb; 173 int rc; 174 175 if (!g_mem_reg_map) { 176 return -EINVAL; 177 } 178 179 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 180 pthread_mutex_lock(&g_mem_reg_map->mutex); 181 182 for (idx_256tb = 0; 183 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 184 idx_256tb++) { 185 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 186 187 if (!map_1gb) { 188 if (contig_start != UINT64_MAX) { 189 /* End of of a virtually contiguous range */ 190 rc = map->ops.notify_cb(map->cb_ctx, map, action, 191 (void *)contig_start, 192 contig_end - contig_start + VALUE_2MB); 193 /* Don't bother handling unregister failures. It can't be any worse */ 194 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 195 goto err_unregister; 196 } 197 } 198 contig_start = UINT64_MAX; 199 continue; 200 } 201 202 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 203 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 204 (contig_start == UINT64_MAX || 205 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 206 /* Rebuild the virtual address from the indexes */ 207 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 208 209 if (contig_start == UINT64_MAX) { 210 contig_start = vaddr; 211 } 212 213 contig_end = vaddr; 214 } else { 215 if (contig_start != UINT64_MAX) { 216 /* End of of a virtually contiguous range */ 217 rc = map->ops.notify_cb(map->cb_ctx, map, action, 218 (void *)contig_start, 219 contig_end - contig_start + VALUE_2MB); 220 /* Don't bother handling unregister failures. It can't be any worse */ 221 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 222 goto err_unregister; 223 } 224 225 /* This page might be a part of a neighbour region, so process 226 * it again. The idx_1gb will be incremented immediately. 227 */ 228 idx_1gb--; 229 } 230 contig_start = UINT64_MAX; 231 } 232 } 233 } 234 235 pthread_mutex_unlock(&g_mem_reg_map->mutex); 236 return 0; 237 238 err_unregister: 239 /* Unwind to the first empty translation so we don't unregister 240 * a region that just failed to register. 241 */ 242 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 243 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 244 contig_start = UINT64_MAX; 245 contig_end = UINT64_MAX; 246 247 /* Unregister any memory we managed to register before the failure */ 248 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 249 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 250 251 if (!map_1gb) { 252 if (contig_end != UINT64_MAX) { 253 /* End of of a virtually contiguous range */ 254 map->ops.notify_cb(map->cb_ctx, map, 255 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 256 (void *)contig_start, 257 contig_end - contig_start + VALUE_2MB); 258 } 259 contig_end = UINT64_MAX; 260 continue; 261 } 262 263 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 264 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 265 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 266 /* Rebuild the virtual address from the indexes */ 267 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 268 269 if (contig_end == UINT64_MAX) { 270 contig_end = vaddr; 271 } 272 contig_start = vaddr; 273 } else { 274 if (contig_end != UINT64_MAX) { 275 /* End of of a virtually contiguous range */ 276 map->ops.notify_cb(map->cb_ctx, map, 277 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 278 (void *)contig_start, 279 contig_end - contig_start + VALUE_2MB); 280 idx_1gb++; 281 } 282 contig_end = UINT64_MAX; 283 } 284 } 285 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 286 } 287 288 pthread_mutex_unlock(&g_mem_reg_map->mutex); 289 return rc; 290 } 291 292 struct spdk_mem_map * 293 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 294 { 295 struct spdk_mem_map *map; 296 int rc; 297 298 map = calloc(1, sizeof(*map)); 299 if (map == NULL) { 300 return NULL; 301 } 302 303 if (pthread_mutex_init(&map->mutex, NULL)) { 304 free(map); 305 return NULL; 306 } 307 308 map->default_translation = default_translation; 309 map->cb_ctx = cb_ctx; 310 if (ops) { 311 map->ops = *ops; 312 } 313 314 if (ops && ops->notify_cb) { 315 pthread_mutex_lock(&g_spdk_mem_map_mutex); 316 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 317 if (rc != 0) { 318 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 319 DEBUG_PRINT("Initial mem_map notify failed\n"); 320 pthread_mutex_destroy(&map->mutex); 321 free(map); 322 return NULL; 323 } 324 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 325 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 326 } 327 328 return map; 329 } 330 331 void 332 spdk_mem_map_free(struct spdk_mem_map **pmap) 333 { 334 struct spdk_mem_map *map; 335 size_t i; 336 337 if (!pmap) { 338 return; 339 } 340 341 map = *pmap; 342 343 if (!map) { 344 return; 345 } 346 347 if (map->ops.notify_cb) { 348 pthread_mutex_lock(&g_spdk_mem_map_mutex); 349 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 350 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 351 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 352 } 353 354 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 355 free(map->map_256tb.map[i]); 356 } 357 358 pthread_mutex_destroy(&map->mutex); 359 360 free(map); 361 *pmap = NULL; 362 } 363 364 int 365 spdk_mem_register(void *vaddr, size_t len) 366 { 367 struct spdk_mem_map *map; 368 int rc; 369 void *seg_vaddr; 370 size_t seg_len; 371 uint64_t reg; 372 373 if ((uintptr_t)vaddr & ~MASK_256TB) { 374 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 375 return -EINVAL; 376 } 377 378 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 379 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 380 __func__, vaddr, len); 381 return -EINVAL; 382 } 383 384 if (len == 0) { 385 return 0; 386 } 387 388 pthread_mutex_lock(&g_spdk_mem_map_mutex); 389 390 seg_vaddr = vaddr; 391 seg_len = len; 392 while (seg_len > 0) { 393 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 394 if (reg & REG_MAP_REGISTERED) { 395 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 396 return -EBUSY; 397 } 398 seg_vaddr += VALUE_2MB; 399 seg_len -= VALUE_2MB; 400 } 401 402 seg_vaddr = vaddr; 403 seg_len = 0; 404 while (len > 0) { 405 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 406 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 407 seg_len += VALUE_2MB; 408 vaddr += VALUE_2MB; 409 len -= VALUE_2MB; 410 } 411 412 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 413 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 414 if (rc != 0) { 415 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 416 return rc; 417 } 418 } 419 420 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 421 return 0; 422 } 423 424 int 425 spdk_mem_unregister(void *vaddr, size_t len) 426 { 427 struct spdk_mem_map *map; 428 int rc; 429 void *seg_vaddr; 430 size_t seg_len; 431 uint64_t reg, newreg; 432 433 if ((uintptr_t)vaddr & ~MASK_256TB) { 434 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 435 return -EINVAL; 436 } 437 438 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 439 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 440 __func__, vaddr, len); 441 return -EINVAL; 442 } 443 444 pthread_mutex_lock(&g_spdk_mem_map_mutex); 445 446 /* The first page must be a start of a region. Also check if it's 447 * registered to make sure we don't return -ERANGE for non-registered 448 * regions. 449 */ 450 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 451 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 452 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 453 return -ERANGE; 454 } 455 456 seg_vaddr = vaddr; 457 seg_len = len; 458 while (seg_len > 0) { 459 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 460 if ((reg & REG_MAP_REGISTERED) == 0) { 461 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 462 return -EINVAL; 463 } 464 seg_vaddr += VALUE_2MB; 465 seg_len -= VALUE_2MB; 466 } 467 468 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 469 /* If the next page is registered, it must be a start of a region as well, 470 * otherwise we'd be unregistering only a part of a region. 471 */ 472 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 473 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 474 return -ERANGE; 475 } 476 seg_vaddr = vaddr; 477 seg_len = 0; 478 479 while (len > 0) { 480 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 481 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 482 483 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 484 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 485 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 486 if (rc != 0) { 487 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 488 return rc; 489 } 490 } 491 492 seg_vaddr = vaddr; 493 seg_len = VALUE_2MB; 494 } else { 495 seg_len += VALUE_2MB; 496 } 497 498 vaddr += VALUE_2MB; 499 len -= VALUE_2MB; 500 } 501 502 if (seg_len > 0) { 503 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 504 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 505 if (rc != 0) { 506 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 507 return rc; 508 } 509 } 510 } 511 512 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 513 return 0; 514 } 515 516 static struct map_1gb * 517 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 518 { 519 struct map_1gb *map_1gb; 520 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 521 size_t i; 522 523 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 524 return NULL; 525 } 526 527 map_1gb = map->map_256tb.map[idx_256tb]; 528 529 if (!map_1gb) { 530 pthread_mutex_lock(&map->mutex); 531 532 /* Recheck to make sure nobody else got the mutex first. */ 533 map_1gb = map->map_256tb.map[idx_256tb]; 534 if (!map_1gb) { 535 map_1gb = malloc(sizeof(struct map_1gb)); 536 if (map_1gb) { 537 /* initialize all entries to default translation */ 538 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 539 map_1gb->map[i].translation_2mb = map->default_translation; 540 } 541 map->map_256tb.map[idx_256tb] = map_1gb; 542 } 543 } 544 545 pthread_mutex_unlock(&map->mutex); 546 547 if (!map_1gb) { 548 DEBUG_PRINT("allocation failed\n"); 549 return NULL; 550 } 551 } 552 553 return map_1gb; 554 } 555 556 int 557 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 558 uint64_t translation) 559 { 560 uint64_t vfn_2mb; 561 struct map_1gb *map_1gb; 562 uint64_t idx_1gb; 563 struct map_2mb *map_2mb; 564 565 if ((uintptr_t)vaddr & ~MASK_256TB) { 566 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 567 return -EINVAL; 568 } 569 570 /* For now, only 2 MB-aligned registrations are supported */ 571 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 572 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 573 __func__, vaddr, size); 574 return -EINVAL; 575 } 576 577 vfn_2mb = vaddr >> SHIFT_2MB; 578 579 while (size) { 580 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 581 if (!map_1gb) { 582 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 583 return -ENOMEM; 584 } 585 586 idx_1gb = MAP_1GB_IDX(vfn_2mb); 587 map_2mb = &map_1gb->map[idx_1gb]; 588 map_2mb->translation_2mb = translation; 589 590 size -= VALUE_2MB; 591 vfn_2mb++; 592 } 593 594 return 0; 595 } 596 597 int 598 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 599 { 600 uint64_t vfn_2mb; 601 struct map_1gb *map_1gb; 602 uint64_t idx_1gb; 603 struct map_2mb *map_2mb; 604 605 if ((uintptr_t)vaddr & ~MASK_256TB) { 606 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 607 return -EINVAL; 608 } 609 610 /* For now, only 2 MB-aligned registrations are supported */ 611 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 612 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 613 __func__, vaddr, size); 614 return -EINVAL; 615 } 616 617 vfn_2mb = vaddr >> SHIFT_2MB; 618 619 while (size) { 620 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 621 if (!map_1gb) { 622 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 623 return -ENOMEM; 624 } 625 626 idx_1gb = MAP_1GB_IDX(vfn_2mb); 627 map_2mb = &map_1gb->map[idx_1gb]; 628 map_2mb->translation_2mb = map->default_translation; 629 630 size -= VALUE_2MB; 631 vfn_2mb++; 632 } 633 634 return 0; 635 } 636 637 inline uint64_t 638 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 639 { 640 const struct map_1gb *map_1gb; 641 const struct map_2mb *map_2mb; 642 uint64_t idx_256tb; 643 uint64_t idx_1gb; 644 uint64_t vfn_2mb; 645 uint64_t cur_size; 646 uint64_t prev_translation; 647 uint64_t orig_translation; 648 649 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 650 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 651 return map->default_translation; 652 } 653 654 vfn_2mb = vaddr >> SHIFT_2MB; 655 idx_256tb = MAP_256TB_IDX(vfn_2mb); 656 idx_1gb = MAP_1GB_IDX(vfn_2mb); 657 658 map_1gb = map->map_256tb.map[idx_256tb]; 659 if (spdk_unlikely(!map_1gb)) { 660 return map->default_translation; 661 } 662 663 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 664 map_2mb = &map_1gb->map[idx_1gb]; 665 if (size == NULL || map->ops.are_contiguous == NULL || 666 map_2mb->translation_2mb == map->default_translation) { 667 if (size != NULL) { 668 *size = spdk_min(*size, cur_size); 669 } 670 return map_2mb->translation_2mb; 671 } 672 673 orig_translation = map_2mb->translation_2mb; 674 prev_translation = orig_translation; 675 while (cur_size < *size) { 676 vfn_2mb++; 677 idx_256tb = MAP_256TB_IDX(vfn_2mb); 678 idx_1gb = MAP_1GB_IDX(vfn_2mb); 679 680 map_1gb = map->map_256tb.map[idx_256tb]; 681 if (spdk_unlikely(!map_1gb)) { 682 break; 683 } 684 685 map_2mb = &map_1gb->map[idx_1gb]; 686 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 687 break; 688 } 689 690 cur_size += VALUE_2MB; 691 prev_translation = map_2mb->translation_2mb; 692 } 693 694 *size = spdk_min(*size, cur_size); 695 return orig_translation; 696 } 697 698 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 699 static void 700 memory_hotplug_cb(enum rte_mem_event event_type, 701 const void *addr, size_t len, void *arg) 702 { 703 if (event_type == RTE_MEM_EVENT_ALLOC) { 704 spdk_mem_register((void *)addr, len); 705 706 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 707 if (!spdk_env_dpdk_external_init()) { 708 return; 709 } 710 #endif 711 712 /* Prior to DPDK 19.02, we have to worry about DPDK 713 * freeing memory in different units than it was allocated. 714 * That doesn't work with things like RDMA MRs. So for 715 * those versions of DPDK, mark each segment so that DPDK 716 * won't later free it. That ensures we don't have to deal 717 * with that scenario. 718 * 719 * DPDK 19.02 added the --match-allocations RTE flag to 720 * avoid this condition. 721 * 722 * Note: if the user initialized DPDK separately, we can't 723 * be sure that --match-allocations was specified, so need 724 * to still mark the segments so they aren't freed. 725 */ 726 while (len > 0) { 727 struct rte_memseg *seg; 728 729 seg = rte_mem_virt2memseg(addr, NULL); 730 assert(seg != NULL); 731 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 732 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 733 len -= seg->hugepage_sz; 734 } 735 } else if (event_type == RTE_MEM_EVENT_FREE) { 736 spdk_mem_unregister((void *)addr, len); 737 } 738 } 739 740 static int 741 memory_iter_cb(const struct rte_memseg_list *msl, 742 const struct rte_memseg *ms, size_t len, void *arg) 743 { 744 return spdk_mem_register(ms->addr, len); 745 } 746 #endif 747 748 int 749 spdk_mem_map_init(void) 750 { 751 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 752 if (g_mem_reg_map == NULL) { 753 DEBUG_PRINT("memory registration map allocation failed\n"); 754 return -1; 755 } 756 757 /* 758 * Walk all DPDK memory segments and register them 759 * with the master memory map 760 */ 761 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 762 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 763 rte_memseg_contig_walk(memory_iter_cb, NULL); 764 #else 765 struct rte_mem_config *mcfg; 766 size_t seg_idx; 767 768 mcfg = rte_eal_get_configuration()->mem_config; 769 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 770 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 771 772 if (seg->addr == NULL) { 773 break; 774 } 775 776 spdk_mem_register(seg->addr, seg->len); 777 } 778 #endif 779 return 0; 780 } 781 782 bool 783 spdk_iommu_is_enabled(void) 784 { 785 #if SPDK_VFIO_ENABLED 786 return g_vfio.enabled && !g_vfio.noiommu_enabled; 787 #else 788 return false; 789 #endif 790 } 791 792 struct spdk_vtophys_pci_device { 793 struct rte_pci_device *pci_device; 794 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 795 }; 796 797 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 798 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 799 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 800 801 static struct spdk_mem_map *g_vtophys_map; 802 803 #if SPDK_VFIO_ENABLED 804 static int 805 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 806 { 807 struct spdk_vfio_dma_map *dma_map; 808 int ret; 809 810 dma_map = calloc(1, sizeof(*dma_map)); 811 if (dma_map == NULL) { 812 return -ENOMEM; 813 } 814 815 dma_map->map.argsz = sizeof(dma_map->map); 816 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 817 dma_map->map.vaddr = vaddr; 818 dma_map->map.iova = iova; 819 dma_map->map.size = size; 820 821 dma_map->unmap.argsz = sizeof(dma_map->unmap); 822 dma_map->unmap.flags = 0; 823 dma_map->unmap.iova = iova; 824 dma_map->unmap.size = size; 825 826 pthread_mutex_lock(&g_vfio.mutex); 827 if (g_vfio.device_ref == 0) { 828 /* VFIO requires at least one device (IOMMU group) to be added to 829 * a VFIO container before it is possible to perform any IOMMU 830 * operations on that container. This memory will be mapped once 831 * the first device (IOMMU group) is hotplugged. 832 * 833 * Since the vfio container is managed internally by DPDK, it is 834 * also possible that some device is already in that container, but 835 * it's not managed by SPDK - e.g. an NIC attached internally 836 * inside DPDK. We could map the memory straight away in such 837 * scenario, but there's no need to do it. DPDK devices clearly 838 * don't need our mappings and hence we defer the mapping 839 * unconditionally until the first SPDK-managed device is 840 * hotplugged. 841 */ 842 goto out_insert; 843 } 844 845 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 846 if (ret) { 847 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 848 pthread_mutex_unlock(&g_vfio.mutex); 849 free(dma_map); 850 return ret; 851 } 852 853 out_insert: 854 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 855 pthread_mutex_unlock(&g_vfio.mutex); 856 return 0; 857 } 858 859 static int 860 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 861 { 862 struct spdk_vfio_dma_map *dma_map; 863 int ret; 864 865 pthread_mutex_lock(&g_vfio.mutex); 866 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 867 if (dma_map->map.iova == iova) { 868 break; 869 } 870 } 871 872 if (dma_map == NULL) { 873 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 874 pthread_mutex_unlock(&g_vfio.mutex); 875 return -ENXIO; 876 } 877 878 /** don't support partial or multiple-page unmap for now */ 879 assert(dma_map->map.size == size); 880 881 if (g_vfio.device_ref == 0) { 882 /* Memory is not mapped anymore, just remove it's references */ 883 goto out_remove; 884 } 885 886 887 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 888 if (ret) { 889 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 890 pthread_mutex_unlock(&g_vfio.mutex); 891 return ret; 892 } 893 894 out_remove: 895 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 896 pthread_mutex_unlock(&g_vfio.mutex); 897 free(dma_map); 898 return 0; 899 } 900 #endif 901 902 static uint64_t 903 vtophys_get_paddr_memseg(uint64_t vaddr) 904 { 905 uintptr_t paddr; 906 struct rte_memseg *seg; 907 908 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 909 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 910 if (seg != NULL) { 911 paddr = seg->phys_addr; 912 if (paddr == RTE_BAD_IOVA) { 913 return SPDK_VTOPHYS_ERROR; 914 } 915 paddr += (vaddr - (uintptr_t)seg->addr); 916 return paddr; 917 } 918 #else 919 struct rte_mem_config *mcfg; 920 uint32_t seg_idx; 921 922 mcfg = rte_eal_get_configuration()->mem_config; 923 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 924 seg = &mcfg->memseg[seg_idx]; 925 if (seg->addr == NULL) { 926 break; 927 } 928 929 if (vaddr >= (uintptr_t)seg->addr && 930 vaddr < ((uintptr_t)seg->addr + seg->len)) { 931 paddr = seg->phys_addr; 932 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 933 if (paddr == RTE_BAD_IOVA) { 934 #else 935 if (paddr == RTE_BAD_PHYS_ADDR) { 936 #endif 937 return SPDK_VTOPHYS_ERROR; 938 } 939 paddr += (vaddr - (uintptr_t)seg->addr); 940 return paddr; 941 } 942 } 943 #endif 944 945 return SPDK_VTOPHYS_ERROR; 946 } 947 948 /* Try to get the paddr from /proc/self/pagemap */ 949 static uint64_t 950 vtophys_get_paddr_pagemap(uint64_t vaddr) 951 { 952 uintptr_t paddr; 953 954 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 955 #define BAD_ADDR RTE_BAD_IOVA 956 #define VTOPHYS rte_mem_virt2iova 957 #else 958 #define BAD_ADDR RTE_BAD_PHYS_ADDR 959 #define VTOPHYS rte_mem_virt2phy 960 #endif 961 962 /* 963 * Note: the virt2phy/virt2iova functions have changed over time, such 964 * that older versions may return 0 while recent versions will never 965 * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and 966 * newer versions, check for both return values. 967 */ 968 paddr = VTOPHYS((void *)vaddr); 969 if (paddr == 0 || paddr == BAD_ADDR) { 970 /* 971 * The vaddr may be valid but doesn't have a backing page 972 * assigned yet. Touch the page to ensure a backing page 973 * gets assigned, then try to translate again. 974 */ 975 rte_atomic64_read((rte_atomic64_t *)vaddr); 976 paddr = VTOPHYS((void *)vaddr); 977 } 978 if (paddr == 0 || paddr == BAD_ADDR) { 979 /* Unable to get to the physical address. */ 980 return SPDK_VTOPHYS_ERROR; 981 } 982 983 #undef BAD_ADDR 984 #undef VTOPHYS 985 986 return paddr; 987 } 988 989 /* Try to get the paddr from pci devices */ 990 static uint64_t 991 vtophys_get_paddr_pci(uint64_t vaddr) 992 { 993 struct spdk_vtophys_pci_device *vtophys_dev; 994 uintptr_t paddr; 995 struct rte_pci_device *dev; 996 struct rte_mem_resource *res; 997 unsigned r; 998 999 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1000 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1001 dev = vtophys_dev->pci_device; 1002 1003 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 1004 res = &dev->mem_resource[r]; 1005 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 1006 vaddr < (uint64_t)res->addr + res->len) { 1007 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1008 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 1009 (void *)paddr); 1010 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1011 return paddr; 1012 } 1013 } 1014 } 1015 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1016 1017 return SPDK_VTOPHYS_ERROR; 1018 } 1019 1020 static int 1021 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1022 enum spdk_mem_map_notify_action action, 1023 void *vaddr, size_t len) 1024 { 1025 int rc = 0, pci_phys = 0; 1026 uint64_t paddr; 1027 1028 if ((uintptr_t)vaddr & ~MASK_256TB) { 1029 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1030 return -EINVAL; 1031 } 1032 1033 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1034 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 1035 __func__, vaddr, len); 1036 return -EINVAL; 1037 } 1038 1039 while (len > 0) { 1040 /* Get the physical address from the DPDK memsegs */ 1041 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1042 1043 switch (action) { 1044 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1045 if (paddr == SPDK_VTOPHYS_ERROR) { 1046 /* This is not an address that DPDK is managing. */ 1047 #if SPDK_VFIO_ENABLED 1048 if (spdk_iommu_is_enabled()) { 1049 /* We'll use the virtual address as the iova. DPDK 1050 * currently uses physical addresses as the iovas (or counts 1051 * up from 0 if it can't get physical addresses), so 1052 * the range of user space virtual addresses and physical 1053 * addresses will never overlap. 1054 */ 1055 paddr = (uint64_t)vaddr; 1056 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1057 if (rc) { 1058 return -EFAULT; 1059 } 1060 } else 1061 #endif 1062 { 1063 /* Get the physical address from /proc/self/pagemap. */ 1064 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1065 if (paddr == SPDK_VTOPHYS_ERROR) { 1066 /* Get the physical address from PCI devices */ 1067 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1068 if (paddr == SPDK_VTOPHYS_ERROR) { 1069 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1070 return -EFAULT; 1071 } 1072 pci_phys = 1; 1073 } 1074 } 1075 } 1076 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1077 if (!pci_phys && (paddr & MASK_2MB)) { 1078 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1079 return -EINVAL; 1080 } 1081 1082 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1083 break; 1084 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1085 #if SPDK_VFIO_ENABLED 1086 if (paddr == SPDK_VTOPHYS_ERROR) { 1087 /* 1088 * This is not an address that DPDK is managing. If vfio is enabled, 1089 * we need to unmap the range from the IOMMU 1090 */ 1091 if (spdk_iommu_is_enabled()) { 1092 uint64_t buffer_len = VALUE_2MB; 1093 paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); 1094 if (buffer_len != VALUE_2MB) { 1095 return -EINVAL; 1096 } 1097 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1098 if (rc) { 1099 return -EFAULT; 1100 } 1101 } 1102 } 1103 #endif 1104 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1105 break; 1106 default: 1107 SPDK_UNREACHABLE(); 1108 } 1109 1110 if (rc != 0) { 1111 return rc; 1112 } 1113 vaddr += VALUE_2MB; 1114 len -= VALUE_2MB; 1115 } 1116 1117 return rc; 1118 } 1119 1120 #if SPDK_VFIO_ENABLED 1121 1122 static bool 1123 spdk_vfio_enabled(void) 1124 { 1125 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1126 return rte_vfio_is_enabled("vfio_pci"); 1127 #else 1128 return pci_vfio_is_enabled(); 1129 #endif 1130 } 1131 1132 /* Check if IOMMU is enabled on the system */ 1133 static bool 1134 has_iommu_groups(void) 1135 { 1136 struct dirent *d; 1137 int count = 0; 1138 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1139 1140 if (dir == NULL) { 1141 return false; 1142 } 1143 1144 while (count < 3 && (d = readdir(dir)) != NULL) { 1145 count++; 1146 } 1147 1148 closedir(dir); 1149 /* there will always be ./ and ../ entries */ 1150 return count > 2; 1151 } 1152 1153 static bool 1154 spdk_vfio_noiommu_enabled(void) 1155 { 1156 return rte_vfio_noiommu_is_enabled(); 1157 } 1158 1159 static void 1160 spdk_vtophys_iommu_init(void) 1161 { 1162 char proc_fd_path[PATH_MAX + 1]; 1163 char link_path[PATH_MAX + 1]; 1164 const char vfio_path[] = "/dev/vfio/vfio"; 1165 DIR *dir; 1166 struct dirent *d; 1167 1168 if (!spdk_vfio_enabled()) { 1169 return; 1170 } 1171 1172 if (spdk_vfio_noiommu_enabled()) { 1173 g_vfio.noiommu_enabled = true; 1174 } else if (!has_iommu_groups()) { 1175 return; 1176 } 1177 1178 dir = opendir("/proc/self/fd"); 1179 if (!dir) { 1180 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1181 return; 1182 } 1183 1184 while ((d = readdir(dir)) != NULL) { 1185 if (d->d_type != DT_LNK) { 1186 continue; 1187 } 1188 1189 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1190 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1191 continue; 1192 } 1193 1194 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1195 sscanf(d->d_name, "%d", &g_vfio.fd); 1196 break; 1197 } 1198 } 1199 1200 closedir(dir); 1201 1202 if (g_vfio.fd < 0) { 1203 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1204 return; 1205 } 1206 1207 g_vfio.enabled = true; 1208 1209 return; 1210 } 1211 #endif 1212 1213 void 1214 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1215 { 1216 struct spdk_vtophys_pci_device *vtophys_dev; 1217 1218 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1219 1220 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1221 if (vtophys_dev) { 1222 vtophys_dev->pci_device = pci_device; 1223 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1224 } else { 1225 DEBUG_PRINT("Memory allocation error\n"); 1226 } 1227 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1228 1229 #if SPDK_VFIO_ENABLED 1230 struct spdk_vfio_dma_map *dma_map; 1231 int ret; 1232 1233 if (!g_vfio.enabled) { 1234 return; 1235 } 1236 1237 pthread_mutex_lock(&g_vfio.mutex); 1238 g_vfio.device_ref++; 1239 if (g_vfio.device_ref > 1) { 1240 pthread_mutex_unlock(&g_vfio.mutex); 1241 return; 1242 } 1243 1244 /* This is the first SPDK device using DPDK vfio. This means that the first 1245 * IOMMU group might have been just been added to the DPDK vfio container. 1246 * From this point it is certain that the memory can be mapped now. 1247 */ 1248 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1249 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1250 if (ret) { 1251 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1252 break; 1253 } 1254 } 1255 pthread_mutex_unlock(&g_vfio.mutex); 1256 #endif 1257 } 1258 1259 void 1260 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1261 { 1262 struct spdk_vtophys_pci_device *vtophys_dev; 1263 1264 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1265 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1266 if (vtophys_dev->pci_device == pci_device) { 1267 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1268 free(vtophys_dev); 1269 break; 1270 } 1271 } 1272 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1273 1274 #if SPDK_VFIO_ENABLED 1275 struct spdk_vfio_dma_map *dma_map; 1276 int ret; 1277 1278 if (!g_vfio.enabled) { 1279 return; 1280 } 1281 1282 pthread_mutex_lock(&g_vfio.mutex); 1283 assert(g_vfio.device_ref > 0); 1284 g_vfio.device_ref--; 1285 if (g_vfio.device_ref > 0) { 1286 pthread_mutex_unlock(&g_vfio.mutex); 1287 return; 1288 } 1289 1290 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1291 * any additional devices using it's vfio container, all the mappings 1292 * will be automatically removed by the Linux vfio driver. We unmap 1293 * the memory manually to be able to easily re-map it later regardless 1294 * of other, external factors. 1295 */ 1296 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1297 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1298 if (ret) { 1299 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1300 break; 1301 } 1302 } 1303 pthread_mutex_unlock(&g_vfio.mutex); 1304 #endif 1305 } 1306 1307 int 1308 spdk_vtophys_init(void) 1309 { 1310 const struct spdk_mem_map_ops vtophys_map_ops = { 1311 .notify_cb = spdk_vtophys_notify, 1312 .are_contiguous = NULL 1313 }; 1314 1315 #if SPDK_VFIO_ENABLED 1316 spdk_vtophys_iommu_init(); 1317 #endif 1318 1319 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1320 if (g_vtophys_map == NULL) { 1321 DEBUG_PRINT("vtophys map allocation failed\n"); 1322 return -1; 1323 } 1324 return 0; 1325 } 1326 1327 uint64_t 1328 spdk_vtophys(void *buf, uint64_t *size) 1329 { 1330 uint64_t vaddr, paddr_2mb; 1331 1332 vaddr = (uint64_t)buf; 1333 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1334 1335 /* 1336 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1337 * we will still bitwise-or it with the buf offset below, but the result will still be 1338 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1339 * unaligned) we must now check the return value before addition. 1340 */ 1341 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1342 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1343 return SPDK_VTOPHYS_ERROR; 1344 } else { 1345 return paddr_2mb + (vaddr & MASK_2MB); 1346 } 1347 } 1348 1349 static int 1350 spdk_bus_scan(void) 1351 { 1352 return 0; 1353 } 1354 1355 static int 1356 spdk_bus_probe(void) 1357 { 1358 return 0; 1359 } 1360 1361 static struct rte_device * 1362 spdk_bus_find_device(const struct rte_device *start, 1363 rte_dev_cmp_t cmp, const void *data) 1364 { 1365 return NULL; 1366 } 1367 1368 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1369 static enum rte_iova_mode 1370 spdk_bus_get_iommu_class(void) { 1371 /* Since we register our PCI drivers after EAL init, we have no chance 1372 * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu 1373 * class. DPDK uses RTE_IOVA_PA by default because for some platforms 1374 * it's the only supported mode, but then SPDK does not support those 1375 * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus 1376 * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used 1377 * (i.e. at least one device on the system is bound to uio_pci_generic), 1378 * so we simply return RTE_IOVA_VA here. 1379 */ 1380 return RTE_IOVA_VA; 1381 } 1382 #endif 1383 1384 struct rte_bus spdk_bus = { 1385 .scan = spdk_bus_scan, 1386 .probe = spdk_bus_probe, 1387 .find_device = spdk_bus_find_device, 1388 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1389 .get_iommu_class = spdk_bus_get_iommu_class, 1390 #endif 1391 }; 1392 1393 RTE_REGISTER_BUS(spdk, spdk_bus); 1394