1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_eal_memconfig.h> 40 41 #include "spdk_internal/assert.h" 42 #include "spdk_internal/memory.h" 43 44 #include "spdk/assert.h" 45 #include "spdk/likely.h" 46 #include "spdk/queue.h" 47 #include "spdk/util.h" 48 49 #ifdef __FreeBSD__ 50 #define SPDK_VFIO_ENABLED 0 51 #else 52 #include <linux/version.h> 53 /* 54 * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API, 55 * and we can't link to internal symbols when built against shared library DPDK, 56 * so disable VFIO entirely in that case. 57 */ 58 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \ 59 (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB)) 60 61 #define SPDK_VFIO_ENABLED 1 62 #include <linux/vfio.h> 63 64 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 65 #include <rte_vfio.h> 66 #else 67 /* Internal DPDK function forward declaration */ 68 int pci_vfio_is_enabled(void); 69 #endif 70 71 struct spdk_vfio_dma_map { 72 struct vfio_iommu_type1_dma_map map; 73 struct vfio_iommu_type1_dma_unmap unmap; 74 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 75 }; 76 77 struct vfio_cfg { 78 int fd; 79 bool enabled; 80 bool noiommu_enabled; 81 unsigned device_ref; 82 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 83 pthread_mutex_t mutex; 84 }; 85 86 static struct vfio_cfg g_vfio = { 87 .fd = -1, 88 .enabled = false, 89 .noiommu_enabled = false, 90 .device_ref = 0, 91 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 92 .mutex = PTHREAD_MUTEX_INITIALIZER 93 }; 94 95 #else 96 #define SPDK_VFIO_ENABLED 0 97 #endif 98 #endif 99 100 #if DEBUG 101 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 102 #else 103 #define DEBUG_PRINT(...) 104 #endif 105 106 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 107 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 108 109 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 110 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 111 112 /* Page is registered */ 113 #define REG_MAP_REGISTERED (1ULL << 62) 114 115 /* A notification region barrier. The 2MB translation entry that's marked 116 * with this flag must be unregistered separately. This allows contiguous 117 * regions to be unregistered in the same chunks they were registered. 118 */ 119 #define REG_MAP_NOTIFY_START (1ULL << 63) 120 121 /* Translation of a single 2MB page. */ 122 struct map_2mb { 123 uint64_t translation_2mb; 124 }; 125 126 /* Second-level map table indexed by bits [21..29] of the virtual address. 127 * Each entry contains the address translation or error for entries that haven't 128 * been retrieved yet. 129 */ 130 struct map_1gb { 131 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 132 }; 133 134 /* Top-level map table indexed by bits [30..47] of the virtual address. 135 * Each entry points to a second-level map table or NULL. 136 */ 137 struct map_256tb { 138 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 139 }; 140 141 /* Page-granularity memory address translation */ 142 struct spdk_mem_map { 143 struct map_256tb map_256tb; 144 pthread_mutex_t mutex; 145 uint64_t default_translation; 146 struct spdk_mem_map_ops ops; 147 void *cb_ctx; 148 TAILQ_ENTRY(spdk_mem_map) tailq; 149 }; 150 151 /* Registrations map. The 64 bit translations are bit fields with the 152 * following layout (starting with the low bits): 153 * 0 - 61 : reserved 154 * 62 - 63 : flags 155 */ 156 static struct spdk_mem_map *g_mem_reg_map; 157 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 158 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 159 160 /* 161 * Walk the currently registered memory via the main memory registration map 162 * and call the new map's notify callback for each virtually contiguous region. 163 */ 164 static int 165 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 166 { 167 size_t idx_256tb; 168 uint64_t idx_1gb; 169 uint64_t contig_start = UINT64_MAX; 170 uint64_t contig_end = UINT64_MAX; 171 struct map_1gb *map_1gb; 172 int rc; 173 174 if (!g_mem_reg_map) { 175 return -EINVAL; 176 } 177 178 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 179 pthread_mutex_lock(&g_mem_reg_map->mutex); 180 181 for (idx_256tb = 0; 182 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 183 idx_256tb++) { 184 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 185 186 if (!map_1gb) { 187 if (contig_start != UINT64_MAX) { 188 /* End of of a virtually contiguous range */ 189 rc = map->ops.notify_cb(map->cb_ctx, map, action, 190 (void *)contig_start, 191 contig_end - contig_start + VALUE_2MB); 192 /* Don't bother handling unregister failures. It can't be any worse */ 193 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 194 goto err_unregister; 195 } 196 } 197 contig_start = UINT64_MAX; 198 continue; 199 } 200 201 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 202 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 203 (contig_start == UINT64_MAX || 204 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 205 /* Rebuild the virtual address from the indexes */ 206 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 207 208 if (contig_start == UINT64_MAX) { 209 contig_start = vaddr; 210 } 211 212 contig_end = vaddr; 213 } else { 214 if (contig_start != UINT64_MAX) { 215 /* End of of a virtually contiguous range */ 216 rc = map->ops.notify_cb(map->cb_ctx, map, action, 217 (void *)contig_start, 218 contig_end - contig_start + VALUE_2MB); 219 /* Don't bother handling unregister failures. It can't be any worse */ 220 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 221 goto err_unregister; 222 } 223 224 /* This page might be a part of a neighbour region, so process 225 * it again. The idx_1gb will be incremented immediately. 226 */ 227 idx_1gb--; 228 } 229 contig_start = UINT64_MAX; 230 } 231 } 232 } 233 234 pthread_mutex_unlock(&g_mem_reg_map->mutex); 235 return 0; 236 237 err_unregister: 238 /* Unwind to the first empty translation so we don't unregister 239 * a region that just failed to register. 240 */ 241 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 242 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 243 contig_start = UINT64_MAX; 244 contig_end = UINT64_MAX; 245 246 /* Unregister any memory we managed to register before the failure */ 247 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 248 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 249 250 if (!map_1gb) { 251 if (contig_end != UINT64_MAX) { 252 /* End of of a virtually contiguous range */ 253 map->ops.notify_cb(map->cb_ctx, map, 254 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 255 (void *)contig_start, 256 contig_end - contig_start + VALUE_2MB); 257 } 258 contig_end = UINT64_MAX; 259 continue; 260 } 261 262 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 263 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 264 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 265 /* Rebuild the virtual address from the indexes */ 266 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 267 268 if (contig_end == UINT64_MAX) { 269 contig_end = vaddr; 270 } 271 contig_start = vaddr; 272 } else { 273 if (contig_end != UINT64_MAX) { 274 /* End of of a virtually contiguous range */ 275 map->ops.notify_cb(map->cb_ctx, map, 276 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 277 (void *)contig_start, 278 contig_end - contig_start + VALUE_2MB); 279 idx_1gb++; 280 } 281 contig_end = UINT64_MAX; 282 } 283 } 284 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 285 } 286 287 pthread_mutex_unlock(&g_mem_reg_map->mutex); 288 return rc; 289 } 290 291 struct spdk_mem_map * 292 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 293 { 294 struct spdk_mem_map *map; 295 int rc; 296 297 map = calloc(1, sizeof(*map)); 298 if (map == NULL) { 299 return NULL; 300 } 301 302 if (pthread_mutex_init(&map->mutex, NULL)) { 303 free(map); 304 return NULL; 305 } 306 307 map->default_translation = default_translation; 308 map->cb_ctx = cb_ctx; 309 if (ops) { 310 map->ops = *ops; 311 } 312 313 if (ops && ops->notify_cb) { 314 pthread_mutex_lock(&g_spdk_mem_map_mutex); 315 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 316 if (rc != 0) { 317 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 318 DEBUG_PRINT("Initial mem_map notify failed\n"); 319 pthread_mutex_destroy(&map->mutex); 320 free(map); 321 return NULL; 322 } 323 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 324 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 325 } 326 327 return map; 328 } 329 330 void 331 spdk_mem_map_free(struct spdk_mem_map **pmap) 332 { 333 struct spdk_mem_map *map; 334 size_t i; 335 336 if (!pmap) { 337 return; 338 } 339 340 map = *pmap; 341 342 if (!map) { 343 return; 344 } 345 346 if (map->ops.notify_cb) { 347 pthread_mutex_lock(&g_spdk_mem_map_mutex); 348 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 349 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 350 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 351 } 352 353 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 354 free(map->map_256tb.map[i]); 355 } 356 357 pthread_mutex_destroy(&map->mutex); 358 359 free(map); 360 *pmap = NULL; 361 } 362 363 int 364 spdk_mem_register(void *vaddr, size_t len) 365 { 366 struct spdk_mem_map *map; 367 int rc; 368 void *seg_vaddr; 369 size_t seg_len; 370 uint64_t reg; 371 372 if ((uintptr_t)vaddr & ~MASK_256TB) { 373 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 374 return -EINVAL; 375 } 376 377 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 378 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 379 __func__, vaddr, len); 380 return -EINVAL; 381 } 382 383 if (len == 0) { 384 return 0; 385 } 386 387 pthread_mutex_lock(&g_spdk_mem_map_mutex); 388 389 seg_vaddr = vaddr; 390 seg_len = len; 391 while (seg_len > 0) { 392 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 393 if (reg & REG_MAP_REGISTERED) { 394 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 395 return -EBUSY; 396 } 397 seg_vaddr += VALUE_2MB; 398 seg_len -= VALUE_2MB; 399 } 400 401 seg_vaddr = vaddr; 402 seg_len = 0; 403 while (len > 0) { 404 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 405 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 406 seg_len += VALUE_2MB; 407 vaddr += VALUE_2MB; 408 len -= VALUE_2MB; 409 } 410 411 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 412 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 413 if (rc != 0) { 414 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 415 return rc; 416 } 417 } 418 419 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 420 return 0; 421 } 422 423 int 424 spdk_mem_unregister(void *vaddr, size_t len) 425 { 426 struct spdk_mem_map *map; 427 int rc; 428 void *seg_vaddr; 429 size_t seg_len; 430 uint64_t reg, newreg; 431 432 if ((uintptr_t)vaddr & ~MASK_256TB) { 433 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 434 return -EINVAL; 435 } 436 437 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 438 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 439 __func__, vaddr, len); 440 return -EINVAL; 441 } 442 443 pthread_mutex_lock(&g_spdk_mem_map_mutex); 444 445 /* The first page must be a start of a region. Also check if it's 446 * registered to make sure we don't return -ERANGE for non-registered 447 * regions. 448 */ 449 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 450 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 451 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 452 return -ERANGE; 453 } 454 455 seg_vaddr = vaddr; 456 seg_len = len; 457 while (seg_len > 0) { 458 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 459 if ((reg & REG_MAP_REGISTERED) == 0) { 460 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 461 return -EINVAL; 462 } 463 seg_vaddr += VALUE_2MB; 464 seg_len -= VALUE_2MB; 465 } 466 467 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 468 /* If the next page is registered, it must be a start of a region as well, 469 * otherwise we'd be unregistering only a part of a region. 470 */ 471 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 472 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 473 return -ERANGE; 474 } 475 seg_vaddr = vaddr; 476 seg_len = 0; 477 478 while (len > 0) { 479 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 480 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 481 482 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 483 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 484 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 485 if (rc != 0) { 486 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 487 return rc; 488 } 489 } 490 491 seg_vaddr = vaddr; 492 seg_len = VALUE_2MB; 493 } else { 494 seg_len += VALUE_2MB; 495 } 496 497 vaddr += VALUE_2MB; 498 len -= VALUE_2MB; 499 } 500 501 if (seg_len > 0) { 502 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 503 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 504 if (rc != 0) { 505 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 506 return rc; 507 } 508 } 509 } 510 511 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 512 return 0; 513 } 514 515 static struct map_1gb * 516 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 517 { 518 struct map_1gb *map_1gb; 519 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 520 size_t i; 521 522 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 523 return NULL; 524 } 525 526 map_1gb = map->map_256tb.map[idx_256tb]; 527 528 if (!map_1gb) { 529 pthread_mutex_lock(&map->mutex); 530 531 /* Recheck to make sure nobody else got the mutex first. */ 532 map_1gb = map->map_256tb.map[idx_256tb]; 533 if (!map_1gb) { 534 map_1gb = malloc(sizeof(struct map_1gb)); 535 if (map_1gb) { 536 /* initialize all entries to default translation */ 537 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 538 map_1gb->map[i].translation_2mb = map->default_translation; 539 } 540 map->map_256tb.map[idx_256tb] = map_1gb; 541 } 542 } 543 544 pthread_mutex_unlock(&map->mutex); 545 546 if (!map_1gb) { 547 DEBUG_PRINT("allocation failed\n"); 548 return NULL; 549 } 550 } 551 552 return map_1gb; 553 } 554 555 int 556 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 557 uint64_t translation) 558 { 559 uint64_t vfn_2mb; 560 struct map_1gb *map_1gb; 561 uint64_t idx_1gb; 562 struct map_2mb *map_2mb; 563 564 if ((uintptr_t)vaddr & ~MASK_256TB) { 565 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 566 return -EINVAL; 567 } 568 569 /* For now, only 2 MB-aligned registrations are supported */ 570 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 571 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 572 __func__, vaddr, size); 573 return -EINVAL; 574 } 575 576 vfn_2mb = vaddr >> SHIFT_2MB; 577 578 while (size) { 579 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 580 if (!map_1gb) { 581 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 582 return -ENOMEM; 583 } 584 585 idx_1gb = MAP_1GB_IDX(vfn_2mb); 586 map_2mb = &map_1gb->map[idx_1gb]; 587 map_2mb->translation_2mb = translation; 588 589 size -= VALUE_2MB; 590 vfn_2mb++; 591 } 592 593 return 0; 594 } 595 596 int 597 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 598 { 599 uint64_t vfn_2mb; 600 struct map_1gb *map_1gb; 601 uint64_t idx_1gb; 602 struct map_2mb *map_2mb; 603 604 if ((uintptr_t)vaddr & ~MASK_256TB) { 605 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 606 return -EINVAL; 607 } 608 609 /* For now, only 2 MB-aligned registrations are supported */ 610 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 611 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 612 __func__, vaddr, size); 613 return -EINVAL; 614 } 615 616 vfn_2mb = vaddr >> SHIFT_2MB; 617 618 while (size) { 619 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 620 if (!map_1gb) { 621 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 622 return -ENOMEM; 623 } 624 625 idx_1gb = MAP_1GB_IDX(vfn_2mb); 626 map_2mb = &map_1gb->map[idx_1gb]; 627 map_2mb->translation_2mb = map->default_translation; 628 629 size -= VALUE_2MB; 630 vfn_2mb++; 631 } 632 633 return 0; 634 } 635 636 inline uint64_t 637 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 638 { 639 const struct map_1gb *map_1gb; 640 const struct map_2mb *map_2mb; 641 uint64_t idx_256tb; 642 uint64_t idx_1gb; 643 uint64_t vfn_2mb; 644 uint64_t cur_size; 645 uint64_t prev_translation; 646 uint64_t orig_translation; 647 648 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 649 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 650 return map->default_translation; 651 } 652 653 vfn_2mb = vaddr >> SHIFT_2MB; 654 idx_256tb = MAP_256TB_IDX(vfn_2mb); 655 idx_1gb = MAP_1GB_IDX(vfn_2mb); 656 657 map_1gb = map->map_256tb.map[idx_256tb]; 658 if (spdk_unlikely(!map_1gb)) { 659 return map->default_translation; 660 } 661 662 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 663 map_2mb = &map_1gb->map[idx_1gb]; 664 if (size == NULL || map->ops.are_contiguous == NULL || 665 map_2mb->translation_2mb == map->default_translation) { 666 if (size != NULL) { 667 *size = spdk_min(*size, cur_size); 668 } 669 return map_2mb->translation_2mb; 670 } 671 672 orig_translation = map_2mb->translation_2mb; 673 prev_translation = orig_translation; 674 while (cur_size < *size) { 675 vfn_2mb++; 676 idx_256tb = MAP_256TB_IDX(vfn_2mb); 677 idx_1gb = MAP_1GB_IDX(vfn_2mb); 678 679 map_1gb = map->map_256tb.map[idx_256tb]; 680 if (spdk_unlikely(!map_1gb)) { 681 break; 682 } 683 684 map_2mb = &map_1gb->map[idx_1gb]; 685 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 686 break; 687 } 688 689 cur_size += VALUE_2MB; 690 prev_translation = map_2mb->translation_2mb; 691 } 692 693 *size = spdk_min(*size, cur_size); 694 return orig_translation; 695 } 696 697 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 698 static void 699 memory_hotplug_cb(enum rte_mem_event event_type, 700 const void *addr, size_t len, void *arg) 701 { 702 if (event_type == RTE_MEM_EVENT_ALLOC) { 703 spdk_mem_register((void *)addr, len); 704 705 /* Now mark each segment so that DPDK won't later free it. 706 * This ensures we don't have to deal with the memory 707 * getting freed in different units than it was allocated. 708 */ 709 while (len > 0) { 710 struct rte_memseg *seg; 711 712 seg = rte_mem_virt2memseg(addr, NULL); 713 assert(seg != NULL); 714 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 715 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 716 len -= seg->hugepage_sz; 717 } 718 } else if (event_type == RTE_MEM_EVENT_FREE) { 719 spdk_mem_unregister((void *)addr, len); 720 } 721 } 722 723 static int 724 memory_iter_cb(const struct rte_memseg_list *msl, 725 const struct rte_memseg *ms, size_t len, void *arg) 726 { 727 return spdk_mem_register(ms->addr, len); 728 } 729 #endif 730 731 int 732 spdk_mem_map_init(void) 733 { 734 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 735 if (g_mem_reg_map == NULL) { 736 DEBUG_PRINT("memory registration map allocation failed\n"); 737 return -1; 738 } 739 740 /* 741 * Walk all DPDK memory segments and register them 742 * with the master memory map 743 */ 744 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 745 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 746 rte_memseg_contig_walk(memory_iter_cb, NULL); 747 #else 748 struct rte_mem_config *mcfg; 749 size_t seg_idx; 750 751 mcfg = rte_eal_get_configuration()->mem_config; 752 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 753 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 754 755 if (seg->addr == NULL) { 756 break; 757 } 758 759 spdk_mem_register(seg->addr, seg->len); 760 } 761 #endif 762 return 0; 763 } 764 765 struct spdk_vtophys_pci_device { 766 struct rte_pci_device *pci_device; 767 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 768 }; 769 770 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 771 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 772 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 773 774 static struct spdk_mem_map *g_vtophys_map; 775 776 #if SPDK_VFIO_ENABLED 777 static int 778 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 779 { 780 struct spdk_vfio_dma_map *dma_map; 781 int ret; 782 783 dma_map = calloc(1, sizeof(*dma_map)); 784 if (dma_map == NULL) { 785 return -ENOMEM; 786 } 787 788 dma_map->map.argsz = sizeof(dma_map->map); 789 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 790 dma_map->map.vaddr = vaddr; 791 dma_map->map.iova = iova; 792 dma_map->map.size = size; 793 794 dma_map->unmap.argsz = sizeof(dma_map->unmap); 795 dma_map->unmap.flags = 0; 796 dma_map->unmap.iova = iova; 797 dma_map->unmap.size = size; 798 799 pthread_mutex_lock(&g_vfio.mutex); 800 if (g_vfio.device_ref == 0) { 801 /* VFIO requires at least one device (IOMMU group) to be added to 802 * a VFIO container before it is possible to perform any IOMMU 803 * operations on that container. This memory will be mapped once 804 * the first device (IOMMU group) is hotplugged. 805 * 806 * Since the vfio container is managed internally by DPDK, it is 807 * also possible that some device is already in that container, but 808 * it's not managed by SPDK - e.g. an NIC attached internally 809 * inside DPDK. We could map the memory straight away in such 810 * scenario, but there's no need to do it. DPDK devices clearly 811 * don't need our mappings and hence we defer the mapping 812 * unconditionally until the first SPDK-managed device is 813 * hotplugged. 814 */ 815 goto out_insert; 816 } 817 818 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 819 if (ret) { 820 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 821 pthread_mutex_unlock(&g_vfio.mutex); 822 free(dma_map); 823 return ret; 824 } 825 826 out_insert: 827 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 828 pthread_mutex_unlock(&g_vfio.mutex); 829 return 0; 830 } 831 832 static int 833 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 834 { 835 struct spdk_vfio_dma_map *dma_map; 836 int ret; 837 838 pthread_mutex_lock(&g_vfio.mutex); 839 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 840 if (dma_map->map.iova == iova) { 841 break; 842 } 843 } 844 845 if (dma_map == NULL) { 846 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 847 pthread_mutex_unlock(&g_vfio.mutex); 848 return -ENXIO; 849 } 850 851 /** don't support partial or multiple-page unmap for now */ 852 assert(dma_map->map.size == size); 853 854 if (g_vfio.device_ref == 0) { 855 /* Memory is not mapped anymore, just remove it's references */ 856 goto out_remove; 857 } 858 859 860 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 861 if (ret) { 862 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 863 pthread_mutex_unlock(&g_vfio.mutex); 864 return ret; 865 } 866 867 out_remove: 868 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 869 pthread_mutex_unlock(&g_vfio.mutex); 870 free(dma_map); 871 return 0; 872 } 873 #endif 874 875 static uint64_t 876 vtophys_get_paddr_memseg(uint64_t vaddr) 877 { 878 uintptr_t paddr; 879 struct rte_memseg *seg; 880 881 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 882 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 883 if (seg != NULL) { 884 paddr = seg->phys_addr; 885 if (paddr == RTE_BAD_IOVA) { 886 return SPDK_VTOPHYS_ERROR; 887 } 888 paddr += (vaddr - (uintptr_t)seg->addr); 889 return paddr; 890 } 891 #else 892 struct rte_mem_config *mcfg; 893 uint32_t seg_idx; 894 895 mcfg = rte_eal_get_configuration()->mem_config; 896 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 897 seg = &mcfg->memseg[seg_idx]; 898 if (seg->addr == NULL) { 899 break; 900 } 901 902 if (vaddr >= (uintptr_t)seg->addr && 903 vaddr < ((uintptr_t)seg->addr + seg->len)) { 904 paddr = seg->phys_addr; 905 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 906 if (paddr == RTE_BAD_IOVA) { 907 #else 908 if (paddr == RTE_BAD_PHYS_ADDR) { 909 #endif 910 return SPDK_VTOPHYS_ERROR; 911 } 912 paddr += (vaddr - (uintptr_t)seg->addr); 913 return paddr; 914 } 915 } 916 #endif 917 918 return SPDK_VTOPHYS_ERROR; 919 } 920 921 /* Try to get the paddr from /proc/self/pagemap */ 922 static uint64_t 923 vtophys_get_paddr_pagemap(uint64_t vaddr) 924 { 925 uintptr_t paddr; 926 927 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 928 #define BAD_ADDR RTE_BAD_IOVA 929 #define VTOPHYS rte_mem_virt2iova 930 #else 931 #define BAD_ADDR RTE_BAD_PHYS_ADDR 932 #define VTOPHYS rte_mem_virt2phy 933 #endif 934 935 /* 936 * Note: the virt2phy/virt2iova functions have changed over time, such 937 * that older versions may return 0 while recent versions will never 938 * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and 939 * newer versions, check for both return values. 940 */ 941 paddr = VTOPHYS((void *)vaddr); 942 if (paddr == 0 || paddr == BAD_ADDR) { 943 /* 944 * The vaddr may be valid but doesn't have a backing page 945 * assigned yet. Touch the page to ensure a backing page 946 * gets assigned, then try to translate again. 947 */ 948 rte_atomic64_read((rte_atomic64_t *)vaddr); 949 paddr = VTOPHYS((void *)vaddr); 950 } 951 if (paddr == 0 || paddr == BAD_ADDR) { 952 /* Unable to get to the physical address. */ 953 return SPDK_VTOPHYS_ERROR; 954 } 955 956 #undef BAD_ADDR 957 #undef VTOPHYS 958 959 return paddr; 960 } 961 962 /* Try to get the paddr from pci devices */ 963 static uint64_t 964 vtophys_get_paddr_pci(uint64_t vaddr) 965 { 966 struct spdk_vtophys_pci_device *vtophys_dev; 967 uintptr_t paddr; 968 struct rte_pci_device *dev; 969 struct rte_mem_resource *res; 970 unsigned r; 971 972 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 973 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 974 dev = vtophys_dev->pci_device; 975 976 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 977 res = &dev->mem_resource[r]; 978 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 979 vaddr < (uint64_t)res->addr + res->len) { 980 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 981 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 982 (void *)paddr); 983 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 984 return paddr; 985 } 986 } 987 } 988 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 989 990 return SPDK_VTOPHYS_ERROR; 991 } 992 993 static int 994 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 995 enum spdk_mem_map_notify_action action, 996 void *vaddr, size_t len) 997 { 998 int rc = 0, pci_phys = 0; 999 uint64_t paddr; 1000 1001 if ((uintptr_t)vaddr & ~MASK_256TB) { 1002 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1003 return -EINVAL; 1004 } 1005 1006 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1007 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 1008 __func__, vaddr, len); 1009 return -EINVAL; 1010 } 1011 1012 while (len > 0) { 1013 /* Get the physical address from the DPDK memsegs */ 1014 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1015 1016 switch (action) { 1017 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1018 if (paddr == SPDK_VTOPHYS_ERROR) { 1019 /* This is not an address that DPDK is managing. */ 1020 #if SPDK_VFIO_ENABLED 1021 if (g_vfio.enabled && !g_vfio.noiommu_enabled) { 1022 /* We'll use the virtual address as the iova. DPDK 1023 * currently uses physical addresses as the iovas (or counts 1024 * up from 0 if it can't get physical addresses), so 1025 * the range of user space virtual addresses and physical 1026 * addresses will never overlap. 1027 */ 1028 paddr = (uint64_t)vaddr; 1029 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1030 if (rc) { 1031 return -EFAULT; 1032 } 1033 } else 1034 #endif 1035 { 1036 /* Get the physical address from /proc/self/pagemap. */ 1037 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1038 if (paddr == SPDK_VTOPHYS_ERROR) { 1039 /* Get the physical address from PCI devices */ 1040 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1041 if (paddr == SPDK_VTOPHYS_ERROR) { 1042 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1043 return -EFAULT; 1044 } 1045 pci_phys = 1; 1046 } 1047 } 1048 } 1049 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1050 if (!pci_phys && (paddr & MASK_2MB)) { 1051 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1052 return -EINVAL; 1053 } 1054 1055 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1056 break; 1057 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1058 #if SPDK_VFIO_ENABLED 1059 if (paddr == SPDK_VTOPHYS_ERROR) { 1060 /* 1061 * This is not an address that DPDK is managing. If vfio is enabled, 1062 * we need to unmap the range from the IOMMU 1063 */ 1064 if (g_vfio.enabled && !g_vfio.noiommu_enabled) { 1065 uint64_t buffer_len = VALUE_2MB; 1066 paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); 1067 if (buffer_len != VALUE_2MB) { 1068 return -EINVAL; 1069 } 1070 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1071 if (rc) { 1072 return -EFAULT; 1073 } 1074 } 1075 } 1076 #endif 1077 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1078 break; 1079 default: 1080 SPDK_UNREACHABLE(); 1081 } 1082 1083 if (rc != 0) { 1084 return rc; 1085 } 1086 vaddr += VALUE_2MB; 1087 len -= VALUE_2MB; 1088 } 1089 1090 return rc; 1091 } 1092 1093 #if SPDK_VFIO_ENABLED 1094 1095 static bool 1096 spdk_vfio_enabled(void) 1097 { 1098 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1099 return rte_vfio_is_enabled("vfio_pci"); 1100 #else 1101 return pci_vfio_is_enabled(); 1102 #endif 1103 } 1104 1105 /* Check if IOMMU is enabled on the system */ 1106 static bool 1107 has_iommu_groups(void) 1108 { 1109 struct dirent *d; 1110 int count = 0; 1111 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1112 1113 if (dir == NULL) { 1114 return false; 1115 } 1116 1117 while (count < 3 && (d = readdir(dir)) != NULL) { 1118 count++; 1119 } 1120 1121 closedir(dir); 1122 /* there will always be ./ and ../ entries */ 1123 return count > 2; 1124 } 1125 1126 static bool 1127 spdk_vfio_noiommu_enabled(void) 1128 { 1129 return rte_vfio_noiommu_is_enabled(); 1130 } 1131 1132 static void 1133 spdk_vtophys_iommu_init(void) 1134 { 1135 char proc_fd_path[PATH_MAX + 1]; 1136 char link_path[PATH_MAX + 1]; 1137 const char vfio_path[] = "/dev/vfio/vfio"; 1138 DIR *dir; 1139 struct dirent *d; 1140 1141 if (!spdk_vfio_enabled()) { 1142 return; 1143 } 1144 1145 if (spdk_vfio_noiommu_enabled()) { 1146 g_vfio.noiommu_enabled = true; 1147 } else if (!has_iommu_groups()) { 1148 return; 1149 } 1150 1151 dir = opendir("/proc/self/fd"); 1152 if (!dir) { 1153 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1154 return; 1155 } 1156 1157 while ((d = readdir(dir)) != NULL) { 1158 if (d->d_type != DT_LNK) { 1159 continue; 1160 } 1161 1162 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1163 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1164 continue; 1165 } 1166 1167 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1168 sscanf(d->d_name, "%d", &g_vfio.fd); 1169 break; 1170 } 1171 } 1172 1173 closedir(dir); 1174 1175 if (g_vfio.fd < 0) { 1176 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1177 return; 1178 } 1179 1180 g_vfio.enabled = true; 1181 1182 return; 1183 } 1184 #endif 1185 1186 void 1187 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1188 { 1189 struct spdk_vtophys_pci_device *vtophys_dev; 1190 1191 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1192 1193 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1194 if (vtophys_dev) { 1195 vtophys_dev->pci_device = pci_device; 1196 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1197 } else { 1198 DEBUG_PRINT("Memory allocation error\n"); 1199 } 1200 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1201 1202 #if SPDK_VFIO_ENABLED 1203 struct spdk_vfio_dma_map *dma_map; 1204 int ret; 1205 1206 if (!g_vfio.enabled) { 1207 return; 1208 } 1209 1210 pthread_mutex_lock(&g_vfio.mutex); 1211 g_vfio.device_ref++; 1212 if (g_vfio.device_ref > 1) { 1213 pthread_mutex_unlock(&g_vfio.mutex); 1214 return; 1215 } 1216 1217 /* This is the first SPDK device using DPDK vfio. This means that the first 1218 * IOMMU group might have been just been added to the DPDK vfio container. 1219 * From this point it is certain that the memory can be mapped now. 1220 */ 1221 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1222 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1223 if (ret) { 1224 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1225 break; 1226 } 1227 } 1228 pthread_mutex_unlock(&g_vfio.mutex); 1229 #endif 1230 } 1231 1232 void 1233 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1234 { 1235 struct spdk_vtophys_pci_device *vtophys_dev; 1236 1237 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1238 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1239 if (vtophys_dev->pci_device == pci_device) { 1240 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1241 free(vtophys_dev); 1242 break; 1243 } 1244 } 1245 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1246 1247 #if SPDK_VFIO_ENABLED 1248 struct spdk_vfio_dma_map *dma_map; 1249 int ret; 1250 1251 if (!g_vfio.enabled) { 1252 return; 1253 } 1254 1255 pthread_mutex_lock(&g_vfio.mutex); 1256 assert(g_vfio.device_ref > 0); 1257 g_vfio.device_ref--; 1258 if (g_vfio.device_ref > 0) { 1259 pthread_mutex_unlock(&g_vfio.mutex); 1260 return; 1261 } 1262 1263 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1264 * any additional devices using it's vfio container, all the mappings 1265 * will be automatically removed by the Linux vfio driver. We unmap 1266 * the memory manually to be able to easily re-map it later regardless 1267 * of other, external factors. 1268 */ 1269 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1270 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1271 if (ret) { 1272 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1273 break; 1274 } 1275 } 1276 pthread_mutex_unlock(&g_vfio.mutex); 1277 #endif 1278 } 1279 1280 int 1281 spdk_vtophys_init(void) 1282 { 1283 const struct spdk_mem_map_ops vtophys_map_ops = { 1284 .notify_cb = spdk_vtophys_notify, 1285 .are_contiguous = NULL 1286 }; 1287 1288 #if SPDK_VFIO_ENABLED 1289 spdk_vtophys_iommu_init(); 1290 #endif 1291 1292 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1293 if (g_vtophys_map == NULL) { 1294 DEBUG_PRINT("vtophys map allocation failed\n"); 1295 return -1; 1296 } 1297 return 0; 1298 } 1299 1300 uint64_t 1301 spdk_vtophys(void *buf, uint64_t *size) 1302 { 1303 uint64_t vaddr, paddr_2mb; 1304 1305 vaddr = (uint64_t)buf; 1306 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1307 1308 /* 1309 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1310 * we will still bitwise-or it with the buf offset below, but the result will still be 1311 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1312 * unaligned) we must now check the return value before addition. 1313 */ 1314 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1315 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1316 return SPDK_VTOPHYS_ERROR; 1317 } else { 1318 return paddr_2mb + (vaddr & MASK_2MB); 1319 } 1320 } 1321 1322 static int 1323 spdk_bus_scan(void) 1324 { 1325 return 0; 1326 } 1327 1328 static int 1329 spdk_bus_probe(void) 1330 { 1331 return 0; 1332 } 1333 1334 static struct rte_device * 1335 spdk_bus_find_device(const struct rte_device *start, 1336 rte_dev_cmp_t cmp, const void *data) 1337 { 1338 return NULL; 1339 } 1340 1341 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1342 static enum rte_iova_mode 1343 spdk_bus_get_iommu_class(void) { 1344 /* Since we register our PCI drivers after EAL init, we have no chance 1345 * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu 1346 * class. DPDK uses RTE_IOVA_PA by default because for some platforms 1347 * it's the only supported mode, but then SPDK does not support those 1348 * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus 1349 * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used 1350 * (i.e. at least one device on the system is bound to uio_pci_generic), 1351 * so we simply return RTE_IOVA_VA here. 1352 */ 1353 return RTE_IOVA_VA; 1354 } 1355 #endif 1356 1357 struct rte_bus spdk_bus = { 1358 .scan = spdk_bus_scan, 1359 .probe = spdk_bus_probe, 1360 .find_device = spdk_bus_find_device, 1361 #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) 1362 .get_iommu_class = spdk_bus_get_iommu_class, 1363 #endif 1364 }; 1365 1366 RTE_REGISTER_BUS(spdk, spdk_bus); 1367