1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_memory.h> 40 #include <rte_eal_memconfig.h> 41 42 #include "spdk_internal/assert.h" 43 44 #include "spdk/assert.h" 45 #include "spdk/likely.h" 46 #include "spdk/queue.h" 47 #include "spdk/util.h" 48 #include "spdk/memory.h" 49 #include "spdk/env_dpdk.h" 50 51 #ifdef __FreeBSD__ 52 #define VFIO_ENABLED 0 53 #else 54 #include <linux/version.h> 55 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 56 #define VFIO_ENABLED 1 57 #include <linux/vfio.h> 58 #include <rte_vfio.h> 59 60 struct spdk_vfio_dma_map { 61 struct vfio_iommu_type1_dma_map map; 62 struct vfio_iommu_type1_dma_unmap unmap; 63 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 64 }; 65 66 struct vfio_cfg { 67 int fd; 68 bool enabled; 69 bool noiommu_enabled; 70 unsigned device_ref; 71 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 72 pthread_mutex_t mutex; 73 }; 74 75 static struct vfio_cfg g_vfio = { 76 .fd = -1, 77 .enabled = false, 78 .noiommu_enabled = false, 79 .device_ref = 0, 80 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 81 .mutex = PTHREAD_MUTEX_INITIALIZER 82 }; 83 84 #else 85 #define VFIO_ENABLED 0 86 #endif 87 #endif 88 89 #if DEBUG 90 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 91 #else 92 #define DEBUG_PRINT(...) 93 #endif 94 95 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 96 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 97 98 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 99 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 100 101 /* Page is registered */ 102 #define REG_MAP_REGISTERED (1ULL << 62) 103 104 /* A notification region barrier. The 2MB translation entry that's marked 105 * with this flag must be unregistered separately. This allows contiguous 106 * regions to be unregistered in the same chunks they were registered. 107 */ 108 #define REG_MAP_NOTIFY_START (1ULL << 63) 109 110 /* Translation of a single 2MB page. */ 111 struct map_2mb { 112 uint64_t translation_2mb; 113 }; 114 115 /* Second-level map table indexed by bits [21..29] of the virtual address. 116 * Each entry contains the address translation or error for entries that haven't 117 * been retrieved yet. 118 */ 119 struct map_1gb { 120 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 121 }; 122 123 /* Top-level map table indexed by bits [30..47] of the virtual address. 124 * Each entry points to a second-level map table or NULL. 125 */ 126 struct map_256tb { 127 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 128 }; 129 130 /* Page-granularity memory address translation */ 131 struct spdk_mem_map { 132 struct map_256tb map_256tb; 133 pthread_mutex_t mutex; 134 uint64_t default_translation; 135 struct spdk_mem_map_ops ops; 136 void *cb_ctx; 137 TAILQ_ENTRY(spdk_mem_map) tailq; 138 }; 139 140 /* Registrations map. The 64 bit translations are bit fields with the 141 * following layout (starting with the low bits): 142 * 0 - 61 : reserved 143 * 62 - 63 : flags 144 */ 145 static struct spdk_mem_map *g_mem_reg_map; 146 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 147 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 148 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 149 150 static bool g_legacy_mem; 151 152 /* 153 * Walk the currently registered memory via the main memory registration map 154 * and call the new map's notify callback for each virtually contiguous region. 155 */ 156 static int 157 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 158 { 159 size_t idx_256tb; 160 uint64_t idx_1gb; 161 uint64_t contig_start = UINT64_MAX; 162 uint64_t contig_end = UINT64_MAX; 163 struct map_1gb *map_1gb; 164 int rc; 165 166 if (!g_mem_reg_map) { 167 return -EINVAL; 168 } 169 170 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 171 pthread_mutex_lock(&g_mem_reg_map->mutex); 172 173 for (idx_256tb = 0; 174 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 175 idx_256tb++) { 176 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 177 178 if (!map_1gb) { 179 if (contig_start != UINT64_MAX) { 180 /* End of of a virtually contiguous range */ 181 rc = map->ops.notify_cb(map->cb_ctx, map, action, 182 (void *)contig_start, 183 contig_end - contig_start + VALUE_2MB); 184 /* Don't bother handling unregister failures. It can't be any worse */ 185 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 186 goto err_unregister; 187 } 188 } 189 contig_start = UINT64_MAX; 190 continue; 191 } 192 193 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 194 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 195 (contig_start == UINT64_MAX || 196 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 197 /* Rebuild the virtual address from the indexes */ 198 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 199 200 if (contig_start == UINT64_MAX) { 201 contig_start = vaddr; 202 } 203 204 contig_end = vaddr; 205 } else { 206 if (contig_start != UINT64_MAX) { 207 /* End of of a virtually contiguous range */ 208 rc = map->ops.notify_cb(map->cb_ctx, map, action, 209 (void *)contig_start, 210 contig_end - contig_start + VALUE_2MB); 211 /* Don't bother handling unregister failures. It can't be any worse */ 212 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 213 goto err_unregister; 214 } 215 216 /* This page might be a part of a neighbour region, so process 217 * it again. The idx_1gb will be incremented immediately. 218 */ 219 idx_1gb--; 220 } 221 contig_start = UINT64_MAX; 222 } 223 } 224 } 225 226 pthread_mutex_unlock(&g_mem_reg_map->mutex); 227 return 0; 228 229 err_unregister: 230 /* Unwind to the first empty translation so we don't unregister 231 * a region that just failed to register. 232 */ 233 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 234 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 235 contig_start = UINT64_MAX; 236 contig_end = UINT64_MAX; 237 238 /* Unregister any memory we managed to register before the failure */ 239 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 240 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 241 242 if (!map_1gb) { 243 if (contig_end != UINT64_MAX) { 244 /* End of of a virtually contiguous range */ 245 map->ops.notify_cb(map->cb_ctx, map, 246 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 247 (void *)contig_start, 248 contig_end - contig_start + VALUE_2MB); 249 } 250 contig_end = UINT64_MAX; 251 continue; 252 } 253 254 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 255 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 256 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 257 /* Rebuild the virtual address from the indexes */ 258 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 259 260 if (contig_end == UINT64_MAX) { 261 contig_end = vaddr; 262 } 263 contig_start = vaddr; 264 } else { 265 if (contig_end != UINT64_MAX) { 266 /* End of of a virtually contiguous range */ 267 map->ops.notify_cb(map->cb_ctx, map, 268 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 269 (void *)contig_start, 270 contig_end - contig_start + VALUE_2MB); 271 idx_1gb++; 272 } 273 contig_end = UINT64_MAX; 274 } 275 } 276 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 277 } 278 279 pthread_mutex_unlock(&g_mem_reg_map->mutex); 280 return rc; 281 } 282 283 struct spdk_mem_map * 284 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 285 { 286 struct spdk_mem_map *map; 287 int rc; 288 289 map = calloc(1, sizeof(*map)); 290 if (map == NULL) { 291 return NULL; 292 } 293 294 if (pthread_mutex_init(&map->mutex, NULL)) { 295 free(map); 296 return NULL; 297 } 298 299 map->default_translation = default_translation; 300 map->cb_ctx = cb_ctx; 301 if (ops) { 302 map->ops = *ops; 303 } 304 305 if (ops && ops->notify_cb) { 306 pthread_mutex_lock(&g_spdk_mem_map_mutex); 307 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 308 if (rc != 0) { 309 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 310 DEBUG_PRINT("Initial mem_map notify failed\n"); 311 pthread_mutex_destroy(&map->mutex); 312 free(map); 313 return NULL; 314 } 315 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 316 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 317 } 318 319 return map; 320 } 321 322 void 323 spdk_mem_map_free(struct spdk_mem_map **pmap) 324 { 325 struct spdk_mem_map *map; 326 size_t i; 327 328 if (!pmap) { 329 return; 330 } 331 332 map = *pmap; 333 334 if (!map) { 335 return; 336 } 337 338 if (map->ops.notify_cb) { 339 pthread_mutex_lock(&g_spdk_mem_map_mutex); 340 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 341 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 342 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 343 } 344 345 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 346 free(map->map_256tb.map[i]); 347 } 348 349 pthread_mutex_destroy(&map->mutex); 350 351 free(map); 352 *pmap = NULL; 353 } 354 355 int 356 spdk_mem_register(void *vaddr, size_t len) 357 { 358 struct spdk_mem_map *map; 359 int rc; 360 void *seg_vaddr; 361 size_t seg_len; 362 uint64_t reg; 363 364 if ((uintptr_t)vaddr & ~MASK_256TB) { 365 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 366 return -EINVAL; 367 } 368 369 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 370 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 371 __func__, vaddr, len); 372 return -EINVAL; 373 } 374 375 if (len == 0) { 376 return 0; 377 } 378 379 pthread_mutex_lock(&g_spdk_mem_map_mutex); 380 381 seg_vaddr = vaddr; 382 seg_len = len; 383 while (seg_len > 0) { 384 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 385 if (reg & REG_MAP_REGISTERED) { 386 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 387 return -EBUSY; 388 } 389 seg_vaddr += VALUE_2MB; 390 seg_len -= VALUE_2MB; 391 } 392 393 seg_vaddr = vaddr; 394 seg_len = 0; 395 while (len > 0) { 396 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 397 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 398 seg_len += VALUE_2MB; 399 vaddr += VALUE_2MB; 400 len -= VALUE_2MB; 401 } 402 403 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 404 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 405 if (rc != 0) { 406 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 407 return rc; 408 } 409 } 410 411 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 412 return 0; 413 } 414 415 int 416 spdk_mem_unregister(void *vaddr, size_t len) 417 { 418 struct spdk_mem_map *map; 419 int rc; 420 void *seg_vaddr; 421 size_t seg_len; 422 uint64_t reg, newreg; 423 424 if ((uintptr_t)vaddr & ~MASK_256TB) { 425 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 426 return -EINVAL; 427 } 428 429 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 430 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 431 __func__, vaddr, len); 432 return -EINVAL; 433 } 434 435 pthread_mutex_lock(&g_spdk_mem_map_mutex); 436 437 /* The first page must be a start of a region. Also check if it's 438 * registered to make sure we don't return -ERANGE for non-registered 439 * regions. 440 */ 441 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 442 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 443 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 444 return -ERANGE; 445 } 446 447 seg_vaddr = vaddr; 448 seg_len = len; 449 while (seg_len > 0) { 450 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 451 if ((reg & REG_MAP_REGISTERED) == 0) { 452 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 453 return -EINVAL; 454 } 455 seg_vaddr += VALUE_2MB; 456 seg_len -= VALUE_2MB; 457 } 458 459 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 460 /* If the next page is registered, it must be a start of a region as well, 461 * otherwise we'd be unregistering only a part of a region. 462 */ 463 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 464 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 465 return -ERANGE; 466 } 467 seg_vaddr = vaddr; 468 seg_len = 0; 469 470 while (len > 0) { 471 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 472 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 473 474 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 475 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 476 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 477 if (rc != 0) { 478 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 479 return rc; 480 } 481 } 482 483 seg_vaddr = vaddr; 484 seg_len = VALUE_2MB; 485 } else { 486 seg_len += VALUE_2MB; 487 } 488 489 vaddr += VALUE_2MB; 490 len -= VALUE_2MB; 491 } 492 493 if (seg_len > 0) { 494 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 495 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 496 if (rc != 0) { 497 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 498 return rc; 499 } 500 } 501 } 502 503 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 504 return 0; 505 } 506 507 int 508 spdk_mem_reserve(void *vaddr, size_t len) 509 { 510 struct spdk_mem_map *map; 511 void *seg_vaddr; 512 size_t seg_len; 513 uint64_t reg; 514 515 if ((uintptr_t)vaddr & ~MASK_256TB) { 516 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 517 return -EINVAL; 518 } 519 520 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 521 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 522 __func__, vaddr, len); 523 return -EINVAL; 524 } 525 526 if (len == 0) { 527 return 0; 528 } 529 530 pthread_mutex_lock(&g_spdk_mem_map_mutex); 531 532 /* Check if any part of this range is already registered */ 533 seg_vaddr = vaddr; 534 seg_len = len; 535 while (seg_len > 0) { 536 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 537 if (reg & REG_MAP_REGISTERED) { 538 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 539 return -EBUSY; 540 } 541 seg_vaddr += VALUE_2MB; 542 seg_len -= VALUE_2MB; 543 } 544 545 /* Simply set the translation to the memory map's default. This allocates the space in the 546 * map but does not provide a valid translation. */ 547 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 548 g_mem_reg_map->default_translation); 549 550 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 551 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 552 } 553 554 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 555 return 0; 556 } 557 558 static struct map_1gb * 559 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 560 { 561 struct map_1gb *map_1gb; 562 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 563 size_t i; 564 565 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 566 return NULL; 567 } 568 569 map_1gb = map->map_256tb.map[idx_256tb]; 570 571 if (!map_1gb) { 572 pthread_mutex_lock(&map->mutex); 573 574 /* Recheck to make sure nobody else got the mutex first. */ 575 map_1gb = map->map_256tb.map[idx_256tb]; 576 if (!map_1gb) { 577 map_1gb = malloc(sizeof(struct map_1gb)); 578 if (map_1gb) { 579 /* initialize all entries to default translation */ 580 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 581 map_1gb->map[i].translation_2mb = map->default_translation; 582 } 583 map->map_256tb.map[idx_256tb] = map_1gb; 584 } 585 } 586 587 pthread_mutex_unlock(&map->mutex); 588 589 if (!map_1gb) { 590 DEBUG_PRINT("allocation failed\n"); 591 return NULL; 592 } 593 } 594 595 return map_1gb; 596 } 597 598 int 599 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 600 uint64_t translation) 601 { 602 uint64_t vfn_2mb; 603 struct map_1gb *map_1gb; 604 uint64_t idx_1gb; 605 struct map_2mb *map_2mb; 606 607 if ((uintptr_t)vaddr & ~MASK_256TB) { 608 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 609 return -EINVAL; 610 } 611 612 /* For now, only 2 MB-aligned registrations are supported */ 613 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 614 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 615 __func__, vaddr, size); 616 return -EINVAL; 617 } 618 619 vfn_2mb = vaddr >> SHIFT_2MB; 620 621 while (size) { 622 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 623 if (!map_1gb) { 624 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 625 return -ENOMEM; 626 } 627 628 idx_1gb = MAP_1GB_IDX(vfn_2mb); 629 map_2mb = &map_1gb->map[idx_1gb]; 630 map_2mb->translation_2mb = translation; 631 632 size -= VALUE_2MB; 633 vfn_2mb++; 634 } 635 636 return 0; 637 } 638 639 int 640 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 641 { 642 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 643 } 644 645 inline uint64_t 646 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 647 { 648 const struct map_1gb *map_1gb; 649 const struct map_2mb *map_2mb; 650 uint64_t idx_256tb; 651 uint64_t idx_1gb; 652 uint64_t vfn_2mb; 653 uint64_t cur_size; 654 uint64_t prev_translation; 655 uint64_t orig_translation; 656 657 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 658 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 659 return map->default_translation; 660 } 661 662 vfn_2mb = vaddr >> SHIFT_2MB; 663 idx_256tb = MAP_256TB_IDX(vfn_2mb); 664 idx_1gb = MAP_1GB_IDX(vfn_2mb); 665 666 map_1gb = map->map_256tb.map[idx_256tb]; 667 if (spdk_unlikely(!map_1gb)) { 668 return map->default_translation; 669 } 670 671 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 672 map_2mb = &map_1gb->map[idx_1gb]; 673 if (size == NULL || map->ops.are_contiguous == NULL || 674 map_2mb->translation_2mb == map->default_translation) { 675 if (size != NULL) { 676 *size = spdk_min(*size, cur_size); 677 } 678 return map_2mb->translation_2mb; 679 } 680 681 orig_translation = map_2mb->translation_2mb; 682 prev_translation = orig_translation; 683 while (cur_size < *size) { 684 vfn_2mb++; 685 idx_256tb = MAP_256TB_IDX(vfn_2mb); 686 idx_1gb = MAP_1GB_IDX(vfn_2mb); 687 688 map_1gb = map->map_256tb.map[idx_256tb]; 689 if (spdk_unlikely(!map_1gb)) { 690 break; 691 } 692 693 map_2mb = &map_1gb->map[idx_1gb]; 694 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 695 break; 696 } 697 698 cur_size += VALUE_2MB; 699 prev_translation = map_2mb->translation_2mb; 700 } 701 702 *size = spdk_min(*size, cur_size); 703 return orig_translation; 704 } 705 706 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 707 static void 708 memory_hotplug_cb(enum rte_mem_event event_type, 709 const void *addr, size_t len, void *arg) 710 { 711 if (event_type == RTE_MEM_EVENT_ALLOC) { 712 spdk_mem_register((void *)addr, len); 713 714 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 715 if (!spdk_env_dpdk_external_init()) { 716 return; 717 } 718 #endif 719 720 /* Prior to DPDK 19.02, we have to worry about DPDK 721 * freeing memory in different units than it was allocated. 722 * That doesn't work with things like RDMA MRs. So for 723 * those versions of DPDK, mark each segment so that DPDK 724 * won't later free it. That ensures we don't have to deal 725 * with that scenario. 726 * 727 * DPDK 19.02 added the --match-allocations RTE flag to 728 * avoid this condition. 729 * 730 * Note: if the user initialized DPDK separately, we can't 731 * be sure that --match-allocations was specified, so need 732 * to still mark the segments so they aren't freed. 733 */ 734 while (len > 0) { 735 struct rte_memseg *seg; 736 737 seg = rte_mem_virt2memseg(addr, NULL); 738 assert(seg != NULL); 739 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 740 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 741 len -= seg->hugepage_sz; 742 } 743 } else if (event_type == RTE_MEM_EVENT_FREE) { 744 spdk_mem_unregister((void *)addr, len); 745 } 746 } 747 748 static int 749 memory_iter_cb(const struct rte_memseg_list *msl, 750 const struct rte_memseg *ms, size_t len, void *arg) 751 { 752 return spdk_mem_register(ms->addr, len); 753 } 754 #endif 755 756 int 757 mem_map_init(bool legacy_mem) 758 { 759 g_legacy_mem = legacy_mem; 760 761 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 762 if (g_mem_reg_map == NULL) { 763 DEBUG_PRINT("memory registration map allocation failed\n"); 764 return -ENOMEM; 765 } 766 767 /* 768 * Walk all DPDK memory segments and register them 769 * with the master memory map 770 */ 771 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 772 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 773 rte_memseg_contig_walk(memory_iter_cb, NULL); 774 #else 775 struct rte_mem_config *mcfg; 776 size_t seg_idx; 777 778 mcfg = rte_eal_get_configuration()->mem_config; 779 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 780 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 781 782 if (seg->addr == NULL) { 783 break; 784 } 785 786 spdk_mem_register(seg->addr, seg->len); 787 } 788 #endif 789 return 0; 790 } 791 792 bool 793 spdk_iommu_is_enabled(void) 794 { 795 #if VFIO_ENABLED 796 return g_vfio.enabled && !g_vfio.noiommu_enabled; 797 #else 798 return false; 799 #endif 800 } 801 802 struct spdk_vtophys_pci_device { 803 struct rte_pci_device *pci_device; 804 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 805 }; 806 807 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 808 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 809 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 810 811 static struct spdk_mem_map *g_vtophys_map; 812 static struct spdk_mem_map *g_phys_ref_map; 813 814 #if VFIO_ENABLED 815 static int 816 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 817 { 818 struct spdk_vfio_dma_map *dma_map; 819 uint64_t refcount; 820 int ret; 821 822 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 823 assert(refcount < UINT64_MAX); 824 if (refcount > 0) { 825 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 826 return 0; 827 } 828 829 dma_map = calloc(1, sizeof(*dma_map)); 830 if (dma_map == NULL) { 831 return -ENOMEM; 832 } 833 834 dma_map->map.argsz = sizeof(dma_map->map); 835 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 836 dma_map->map.vaddr = vaddr; 837 dma_map->map.iova = iova; 838 dma_map->map.size = size; 839 840 dma_map->unmap.argsz = sizeof(dma_map->unmap); 841 dma_map->unmap.flags = 0; 842 dma_map->unmap.iova = iova; 843 dma_map->unmap.size = size; 844 845 pthread_mutex_lock(&g_vfio.mutex); 846 if (g_vfio.device_ref == 0) { 847 /* VFIO requires at least one device (IOMMU group) to be added to 848 * a VFIO container before it is possible to perform any IOMMU 849 * operations on that container. This memory will be mapped once 850 * the first device (IOMMU group) is hotplugged. 851 * 852 * Since the vfio container is managed internally by DPDK, it is 853 * also possible that some device is already in that container, but 854 * it's not managed by SPDK - e.g. an NIC attached internally 855 * inside DPDK. We could map the memory straight away in such 856 * scenario, but there's no need to do it. DPDK devices clearly 857 * don't need our mappings and hence we defer the mapping 858 * unconditionally until the first SPDK-managed device is 859 * hotplugged. 860 */ 861 goto out_insert; 862 } 863 864 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 865 if (ret) { 866 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 867 pthread_mutex_unlock(&g_vfio.mutex); 868 free(dma_map); 869 return ret; 870 } 871 872 out_insert: 873 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 874 pthread_mutex_unlock(&g_vfio.mutex); 875 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 876 return 0; 877 } 878 879 static int 880 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 881 { 882 struct spdk_vfio_dma_map *dma_map; 883 uint64_t refcount; 884 int ret; 885 886 pthread_mutex_lock(&g_vfio.mutex); 887 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 888 if (dma_map->map.iova == iova) { 889 break; 890 } 891 } 892 893 if (dma_map == NULL) { 894 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 895 pthread_mutex_unlock(&g_vfio.mutex); 896 return -ENXIO; 897 } 898 899 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 900 assert(refcount < UINT64_MAX); 901 if (refcount > 0) { 902 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 903 } 904 905 /* We still have outstanding references, don't clear it. */ 906 if (refcount > 1) { 907 pthread_mutex_unlock(&g_vfio.mutex); 908 return 0; 909 } 910 911 /** don't support partial or multiple-page unmap for now */ 912 assert(dma_map->map.size == size); 913 914 if (g_vfio.device_ref == 0) { 915 /* Memory is not mapped anymore, just remove it's references */ 916 goto out_remove; 917 } 918 919 920 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 921 if (ret) { 922 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 923 pthread_mutex_unlock(&g_vfio.mutex); 924 return ret; 925 } 926 927 out_remove: 928 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 929 pthread_mutex_unlock(&g_vfio.mutex); 930 free(dma_map); 931 return 0; 932 } 933 #endif 934 935 static uint64_t 936 vtophys_get_paddr_memseg(uint64_t vaddr) 937 { 938 uintptr_t paddr; 939 struct rte_memseg *seg; 940 941 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 942 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 943 if (seg != NULL) { 944 paddr = seg->phys_addr; 945 if (paddr == RTE_BAD_IOVA) { 946 return SPDK_VTOPHYS_ERROR; 947 } 948 paddr += (vaddr - (uintptr_t)seg->addr); 949 return paddr; 950 } 951 #else 952 struct rte_mem_config *mcfg; 953 uint32_t seg_idx; 954 955 mcfg = rte_eal_get_configuration()->mem_config; 956 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 957 seg = &mcfg->memseg[seg_idx]; 958 if (seg->addr == NULL) { 959 break; 960 } 961 962 if (vaddr >= (uintptr_t)seg->addr && 963 vaddr < ((uintptr_t)seg->addr + seg->len)) { 964 paddr = seg->phys_addr; 965 if (paddr == RTE_BAD_IOVA) { 966 return SPDK_VTOPHYS_ERROR; 967 } 968 paddr += (vaddr - (uintptr_t)seg->addr); 969 return paddr; 970 } 971 } 972 #endif 973 974 return SPDK_VTOPHYS_ERROR; 975 } 976 977 /* Try to get the paddr from /proc/self/pagemap */ 978 static uint64_t 979 vtophys_get_paddr_pagemap(uint64_t vaddr) 980 { 981 uintptr_t paddr; 982 983 /* Silence static analyzers */ 984 assert(vaddr != 0); 985 paddr = rte_mem_virt2iova((void *)vaddr); 986 if (paddr == RTE_BAD_IOVA) { 987 /* 988 * The vaddr may be valid but doesn't have a backing page 989 * assigned yet. Touch the page to ensure a backing page 990 * gets assigned, then try to translate again. 991 */ 992 rte_atomic64_read((rte_atomic64_t *)vaddr); 993 paddr = rte_mem_virt2iova((void *)vaddr); 994 } 995 if (paddr == RTE_BAD_IOVA) { 996 /* Unable to get to the physical address. */ 997 return SPDK_VTOPHYS_ERROR; 998 } 999 1000 return paddr; 1001 } 1002 1003 /* Try to get the paddr from pci devices */ 1004 static uint64_t 1005 vtophys_get_paddr_pci(uint64_t vaddr) 1006 { 1007 struct spdk_vtophys_pci_device *vtophys_dev; 1008 uintptr_t paddr; 1009 struct rte_pci_device *dev; 1010 struct rte_mem_resource *res; 1011 unsigned r; 1012 1013 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1014 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1015 dev = vtophys_dev->pci_device; 1016 1017 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 1018 res = &dev->mem_resource[r]; 1019 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 1020 vaddr < (uint64_t)res->addr + res->len) { 1021 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1022 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 1023 (void *)paddr); 1024 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1025 return paddr; 1026 } 1027 } 1028 } 1029 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1030 1031 return SPDK_VTOPHYS_ERROR; 1032 } 1033 1034 static int 1035 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1036 enum spdk_mem_map_notify_action action, 1037 void *vaddr, size_t len) 1038 { 1039 int rc = 0, pci_phys = 0; 1040 uint64_t paddr; 1041 1042 if ((uintptr_t)vaddr & ~MASK_256TB) { 1043 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1044 return -EINVAL; 1045 } 1046 1047 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1048 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1049 vaddr, len); 1050 return -EINVAL; 1051 } 1052 1053 /* Get the physical address from the DPDK memsegs */ 1054 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1055 1056 switch (action) { 1057 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1058 if (paddr == SPDK_VTOPHYS_ERROR) { 1059 /* This is not an address that DPDK is managing. */ 1060 #if VFIO_ENABLED 1061 enum rte_iova_mode iova_mode; 1062 1063 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1064 iova_mode = rte_eal_iova_mode(); 1065 #else 1066 iova_mode = rte_eal_get_configuration()->iova_mode; 1067 #endif 1068 1069 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1070 /* We'll use the virtual address as the iova to match DPDK. */ 1071 paddr = (uint64_t)vaddr; 1072 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1073 if (rc) { 1074 return -EFAULT; 1075 } 1076 while (len > 0) { 1077 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1078 if (rc != 0) { 1079 return rc; 1080 } 1081 vaddr += VALUE_2MB; 1082 paddr += VALUE_2MB; 1083 len -= VALUE_2MB; 1084 } 1085 } else 1086 #endif 1087 { 1088 /* Get the physical address from /proc/self/pagemap. */ 1089 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1090 if (paddr == SPDK_VTOPHYS_ERROR) { 1091 /* Get the physical address from PCI devices */ 1092 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1093 if (paddr == SPDK_VTOPHYS_ERROR) { 1094 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1095 return -EFAULT; 1096 } 1097 /* The beginning of this address range points to a PCI resource, 1098 * so the rest must point to a PCI resource as well. 1099 */ 1100 pci_phys = 1; 1101 } 1102 1103 /* Get paddr for each 2MB chunk in this address range */ 1104 while (len > 0) { 1105 /* Get the physical address from /proc/self/pagemap. */ 1106 if (pci_phys) { 1107 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1108 } else { 1109 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1110 } 1111 1112 if (paddr == SPDK_VTOPHYS_ERROR) { 1113 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1114 return -EFAULT; 1115 } 1116 1117 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1118 if (!pci_phys && (paddr & MASK_2MB)) { 1119 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1120 return -EINVAL; 1121 } 1122 #if VFIO_ENABLED 1123 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1124 * with the IOMMU using the physical address to match. */ 1125 if (spdk_iommu_is_enabled()) { 1126 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1127 if (rc) { 1128 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1129 return -EFAULT; 1130 } 1131 } 1132 #endif 1133 1134 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1135 if (rc != 0) { 1136 return rc; 1137 } 1138 1139 vaddr += VALUE_2MB; 1140 len -= VALUE_2MB; 1141 } 1142 } 1143 } else { 1144 /* This is an address managed by DPDK. Just setup the translations. */ 1145 while (len > 0) { 1146 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1147 if (paddr == SPDK_VTOPHYS_ERROR) { 1148 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1149 return -EFAULT; 1150 } 1151 1152 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1153 if (rc != 0) { 1154 return rc; 1155 } 1156 1157 vaddr += VALUE_2MB; 1158 len -= VALUE_2MB; 1159 } 1160 } 1161 1162 break; 1163 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1164 #if VFIO_ENABLED 1165 if (paddr == SPDK_VTOPHYS_ERROR) { 1166 /* 1167 * This is not an address that DPDK is managing. If vfio is enabled, 1168 * we need to unmap the range from the IOMMU 1169 */ 1170 if (spdk_iommu_is_enabled()) { 1171 uint64_t buffer_len = len; 1172 uint8_t *va = vaddr; 1173 enum rte_iova_mode iova_mode; 1174 1175 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1176 iova_mode = rte_eal_iova_mode(); 1177 #else 1178 iova_mode = rte_eal_get_configuration()->iova_mode; 1179 #endif 1180 /* 1181 * In virtual address mode, the region is contiguous and can be done in 1182 * one unmap. 1183 */ 1184 if (iova_mode == RTE_IOVA_VA) { 1185 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1186 if (buffer_len != len || paddr != (uintptr_t)va) { 1187 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1188 "translation had address 0x%" PRIx64 " and length %lu\n", 1189 va, len, paddr, buffer_len); 1190 return -EINVAL; 1191 } 1192 rc = vtophys_iommu_unmap_dma(paddr, len); 1193 if (rc) { 1194 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1195 return -EFAULT; 1196 } 1197 } else if (iova_mode == RTE_IOVA_PA) { 1198 /* Get paddr for each 2MB chunk in this address range */ 1199 while (buffer_len > 0) { 1200 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1201 1202 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1203 DEBUG_PRINT("could not get phys addr for %p\n", va); 1204 return -EFAULT; 1205 } 1206 1207 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1208 if (rc) { 1209 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1210 return -EFAULT; 1211 } 1212 1213 va += VALUE_2MB; 1214 buffer_len -= VALUE_2MB; 1215 } 1216 } 1217 } 1218 } 1219 #endif 1220 while (len > 0) { 1221 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1222 if (rc != 0) { 1223 return rc; 1224 } 1225 1226 vaddr += VALUE_2MB; 1227 len -= VALUE_2MB; 1228 } 1229 1230 break; 1231 default: 1232 SPDK_UNREACHABLE(); 1233 } 1234 1235 return rc; 1236 } 1237 1238 static int 1239 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1240 { 1241 /* This function is always called with paddrs for two subsequent 1242 * 2MB chunks in virtual address space, so those chunks will be only 1243 * physically contiguous if the physical addresses are 2MB apart 1244 * from each other as well. 1245 */ 1246 return (paddr2 - paddr1 == VALUE_2MB); 1247 } 1248 1249 #if VFIO_ENABLED 1250 1251 static bool 1252 vfio_enabled(void) 1253 { 1254 return rte_vfio_is_enabled("vfio_pci"); 1255 } 1256 1257 /* Check if IOMMU is enabled on the system */ 1258 static bool 1259 has_iommu_groups(void) 1260 { 1261 struct dirent *d; 1262 int count = 0; 1263 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1264 1265 if (dir == NULL) { 1266 return false; 1267 } 1268 1269 while (count < 3 && (d = readdir(dir)) != NULL) { 1270 count++; 1271 } 1272 1273 closedir(dir); 1274 /* there will always be ./ and ../ entries */ 1275 return count > 2; 1276 } 1277 1278 static bool 1279 vfio_noiommu_enabled(void) 1280 { 1281 return rte_vfio_noiommu_is_enabled(); 1282 } 1283 1284 static void 1285 vtophys_iommu_init(void) 1286 { 1287 char proc_fd_path[PATH_MAX + 1]; 1288 char link_path[PATH_MAX + 1]; 1289 const char vfio_path[] = "/dev/vfio/vfio"; 1290 DIR *dir; 1291 struct dirent *d; 1292 1293 if (!vfio_enabled()) { 1294 return; 1295 } 1296 1297 if (vfio_noiommu_enabled()) { 1298 g_vfio.noiommu_enabled = true; 1299 } else if (!has_iommu_groups()) { 1300 return; 1301 } 1302 1303 dir = opendir("/proc/self/fd"); 1304 if (!dir) { 1305 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1306 return; 1307 } 1308 1309 while ((d = readdir(dir)) != NULL) { 1310 if (d->d_type != DT_LNK) { 1311 continue; 1312 } 1313 1314 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1315 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1316 continue; 1317 } 1318 1319 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1320 sscanf(d->d_name, "%d", &g_vfio.fd); 1321 break; 1322 } 1323 } 1324 1325 closedir(dir); 1326 1327 if (g_vfio.fd < 0) { 1328 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1329 return; 1330 } 1331 1332 g_vfio.enabled = true; 1333 1334 return; 1335 } 1336 #endif 1337 1338 void 1339 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1340 { 1341 struct spdk_vtophys_pci_device *vtophys_dev; 1342 1343 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1344 1345 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1346 if (vtophys_dev) { 1347 vtophys_dev->pci_device = pci_device; 1348 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1349 } else { 1350 DEBUG_PRINT("Memory allocation error\n"); 1351 } 1352 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1353 1354 #if VFIO_ENABLED 1355 struct spdk_vfio_dma_map *dma_map; 1356 int ret; 1357 1358 if (!g_vfio.enabled) { 1359 return; 1360 } 1361 1362 pthread_mutex_lock(&g_vfio.mutex); 1363 g_vfio.device_ref++; 1364 if (g_vfio.device_ref > 1) { 1365 pthread_mutex_unlock(&g_vfio.mutex); 1366 return; 1367 } 1368 1369 /* This is the first SPDK device using DPDK vfio. This means that the first 1370 * IOMMU group might have been just been added to the DPDK vfio container. 1371 * From this point it is certain that the memory can be mapped now. 1372 */ 1373 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1374 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1375 if (ret) { 1376 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1377 break; 1378 } 1379 } 1380 pthread_mutex_unlock(&g_vfio.mutex); 1381 #endif 1382 } 1383 1384 void 1385 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1386 { 1387 struct spdk_vtophys_pci_device *vtophys_dev; 1388 1389 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1390 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1391 if (vtophys_dev->pci_device == pci_device) { 1392 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1393 free(vtophys_dev); 1394 break; 1395 } 1396 } 1397 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1398 1399 #if VFIO_ENABLED 1400 struct spdk_vfio_dma_map *dma_map; 1401 int ret; 1402 1403 if (!g_vfio.enabled) { 1404 return; 1405 } 1406 1407 pthread_mutex_lock(&g_vfio.mutex); 1408 assert(g_vfio.device_ref > 0); 1409 g_vfio.device_ref--; 1410 if (g_vfio.device_ref > 0) { 1411 pthread_mutex_unlock(&g_vfio.mutex); 1412 return; 1413 } 1414 1415 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1416 * any additional devices using it's vfio container, all the mappings 1417 * will be automatically removed by the Linux vfio driver. We unmap 1418 * the memory manually to be able to easily re-map it later regardless 1419 * of other, external factors. 1420 */ 1421 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1422 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1423 if (ret) { 1424 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1425 break; 1426 } 1427 } 1428 pthread_mutex_unlock(&g_vfio.mutex); 1429 #endif 1430 } 1431 1432 int 1433 vtophys_init(void) 1434 { 1435 const struct spdk_mem_map_ops vtophys_map_ops = { 1436 .notify_cb = vtophys_notify, 1437 .are_contiguous = vtophys_check_contiguous_entries, 1438 }; 1439 1440 const struct spdk_mem_map_ops phys_ref_map_ops = { 1441 .notify_cb = NULL, 1442 .are_contiguous = NULL, 1443 }; 1444 1445 #if VFIO_ENABLED 1446 vtophys_iommu_init(); 1447 #endif 1448 1449 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1450 if (g_phys_ref_map == NULL) { 1451 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1452 return -ENOMEM; 1453 } 1454 1455 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1456 if (g_vtophys_map == NULL) { 1457 DEBUG_PRINT("vtophys map allocation failed\n"); 1458 return -ENOMEM; 1459 } 1460 return 0; 1461 } 1462 1463 uint64_t 1464 spdk_vtophys(void *buf, uint64_t *size) 1465 { 1466 uint64_t vaddr, paddr_2mb; 1467 1468 vaddr = (uint64_t)buf; 1469 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1470 1471 /* 1472 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1473 * we will still bitwise-or it with the buf offset below, but the result will still be 1474 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1475 * unaligned) we must now check the return value before addition. 1476 */ 1477 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1478 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1479 return SPDK_VTOPHYS_ERROR; 1480 } else { 1481 return paddr_2mb + (vaddr & MASK_2MB); 1482 } 1483 } 1484