1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_memory.h> 40 #include <rte_eal_memconfig.h> 41 42 #include "spdk_internal/assert.h" 43 44 #include "spdk/assert.h" 45 #include "spdk/likely.h" 46 #include "spdk/queue.h" 47 #include "spdk/util.h" 48 #include "spdk/memory.h" 49 #include "spdk/env_dpdk.h" 50 #include "spdk/log.h" 51 52 #ifndef __linux__ 53 #define VFIO_ENABLED 0 54 #else 55 #include <linux/version.h> 56 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 57 #define VFIO_ENABLED 1 58 #include <linux/vfio.h> 59 #include <rte_vfio.h> 60 61 struct spdk_vfio_dma_map { 62 struct vfio_iommu_type1_dma_map map; 63 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 64 }; 65 66 struct vfio_cfg { 67 int fd; 68 bool enabled; 69 bool noiommu_enabled; 70 unsigned device_ref; 71 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 72 pthread_mutex_t mutex; 73 }; 74 75 static struct vfio_cfg g_vfio = { 76 .fd = -1, 77 .enabled = false, 78 .noiommu_enabled = false, 79 .device_ref = 0, 80 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 81 .mutex = PTHREAD_MUTEX_INITIALIZER 82 }; 83 84 #else 85 #define VFIO_ENABLED 0 86 #endif 87 #endif 88 89 #if DEBUG 90 #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 91 #else 92 #define DEBUG_PRINT(...) 93 #endif 94 95 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 96 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 97 98 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 99 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 100 101 /* Page is registered */ 102 #define REG_MAP_REGISTERED (1ULL << 62) 103 104 /* A notification region barrier. The 2MB translation entry that's marked 105 * with this flag must be unregistered separately. This allows contiguous 106 * regions to be unregistered in the same chunks they were registered. 107 */ 108 #define REG_MAP_NOTIFY_START (1ULL << 63) 109 110 /* Translation of a single 2MB page. */ 111 struct map_2mb { 112 uint64_t translation_2mb; 113 }; 114 115 /* Second-level map table indexed by bits [21..29] of the virtual address. 116 * Each entry contains the address translation or error for entries that haven't 117 * been retrieved yet. 118 */ 119 struct map_1gb { 120 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 121 }; 122 123 /* Top-level map table indexed by bits [30..47] of the virtual address. 124 * Each entry points to a second-level map table or NULL. 125 */ 126 struct map_256tb { 127 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 128 }; 129 130 /* Page-granularity memory address translation */ 131 struct spdk_mem_map { 132 struct map_256tb map_256tb; 133 pthread_mutex_t mutex; 134 uint64_t default_translation; 135 struct spdk_mem_map_ops ops; 136 void *cb_ctx; 137 TAILQ_ENTRY(spdk_mem_map) tailq; 138 }; 139 140 /* Registrations map. The 64 bit translations are bit fields with the 141 * following layout (starting with the low bits): 142 * 0 - 61 : reserved 143 * 62 - 63 : flags 144 */ 145 static struct spdk_mem_map *g_mem_reg_map; 146 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 147 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 148 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 149 150 static bool g_legacy_mem; 151 152 /* 153 * Walk the currently registered memory via the main memory registration map 154 * and call the new map's notify callback for each virtually contiguous region. 155 */ 156 static int 157 mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 158 { 159 size_t idx_256tb; 160 uint64_t idx_1gb; 161 uint64_t contig_start = UINT64_MAX; 162 uint64_t contig_end = UINT64_MAX; 163 struct map_1gb *map_1gb; 164 int rc; 165 166 if (!g_mem_reg_map) { 167 return -EINVAL; 168 } 169 170 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 171 pthread_mutex_lock(&g_mem_reg_map->mutex); 172 173 for (idx_256tb = 0; 174 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 175 idx_256tb++) { 176 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 177 178 if (!map_1gb) { 179 if (contig_start != UINT64_MAX) { 180 /* End of of a virtually contiguous range */ 181 rc = map->ops.notify_cb(map->cb_ctx, map, action, 182 (void *)contig_start, 183 contig_end - contig_start + VALUE_2MB); 184 /* Don't bother handling unregister failures. It can't be any worse */ 185 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 186 goto err_unregister; 187 } 188 } 189 contig_start = UINT64_MAX; 190 continue; 191 } 192 193 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 194 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 195 (contig_start == UINT64_MAX || 196 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 197 /* Rebuild the virtual address from the indexes */ 198 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 199 200 if (contig_start == UINT64_MAX) { 201 contig_start = vaddr; 202 } 203 204 contig_end = vaddr; 205 } else { 206 if (contig_start != UINT64_MAX) { 207 /* End of of a virtually contiguous range */ 208 rc = map->ops.notify_cb(map->cb_ctx, map, action, 209 (void *)contig_start, 210 contig_end - contig_start + VALUE_2MB); 211 /* Don't bother handling unregister failures. It can't be any worse */ 212 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 213 goto err_unregister; 214 } 215 216 /* This page might be a part of a neighbour region, so process 217 * it again. The idx_1gb will be incremented immediately. 218 */ 219 idx_1gb--; 220 } 221 contig_start = UINT64_MAX; 222 } 223 } 224 } 225 226 pthread_mutex_unlock(&g_mem_reg_map->mutex); 227 return 0; 228 229 err_unregister: 230 /* Unwind to the first empty translation so we don't unregister 231 * a region that just failed to register. 232 */ 233 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 234 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 235 contig_start = UINT64_MAX; 236 contig_end = UINT64_MAX; 237 238 /* Unregister any memory we managed to register before the failure */ 239 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 240 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 241 242 if (!map_1gb) { 243 if (contig_end != UINT64_MAX) { 244 /* End of of a virtually contiguous range */ 245 map->ops.notify_cb(map->cb_ctx, map, 246 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 247 (void *)contig_start, 248 contig_end - contig_start + VALUE_2MB); 249 } 250 contig_end = UINT64_MAX; 251 continue; 252 } 253 254 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 255 /* Rebuild the virtual address from the indexes */ 256 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 257 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 258 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 259 260 if (contig_end == UINT64_MAX) { 261 contig_end = vaddr; 262 } 263 contig_start = vaddr; 264 } else { 265 if (contig_end != UINT64_MAX) { 266 if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 267 contig_start = vaddr; 268 } 269 /* End of of a virtually contiguous range */ 270 map->ops.notify_cb(map->cb_ctx, map, 271 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 272 (void *)contig_start, 273 contig_end - contig_start + VALUE_2MB); 274 } 275 contig_end = UINT64_MAX; 276 } 277 } 278 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 279 } 280 281 pthread_mutex_unlock(&g_mem_reg_map->mutex); 282 return rc; 283 } 284 285 struct spdk_mem_map * 286 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 287 { 288 struct spdk_mem_map *map; 289 int rc; 290 size_t i; 291 292 map = calloc(1, sizeof(*map)); 293 if (map == NULL) { 294 return NULL; 295 } 296 297 if (pthread_mutex_init(&map->mutex, NULL)) { 298 free(map); 299 return NULL; 300 } 301 302 map->default_translation = default_translation; 303 map->cb_ctx = cb_ctx; 304 if (ops) { 305 map->ops = *ops; 306 } 307 308 if (ops && ops->notify_cb) { 309 pthread_mutex_lock(&g_spdk_mem_map_mutex); 310 rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 311 if (rc != 0) { 312 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 313 DEBUG_PRINT("Initial mem_map notify failed\n"); 314 pthread_mutex_destroy(&map->mutex); 315 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 316 free(map->map_256tb.map[i]); 317 } 318 free(map); 319 return NULL; 320 } 321 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 322 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 323 } 324 325 return map; 326 } 327 328 void 329 spdk_mem_map_free(struct spdk_mem_map **pmap) 330 { 331 struct spdk_mem_map *map; 332 size_t i; 333 334 if (!pmap) { 335 return; 336 } 337 338 map = *pmap; 339 340 if (!map) { 341 return; 342 } 343 344 if (map->ops.notify_cb) { 345 pthread_mutex_lock(&g_spdk_mem_map_mutex); 346 mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 347 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 348 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 349 } 350 351 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 352 free(map->map_256tb.map[i]); 353 } 354 355 pthread_mutex_destroy(&map->mutex); 356 357 free(map); 358 *pmap = NULL; 359 } 360 361 int 362 spdk_mem_register(void *vaddr, size_t len) 363 { 364 struct spdk_mem_map *map; 365 int rc; 366 void *seg_vaddr; 367 size_t seg_len; 368 uint64_t reg; 369 370 if ((uintptr_t)vaddr & ~MASK_256TB) { 371 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 372 return -EINVAL; 373 } 374 375 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 376 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 377 __func__, vaddr, len); 378 return -EINVAL; 379 } 380 381 if (len == 0) { 382 return 0; 383 } 384 385 pthread_mutex_lock(&g_spdk_mem_map_mutex); 386 387 seg_vaddr = vaddr; 388 seg_len = len; 389 while (seg_len > 0) { 390 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 391 if (reg & REG_MAP_REGISTERED) { 392 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 393 return -EBUSY; 394 } 395 seg_vaddr += VALUE_2MB; 396 seg_len -= VALUE_2MB; 397 } 398 399 seg_vaddr = vaddr; 400 seg_len = 0; 401 while (len > 0) { 402 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 403 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 404 seg_len += VALUE_2MB; 405 vaddr += VALUE_2MB; 406 len -= VALUE_2MB; 407 } 408 409 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 410 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 411 if (rc != 0) { 412 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 413 return rc; 414 } 415 } 416 417 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 418 return 0; 419 } 420 421 int 422 spdk_mem_unregister(void *vaddr, size_t len) 423 { 424 struct spdk_mem_map *map; 425 int rc; 426 void *seg_vaddr; 427 size_t seg_len; 428 uint64_t reg, newreg; 429 430 if ((uintptr_t)vaddr & ~MASK_256TB) { 431 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 432 return -EINVAL; 433 } 434 435 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 436 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 437 __func__, vaddr, len); 438 return -EINVAL; 439 } 440 441 pthread_mutex_lock(&g_spdk_mem_map_mutex); 442 443 /* The first page must be a start of a region. Also check if it's 444 * registered to make sure we don't return -ERANGE for non-registered 445 * regions. 446 */ 447 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 448 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 449 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 450 return -ERANGE; 451 } 452 453 seg_vaddr = vaddr; 454 seg_len = len; 455 while (seg_len > 0) { 456 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 457 if ((reg & REG_MAP_REGISTERED) == 0) { 458 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 459 return -EINVAL; 460 } 461 seg_vaddr += VALUE_2MB; 462 seg_len -= VALUE_2MB; 463 } 464 465 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 466 /* If the next page is registered, it must be a start of a region as well, 467 * otherwise we'd be unregistering only a part of a region. 468 */ 469 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 470 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 471 return -ERANGE; 472 } 473 seg_vaddr = vaddr; 474 seg_len = 0; 475 476 while (len > 0) { 477 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 478 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 479 480 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 481 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 482 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 483 if (rc != 0) { 484 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 485 return rc; 486 } 487 } 488 489 seg_vaddr = vaddr; 490 seg_len = VALUE_2MB; 491 } else { 492 seg_len += VALUE_2MB; 493 } 494 495 vaddr += VALUE_2MB; 496 len -= VALUE_2MB; 497 } 498 499 if (seg_len > 0) { 500 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 501 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 502 if (rc != 0) { 503 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 504 return rc; 505 } 506 } 507 } 508 509 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 510 return 0; 511 } 512 513 int 514 spdk_mem_reserve(void *vaddr, size_t len) 515 { 516 struct spdk_mem_map *map; 517 void *seg_vaddr; 518 size_t seg_len; 519 uint64_t reg; 520 521 if ((uintptr_t)vaddr & ~MASK_256TB) { 522 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 523 return -EINVAL; 524 } 525 526 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 527 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 528 __func__, vaddr, len); 529 return -EINVAL; 530 } 531 532 if (len == 0) { 533 return 0; 534 } 535 536 pthread_mutex_lock(&g_spdk_mem_map_mutex); 537 538 /* Check if any part of this range is already registered */ 539 seg_vaddr = vaddr; 540 seg_len = len; 541 while (seg_len > 0) { 542 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 543 if (reg & REG_MAP_REGISTERED) { 544 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 545 return -EBUSY; 546 } 547 seg_vaddr += VALUE_2MB; 548 seg_len -= VALUE_2MB; 549 } 550 551 /* Simply set the translation to the memory map's default. This allocates the space in the 552 * map but does not provide a valid translation. */ 553 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 554 g_mem_reg_map->default_translation); 555 556 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 557 spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 558 } 559 560 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 561 return 0; 562 } 563 564 static struct map_1gb * 565 mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 566 { 567 struct map_1gb *map_1gb; 568 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 569 size_t i; 570 571 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 572 return NULL; 573 } 574 575 map_1gb = map->map_256tb.map[idx_256tb]; 576 577 if (!map_1gb) { 578 pthread_mutex_lock(&map->mutex); 579 580 /* Recheck to make sure nobody else got the mutex first. */ 581 map_1gb = map->map_256tb.map[idx_256tb]; 582 if (!map_1gb) { 583 map_1gb = malloc(sizeof(struct map_1gb)); 584 if (map_1gb) { 585 /* initialize all entries to default translation */ 586 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 587 map_1gb->map[i].translation_2mb = map->default_translation; 588 } 589 map->map_256tb.map[idx_256tb] = map_1gb; 590 } 591 } 592 593 pthread_mutex_unlock(&map->mutex); 594 595 if (!map_1gb) { 596 DEBUG_PRINT("allocation failed\n"); 597 return NULL; 598 } 599 } 600 601 return map_1gb; 602 } 603 604 int 605 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 606 uint64_t translation) 607 { 608 uint64_t vfn_2mb; 609 struct map_1gb *map_1gb; 610 uint64_t idx_1gb; 611 struct map_2mb *map_2mb; 612 613 if ((uintptr_t)vaddr & ~MASK_256TB) { 614 DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 615 return -EINVAL; 616 } 617 618 /* For now, only 2 MB-aligned registrations are supported */ 619 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 620 DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 621 __func__, vaddr, size); 622 return -EINVAL; 623 } 624 625 vfn_2mb = vaddr >> SHIFT_2MB; 626 627 while (size) { 628 map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 629 if (!map_1gb) { 630 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 631 return -ENOMEM; 632 } 633 634 idx_1gb = MAP_1GB_IDX(vfn_2mb); 635 map_2mb = &map_1gb->map[idx_1gb]; 636 map_2mb->translation_2mb = translation; 637 638 size -= VALUE_2MB; 639 vfn_2mb++; 640 } 641 642 return 0; 643 } 644 645 int 646 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 647 { 648 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 649 } 650 651 inline uint64_t 652 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 653 { 654 const struct map_1gb *map_1gb; 655 const struct map_2mb *map_2mb; 656 uint64_t idx_256tb; 657 uint64_t idx_1gb; 658 uint64_t vfn_2mb; 659 uint64_t cur_size; 660 uint64_t prev_translation; 661 uint64_t orig_translation; 662 663 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 664 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 665 return map->default_translation; 666 } 667 668 vfn_2mb = vaddr >> SHIFT_2MB; 669 idx_256tb = MAP_256TB_IDX(vfn_2mb); 670 idx_1gb = MAP_1GB_IDX(vfn_2mb); 671 672 map_1gb = map->map_256tb.map[idx_256tb]; 673 if (spdk_unlikely(!map_1gb)) { 674 return map->default_translation; 675 } 676 677 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 678 map_2mb = &map_1gb->map[idx_1gb]; 679 if (size == NULL || map->ops.are_contiguous == NULL || 680 map_2mb->translation_2mb == map->default_translation) { 681 if (size != NULL) { 682 *size = spdk_min(*size, cur_size); 683 } 684 return map_2mb->translation_2mb; 685 } 686 687 orig_translation = map_2mb->translation_2mb; 688 prev_translation = orig_translation; 689 while (cur_size < *size) { 690 vfn_2mb++; 691 idx_256tb = MAP_256TB_IDX(vfn_2mb); 692 idx_1gb = MAP_1GB_IDX(vfn_2mb); 693 694 map_1gb = map->map_256tb.map[idx_256tb]; 695 if (spdk_unlikely(!map_1gb)) { 696 break; 697 } 698 699 map_2mb = &map_1gb->map[idx_1gb]; 700 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 701 break; 702 } 703 704 cur_size += VALUE_2MB; 705 prev_translation = map_2mb->translation_2mb; 706 } 707 708 *size = spdk_min(*size, cur_size); 709 return orig_translation; 710 } 711 712 static void 713 memory_hotplug_cb(enum rte_mem_event event_type, 714 const void *addr, size_t len, void *arg) 715 { 716 if (event_type == RTE_MEM_EVENT_ALLOC) { 717 spdk_mem_register((void *)addr, len); 718 719 if (!spdk_env_dpdk_external_init()) { 720 return; 721 } 722 723 /* When the user initialized DPDK separately, we can't 724 * be sure that --match-allocations RTE flag was specified. 725 * Without this flag, DPDK can free memory in different units 726 * than it was allocated. It doesn't work with things like RDMA MRs. 727 * 728 * For such cases, we mark segments so they aren't freed. 729 */ 730 while (len > 0) { 731 struct rte_memseg *seg; 732 733 seg = rte_mem_virt2memseg(addr, NULL); 734 assert(seg != NULL); 735 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 736 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 737 len -= seg->hugepage_sz; 738 } 739 } else if (event_type == RTE_MEM_EVENT_FREE) { 740 spdk_mem_unregister((void *)addr, len); 741 } 742 } 743 744 static int 745 memory_iter_cb(const struct rte_memseg_list *msl, 746 const struct rte_memseg *ms, size_t len, void *arg) 747 { 748 return spdk_mem_register(ms->addr, len); 749 } 750 751 int 752 mem_map_init(bool legacy_mem) 753 { 754 g_legacy_mem = legacy_mem; 755 756 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 757 if (g_mem_reg_map == NULL) { 758 DEBUG_PRINT("memory registration map allocation failed\n"); 759 return -ENOMEM; 760 } 761 762 /* 763 * Walk all DPDK memory segments and register them 764 * with the main memory map 765 */ 766 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 767 rte_memseg_contig_walk(memory_iter_cb, NULL); 768 return 0; 769 } 770 771 bool 772 spdk_iommu_is_enabled(void) 773 { 774 #if VFIO_ENABLED 775 return g_vfio.enabled && !g_vfio.noiommu_enabled; 776 #else 777 return false; 778 #endif 779 } 780 781 struct spdk_vtophys_pci_device { 782 struct rte_pci_device *pci_device; 783 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 784 }; 785 786 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 787 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 788 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 789 790 static struct spdk_mem_map *g_vtophys_map; 791 static struct spdk_mem_map *g_phys_ref_map; 792 793 #if VFIO_ENABLED 794 static int 795 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 796 { 797 struct spdk_vfio_dma_map *dma_map; 798 uint64_t refcount; 799 int ret; 800 801 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 802 assert(refcount < UINT64_MAX); 803 if (refcount > 0) { 804 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 805 return 0; 806 } 807 808 dma_map = calloc(1, sizeof(*dma_map)); 809 if (dma_map == NULL) { 810 return -ENOMEM; 811 } 812 813 dma_map->map.argsz = sizeof(dma_map->map); 814 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 815 dma_map->map.vaddr = vaddr; 816 dma_map->map.iova = iova; 817 dma_map->map.size = size; 818 819 pthread_mutex_lock(&g_vfio.mutex); 820 if (g_vfio.device_ref == 0) { 821 /* VFIO requires at least one device (IOMMU group) to be added to 822 * a VFIO container before it is possible to perform any IOMMU 823 * operations on that container. This memory will be mapped once 824 * the first device (IOMMU group) is hotplugged. 825 * 826 * Since the vfio container is managed internally by DPDK, it is 827 * also possible that some device is already in that container, but 828 * it's not managed by SPDK - e.g. an NIC attached internally 829 * inside DPDK. We could map the memory straight away in such 830 * scenario, but there's no need to do it. DPDK devices clearly 831 * don't need our mappings and hence we defer the mapping 832 * unconditionally until the first SPDK-managed device is 833 * hotplugged. 834 */ 835 goto out_insert; 836 } 837 838 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 839 if (ret) { 840 /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 841 SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 842 } 843 844 out_insert: 845 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 846 pthread_mutex_unlock(&g_vfio.mutex); 847 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 848 return 0; 849 } 850 851 static int 852 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 853 { 854 struct spdk_vfio_dma_map *dma_map; 855 uint64_t refcount; 856 int ret; 857 struct vfio_iommu_type1_dma_unmap unmap = {}; 858 859 pthread_mutex_lock(&g_vfio.mutex); 860 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 861 if (dma_map->map.iova == iova) { 862 break; 863 } 864 } 865 866 if (dma_map == NULL) { 867 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 868 pthread_mutex_unlock(&g_vfio.mutex); 869 return -ENXIO; 870 } 871 872 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 873 assert(refcount < UINT64_MAX); 874 if (refcount > 0) { 875 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 876 } 877 878 /* We still have outstanding references, don't clear it. */ 879 if (refcount > 1) { 880 pthread_mutex_unlock(&g_vfio.mutex); 881 return 0; 882 } 883 884 /** don't support partial or multiple-page unmap for now */ 885 assert(dma_map->map.size == size); 886 887 if (g_vfio.device_ref == 0) { 888 /* Memory is not mapped anymore, just remove it's references */ 889 goto out_remove; 890 } 891 892 unmap.argsz = sizeof(unmap); 893 unmap.flags = 0; 894 unmap.iova = dma_map->map.iova; 895 unmap.size = dma_map->map.size; 896 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 897 if (ret) { 898 SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 899 } 900 901 out_remove: 902 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 903 pthread_mutex_unlock(&g_vfio.mutex); 904 free(dma_map); 905 return 0; 906 } 907 #endif 908 909 static uint64_t 910 vtophys_get_paddr_memseg(uint64_t vaddr) 911 { 912 uintptr_t paddr; 913 struct rte_memseg *seg; 914 915 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 916 if (seg != NULL) { 917 paddr = seg->iova; 918 if (paddr == RTE_BAD_IOVA) { 919 return SPDK_VTOPHYS_ERROR; 920 } 921 paddr += (vaddr - (uintptr_t)seg->addr); 922 return paddr; 923 } 924 925 return SPDK_VTOPHYS_ERROR; 926 } 927 928 /* Try to get the paddr from /proc/self/pagemap */ 929 static uint64_t 930 vtophys_get_paddr_pagemap(uint64_t vaddr) 931 { 932 uintptr_t paddr; 933 934 /* Silence static analyzers */ 935 assert(vaddr != 0); 936 paddr = rte_mem_virt2iova((void *)vaddr); 937 if (paddr == RTE_BAD_IOVA) { 938 /* 939 * The vaddr may be valid but doesn't have a backing page 940 * assigned yet. Touch the page to ensure a backing page 941 * gets assigned, then try to translate again. 942 */ 943 rte_atomic64_read((rte_atomic64_t *)vaddr); 944 paddr = rte_mem_virt2iova((void *)vaddr); 945 } 946 if (paddr == RTE_BAD_IOVA) { 947 /* Unable to get to the physical address. */ 948 return SPDK_VTOPHYS_ERROR; 949 } 950 951 return paddr; 952 } 953 954 /* Try to get the paddr from pci devices */ 955 static uint64_t 956 vtophys_get_paddr_pci(uint64_t vaddr) 957 { 958 struct spdk_vtophys_pci_device *vtophys_dev; 959 uintptr_t paddr; 960 struct rte_pci_device *dev; 961 struct rte_mem_resource *res; 962 unsigned r; 963 964 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 965 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 966 dev = vtophys_dev->pci_device; 967 968 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 969 res = &dev->mem_resource[r]; 970 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 971 vaddr < (uint64_t)res->addr + res->len) { 972 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 973 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 974 (void *)paddr); 975 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 976 return paddr; 977 } 978 } 979 } 980 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 981 982 return SPDK_VTOPHYS_ERROR; 983 } 984 985 static int 986 vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 987 enum spdk_mem_map_notify_action action, 988 void *vaddr, size_t len) 989 { 990 int rc = 0, pci_phys = 0; 991 uint64_t paddr; 992 993 if ((uintptr_t)vaddr & ~MASK_256TB) { 994 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 995 return -EINVAL; 996 } 997 998 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 999 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1000 vaddr, len); 1001 return -EINVAL; 1002 } 1003 1004 /* Get the physical address from the DPDK memsegs */ 1005 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1006 1007 switch (action) { 1008 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1009 if (paddr == SPDK_VTOPHYS_ERROR) { 1010 /* This is not an address that DPDK is managing. */ 1011 #if VFIO_ENABLED 1012 enum rte_iova_mode iova_mode; 1013 1014 iova_mode = rte_eal_iova_mode(); 1015 1016 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1017 /* We'll use the virtual address as the iova to match DPDK. */ 1018 paddr = (uint64_t)vaddr; 1019 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1020 if (rc) { 1021 return -EFAULT; 1022 } 1023 while (len > 0) { 1024 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1025 if (rc != 0) { 1026 return rc; 1027 } 1028 vaddr += VALUE_2MB; 1029 paddr += VALUE_2MB; 1030 len -= VALUE_2MB; 1031 } 1032 } else 1033 #endif 1034 { 1035 /* Get the physical address from /proc/self/pagemap. */ 1036 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1037 if (paddr == SPDK_VTOPHYS_ERROR) { 1038 /* Get the physical address from PCI devices */ 1039 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1040 if (paddr == SPDK_VTOPHYS_ERROR) { 1041 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1042 return -EFAULT; 1043 } 1044 /* The beginning of this address range points to a PCI resource, 1045 * so the rest must point to a PCI resource as well. 1046 */ 1047 pci_phys = 1; 1048 } 1049 1050 /* Get paddr for each 2MB chunk in this address range */ 1051 while (len > 0) { 1052 /* Get the physical address from /proc/self/pagemap. */ 1053 if (pci_phys) { 1054 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1055 } else { 1056 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1057 } 1058 1059 if (paddr == SPDK_VTOPHYS_ERROR) { 1060 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1061 return -EFAULT; 1062 } 1063 1064 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1065 if (!pci_phys && (paddr & MASK_2MB)) { 1066 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1067 return -EINVAL; 1068 } 1069 #if VFIO_ENABLED 1070 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1071 * with the IOMMU using the physical address to match. */ 1072 if (spdk_iommu_is_enabled()) { 1073 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1074 if (rc) { 1075 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1076 return -EFAULT; 1077 } 1078 } 1079 #endif 1080 1081 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1082 if (rc != 0) { 1083 return rc; 1084 } 1085 1086 vaddr += VALUE_2MB; 1087 len -= VALUE_2MB; 1088 } 1089 } 1090 } else { 1091 /* This is an address managed by DPDK. Just setup the translations. */ 1092 while (len > 0) { 1093 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1094 if (paddr == SPDK_VTOPHYS_ERROR) { 1095 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1096 return -EFAULT; 1097 } 1098 1099 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1100 if (rc != 0) { 1101 return rc; 1102 } 1103 1104 vaddr += VALUE_2MB; 1105 len -= VALUE_2MB; 1106 } 1107 } 1108 1109 break; 1110 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1111 #if VFIO_ENABLED 1112 if (paddr == SPDK_VTOPHYS_ERROR) { 1113 /* 1114 * This is not an address that DPDK is managing. If vfio is enabled, 1115 * we need to unmap the range from the IOMMU 1116 */ 1117 if (spdk_iommu_is_enabled()) { 1118 uint64_t buffer_len = len; 1119 uint8_t *va = vaddr; 1120 enum rte_iova_mode iova_mode; 1121 1122 iova_mode = rte_eal_iova_mode(); 1123 /* 1124 * In virtual address mode, the region is contiguous and can be done in 1125 * one unmap. 1126 */ 1127 if (iova_mode == RTE_IOVA_VA) { 1128 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1129 if (buffer_len != len || paddr != (uintptr_t)va) { 1130 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1131 "translation had address 0x%" PRIx64 " and length %lu\n", 1132 va, len, paddr, buffer_len); 1133 return -EINVAL; 1134 } 1135 rc = vtophys_iommu_unmap_dma(paddr, len); 1136 if (rc) { 1137 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1138 return -EFAULT; 1139 } 1140 } else if (iova_mode == RTE_IOVA_PA) { 1141 /* Get paddr for each 2MB chunk in this address range */ 1142 while (buffer_len > 0) { 1143 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1144 1145 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1146 DEBUG_PRINT("could not get phys addr for %p\n", va); 1147 return -EFAULT; 1148 } 1149 1150 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1151 if (rc) { 1152 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1153 return -EFAULT; 1154 } 1155 1156 va += VALUE_2MB; 1157 buffer_len -= VALUE_2MB; 1158 } 1159 } 1160 } 1161 } 1162 #endif 1163 while (len > 0) { 1164 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1165 if (rc != 0) { 1166 return rc; 1167 } 1168 1169 vaddr += VALUE_2MB; 1170 len -= VALUE_2MB; 1171 } 1172 1173 break; 1174 default: 1175 SPDK_UNREACHABLE(); 1176 } 1177 1178 return rc; 1179 } 1180 1181 static int 1182 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1183 { 1184 /* This function is always called with paddrs for two subsequent 1185 * 2MB chunks in virtual address space, so those chunks will be only 1186 * physically contiguous if the physical addresses are 2MB apart 1187 * from each other as well. 1188 */ 1189 return (paddr2 - paddr1 == VALUE_2MB); 1190 } 1191 1192 #if VFIO_ENABLED 1193 1194 static bool 1195 vfio_enabled(void) 1196 { 1197 return rte_vfio_is_enabled("vfio_pci"); 1198 } 1199 1200 /* Check if IOMMU is enabled on the system */ 1201 static bool 1202 has_iommu_groups(void) 1203 { 1204 int count = 0; 1205 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1206 1207 if (dir == NULL) { 1208 return false; 1209 } 1210 1211 while (count < 3 && readdir(dir) != NULL) { 1212 count++; 1213 } 1214 1215 closedir(dir); 1216 /* there will always be ./ and ../ entries */ 1217 return count > 2; 1218 } 1219 1220 static bool 1221 vfio_noiommu_enabled(void) 1222 { 1223 return rte_vfio_noiommu_is_enabled(); 1224 } 1225 1226 static void 1227 vtophys_iommu_init(void) 1228 { 1229 char proc_fd_path[PATH_MAX + 1]; 1230 char link_path[PATH_MAX + 1]; 1231 const char vfio_path[] = "/dev/vfio/vfio"; 1232 DIR *dir; 1233 struct dirent *d; 1234 1235 if (!vfio_enabled()) { 1236 return; 1237 } 1238 1239 if (vfio_noiommu_enabled()) { 1240 g_vfio.noiommu_enabled = true; 1241 } else if (!has_iommu_groups()) { 1242 return; 1243 } 1244 1245 dir = opendir("/proc/self/fd"); 1246 if (!dir) { 1247 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1248 return; 1249 } 1250 1251 while ((d = readdir(dir)) != NULL) { 1252 if (d->d_type != DT_LNK) { 1253 continue; 1254 } 1255 1256 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1257 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1258 continue; 1259 } 1260 1261 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1262 sscanf(d->d_name, "%d", &g_vfio.fd); 1263 break; 1264 } 1265 } 1266 1267 closedir(dir); 1268 1269 if (g_vfio.fd < 0) { 1270 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1271 return; 1272 } 1273 1274 g_vfio.enabled = true; 1275 1276 return; 1277 } 1278 1279 #endif 1280 1281 void 1282 vtophys_pci_device_added(struct rte_pci_device *pci_device) 1283 { 1284 struct spdk_vtophys_pci_device *vtophys_dev; 1285 1286 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1287 1288 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1289 if (vtophys_dev) { 1290 vtophys_dev->pci_device = pci_device; 1291 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1292 } else { 1293 DEBUG_PRINT("Memory allocation error\n"); 1294 } 1295 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1296 1297 #if VFIO_ENABLED 1298 struct spdk_vfio_dma_map *dma_map; 1299 int ret; 1300 1301 if (!g_vfio.enabled) { 1302 return; 1303 } 1304 1305 pthread_mutex_lock(&g_vfio.mutex); 1306 g_vfio.device_ref++; 1307 if (g_vfio.device_ref > 1) { 1308 pthread_mutex_unlock(&g_vfio.mutex); 1309 return; 1310 } 1311 1312 /* This is the first SPDK device using DPDK vfio. This means that the first 1313 * IOMMU group might have been just been added to the DPDK vfio container. 1314 * From this point it is certain that the memory can be mapped now. 1315 */ 1316 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1317 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1318 if (ret) { 1319 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1320 break; 1321 } 1322 } 1323 pthread_mutex_unlock(&g_vfio.mutex); 1324 #endif 1325 } 1326 1327 void 1328 vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1329 { 1330 struct spdk_vtophys_pci_device *vtophys_dev; 1331 1332 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1333 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1334 if (vtophys_dev->pci_device == pci_device) { 1335 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1336 free(vtophys_dev); 1337 break; 1338 } 1339 } 1340 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1341 1342 #if VFIO_ENABLED 1343 struct spdk_vfio_dma_map *dma_map; 1344 int ret; 1345 1346 if (!g_vfio.enabled) { 1347 return; 1348 } 1349 1350 pthread_mutex_lock(&g_vfio.mutex); 1351 assert(g_vfio.device_ref > 0); 1352 g_vfio.device_ref--; 1353 if (g_vfio.device_ref > 0) { 1354 pthread_mutex_unlock(&g_vfio.mutex); 1355 return; 1356 } 1357 1358 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1359 * any additional devices using it's vfio container, all the mappings 1360 * will be automatically removed by the Linux vfio driver. We unmap 1361 * the memory manually to be able to easily re-map it later regardless 1362 * of other, external factors. 1363 */ 1364 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1365 struct vfio_iommu_type1_dma_unmap unmap = {}; 1366 unmap.argsz = sizeof(unmap); 1367 unmap.flags = 0; 1368 unmap.iova = dma_map->map.iova; 1369 unmap.size = dma_map->map.size; 1370 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 1371 if (ret) { 1372 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1373 break; 1374 } 1375 } 1376 pthread_mutex_unlock(&g_vfio.mutex); 1377 #endif 1378 } 1379 1380 int 1381 vtophys_init(void) 1382 { 1383 const struct spdk_mem_map_ops vtophys_map_ops = { 1384 .notify_cb = vtophys_notify, 1385 .are_contiguous = vtophys_check_contiguous_entries, 1386 }; 1387 1388 const struct spdk_mem_map_ops phys_ref_map_ops = { 1389 .notify_cb = NULL, 1390 .are_contiguous = NULL, 1391 }; 1392 1393 #if VFIO_ENABLED 1394 vtophys_iommu_init(); 1395 #endif 1396 1397 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1398 if (g_phys_ref_map == NULL) { 1399 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1400 return -ENOMEM; 1401 } 1402 1403 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1404 if (g_vtophys_map == NULL) { 1405 DEBUG_PRINT("vtophys map allocation failed\n"); 1406 spdk_mem_map_free(&g_phys_ref_map); 1407 return -ENOMEM; 1408 } 1409 return 0; 1410 } 1411 1412 uint64_t 1413 spdk_vtophys(const void *buf, uint64_t *size) 1414 { 1415 uint64_t vaddr, paddr_2mb; 1416 1417 vaddr = (uint64_t)buf; 1418 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1419 1420 /* 1421 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1422 * we will still bitwise-or it with the buf offset below, but the result will still be 1423 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1424 * unaligned) we must now check the return value before addition. 1425 */ 1426 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1427 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1428 return SPDK_VTOPHYS_ERROR; 1429 } else { 1430 return paddr_2mb + (vaddr & MASK_2MB); 1431 } 1432 } 1433 1434 int 1435 spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1436 { 1437 struct rte_memseg *seg; 1438 int ret, fd; 1439 1440 seg = rte_mem_virt2memseg(vaddr, NULL); 1441 if (!seg) { 1442 SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1443 return -ENOENT; 1444 } 1445 1446 fd = rte_memseg_get_fd_thread_unsafe(seg); 1447 if (fd < 0) { 1448 return fd; 1449 } 1450 1451 ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1452 if (ret < 0) { 1453 return ret; 1454 } 1455 1456 return fd; 1457 } 1458