1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_memory.h> 40 #include <rte_eal_memconfig.h> 41 42 #include "spdk_internal/assert.h" 43 #include "spdk_internal/memory.h" 44 45 #include "spdk/assert.h" 46 #include "spdk/likely.h" 47 #include "spdk/queue.h" 48 #include "spdk/util.h" 49 #include "spdk/env_dpdk.h" 50 51 #ifdef __FreeBSD__ 52 #define SPDK_VFIO_ENABLED 0 53 #else 54 #include <linux/version.h> 55 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 56 #define SPDK_VFIO_ENABLED 1 57 #include <linux/vfio.h> 58 #include <rte_vfio.h> 59 60 struct spdk_vfio_dma_map { 61 struct vfio_iommu_type1_dma_map map; 62 struct vfio_iommu_type1_dma_unmap unmap; 63 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 64 }; 65 66 struct vfio_cfg { 67 int fd; 68 bool enabled; 69 bool noiommu_enabled; 70 unsigned device_ref; 71 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 72 pthread_mutex_t mutex; 73 }; 74 75 static struct vfio_cfg g_vfio = { 76 .fd = -1, 77 .enabled = false, 78 .noiommu_enabled = false, 79 .device_ref = 0, 80 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 81 .mutex = PTHREAD_MUTEX_INITIALIZER 82 }; 83 84 #else 85 #define SPDK_VFIO_ENABLED 0 86 #endif 87 #endif 88 89 #if DEBUG 90 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 91 #else 92 #define DEBUG_PRINT(...) 93 #endif 94 95 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 96 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 97 98 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 99 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 100 101 /* Page is registered */ 102 #define REG_MAP_REGISTERED (1ULL << 62) 103 104 /* A notification region barrier. The 2MB translation entry that's marked 105 * with this flag must be unregistered separately. This allows contiguous 106 * regions to be unregistered in the same chunks they were registered. 107 */ 108 #define REG_MAP_NOTIFY_START (1ULL << 63) 109 110 /* Translation of a single 2MB page. */ 111 struct map_2mb { 112 uint64_t translation_2mb; 113 }; 114 115 /* Second-level map table indexed by bits [21..29] of the virtual address. 116 * Each entry contains the address translation or error for entries that haven't 117 * been retrieved yet. 118 */ 119 struct map_1gb { 120 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 121 }; 122 123 /* Top-level map table indexed by bits [30..47] of the virtual address. 124 * Each entry points to a second-level map table or NULL. 125 */ 126 struct map_256tb { 127 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 128 }; 129 130 /* Page-granularity memory address translation */ 131 struct spdk_mem_map { 132 struct map_256tb map_256tb; 133 pthread_mutex_t mutex; 134 uint64_t default_translation; 135 struct spdk_mem_map_ops ops; 136 void *cb_ctx; 137 TAILQ_ENTRY(spdk_mem_map) tailq; 138 }; 139 140 /* Registrations map. The 64 bit translations are bit fields with the 141 * following layout (starting with the low bits): 142 * 0 - 61 : reserved 143 * 62 - 63 : flags 144 */ 145 static struct spdk_mem_map *g_mem_reg_map; 146 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 147 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 148 149 /* 150 * Walk the currently registered memory via the main memory registration map 151 * and call the new map's notify callback for each virtually contiguous region. 152 */ 153 static int 154 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 155 { 156 size_t idx_256tb; 157 uint64_t idx_1gb; 158 uint64_t contig_start = UINT64_MAX; 159 uint64_t contig_end = UINT64_MAX; 160 struct map_1gb *map_1gb; 161 int rc; 162 163 if (!g_mem_reg_map) { 164 return -EINVAL; 165 } 166 167 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 168 pthread_mutex_lock(&g_mem_reg_map->mutex); 169 170 for (idx_256tb = 0; 171 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 172 idx_256tb++) { 173 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 174 175 if (!map_1gb) { 176 if (contig_start != UINT64_MAX) { 177 /* End of of a virtually contiguous range */ 178 rc = map->ops.notify_cb(map->cb_ctx, map, action, 179 (void *)contig_start, 180 contig_end - contig_start + VALUE_2MB); 181 /* Don't bother handling unregister failures. It can't be any worse */ 182 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 183 goto err_unregister; 184 } 185 } 186 contig_start = UINT64_MAX; 187 continue; 188 } 189 190 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 191 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 192 (contig_start == UINT64_MAX || 193 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 194 /* Rebuild the virtual address from the indexes */ 195 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 196 197 if (contig_start == UINT64_MAX) { 198 contig_start = vaddr; 199 } 200 201 contig_end = vaddr; 202 } else { 203 if (contig_start != UINT64_MAX) { 204 /* End of of a virtually contiguous range */ 205 rc = map->ops.notify_cb(map->cb_ctx, map, action, 206 (void *)contig_start, 207 contig_end - contig_start + VALUE_2MB); 208 /* Don't bother handling unregister failures. It can't be any worse */ 209 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 210 goto err_unregister; 211 } 212 213 /* This page might be a part of a neighbour region, so process 214 * it again. The idx_1gb will be incremented immediately. 215 */ 216 idx_1gb--; 217 } 218 contig_start = UINT64_MAX; 219 } 220 } 221 } 222 223 pthread_mutex_unlock(&g_mem_reg_map->mutex); 224 return 0; 225 226 err_unregister: 227 /* Unwind to the first empty translation so we don't unregister 228 * a region that just failed to register. 229 */ 230 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 231 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 232 contig_start = UINT64_MAX; 233 contig_end = UINT64_MAX; 234 235 /* Unregister any memory we managed to register before the failure */ 236 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 237 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 238 239 if (!map_1gb) { 240 if (contig_end != UINT64_MAX) { 241 /* End of of a virtually contiguous range */ 242 map->ops.notify_cb(map->cb_ctx, map, 243 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 244 (void *)contig_start, 245 contig_end - contig_start + VALUE_2MB); 246 } 247 contig_end = UINT64_MAX; 248 continue; 249 } 250 251 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 252 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 253 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 254 /* Rebuild the virtual address from the indexes */ 255 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 256 257 if (contig_end == UINT64_MAX) { 258 contig_end = vaddr; 259 } 260 contig_start = vaddr; 261 } else { 262 if (contig_end != UINT64_MAX) { 263 /* End of of a virtually contiguous range */ 264 map->ops.notify_cb(map->cb_ctx, map, 265 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 266 (void *)contig_start, 267 contig_end - contig_start + VALUE_2MB); 268 idx_1gb++; 269 } 270 contig_end = UINT64_MAX; 271 } 272 } 273 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 274 } 275 276 pthread_mutex_unlock(&g_mem_reg_map->mutex); 277 return rc; 278 } 279 280 struct spdk_mem_map * 281 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 282 { 283 struct spdk_mem_map *map; 284 int rc; 285 286 map = calloc(1, sizeof(*map)); 287 if (map == NULL) { 288 return NULL; 289 } 290 291 if (pthread_mutex_init(&map->mutex, NULL)) { 292 free(map); 293 return NULL; 294 } 295 296 map->default_translation = default_translation; 297 map->cb_ctx = cb_ctx; 298 if (ops) { 299 map->ops = *ops; 300 } 301 302 if (ops && ops->notify_cb) { 303 pthread_mutex_lock(&g_spdk_mem_map_mutex); 304 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 305 if (rc != 0) { 306 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 307 DEBUG_PRINT("Initial mem_map notify failed\n"); 308 pthread_mutex_destroy(&map->mutex); 309 free(map); 310 return NULL; 311 } 312 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 313 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 314 } 315 316 return map; 317 } 318 319 void 320 spdk_mem_map_free(struct spdk_mem_map **pmap) 321 { 322 struct spdk_mem_map *map; 323 size_t i; 324 325 if (!pmap) { 326 return; 327 } 328 329 map = *pmap; 330 331 if (!map) { 332 return; 333 } 334 335 if (map->ops.notify_cb) { 336 pthread_mutex_lock(&g_spdk_mem_map_mutex); 337 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 338 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 339 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 340 } 341 342 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 343 free(map->map_256tb.map[i]); 344 } 345 346 pthread_mutex_destroy(&map->mutex); 347 348 free(map); 349 *pmap = NULL; 350 } 351 352 int 353 spdk_mem_register(void *vaddr, size_t len) 354 { 355 struct spdk_mem_map *map; 356 int rc; 357 void *seg_vaddr; 358 size_t seg_len; 359 uint64_t reg; 360 361 if ((uintptr_t)vaddr & ~MASK_256TB) { 362 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 363 return -EINVAL; 364 } 365 366 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 367 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 368 __func__, vaddr, len); 369 return -EINVAL; 370 } 371 372 if (len == 0) { 373 return 0; 374 } 375 376 pthread_mutex_lock(&g_spdk_mem_map_mutex); 377 378 seg_vaddr = vaddr; 379 seg_len = len; 380 while (seg_len > 0) { 381 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 382 if (reg & REG_MAP_REGISTERED) { 383 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 384 return -EBUSY; 385 } 386 seg_vaddr += VALUE_2MB; 387 seg_len -= VALUE_2MB; 388 } 389 390 seg_vaddr = vaddr; 391 seg_len = 0; 392 while (len > 0) { 393 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 394 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 395 seg_len += VALUE_2MB; 396 vaddr += VALUE_2MB; 397 len -= VALUE_2MB; 398 } 399 400 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 401 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 402 if (rc != 0) { 403 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 404 return rc; 405 } 406 } 407 408 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 409 return 0; 410 } 411 412 int 413 spdk_mem_unregister(void *vaddr, size_t len) 414 { 415 struct spdk_mem_map *map; 416 int rc; 417 void *seg_vaddr; 418 size_t seg_len; 419 uint64_t reg, newreg; 420 421 if ((uintptr_t)vaddr & ~MASK_256TB) { 422 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 423 return -EINVAL; 424 } 425 426 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 427 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 428 __func__, vaddr, len); 429 return -EINVAL; 430 } 431 432 pthread_mutex_lock(&g_spdk_mem_map_mutex); 433 434 /* The first page must be a start of a region. Also check if it's 435 * registered to make sure we don't return -ERANGE for non-registered 436 * regions. 437 */ 438 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 439 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 440 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 441 return -ERANGE; 442 } 443 444 seg_vaddr = vaddr; 445 seg_len = len; 446 while (seg_len > 0) { 447 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 448 if ((reg & REG_MAP_REGISTERED) == 0) { 449 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 450 return -EINVAL; 451 } 452 seg_vaddr += VALUE_2MB; 453 seg_len -= VALUE_2MB; 454 } 455 456 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 457 /* If the next page is registered, it must be a start of a region as well, 458 * otherwise we'd be unregistering only a part of a region. 459 */ 460 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 461 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 462 return -ERANGE; 463 } 464 seg_vaddr = vaddr; 465 seg_len = 0; 466 467 while (len > 0) { 468 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 469 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 470 471 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 472 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 473 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 474 if (rc != 0) { 475 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 476 return rc; 477 } 478 } 479 480 seg_vaddr = vaddr; 481 seg_len = VALUE_2MB; 482 } else { 483 seg_len += VALUE_2MB; 484 } 485 486 vaddr += VALUE_2MB; 487 len -= VALUE_2MB; 488 } 489 490 if (seg_len > 0) { 491 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 492 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 493 if (rc != 0) { 494 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 495 return rc; 496 } 497 } 498 } 499 500 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 501 return 0; 502 } 503 504 static struct map_1gb * 505 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 506 { 507 struct map_1gb *map_1gb; 508 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 509 size_t i; 510 511 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 512 return NULL; 513 } 514 515 map_1gb = map->map_256tb.map[idx_256tb]; 516 517 if (!map_1gb) { 518 pthread_mutex_lock(&map->mutex); 519 520 /* Recheck to make sure nobody else got the mutex first. */ 521 map_1gb = map->map_256tb.map[idx_256tb]; 522 if (!map_1gb) { 523 map_1gb = malloc(sizeof(struct map_1gb)); 524 if (map_1gb) { 525 /* initialize all entries to default translation */ 526 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 527 map_1gb->map[i].translation_2mb = map->default_translation; 528 } 529 map->map_256tb.map[idx_256tb] = map_1gb; 530 } 531 } 532 533 pthread_mutex_unlock(&map->mutex); 534 535 if (!map_1gb) { 536 DEBUG_PRINT("allocation failed\n"); 537 return NULL; 538 } 539 } 540 541 return map_1gb; 542 } 543 544 int 545 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 546 uint64_t translation) 547 { 548 uint64_t vfn_2mb; 549 struct map_1gb *map_1gb; 550 uint64_t idx_1gb; 551 struct map_2mb *map_2mb; 552 553 if ((uintptr_t)vaddr & ~MASK_256TB) { 554 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 555 return -EINVAL; 556 } 557 558 /* For now, only 2 MB-aligned registrations are supported */ 559 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 560 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 561 __func__, vaddr, size); 562 return -EINVAL; 563 } 564 565 vfn_2mb = vaddr >> SHIFT_2MB; 566 567 while (size) { 568 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 569 if (!map_1gb) { 570 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 571 return -ENOMEM; 572 } 573 574 idx_1gb = MAP_1GB_IDX(vfn_2mb); 575 map_2mb = &map_1gb->map[idx_1gb]; 576 map_2mb->translation_2mb = translation; 577 578 size -= VALUE_2MB; 579 vfn_2mb++; 580 } 581 582 return 0; 583 } 584 585 int 586 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 587 { 588 uint64_t vfn_2mb; 589 struct map_1gb *map_1gb; 590 uint64_t idx_1gb; 591 struct map_2mb *map_2mb; 592 593 if ((uintptr_t)vaddr & ~MASK_256TB) { 594 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 595 return -EINVAL; 596 } 597 598 /* For now, only 2 MB-aligned registrations are supported */ 599 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 600 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 601 __func__, vaddr, size); 602 return -EINVAL; 603 } 604 605 vfn_2mb = vaddr >> SHIFT_2MB; 606 607 while (size) { 608 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 609 if (!map_1gb) { 610 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 611 return -ENOMEM; 612 } 613 614 idx_1gb = MAP_1GB_IDX(vfn_2mb); 615 map_2mb = &map_1gb->map[idx_1gb]; 616 map_2mb->translation_2mb = map->default_translation; 617 618 size -= VALUE_2MB; 619 vfn_2mb++; 620 } 621 622 return 0; 623 } 624 625 inline uint64_t 626 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 627 { 628 const struct map_1gb *map_1gb; 629 const struct map_2mb *map_2mb; 630 uint64_t idx_256tb; 631 uint64_t idx_1gb; 632 uint64_t vfn_2mb; 633 uint64_t cur_size; 634 uint64_t prev_translation; 635 uint64_t orig_translation; 636 637 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 638 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 639 return map->default_translation; 640 } 641 642 vfn_2mb = vaddr >> SHIFT_2MB; 643 idx_256tb = MAP_256TB_IDX(vfn_2mb); 644 idx_1gb = MAP_1GB_IDX(vfn_2mb); 645 646 map_1gb = map->map_256tb.map[idx_256tb]; 647 if (spdk_unlikely(!map_1gb)) { 648 return map->default_translation; 649 } 650 651 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 652 map_2mb = &map_1gb->map[idx_1gb]; 653 if (size == NULL || map->ops.are_contiguous == NULL || 654 map_2mb->translation_2mb == map->default_translation) { 655 if (size != NULL) { 656 *size = spdk_min(*size, cur_size); 657 } 658 return map_2mb->translation_2mb; 659 } 660 661 orig_translation = map_2mb->translation_2mb; 662 prev_translation = orig_translation; 663 while (cur_size < *size) { 664 vfn_2mb++; 665 idx_256tb = MAP_256TB_IDX(vfn_2mb); 666 idx_1gb = MAP_1GB_IDX(vfn_2mb); 667 668 map_1gb = map->map_256tb.map[idx_256tb]; 669 if (spdk_unlikely(!map_1gb)) { 670 break; 671 } 672 673 map_2mb = &map_1gb->map[idx_1gb]; 674 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 675 break; 676 } 677 678 cur_size += VALUE_2MB; 679 prev_translation = map_2mb->translation_2mb; 680 } 681 682 *size = spdk_min(*size, cur_size); 683 return orig_translation; 684 } 685 686 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 687 static void 688 memory_hotplug_cb(enum rte_mem_event event_type, 689 const void *addr, size_t len, void *arg) 690 { 691 if (event_type == RTE_MEM_EVENT_ALLOC) { 692 spdk_mem_register((void *)addr, len); 693 694 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 695 if (!spdk_env_dpdk_external_init()) { 696 return; 697 } 698 #endif 699 700 /* Prior to DPDK 19.02, we have to worry about DPDK 701 * freeing memory in different units than it was allocated. 702 * That doesn't work with things like RDMA MRs. So for 703 * those versions of DPDK, mark each segment so that DPDK 704 * won't later free it. That ensures we don't have to deal 705 * with that scenario. 706 * 707 * DPDK 19.02 added the --match-allocations RTE flag to 708 * avoid this condition. 709 * 710 * Note: if the user initialized DPDK separately, we can't 711 * be sure that --match-allocations was specified, so need 712 * to still mark the segments so they aren't freed. 713 */ 714 while (len > 0) { 715 struct rte_memseg *seg; 716 717 seg = rte_mem_virt2memseg(addr, NULL); 718 assert(seg != NULL); 719 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 720 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 721 len -= seg->hugepage_sz; 722 } 723 } else if (event_type == RTE_MEM_EVENT_FREE) { 724 spdk_mem_unregister((void *)addr, len); 725 } 726 } 727 728 static int 729 memory_iter_cb(const struct rte_memseg_list *msl, 730 const struct rte_memseg *ms, size_t len, void *arg) 731 { 732 return spdk_mem_register(ms->addr, len); 733 } 734 #endif 735 736 int 737 spdk_mem_map_init(void) 738 { 739 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 740 if (g_mem_reg_map == NULL) { 741 DEBUG_PRINT("memory registration map allocation failed\n"); 742 return -ENOMEM; 743 } 744 745 /* 746 * Walk all DPDK memory segments and register them 747 * with the master memory map 748 */ 749 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 750 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 751 rte_memseg_contig_walk(memory_iter_cb, NULL); 752 #else 753 struct rte_mem_config *mcfg; 754 size_t seg_idx; 755 756 mcfg = rte_eal_get_configuration()->mem_config; 757 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 758 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 759 760 if (seg->addr == NULL) { 761 break; 762 } 763 764 spdk_mem_register(seg->addr, seg->len); 765 } 766 #endif 767 return 0; 768 } 769 770 bool 771 spdk_iommu_is_enabled(void) 772 { 773 #if SPDK_VFIO_ENABLED 774 return g_vfio.enabled && !g_vfio.noiommu_enabled; 775 #else 776 return false; 777 #endif 778 } 779 780 struct spdk_vtophys_pci_device { 781 struct rte_pci_device *pci_device; 782 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 783 }; 784 785 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 786 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 787 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 788 789 static struct spdk_mem_map *g_vtophys_map; 790 791 #if SPDK_VFIO_ENABLED 792 static int 793 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 794 { 795 struct spdk_vfio_dma_map *dma_map; 796 int ret; 797 798 dma_map = calloc(1, sizeof(*dma_map)); 799 if (dma_map == NULL) { 800 return -ENOMEM; 801 } 802 803 dma_map->map.argsz = sizeof(dma_map->map); 804 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 805 dma_map->map.vaddr = vaddr; 806 dma_map->map.iova = iova; 807 dma_map->map.size = size; 808 809 dma_map->unmap.argsz = sizeof(dma_map->unmap); 810 dma_map->unmap.flags = 0; 811 dma_map->unmap.iova = iova; 812 dma_map->unmap.size = size; 813 814 pthread_mutex_lock(&g_vfio.mutex); 815 if (g_vfio.device_ref == 0) { 816 /* VFIO requires at least one device (IOMMU group) to be added to 817 * a VFIO container before it is possible to perform any IOMMU 818 * operations on that container. This memory will be mapped once 819 * the first device (IOMMU group) is hotplugged. 820 * 821 * Since the vfio container is managed internally by DPDK, it is 822 * also possible that some device is already in that container, but 823 * it's not managed by SPDK - e.g. an NIC attached internally 824 * inside DPDK. We could map the memory straight away in such 825 * scenario, but there's no need to do it. DPDK devices clearly 826 * don't need our mappings and hence we defer the mapping 827 * unconditionally until the first SPDK-managed device is 828 * hotplugged. 829 */ 830 goto out_insert; 831 } 832 833 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 834 if (ret) { 835 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 836 pthread_mutex_unlock(&g_vfio.mutex); 837 free(dma_map); 838 return ret; 839 } 840 841 out_insert: 842 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 843 pthread_mutex_unlock(&g_vfio.mutex); 844 return 0; 845 } 846 847 static int 848 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 849 { 850 struct spdk_vfio_dma_map *dma_map; 851 int ret; 852 853 pthread_mutex_lock(&g_vfio.mutex); 854 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 855 if (dma_map->map.iova == iova) { 856 break; 857 } 858 } 859 860 if (dma_map == NULL) { 861 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 862 pthread_mutex_unlock(&g_vfio.mutex); 863 return -ENXIO; 864 } 865 866 /** don't support partial or multiple-page unmap for now */ 867 assert(dma_map->map.size == size); 868 869 if (g_vfio.device_ref == 0) { 870 /* Memory is not mapped anymore, just remove it's references */ 871 goto out_remove; 872 } 873 874 875 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 876 if (ret) { 877 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 878 pthread_mutex_unlock(&g_vfio.mutex); 879 return ret; 880 } 881 882 out_remove: 883 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 884 pthread_mutex_unlock(&g_vfio.mutex); 885 free(dma_map); 886 return 0; 887 } 888 #endif 889 890 static uint64_t 891 vtophys_get_paddr_memseg(uint64_t vaddr) 892 { 893 uintptr_t paddr; 894 struct rte_memseg *seg; 895 896 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 897 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 898 if (seg != NULL) { 899 paddr = seg->phys_addr; 900 if (paddr == RTE_BAD_IOVA) { 901 return SPDK_VTOPHYS_ERROR; 902 } 903 paddr += (vaddr - (uintptr_t)seg->addr); 904 return paddr; 905 } 906 #else 907 struct rte_mem_config *mcfg; 908 uint32_t seg_idx; 909 910 mcfg = rte_eal_get_configuration()->mem_config; 911 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 912 seg = &mcfg->memseg[seg_idx]; 913 if (seg->addr == NULL) { 914 break; 915 } 916 917 if (vaddr >= (uintptr_t)seg->addr && 918 vaddr < ((uintptr_t)seg->addr + seg->len)) { 919 paddr = seg->phys_addr; 920 if (paddr == RTE_BAD_IOVA) { 921 return SPDK_VTOPHYS_ERROR; 922 } 923 paddr += (vaddr - (uintptr_t)seg->addr); 924 return paddr; 925 } 926 } 927 #endif 928 929 return SPDK_VTOPHYS_ERROR; 930 } 931 932 /* Try to get the paddr from /proc/self/pagemap */ 933 static uint64_t 934 vtophys_get_paddr_pagemap(uint64_t vaddr) 935 { 936 uintptr_t paddr; 937 938 /* Silence static analyzers */ 939 assert(vaddr != 0); 940 paddr = rte_mem_virt2iova((void *)vaddr); 941 if (paddr == RTE_BAD_IOVA) { 942 /* 943 * The vaddr may be valid but doesn't have a backing page 944 * assigned yet. Touch the page to ensure a backing page 945 * gets assigned, then try to translate again. 946 */ 947 rte_atomic64_read((rte_atomic64_t *)vaddr); 948 paddr = rte_mem_virt2iova((void *)vaddr); 949 } 950 if (paddr == RTE_BAD_IOVA) { 951 /* Unable to get to the physical address. */ 952 return SPDK_VTOPHYS_ERROR; 953 } 954 955 return paddr; 956 } 957 958 /* Try to get the paddr from pci devices */ 959 static uint64_t 960 vtophys_get_paddr_pci(uint64_t vaddr) 961 { 962 struct spdk_vtophys_pci_device *vtophys_dev; 963 uintptr_t paddr; 964 struct rte_pci_device *dev; 965 struct rte_mem_resource *res; 966 unsigned r; 967 968 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 969 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 970 dev = vtophys_dev->pci_device; 971 972 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 973 res = &dev->mem_resource[r]; 974 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 975 vaddr < (uint64_t)res->addr + res->len) { 976 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 977 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 978 (void *)paddr); 979 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 980 return paddr; 981 } 982 } 983 } 984 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 985 986 return SPDK_VTOPHYS_ERROR; 987 } 988 989 static int 990 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 991 enum spdk_mem_map_notify_action action, 992 void *vaddr, size_t len) 993 { 994 int rc = 0, pci_phys = 0; 995 uint64_t paddr; 996 997 if ((uintptr_t)vaddr & ~MASK_256TB) { 998 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 999 return -EINVAL; 1000 } 1001 1002 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1003 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1004 vaddr, len); 1005 return -EINVAL; 1006 } 1007 1008 /* Get the physical address from the DPDK memsegs */ 1009 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1010 1011 switch (action) { 1012 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1013 if (paddr == SPDK_VTOPHYS_ERROR) { 1014 /* This is not an address that DPDK is managing. */ 1015 #if SPDK_VFIO_ENABLED 1016 if (spdk_iommu_is_enabled()) { 1017 /* We'll use the virtual address as the iova. DPDK 1018 * currently uses physical addresses as the iovas (or counts 1019 * up from 0 if it can't get physical addresses), so 1020 * the range of user space virtual addresses and physical 1021 * addresses will never overlap. 1022 */ 1023 paddr = (uint64_t)vaddr; 1024 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1025 if (rc) { 1026 return -EFAULT; 1027 } 1028 while (len > 0) { 1029 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1030 if (rc != 0) { 1031 return rc; 1032 } 1033 vaddr += VALUE_2MB; 1034 paddr += VALUE_2MB; 1035 len -= VALUE_2MB; 1036 } 1037 } else 1038 #endif 1039 { 1040 /* Get the physical address from /proc/self/pagemap. */ 1041 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1042 if (paddr == SPDK_VTOPHYS_ERROR) { 1043 /* Get the physical address from PCI devices */ 1044 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1045 if (paddr == SPDK_VTOPHYS_ERROR) { 1046 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1047 return -EFAULT; 1048 } 1049 /* The beginning of this address range points to a PCI resource, 1050 * so the rest must point to a PCI resource as well. 1051 */ 1052 pci_phys = 1; 1053 } 1054 1055 /* Get paddr for each 2MB chunk in this address range */ 1056 while (len > 0) { 1057 /* Get the physical address from /proc/self/pagemap. */ 1058 if (pci_phys) { 1059 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1060 } else { 1061 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1062 } 1063 1064 if (paddr == SPDK_VTOPHYS_ERROR) { 1065 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1066 return -EFAULT; 1067 } 1068 1069 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1070 if (!pci_phys && (paddr & MASK_2MB)) { 1071 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1072 return -EINVAL; 1073 } 1074 1075 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1076 if (rc != 0) { 1077 return rc; 1078 } 1079 1080 vaddr += VALUE_2MB; 1081 len -= VALUE_2MB; 1082 } 1083 } 1084 } else { 1085 /* This is an address managed by DPDK. Just setup the translations. */ 1086 while (len > 0) { 1087 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1088 if (paddr == SPDK_VTOPHYS_ERROR) { 1089 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1090 return -EFAULT; 1091 } 1092 1093 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1094 if (rc != 0) { 1095 return rc; 1096 } 1097 1098 vaddr += VALUE_2MB; 1099 len -= VALUE_2MB; 1100 } 1101 } 1102 1103 break; 1104 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1105 #if SPDK_VFIO_ENABLED 1106 if (paddr == SPDK_VTOPHYS_ERROR) { 1107 /* 1108 * This is not an address that DPDK is managing. If vfio is enabled, 1109 * we need to unmap the range from the IOMMU 1110 */ 1111 if (spdk_iommu_is_enabled()) { 1112 uint64_t buffer_len = len; 1113 paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); 1114 if (buffer_len != len) { 1115 return -EINVAL; 1116 } 1117 rc = vtophys_iommu_unmap_dma(paddr, len); 1118 if (rc) { 1119 return -EFAULT; 1120 } 1121 } 1122 } 1123 #endif 1124 while (len > 0) { 1125 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1126 if (rc != 0) { 1127 return rc; 1128 } 1129 1130 vaddr += VALUE_2MB; 1131 len -= VALUE_2MB; 1132 } 1133 1134 break; 1135 default: 1136 SPDK_UNREACHABLE(); 1137 } 1138 1139 return rc; 1140 } 1141 1142 static int 1143 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1144 { 1145 /* This function is always called with paddrs for two subsequent 1146 * 2MB chunks in virtual address space, so those chunks will be only 1147 * physically contiguous if the physical addresses are 2MB apart 1148 * from each other as well. 1149 */ 1150 return (paddr2 - paddr1 == VALUE_2MB); 1151 } 1152 1153 #if SPDK_VFIO_ENABLED 1154 1155 static bool 1156 spdk_vfio_enabled(void) 1157 { 1158 return rte_vfio_is_enabled("vfio_pci"); 1159 } 1160 1161 /* Check if IOMMU is enabled on the system */ 1162 static bool 1163 has_iommu_groups(void) 1164 { 1165 struct dirent *d; 1166 int count = 0; 1167 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1168 1169 if (dir == NULL) { 1170 return false; 1171 } 1172 1173 while (count < 3 && (d = readdir(dir)) != NULL) { 1174 count++; 1175 } 1176 1177 closedir(dir); 1178 /* there will always be ./ and ../ entries */ 1179 return count > 2; 1180 } 1181 1182 static bool 1183 spdk_vfio_noiommu_enabled(void) 1184 { 1185 return rte_vfio_noiommu_is_enabled(); 1186 } 1187 1188 static void 1189 spdk_vtophys_iommu_init(void) 1190 { 1191 char proc_fd_path[PATH_MAX + 1]; 1192 char link_path[PATH_MAX + 1]; 1193 const char vfio_path[] = "/dev/vfio/vfio"; 1194 DIR *dir; 1195 struct dirent *d; 1196 1197 if (!spdk_vfio_enabled()) { 1198 return; 1199 } 1200 1201 if (spdk_vfio_noiommu_enabled()) { 1202 g_vfio.noiommu_enabled = true; 1203 } else if (!has_iommu_groups()) { 1204 return; 1205 } 1206 1207 dir = opendir("/proc/self/fd"); 1208 if (!dir) { 1209 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1210 return; 1211 } 1212 1213 while ((d = readdir(dir)) != NULL) { 1214 if (d->d_type != DT_LNK) { 1215 continue; 1216 } 1217 1218 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1219 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1220 continue; 1221 } 1222 1223 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1224 sscanf(d->d_name, "%d", &g_vfio.fd); 1225 break; 1226 } 1227 } 1228 1229 closedir(dir); 1230 1231 if (g_vfio.fd < 0) { 1232 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1233 return; 1234 } 1235 1236 g_vfio.enabled = true; 1237 1238 return; 1239 } 1240 #endif 1241 1242 void 1243 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1244 { 1245 struct spdk_vtophys_pci_device *vtophys_dev; 1246 1247 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1248 1249 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1250 if (vtophys_dev) { 1251 vtophys_dev->pci_device = pci_device; 1252 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1253 } else { 1254 DEBUG_PRINT("Memory allocation error\n"); 1255 } 1256 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1257 1258 #if SPDK_VFIO_ENABLED 1259 struct spdk_vfio_dma_map *dma_map; 1260 int ret; 1261 1262 if (!g_vfio.enabled) { 1263 return; 1264 } 1265 1266 pthread_mutex_lock(&g_vfio.mutex); 1267 g_vfio.device_ref++; 1268 if (g_vfio.device_ref > 1) { 1269 pthread_mutex_unlock(&g_vfio.mutex); 1270 return; 1271 } 1272 1273 /* This is the first SPDK device using DPDK vfio. This means that the first 1274 * IOMMU group might have been just been added to the DPDK vfio container. 1275 * From this point it is certain that the memory can be mapped now. 1276 */ 1277 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1278 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1279 if (ret) { 1280 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1281 break; 1282 } 1283 } 1284 pthread_mutex_unlock(&g_vfio.mutex); 1285 #endif 1286 } 1287 1288 void 1289 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1290 { 1291 struct spdk_vtophys_pci_device *vtophys_dev; 1292 1293 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1294 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1295 if (vtophys_dev->pci_device == pci_device) { 1296 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1297 free(vtophys_dev); 1298 break; 1299 } 1300 } 1301 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1302 1303 #if SPDK_VFIO_ENABLED 1304 struct spdk_vfio_dma_map *dma_map; 1305 int ret; 1306 1307 if (!g_vfio.enabled) { 1308 return; 1309 } 1310 1311 pthread_mutex_lock(&g_vfio.mutex); 1312 assert(g_vfio.device_ref > 0); 1313 g_vfio.device_ref--; 1314 if (g_vfio.device_ref > 0) { 1315 pthread_mutex_unlock(&g_vfio.mutex); 1316 return; 1317 } 1318 1319 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1320 * any additional devices using it's vfio container, all the mappings 1321 * will be automatically removed by the Linux vfio driver. We unmap 1322 * the memory manually to be able to easily re-map it later regardless 1323 * of other, external factors. 1324 */ 1325 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1326 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1327 if (ret) { 1328 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1329 break; 1330 } 1331 } 1332 pthread_mutex_unlock(&g_vfio.mutex); 1333 #endif 1334 } 1335 1336 int 1337 spdk_vtophys_init(void) 1338 { 1339 const struct spdk_mem_map_ops vtophys_map_ops = { 1340 .notify_cb = spdk_vtophys_notify, 1341 .are_contiguous = vtophys_check_contiguous_entries, 1342 }; 1343 1344 #if SPDK_VFIO_ENABLED 1345 spdk_vtophys_iommu_init(); 1346 #endif 1347 1348 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1349 if (g_vtophys_map == NULL) { 1350 DEBUG_PRINT("vtophys map allocation failed\n"); 1351 return -ENOMEM; 1352 } 1353 return 0; 1354 } 1355 1356 uint64_t 1357 spdk_vtophys(void *buf, uint64_t *size) 1358 { 1359 uint64_t vaddr, paddr_2mb; 1360 1361 vaddr = (uint64_t)buf; 1362 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1363 1364 /* 1365 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1366 * we will still bitwise-or it with the buf offset below, but the result will still be 1367 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1368 * unaligned) we must now check the return value before addition. 1369 */ 1370 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1371 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1372 return SPDK_VTOPHYS_ERROR; 1373 } else { 1374 return paddr_2mb + (vaddr & MASK_2MB); 1375 } 1376 } 1377