1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_malloc.h> 40 #include <rte_memory.h> 41 #include <rte_eal_memconfig.h> 42 43 #include "spdk_internal/assert.h" 44 #include "spdk_internal/memory.h" 45 46 #include "spdk/assert.h" 47 #include "spdk/likely.h" 48 #include "spdk/queue.h" 49 #include "spdk/util.h" 50 #include "spdk/env_dpdk.h" 51 52 #ifdef __FreeBSD__ 53 #define SPDK_VFIO_ENABLED 0 54 #else 55 #include <linux/version.h> 56 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 57 #define SPDK_VFIO_ENABLED 1 58 #include <linux/vfio.h> 59 #include <rte_vfio.h> 60 61 struct spdk_vfio_dma_map { 62 struct vfio_iommu_type1_dma_map map; 63 struct vfio_iommu_type1_dma_unmap unmap; 64 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 65 }; 66 67 struct vfio_cfg { 68 int fd; 69 bool enabled; 70 bool noiommu_enabled; 71 unsigned device_ref; 72 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 73 pthread_mutex_t mutex; 74 }; 75 76 static struct vfio_cfg g_vfio = { 77 .fd = -1, 78 .enabled = false, 79 .noiommu_enabled = false, 80 .device_ref = 0, 81 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 82 .mutex = PTHREAD_MUTEX_INITIALIZER 83 }; 84 85 #else 86 #define SPDK_VFIO_ENABLED 0 87 #endif 88 #endif 89 90 #if DEBUG 91 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 92 #else 93 #define DEBUG_PRINT(...) 94 #endif 95 96 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 97 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 98 99 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 100 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 101 102 /* Page is registered */ 103 #define REG_MAP_REGISTERED (1ULL << 62) 104 105 /* A notification region barrier. The 2MB translation entry that's marked 106 * with this flag must be unregistered separately. This allows contiguous 107 * regions to be unregistered in the same chunks they were registered. 108 */ 109 #define REG_MAP_NOTIFY_START (1ULL << 63) 110 111 /* Translation of a single 2MB page. */ 112 struct map_2mb { 113 uint64_t translation_2mb; 114 }; 115 116 /* Second-level map table indexed by bits [21..29] of the virtual address. 117 * Each entry contains the address translation or error for entries that haven't 118 * been retrieved yet. 119 */ 120 struct map_1gb { 121 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 122 }; 123 124 /* Top-level map table indexed by bits [30..47] of the virtual address. 125 * Each entry points to a second-level map table or NULL. 126 */ 127 struct map_256tb { 128 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 129 }; 130 131 /* Page-granularity memory address translation */ 132 struct spdk_mem_map { 133 struct map_256tb map_256tb; 134 pthread_mutex_t mutex; 135 uint64_t default_translation; 136 struct spdk_mem_map_ops ops; 137 void *cb_ctx; 138 TAILQ_ENTRY(spdk_mem_map) tailq; 139 }; 140 141 /* Registrations map. The 64 bit translations are bit fields with the 142 * following layout (starting with the low bits): 143 * 0 - 61 : reserved 144 * 62 - 63 : flags 145 */ 146 static struct spdk_mem_map *g_mem_reg_map; 147 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 148 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 149 150 static bool g_legacy_mem; 151 152 /* 153 * Walk the currently registered memory via the main memory registration map 154 * and call the new map's notify callback for each virtually contiguous region. 155 */ 156 static int 157 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 158 { 159 size_t idx_256tb; 160 uint64_t idx_1gb; 161 uint64_t contig_start = UINT64_MAX; 162 uint64_t contig_end = UINT64_MAX; 163 struct map_1gb *map_1gb; 164 int rc; 165 166 if (!g_mem_reg_map) { 167 return -EINVAL; 168 } 169 170 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 171 pthread_mutex_lock(&g_mem_reg_map->mutex); 172 173 for (idx_256tb = 0; 174 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 175 idx_256tb++) { 176 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 177 178 if (!map_1gb) { 179 if (contig_start != UINT64_MAX) { 180 /* End of of a virtually contiguous range */ 181 rc = map->ops.notify_cb(map->cb_ctx, map, action, 182 (void *)contig_start, 183 contig_end - contig_start + VALUE_2MB); 184 /* Don't bother handling unregister failures. It can't be any worse */ 185 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 186 goto err_unregister; 187 } 188 } 189 contig_start = UINT64_MAX; 190 continue; 191 } 192 193 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 194 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 195 (contig_start == UINT64_MAX || 196 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 197 /* Rebuild the virtual address from the indexes */ 198 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 199 200 if (contig_start == UINT64_MAX) { 201 contig_start = vaddr; 202 } 203 204 contig_end = vaddr; 205 } else { 206 if (contig_start != UINT64_MAX) { 207 /* End of of a virtually contiguous range */ 208 rc = map->ops.notify_cb(map->cb_ctx, map, action, 209 (void *)contig_start, 210 contig_end - contig_start + VALUE_2MB); 211 /* Don't bother handling unregister failures. It can't be any worse */ 212 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 213 goto err_unregister; 214 } 215 216 /* This page might be a part of a neighbour region, so process 217 * it again. The idx_1gb will be incremented immediately. 218 */ 219 idx_1gb--; 220 } 221 contig_start = UINT64_MAX; 222 } 223 } 224 } 225 226 pthread_mutex_unlock(&g_mem_reg_map->mutex); 227 return 0; 228 229 err_unregister: 230 /* Unwind to the first empty translation so we don't unregister 231 * a region that just failed to register. 232 */ 233 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 234 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 235 contig_start = UINT64_MAX; 236 contig_end = UINT64_MAX; 237 238 /* Unregister any memory we managed to register before the failure */ 239 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 240 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 241 242 if (!map_1gb) { 243 if (contig_end != UINT64_MAX) { 244 /* End of of a virtually contiguous range */ 245 map->ops.notify_cb(map->cb_ctx, map, 246 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 247 (void *)contig_start, 248 contig_end - contig_start + VALUE_2MB); 249 } 250 contig_end = UINT64_MAX; 251 continue; 252 } 253 254 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 255 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 256 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 257 /* Rebuild the virtual address from the indexes */ 258 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 259 260 if (contig_end == UINT64_MAX) { 261 contig_end = vaddr; 262 } 263 contig_start = vaddr; 264 } else { 265 if (contig_end != UINT64_MAX) { 266 /* End of of a virtually contiguous range */ 267 map->ops.notify_cb(map->cb_ctx, map, 268 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 269 (void *)contig_start, 270 contig_end - contig_start + VALUE_2MB); 271 idx_1gb++; 272 } 273 contig_end = UINT64_MAX; 274 } 275 } 276 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 277 } 278 279 pthread_mutex_unlock(&g_mem_reg_map->mutex); 280 return rc; 281 } 282 283 struct spdk_mem_map * 284 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 285 { 286 struct spdk_mem_map *map; 287 int rc; 288 289 map = calloc(1, sizeof(*map)); 290 if (map == NULL) { 291 return NULL; 292 } 293 294 if (pthread_mutex_init(&map->mutex, NULL)) { 295 free(map); 296 return NULL; 297 } 298 299 map->default_translation = default_translation; 300 map->cb_ctx = cb_ctx; 301 if (ops) { 302 map->ops = *ops; 303 } 304 305 if (ops && ops->notify_cb) { 306 pthread_mutex_lock(&g_spdk_mem_map_mutex); 307 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 308 if (rc != 0) { 309 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 310 DEBUG_PRINT("Initial mem_map notify failed\n"); 311 pthread_mutex_destroy(&map->mutex); 312 free(map); 313 return NULL; 314 } 315 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 316 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 317 } 318 319 return map; 320 } 321 322 void 323 spdk_mem_map_free(struct spdk_mem_map **pmap) 324 { 325 struct spdk_mem_map *map; 326 size_t i; 327 328 if (!pmap) { 329 return; 330 } 331 332 map = *pmap; 333 334 if (!map) { 335 return; 336 } 337 338 if (map->ops.notify_cb) { 339 pthread_mutex_lock(&g_spdk_mem_map_mutex); 340 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 341 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 342 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 343 } 344 345 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 346 if (g_legacy_mem) { 347 rte_free(map->map_256tb.map[i]); 348 } else { 349 free(map->map_256tb.map[i]); 350 } 351 } 352 353 pthread_mutex_destroy(&map->mutex); 354 355 free(map); 356 *pmap = NULL; 357 } 358 359 int 360 spdk_mem_register(void *vaddr, size_t len) 361 { 362 struct spdk_mem_map *map; 363 int rc; 364 void *seg_vaddr; 365 size_t seg_len; 366 uint64_t reg; 367 368 if ((uintptr_t)vaddr & ~MASK_256TB) { 369 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 370 return -EINVAL; 371 } 372 373 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 374 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 375 __func__, vaddr, len); 376 return -EINVAL; 377 } 378 379 if (len == 0) { 380 return 0; 381 } 382 383 pthread_mutex_lock(&g_spdk_mem_map_mutex); 384 385 seg_vaddr = vaddr; 386 seg_len = len; 387 while (seg_len > 0) { 388 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 389 if (reg & REG_MAP_REGISTERED) { 390 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 391 return -EBUSY; 392 } 393 seg_vaddr += VALUE_2MB; 394 seg_len -= VALUE_2MB; 395 } 396 397 seg_vaddr = vaddr; 398 seg_len = 0; 399 while (len > 0) { 400 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 401 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 402 seg_len += VALUE_2MB; 403 vaddr += VALUE_2MB; 404 len -= VALUE_2MB; 405 } 406 407 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 408 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 409 if (rc != 0) { 410 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 411 return rc; 412 } 413 } 414 415 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 416 return 0; 417 } 418 419 int 420 spdk_mem_unregister(void *vaddr, size_t len) 421 { 422 struct spdk_mem_map *map; 423 int rc; 424 void *seg_vaddr; 425 size_t seg_len; 426 uint64_t reg, newreg; 427 428 if ((uintptr_t)vaddr & ~MASK_256TB) { 429 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 430 return -EINVAL; 431 } 432 433 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 434 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 435 __func__, vaddr, len); 436 return -EINVAL; 437 } 438 439 pthread_mutex_lock(&g_spdk_mem_map_mutex); 440 441 /* The first page must be a start of a region. Also check if it's 442 * registered to make sure we don't return -ERANGE for non-registered 443 * regions. 444 */ 445 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 446 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 447 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 448 return -ERANGE; 449 } 450 451 seg_vaddr = vaddr; 452 seg_len = len; 453 while (seg_len > 0) { 454 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 455 if ((reg & REG_MAP_REGISTERED) == 0) { 456 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 457 return -EINVAL; 458 } 459 seg_vaddr += VALUE_2MB; 460 seg_len -= VALUE_2MB; 461 } 462 463 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 464 /* If the next page is registered, it must be a start of a region as well, 465 * otherwise we'd be unregistering only a part of a region. 466 */ 467 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 468 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 469 return -ERANGE; 470 } 471 seg_vaddr = vaddr; 472 seg_len = 0; 473 474 while (len > 0) { 475 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 476 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 477 478 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 479 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 480 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 481 if (rc != 0) { 482 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 483 return rc; 484 } 485 } 486 487 seg_vaddr = vaddr; 488 seg_len = VALUE_2MB; 489 } else { 490 seg_len += VALUE_2MB; 491 } 492 493 vaddr += VALUE_2MB; 494 len -= VALUE_2MB; 495 } 496 497 if (seg_len > 0) { 498 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 499 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 500 if (rc != 0) { 501 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 502 return rc; 503 } 504 } 505 } 506 507 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 508 return 0; 509 } 510 511 static struct map_1gb * 512 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 513 { 514 struct map_1gb *map_1gb; 515 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 516 size_t i; 517 518 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 519 return NULL; 520 } 521 522 map_1gb = map->map_256tb.map[idx_256tb]; 523 524 if (!map_1gb) { 525 pthread_mutex_lock(&map->mutex); 526 527 /* Recheck to make sure nobody else got the mutex first. */ 528 map_1gb = map->map_256tb.map[idx_256tb]; 529 if (!map_1gb) { 530 /* Some of the existing apps use TCMalloc hugepage 531 * allocator and register this tcmalloc allocated 532 * hugepage memory with SPDK in the mmap hook. Since 533 * this function is called in the spdk_mem_register 534 * code path we can't do a malloc here otherwise that 535 * would cause a livelock. So we use the dpdk provided 536 * allocator instead, which avoids this cyclic 537 * dependency. Note this is only guaranteed to work when 538 * DPDK dynamic memory allocation is disabled (--legacy-mem), 539 * which then is a requirement for anyone using TCMalloc in 540 * this way. 541 */ 542 if (g_legacy_mem) { 543 map_1gb = rte_malloc(NULL, sizeof(struct map_1gb), 0); 544 } else { 545 map_1gb = malloc(sizeof(struct map_1gb)); 546 } 547 if (map_1gb) { 548 /* initialize all entries to default translation */ 549 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 550 map_1gb->map[i].translation_2mb = map->default_translation; 551 } 552 map->map_256tb.map[idx_256tb] = map_1gb; 553 } 554 } 555 556 pthread_mutex_unlock(&map->mutex); 557 558 if (!map_1gb) { 559 DEBUG_PRINT("allocation failed\n"); 560 return NULL; 561 } 562 } 563 564 return map_1gb; 565 } 566 567 int 568 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 569 uint64_t translation) 570 { 571 uint64_t vfn_2mb; 572 struct map_1gb *map_1gb; 573 uint64_t idx_1gb; 574 struct map_2mb *map_2mb; 575 576 if ((uintptr_t)vaddr & ~MASK_256TB) { 577 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 578 return -EINVAL; 579 } 580 581 /* For now, only 2 MB-aligned registrations are supported */ 582 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 583 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 584 __func__, vaddr, size); 585 return -EINVAL; 586 } 587 588 vfn_2mb = vaddr >> SHIFT_2MB; 589 590 while (size) { 591 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 592 if (!map_1gb) { 593 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 594 return -ENOMEM; 595 } 596 597 idx_1gb = MAP_1GB_IDX(vfn_2mb); 598 map_2mb = &map_1gb->map[idx_1gb]; 599 map_2mb->translation_2mb = translation; 600 601 size -= VALUE_2MB; 602 vfn_2mb++; 603 } 604 605 return 0; 606 } 607 608 int 609 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 610 { 611 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 612 } 613 614 inline uint64_t 615 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 616 { 617 const struct map_1gb *map_1gb; 618 const struct map_2mb *map_2mb; 619 uint64_t idx_256tb; 620 uint64_t idx_1gb; 621 uint64_t vfn_2mb; 622 uint64_t cur_size; 623 uint64_t prev_translation; 624 uint64_t orig_translation; 625 626 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 627 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 628 return map->default_translation; 629 } 630 631 vfn_2mb = vaddr >> SHIFT_2MB; 632 idx_256tb = MAP_256TB_IDX(vfn_2mb); 633 idx_1gb = MAP_1GB_IDX(vfn_2mb); 634 635 map_1gb = map->map_256tb.map[idx_256tb]; 636 if (spdk_unlikely(!map_1gb)) { 637 return map->default_translation; 638 } 639 640 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 641 map_2mb = &map_1gb->map[idx_1gb]; 642 if (size == NULL || map->ops.are_contiguous == NULL || 643 map_2mb->translation_2mb == map->default_translation) { 644 if (size != NULL) { 645 *size = spdk_min(*size, cur_size); 646 } 647 return map_2mb->translation_2mb; 648 } 649 650 orig_translation = map_2mb->translation_2mb; 651 prev_translation = orig_translation; 652 while (cur_size < *size) { 653 vfn_2mb++; 654 idx_256tb = MAP_256TB_IDX(vfn_2mb); 655 idx_1gb = MAP_1GB_IDX(vfn_2mb); 656 657 map_1gb = map->map_256tb.map[idx_256tb]; 658 if (spdk_unlikely(!map_1gb)) { 659 break; 660 } 661 662 map_2mb = &map_1gb->map[idx_1gb]; 663 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 664 break; 665 } 666 667 cur_size += VALUE_2MB; 668 prev_translation = map_2mb->translation_2mb; 669 } 670 671 *size = spdk_min(*size, cur_size); 672 return orig_translation; 673 } 674 675 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 676 static void 677 memory_hotplug_cb(enum rte_mem_event event_type, 678 const void *addr, size_t len, void *arg) 679 { 680 if (event_type == RTE_MEM_EVENT_ALLOC) { 681 spdk_mem_register((void *)addr, len); 682 683 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 684 if (!spdk_env_dpdk_external_init()) { 685 return; 686 } 687 #endif 688 689 /* Prior to DPDK 19.02, we have to worry about DPDK 690 * freeing memory in different units than it was allocated. 691 * That doesn't work with things like RDMA MRs. So for 692 * those versions of DPDK, mark each segment so that DPDK 693 * won't later free it. That ensures we don't have to deal 694 * with that scenario. 695 * 696 * DPDK 19.02 added the --match-allocations RTE flag to 697 * avoid this condition. 698 * 699 * Note: if the user initialized DPDK separately, we can't 700 * be sure that --match-allocations was specified, so need 701 * to still mark the segments so they aren't freed. 702 */ 703 while (len > 0) { 704 struct rte_memseg *seg; 705 706 seg = rte_mem_virt2memseg(addr, NULL); 707 assert(seg != NULL); 708 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 709 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 710 len -= seg->hugepage_sz; 711 } 712 } else if (event_type == RTE_MEM_EVENT_FREE) { 713 spdk_mem_unregister((void *)addr, len); 714 } 715 } 716 717 static int 718 memory_iter_cb(const struct rte_memseg_list *msl, 719 const struct rte_memseg *ms, size_t len, void *arg) 720 { 721 return spdk_mem_register(ms->addr, len); 722 } 723 #endif 724 725 int 726 spdk_mem_map_init(bool legacy_mem) 727 { 728 g_legacy_mem = legacy_mem; 729 730 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 731 if (g_mem_reg_map == NULL) { 732 DEBUG_PRINT("memory registration map allocation failed\n"); 733 return -ENOMEM; 734 } 735 736 /* 737 * Walk all DPDK memory segments and register them 738 * with the master memory map 739 */ 740 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 741 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 742 rte_memseg_contig_walk(memory_iter_cb, NULL); 743 #else 744 struct rte_mem_config *mcfg; 745 size_t seg_idx; 746 747 mcfg = rte_eal_get_configuration()->mem_config; 748 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 749 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 750 751 if (seg->addr == NULL) { 752 break; 753 } 754 755 spdk_mem_register(seg->addr, seg->len); 756 } 757 #endif 758 return 0; 759 } 760 761 bool 762 spdk_iommu_is_enabled(void) 763 { 764 #if SPDK_VFIO_ENABLED 765 return g_vfio.enabled && !g_vfio.noiommu_enabled; 766 #else 767 return false; 768 #endif 769 } 770 771 struct spdk_vtophys_pci_device { 772 struct rte_pci_device *pci_device; 773 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 774 }; 775 776 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 777 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 778 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 779 780 static struct spdk_mem_map *g_vtophys_map; 781 static struct spdk_mem_map *g_phys_ref_map; 782 783 #if SPDK_VFIO_ENABLED 784 static int 785 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 786 { 787 struct spdk_vfio_dma_map *dma_map; 788 uint64_t refcount; 789 int ret; 790 791 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 792 assert(refcount < UINT64_MAX); 793 if (refcount > 0) { 794 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 795 return 0; 796 } 797 798 dma_map = calloc(1, sizeof(*dma_map)); 799 if (dma_map == NULL) { 800 return -ENOMEM; 801 } 802 803 dma_map->map.argsz = sizeof(dma_map->map); 804 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 805 dma_map->map.vaddr = vaddr; 806 dma_map->map.iova = iova; 807 dma_map->map.size = size; 808 809 dma_map->unmap.argsz = sizeof(dma_map->unmap); 810 dma_map->unmap.flags = 0; 811 dma_map->unmap.iova = iova; 812 dma_map->unmap.size = size; 813 814 pthread_mutex_lock(&g_vfio.mutex); 815 if (g_vfio.device_ref == 0) { 816 /* VFIO requires at least one device (IOMMU group) to be added to 817 * a VFIO container before it is possible to perform any IOMMU 818 * operations on that container. This memory will be mapped once 819 * the first device (IOMMU group) is hotplugged. 820 * 821 * Since the vfio container is managed internally by DPDK, it is 822 * also possible that some device is already in that container, but 823 * it's not managed by SPDK - e.g. an NIC attached internally 824 * inside DPDK. We could map the memory straight away in such 825 * scenario, but there's no need to do it. DPDK devices clearly 826 * don't need our mappings and hence we defer the mapping 827 * unconditionally until the first SPDK-managed device is 828 * hotplugged. 829 */ 830 goto out_insert; 831 } 832 833 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 834 if (ret) { 835 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 836 pthread_mutex_unlock(&g_vfio.mutex); 837 free(dma_map); 838 return ret; 839 } 840 841 out_insert: 842 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 843 pthread_mutex_unlock(&g_vfio.mutex); 844 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 845 return 0; 846 } 847 848 static int 849 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 850 { 851 struct spdk_vfio_dma_map *dma_map; 852 uint64_t refcount; 853 int ret; 854 855 pthread_mutex_lock(&g_vfio.mutex); 856 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 857 if (dma_map->map.iova == iova) { 858 break; 859 } 860 } 861 862 if (dma_map == NULL) { 863 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 864 pthread_mutex_unlock(&g_vfio.mutex); 865 return -ENXIO; 866 } 867 868 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 869 assert(refcount < UINT64_MAX); 870 if (refcount > 0) { 871 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 872 } 873 874 /* We still have outstanding references, don't clear it. */ 875 if (refcount > 1) { 876 pthread_mutex_unlock(&g_vfio.mutex); 877 return 0; 878 } 879 880 /** don't support partial or multiple-page unmap for now */ 881 assert(dma_map->map.size == size); 882 883 if (g_vfio.device_ref == 0) { 884 /* Memory is not mapped anymore, just remove it's references */ 885 goto out_remove; 886 } 887 888 889 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 890 if (ret) { 891 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 892 pthread_mutex_unlock(&g_vfio.mutex); 893 return ret; 894 } 895 896 out_remove: 897 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 898 pthread_mutex_unlock(&g_vfio.mutex); 899 free(dma_map); 900 return 0; 901 } 902 #endif 903 904 static uint64_t 905 vtophys_get_paddr_memseg(uint64_t vaddr) 906 { 907 uintptr_t paddr; 908 struct rte_memseg *seg; 909 910 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 911 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 912 if (seg != NULL) { 913 paddr = seg->phys_addr; 914 if (paddr == RTE_BAD_IOVA) { 915 return SPDK_VTOPHYS_ERROR; 916 } 917 paddr += (vaddr - (uintptr_t)seg->addr); 918 return paddr; 919 } 920 #else 921 struct rte_mem_config *mcfg; 922 uint32_t seg_idx; 923 924 mcfg = rte_eal_get_configuration()->mem_config; 925 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 926 seg = &mcfg->memseg[seg_idx]; 927 if (seg->addr == NULL) { 928 break; 929 } 930 931 if (vaddr >= (uintptr_t)seg->addr && 932 vaddr < ((uintptr_t)seg->addr + seg->len)) { 933 paddr = seg->phys_addr; 934 if (paddr == RTE_BAD_IOVA) { 935 return SPDK_VTOPHYS_ERROR; 936 } 937 paddr += (vaddr - (uintptr_t)seg->addr); 938 return paddr; 939 } 940 } 941 #endif 942 943 return SPDK_VTOPHYS_ERROR; 944 } 945 946 /* Try to get the paddr from /proc/self/pagemap */ 947 static uint64_t 948 vtophys_get_paddr_pagemap(uint64_t vaddr) 949 { 950 uintptr_t paddr; 951 952 /* Silence static analyzers */ 953 assert(vaddr != 0); 954 paddr = rte_mem_virt2iova((void *)vaddr); 955 if (paddr == RTE_BAD_IOVA) { 956 /* 957 * The vaddr may be valid but doesn't have a backing page 958 * assigned yet. Touch the page to ensure a backing page 959 * gets assigned, then try to translate again. 960 */ 961 rte_atomic64_read((rte_atomic64_t *)vaddr); 962 paddr = rte_mem_virt2iova((void *)vaddr); 963 } 964 if (paddr == RTE_BAD_IOVA) { 965 /* Unable to get to the physical address. */ 966 return SPDK_VTOPHYS_ERROR; 967 } 968 969 return paddr; 970 } 971 972 /* Try to get the paddr from pci devices */ 973 static uint64_t 974 vtophys_get_paddr_pci(uint64_t vaddr) 975 { 976 struct spdk_vtophys_pci_device *vtophys_dev; 977 uintptr_t paddr; 978 struct rte_pci_device *dev; 979 struct rte_mem_resource *res; 980 unsigned r; 981 982 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 983 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 984 dev = vtophys_dev->pci_device; 985 986 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 987 res = &dev->mem_resource[r]; 988 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 989 vaddr < (uint64_t)res->addr + res->len) { 990 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 991 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 992 (void *)paddr); 993 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 994 return paddr; 995 } 996 } 997 } 998 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 999 1000 return SPDK_VTOPHYS_ERROR; 1001 } 1002 1003 static int 1004 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1005 enum spdk_mem_map_notify_action action, 1006 void *vaddr, size_t len) 1007 { 1008 int rc = 0, pci_phys = 0; 1009 uint64_t paddr; 1010 1011 if ((uintptr_t)vaddr & ~MASK_256TB) { 1012 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1013 return -EINVAL; 1014 } 1015 1016 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1017 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1018 vaddr, len); 1019 return -EINVAL; 1020 } 1021 1022 /* Get the physical address from the DPDK memsegs */ 1023 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1024 1025 switch (action) { 1026 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1027 if (paddr == SPDK_VTOPHYS_ERROR) { 1028 /* This is not an address that DPDK is managing. */ 1029 #if SPDK_VFIO_ENABLED 1030 enum rte_iova_mode iova_mode; 1031 1032 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1033 iova_mode = rte_eal_iova_mode(); 1034 #else 1035 iova_mode = rte_eal_get_configuration()->iova_mode; 1036 #endif 1037 1038 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1039 /* We'll use the virtual address as the iova to match DPDK. */ 1040 paddr = (uint64_t)vaddr; 1041 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1042 if (rc) { 1043 return -EFAULT; 1044 } 1045 while (len > 0) { 1046 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1047 if (rc != 0) { 1048 return rc; 1049 } 1050 vaddr += VALUE_2MB; 1051 paddr += VALUE_2MB; 1052 len -= VALUE_2MB; 1053 } 1054 } else 1055 #endif 1056 { 1057 /* Get the physical address from /proc/self/pagemap. */ 1058 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1059 if (paddr == SPDK_VTOPHYS_ERROR) { 1060 /* Get the physical address from PCI devices */ 1061 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1062 if (paddr == SPDK_VTOPHYS_ERROR) { 1063 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1064 return -EFAULT; 1065 } 1066 /* The beginning of this address range points to a PCI resource, 1067 * so the rest must point to a PCI resource as well. 1068 */ 1069 pci_phys = 1; 1070 } 1071 1072 /* Get paddr for each 2MB chunk in this address range */ 1073 while (len > 0) { 1074 /* Get the physical address from /proc/self/pagemap. */ 1075 if (pci_phys) { 1076 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1077 } else { 1078 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1079 } 1080 1081 if (paddr == SPDK_VTOPHYS_ERROR) { 1082 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1083 return -EFAULT; 1084 } 1085 1086 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1087 if (!pci_phys && (paddr & MASK_2MB)) { 1088 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1089 return -EINVAL; 1090 } 1091 #if SPDK_VFIO_ENABLED 1092 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1093 * with the IOMMU using the physical address to match. */ 1094 if (spdk_iommu_is_enabled()) { 1095 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1096 if (rc) { 1097 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1098 return -EFAULT; 1099 } 1100 } 1101 #endif 1102 1103 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1104 if (rc != 0) { 1105 return rc; 1106 } 1107 1108 vaddr += VALUE_2MB; 1109 len -= VALUE_2MB; 1110 } 1111 } 1112 } else { 1113 /* This is an address managed by DPDK. Just setup the translations. */ 1114 while (len > 0) { 1115 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1116 if (paddr == SPDK_VTOPHYS_ERROR) { 1117 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1118 return -EFAULT; 1119 } 1120 1121 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1122 if (rc != 0) { 1123 return rc; 1124 } 1125 1126 vaddr += VALUE_2MB; 1127 len -= VALUE_2MB; 1128 } 1129 } 1130 1131 break; 1132 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1133 #if SPDK_VFIO_ENABLED 1134 if (paddr == SPDK_VTOPHYS_ERROR) { 1135 /* 1136 * This is not an address that DPDK is managing. If vfio is enabled, 1137 * we need to unmap the range from the IOMMU 1138 */ 1139 if (spdk_iommu_is_enabled()) { 1140 uint64_t buffer_len = len; 1141 uint8_t *va = vaddr; 1142 enum rte_iova_mode iova_mode; 1143 1144 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1145 iova_mode = rte_eal_iova_mode(); 1146 #else 1147 iova_mode = rte_eal_get_configuration()->iova_mode; 1148 #endif 1149 /* 1150 * In virtual address mode, the region is contiguous and can be done in 1151 * one unmap. 1152 */ 1153 if (iova_mode == RTE_IOVA_VA) { 1154 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1155 if (buffer_len != len || paddr != (uintptr_t)va) { 1156 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1157 "translation had address 0x%" PRIx64 " and length %lu\n", 1158 va, len, paddr, buffer_len); 1159 return -EINVAL; 1160 } 1161 rc = vtophys_iommu_unmap_dma(paddr, len); 1162 if (rc) { 1163 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1164 return -EFAULT; 1165 } 1166 } else if (iova_mode == RTE_IOVA_PA) { 1167 /* Get paddr for each 2MB chunk in this address range */ 1168 while (buffer_len > 0) { 1169 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1170 1171 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1172 DEBUG_PRINT("could not get phys addr for %p\n", va); 1173 return -EFAULT; 1174 } 1175 1176 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1177 if (rc) { 1178 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1179 return -EFAULT; 1180 } 1181 1182 va += VALUE_2MB; 1183 buffer_len -= VALUE_2MB; 1184 } 1185 } 1186 } 1187 } 1188 #endif 1189 while (len > 0) { 1190 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1191 if (rc != 0) { 1192 return rc; 1193 } 1194 1195 vaddr += VALUE_2MB; 1196 len -= VALUE_2MB; 1197 } 1198 1199 break; 1200 default: 1201 SPDK_UNREACHABLE(); 1202 } 1203 1204 return rc; 1205 } 1206 1207 static int 1208 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1209 { 1210 /* This function is always called with paddrs for two subsequent 1211 * 2MB chunks in virtual address space, so those chunks will be only 1212 * physically contiguous if the physical addresses are 2MB apart 1213 * from each other as well. 1214 */ 1215 return (paddr2 - paddr1 == VALUE_2MB); 1216 } 1217 1218 #if SPDK_VFIO_ENABLED 1219 1220 static bool 1221 spdk_vfio_enabled(void) 1222 { 1223 return rte_vfio_is_enabled("vfio_pci"); 1224 } 1225 1226 /* Check if IOMMU is enabled on the system */ 1227 static bool 1228 has_iommu_groups(void) 1229 { 1230 struct dirent *d; 1231 int count = 0; 1232 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1233 1234 if (dir == NULL) { 1235 return false; 1236 } 1237 1238 while (count < 3 && (d = readdir(dir)) != NULL) { 1239 count++; 1240 } 1241 1242 closedir(dir); 1243 /* there will always be ./ and ../ entries */ 1244 return count > 2; 1245 } 1246 1247 static bool 1248 spdk_vfio_noiommu_enabled(void) 1249 { 1250 return rte_vfio_noiommu_is_enabled(); 1251 } 1252 1253 static void 1254 spdk_vtophys_iommu_init(void) 1255 { 1256 char proc_fd_path[PATH_MAX + 1]; 1257 char link_path[PATH_MAX + 1]; 1258 const char vfio_path[] = "/dev/vfio/vfio"; 1259 DIR *dir; 1260 struct dirent *d; 1261 1262 if (!spdk_vfio_enabled()) { 1263 return; 1264 } 1265 1266 if (spdk_vfio_noiommu_enabled()) { 1267 g_vfio.noiommu_enabled = true; 1268 } else if (!has_iommu_groups()) { 1269 return; 1270 } 1271 1272 dir = opendir("/proc/self/fd"); 1273 if (!dir) { 1274 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1275 return; 1276 } 1277 1278 while ((d = readdir(dir)) != NULL) { 1279 if (d->d_type != DT_LNK) { 1280 continue; 1281 } 1282 1283 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1284 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1285 continue; 1286 } 1287 1288 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1289 sscanf(d->d_name, "%d", &g_vfio.fd); 1290 break; 1291 } 1292 } 1293 1294 closedir(dir); 1295 1296 if (g_vfio.fd < 0) { 1297 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1298 return; 1299 } 1300 1301 g_vfio.enabled = true; 1302 1303 return; 1304 } 1305 #endif 1306 1307 void 1308 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1309 { 1310 struct spdk_vtophys_pci_device *vtophys_dev; 1311 1312 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1313 1314 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1315 if (vtophys_dev) { 1316 vtophys_dev->pci_device = pci_device; 1317 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1318 } else { 1319 DEBUG_PRINT("Memory allocation error\n"); 1320 } 1321 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1322 1323 #if SPDK_VFIO_ENABLED 1324 struct spdk_vfio_dma_map *dma_map; 1325 int ret; 1326 1327 if (!g_vfio.enabled) { 1328 return; 1329 } 1330 1331 pthread_mutex_lock(&g_vfio.mutex); 1332 g_vfio.device_ref++; 1333 if (g_vfio.device_ref > 1) { 1334 pthread_mutex_unlock(&g_vfio.mutex); 1335 return; 1336 } 1337 1338 /* This is the first SPDK device using DPDK vfio. This means that the first 1339 * IOMMU group might have been just been added to the DPDK vfio container. 1340 * From this point it is certain that the memory can be mapped now. 1341 */ 1342 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1343 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1344 if (ret) { 1345 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1346 break; 1347 } 1348 } 1349 pthread_mutex_unlock(&g_vfio.mutex); 1350 #endif 1351 } 1352 1353 void 1354 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1355 { 1356 struct spdk_vtophys_pci_device *vtophys_dev; 1357 1358 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1359 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1360 if (vtophys_dev->pci_device == pci_device) { 1361 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1362 free(vtophys_dev); 1363 break; 1364 } 1365 } 1366 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1367 1368 #if SPDK_VFIO_ENABLED 1369 struct spdk_vfio_dma_map *dma_map; 1370 int ret; 1371 1372 if (!g_vfio.enabled) { 1373 return; 1374 } 1375 1376 pthread_mutex_lock(&g_vfio.mutex); 1377 assert(g_vfio.device_ref > 0); 1378 g_vfio.device_ref--; 1379 if (g_vfio.device_ref > 0) { 1380 pthread_mutex_unlock(&g_vfio.mutex); 1381 return; 1382 } 1383 1384 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1385 * any additional devices using it's vfio container, all the mappings 1386 * will be automatically removed by the Linux vfio driver. We unmap 1387 * the memory manually to be able to easily re-map it later regardless 1388 * of other, external factors. 1389 */ 1390 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1391 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1392 if (ret) { 1393 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1394 break; 1395 } 1396 } 1397 pthread_mutex_unlock(&g_vfio.mutex); 1398 #endif 1399 } 1400 1401 int 1402 spdk_vtophys_init(void) 1403 { 1404 const struct spdk_mem_map_ops vtophys_map_ops = { 1405 .notify_cb = spdk_vtophys_notify, 1406 .are_contiguous = vtophys_check_contiguous_entries, 1407 }; 1408 1409 const struct spdk_mem_map_ops phys_ref_map_ops = { 1410 .notify_cb = NULL, 1411 .are_contiguous = NULL, 1412 }; 1413 1414 #if SPDK_VFIO_ENABLED 1415 spdk_vtophys_iommu_init(); 1416 #endif 1417 1418 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1419 if (g_phys_ref_map == NULL) { 1420 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1421 return -ENOMEM; 1422 } 1423 1424 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1425 if (g_vtophys_map == NULL) { 1426 DEBUG_PRINT("vtophys map allocation failed\n"); 1427 return -ENOMEM; 1428 } 1429 return 0; 1430 } 1431 1432 uint64_t 1433 spdk_vtophys(void *buf, uint64_t *size) 1434 { 1435 uint64_t vaddr, paddr_2mb; 1436 1437 vaddr = (uint64_t)buf; 1438 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1439 1440 /* 1441 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1442 * we will still bitwise-or it with the buf offset below, but the result will still be 1443 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1444 * unaligned) we must now check the return value before addition. 1445 */ 1446 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1447 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1448 return SPDK_VTOPHYS_ERROR; 1449 } else { 1450 return paddr_2mb + (vaddr & MASK_2MB); 1451 } 1452 } 1453