1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_malloc.h> 40 #include <rte_memory.h> 41 #include <rte_eal_memconfig.h> 42 43 #include "spdk_internal/assert.h" 44 45 #include "spdk/assert.h" 46 #include "spdk/likely.h" 47 #include "spdk/queue.h" 48 #include "spdk/util.h" 49 #include "spdk/memory.h" 50 #include "spdk/env_dpdk.h" 51 52 #ifdef __FreeBSD__ 53 #define SPDK_VFIO_ENABLED 0 54 #else 55 #include <linux/version.h> 56 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 57 #define SPDK_VFIO_ENABLED 1 58 #include <linux/vfio.h> 59 #include <rte_vfio.h> 60 61 struct spdk_vfio_dma_map { 62 struct vfio_iommu_type1_dma_map map; 63 struct vfio_iommu_type1_dma_unmap unmap; 64 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 65 }; 66 67 struct vfio_cfg { 68 int fd; 69 bool enabled; 70 bool noiommu_enabled; 71 unsigned device_ref; 72 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 73 pthread_mutex_t mutex; 74 }; 75 76 static struct vfio_cfg g_vfio = { 77 .fd = -1, 78 .enabled = false, 79 .noiommu_enabled = false, 80 .device_ref = 0, 81 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 82 .mutex = PTHREAD_MUTEX_INITIALIZER 83 }; 84 85 #else 86 #define SPDK_VFIO_ENABLED 0 87 #endif 88 #endif 89 90 #if DEBUG 91 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 92 #else 93 #define DEBUG_PRINT(...) 94 #endif 95 96 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 97 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 98 99 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 100 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 101 102 /* Page is registered */ 103 #define REG_MAP_REGISTERED (1ULL << 62) 104 105 /* A notification region barrier. The 2MB translation entry that's marked 106 * with this flag must be unregistered separately. This allows contiguous 107 * regions to be unregistered in the same chunks they were registered. 108 */ 109 #define REG_MAP_NOTIFY_START (1ULL << 63) 110 111 /* Translation of a single 2MB page. */ 112 struct map_2mb { 113 uint64_t translation_2mb; 114 }; 115 116 /* Second-level map table indexed by bits [21..29] of the virtual address. 117 * Each entry contains the address translation or error for entries that haven't 118 * been retrieved yet. 119 */ 120 struct map_1gb { 121 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 122 }; 123 124 /* Top-level map table indexed by bits [30..47] of the virtual address. 125 * Each entry points to a second-level map table or NULL. 126 */ 127 struct map_256tb { 128 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 129 }; 130 131 /* Page-granularity memory address translation */ 132 struct spdk_mem_map { 133 struct map_256tb map_256tb; 134 pthread_mutex_t mutex; 135 uint64_t default_translation; 136 struct spdk_mem_map_ops ops; 137 void *cb_ctx; 138 TAILQ_ENTRY(spdk_mem_map) tailq; 139 }; 140 141 /* Registrations map. The 64 bit translations are bit fields with the 142 * following layout (starting with the low bits): 143 * 0 - 61 : reserved 144 * 62 - 63 : flags 145 */ 146 static struct spdk_mem_map *g_mem_reg_map; 147 static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 148 TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 149 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 150 151 static bool g_legacy_mem; 152 153 /* 154 * Walk the currently registered memory via the main memory registration map 155 * and call the new map's notify callback for each virtually contiguous region. 156 */ 157 static int 158 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 159 { 160 size_t idx_256tb; 161 uint64_t idx_1gb; 162 uint64_t contig_start = UINT64_MAX; 163 uint64_t contig_end = UINT64_MAX; 164 struct map_1gb *map_1gb; 165 int rc; 166 167 if (!g_mem_reg_map) { 168 return -EINVAL; 169 } 170 171 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 172 pthread_mutex_lock(&g_mem_reg_map->mutex); 173 174 for (idx_256tb = 0; 175 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 176 idx_256tb++) { 177 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 178 179 if (!map_1gb) { 180 if (contig_start != UINT64_MAX) { 181 /* End of of a virtually contiguous range */ 182 rc = map->ops.notify_cb(map->cb_ctx, map, action, 183 (void *)contig_start, 184 contig_end - contig_start + VALUE_2MB); 185 /* Don't bother handling unregister failures. It can't be any worse */ 186 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 187 goto err_unregister; 188 } 189 } 190 contig_start = UINT64_MAX; 191 continue; 192 } 193 194 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 195 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 196 (contig_start == UINT64_MAX || 197 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 198 /* Rebuild the virtual address from the indexes */ 199 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 200 201 if (contig_start == UINT64_MAX) { 202 contig_start = vaddr; 203 } 204 205 contig_end = vaddr; 206 } else { 207 if (contig_start != UINT64_MAX) { 208 /* End of of a virtually contiguous range */ 209 rc = map->ops.notify_cb(map->cb_ctx, map, action, 210 (void *)contig_start, 211 contig_end - contig_start + VALUE_2MB); 212 /* Don't bother handling unregister failures. It can't be any worse */ 213 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 214 goto err_unregister; 215 } 216 217 /* This page might be a part of a neighbour region, so process 218 * it again. The idx_1gb will be incremented immediately. 219 */ 220 idx_1gb--; 221 } 222 contig_start = UINT64_MAX; 223 } 224 } 225 } 226 227 pthread_mutex_unlock(&g_mem_reg_map->mutex); 228 return 0; 229 230 err_unregister: 231 /* Unwind to the first empty translation so we don't unregister 232 * a region that just failed to register. 233 */ 234 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 235 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 236 contig_start = UINT64_MAX; 237 contig_end = UINT64_MAX; 238 239 /* Unregister any memory we managed to register before the failure */ 240 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 241 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 242 243 if (!map_1gb) { 244 if (contig_end != UINT64_MAX) { 245 /* End of of a virtually contiguous range */ 246 map->ops.notify_cb(map->cb_ctx, map, 247 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 248 (void *)contig_start, 249 contig_end - contig_start + VALUE_2MB); 250 } 251 contig_end = UINT64_MAX; 252 continue; 253 } 254 255 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 256 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 257 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 258 /* Rebuild the virtual address from the indexes */ 259 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 260 261 if (contig_end == UINT64_MAX) { 262 contig_end = vaddr; 263 } 264 contig_start = vaddr; 265 } else { 266 if (contig_end != UINT64_MAX) { 267 /* End of of a virtually contiguous range */ 268 map->ops.notify_cb(map->cb_ctx, map, 269 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 270 (void *)contig_start, 271 contig_end - contig_start + VALUE_2MB); 272 idx_1gb++; 273 } 274 contig_end = UINT64_MAX; 275 } 276 } 277 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 278 } 279 280 pthread_mutex_unlock(&g_mem_reg_map->mutex); 281 return rc; 282 } 283 284 struct spdk_mem_map * 285 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 286 { 287 struct spdk_mem_map *map; 288 int rc; 289 290 map = calloc(1, sizeof(*map)); 291 if (map == NULL) { 292 return NULL; 293 } 294 295 if (pthread_mutex_init(&map->mutex, NULL)) { 296 free(map); 297 return NULL; 298 } 299 300 map->default_translation = default_translation; 301 map->cb_ctx = cb_ctx; 302 if (ops) { 303 map->ops = *ops; 304 } 305 306 if (ops && ops->notify_cb) { 307 pthread_mutex_lock(&g_spdk_mem_map_mutex); 308 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 309 if (rc != 0) { 310 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 311 DEBUG_PRINT("Initial mem_map notify failed\n"); 312 pthread_mutex_destroy(&map->mutex); 313 free(map); 314 return NULL; 315 } 316 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 317 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 318 } 319 320 return map; 321 } 322 323 void 324 spdk_mem_map_free(struct spdk_mem_map **pmap) 325 { 326 struct spdk_mem_map *map; 327 size_t i; 328 329 if (!pmap) { 330 return; 331 } 332 333 map = *pmap; 334 335 if (!map) { 336 return; 337 } 338 339 if (map->ops.notify_cb) { 340 pthread_mutex_lock(&g_spdk_mem_map_mutex); 341 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 342 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 343 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 344 } 345 346 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 347 if (g_legacy_mem) { 348 rte_free(map->map_256tb.map[i]); 349 } else { 350 free(map->map_256tb.map[i]); 351 } 352 } 353 354 pthread_mutex_destroy(&map->mutex); 355 356 free(map); 357 *pmap = NULL; 358 } 359 360 int 361 spdk_mem_register(void *vaddr, size_t len) 362 { 363 struct spdk_mem_map *map; 364 int rc; 365 void *seg_vaddr; 366 size_t seg_len; 367 uint64_t reg; 368 369 if ((uintptr_t)vaddr & ~MASK_256TB) { 370 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 371 return -EINVAL; 372 } 373 374 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 375 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 376 __func__, vaddr, len); 377 return -EINVAL; 378 } 379 380 if (len == 0) { 381 return 0; 382 } 383 384 pthread_mutex_lock(&g_spdk_mem_map_mutex); 385 386 seg_vaddr = vaddr; 387 seg_len = len; 388 while (seg_len > 0) { 389 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 390 if (reg & REG_MAP_REGISTERED) { 391 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 392 return -EBUSY; 393 } 394 seg_vaddr += VALUE_2MB; 395 seg_len -= VALUE_2MB; 396 } 397 398 seg_vaddr = vaddr; 399 seg_len = 0; 400 while (len > 0) { 401 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 402 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 403 seg_len += VALUE_2MB; 404 vaddr += VALUE_2MB; 405 len -= VALUE_2MB; 406 } 407 408 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 409 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 410 if (rc != 0) { 411 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 412 return rc; 413 } 414 } 415 416 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 417 return 0; 418 } 419 420 int 421 spdk_mem_unregister(void *vaddr, size_t len) 422 { 423 struct spdk_mem_map *map; 424 int rc; 425 void *seg_vaddr; 426 size_t seg_len; 427 uint64_t reg, newreg; 428 429 if ((uintptr_t)vaddr & ~MASK_256TB) { 430 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 431 return -EINVAL; 432 } 433 434 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 435 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 436 __func__, vaddr, len); 437 return -EINVAL; 438 } 439 440 pthread_mutex_lock(&g_spdk_mem_map_mutex); 441 442 /* The first page must be a start of a region. Also check if it's 443 * registered to make sure we don't return -ERANGE for non-registered 444 * regions. 445 */ 446 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 447 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 448 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 449 return -ERANGE; 450 } 451 452 seg_vaddr = vaddr; 453 seg_len = len; 454 while (seg_len > 0) { 455 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 456 if ((reg & REG_MAP_REGISTERED) == 0) { 457 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 458 return -EINVAL; 459 } 460 seg_vaddr += VALUE_2MB; 461 seg_len -= VALUE_2MB; 462 } 463 464 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 465 /* If the next page is registered, it must be a start of a region as well, 466 * otherwise we'd be unregistering only a part of a region. 467 */ 468 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 469 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 470 return -ERANGE; 471 } 472 seg_vaddr = vaddr; 473 seg_len = 0; 474 475 while (len > 0) { 476 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 477 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 478 479 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 480 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 481 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 482 if (rc != 0) { 483 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 484 return rc; 485 } 486 } 487 488 seg_vaddr = vaddr; 489 seg_len = VALUE_2MB; 490 } else { 491 seg_len += VALUE_2MB; 492 } 493 494 vaddr += VALUE_2MB; 495 len -= VALUE_2MB; 496 } 497 498 if (seg_len > 0) { 499 TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 500 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 501 if (rc != 0) { 502 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 503 return rc; 504 } 505 } 506 } 507 508 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 509 return 0; 510 } 511 512 static struct map_1gb * 513 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 514 { 515 struct map_1gb *map_1gb; 516 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 517 size_t i; 518 519 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 520 return NULL; 521 } 522 523 map_1gb = map->map_256tb.map[idx_256tb]; 524 525 if (!map_1gb) { 526 pthread_mutex_lock(&map->mutex); 527 528 /* Recheck to make sure nobody else got the mutex first. */ 529 map_1gb = map->map_256tb.map[idx_256tb]; 530 if (!map_1gb) { 531 /* Some of the existing apps use TCMalloc hugepage 532 * allocator and register this tcmalloc allocated 533 * hugepage memory with SPDK in the mmap hook. Since 534 * this function is called in the spdk_mem_register 535 * code path we can't do a malloc here otherwise that 536 * would cause a livelock. So we use the dpdk provided 537 * allocator instead, which avoids this cyclic 538 * dependency. Note this is only guaranteed to work when 539 * DPDK dynamic memory allocation is disabled (--legacy-mem), 540 * which then is a requirement for anyone using TCMalloc in 541 * this way. 542 */ 543 if (g_legacy_mem) { 544 map_1gb = rte_malloc(NULL, sizeof(struct map_1gb), 0); 545 } else { 546 map_1gb = malloc(sizeof(struct map_1gb)); 547 } 548 if (map_1gb) { 549 /* initialize all entries to default translation */ 550 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 551 map_1gb->map[i].translation_2mb = map->default_translation; 552 } 553 map->map_256tb.map[idx_256tb] = map_1gb; 554 } 555 } 556 557 pthread_mutex_unlock(&map->mutex); 558 559 if (!map_1gb) { 560 DEBUG_PRINT("allocation failed\n"); 561 return NULL; 562 } 563 } 564 565 return map_1gb; 566 } 567 568 int 569 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 570 uint64_t translation) 571 { 572 uint64_t vfn_2mb; 573 struct map_1gb *map_1gb; 574 uint64_t idx_1gb; 575 struct map_2mb *map_2mb; 576 577 if ((uintptr_t)vaddr & ~MASK_256TB) { 578 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 579 return -EINVAL; 580 } 581 582 /* For now, only 2 MB-aligned registrations are supported */ 583 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 584 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 585 __func__, vaddr, size); 586 return -EINVAL; 587 } 588 589 vfn_2mb = vaddr >> SHIFT_2MB; 590 591 while (size) { 592 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 593 if (!map_1gb) { 594 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 595 return -ENOMEM; 596 } 597 598 idx_1gb = MAP_1GB_IDX(vfn_2mb); 599 map_2mb = &map_1gb->map[idx_1gb]; 600 map_2mb->translation_2mb = translation; 601 602 size -= VALUE_2MB; 603 vfn_2mb++; 604 } 605 606 return 0; 607 } 608 609 int 610 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 611 { 612 return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 613 } 614 615 inline uint64_t 616 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 617 { 618 const struct map_1gb *map_1gb; 619 const struct map_2mb *map_2mb; 620 uint64_t idx_256tb; 621 uint64_t idx_1gb; 622 uint64_t vfn_2mb; 623 uint64_t cur_size; 624 uint64_t prev_translation; 625 uint64_t orig_translation; 626 627 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 628 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 629 return map->default_translation; 630 } 631 632 vfn_2mb = vaddr >> SHIFT_2MB; 633 idx_256tb = MAP_256TB_IDX(vfn_2mb); 634 idx_1gb = MAP_1GB_IDX(vfn_2mb); 635 636 map_1gb = map->map_256tb.map[idx_256tb]; 637 if (spdk_unlikely(!map_1gb)) { 638 return map->default_translation; 639 } 640 641 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 642 map_2mb = &map_1gb->map[idx_1gb]; 643 if (size == NULL || map->ops.are_contiguous == NULL || 644 map_2mb->translation_2mb == map->default_translation) { 645 if (size != NULL) { 646 *size = spdk_min(*size, cur_size); 647 } 648 return map_2mb->translation_2mb; 649 } 650 651 orig_translation = map_2mb->translation_2mb; 652 prev_translation = orig_translation; 653 while (cur_size < *size) { 654 vfn_2mb++; 655 idx_256tb = MAP_256TB_IDX(vfn_2mb); 656 idx_1gb = MAP_1GB_IDX(vfn_2mb); 657 658 map_1gb = map->map_256tb.map[idx_256tb]; 659 if (spdk_unlikely(!map_1gb)) { 660 break; 661 } 662 663 map_2mb = &map_1gb->map[idx_1gb]; 664 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 665 break; 666 } 667 668 cur_size += VALUE_2MB; 669 prev_translation = map_2mb->translation_2mb; 670 } 671 672 *size = spdk_min(*size, cur_size); 673 return orig_translation; 674 } 675 676 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 677 static void 678 memory_hotplug_cb(enum rte_mem_event event_type, 679 const void *addr, size_t len, void *arg) 680 { 681 if (event_type == RTE_MEM_EVENT_ALLOC) { 682 spdk_mem_register((void *)addr, len); 683 684 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 685 if (!spdk_env_dpdk_external_init()) { 686 return; 687 } 688 #endif 689 690 /* Prior to DPDK 19.02, we have to worry about DPDK 691 * freeing memory in different units than it was allocated. 692 * That doesn't work with things like RDMA MRs. So for 693 * those versions of DPDK, mark each segment so that DPDK 694 * won't later free it. That ensures we don't have to deal 695 * with that scenario. 696 * 697 * DPDK 19.02 added the --match-allocations RTE flag to 698 * avoid this condition. 699 * 700 * Note: if the user initialized DPDK separately, we can't 701 * be sure that --match-allocations was specified, so need 702 * to still mark the segments so they aren't freed. 703 */ 704 while (len > 0) { 705 struct rte_memseg *seg; 706 707 seg = rte_mem_virt2memseg(addr, NULL); 708 assert(seg != NULL); 709 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 710 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 711 len -= seg->hugepage_sz; 712 } 713 } else if (event_type == RTE_MEM_EVENT_FREE) { 714 spdk_mem_unregister((void *)addr, len); 715 } 716 } 717 718 static int 719 memory_iter_cb(const struct rte_memseg_list *msl, 720 const struct rte_memseg *ms, size_t len, void *arg) 721 { 722 return spdk_mem_register(ms->addr, len); 723 } 724 #endif 725 726 int 727 spdk_mem_map_init(bool legacy_mem) 728 { 729 g_legacy_mem = legacy_mem; 730 731 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 732 if (g_mem_reg_map == NULL) { 733 DEBUG_PRINT("memory registration map allocation failed\n"); 734 return -ENOMEM; 735 } 736 737 /* 738 * Walk all DPDK memory segments and register them 739 * with the master memory map 740 */ 741 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 742 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 743 rte_memseg_contig_walk(memory_iter_cb, NULL); 744 #else 745 struct rte_mem_config *mcfg; 746 size_t seg_idx; 747 748 mcfg = rte_eal_get_configuration()->mem_config; 749 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 750 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 751 752 if (seg->addr == NULL) { 753 break; 754 } 755 756 spdk_mem_register(seg->addr, seg->len); 757 } 758 #endif 759 return 0; 760 } 761 762 bool 763 spdk_iommu_is_enabled(void) 764 { 765 #if SPDK_VFIO_ENABLED 766 return g_vfio.enabled && !g_vfio.noiommu_enabled; 767 #else 768 return false; 769 #endif 770 } 771 772 struct spdk_vtophys_pci_device { 773 struct rte_pci_device *pci_device; 774 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 775 }; 776 777 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 778 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 779 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 780 781 static struct spdk_mem_map *g_vtophys_map; 782 static struct spdk_mem_map *g_phys_ref_map; 783 784 #if SPDK_VFIO_ENABLED 785 static int 786 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 787 { 788 struct spdk_vfio_dma_map *dma_map; 789 uint64_t refcount; 790 int ret; 791 792 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 793 assert(refcount < UINT64_MAX); 794 if (refcount > 0) { 795 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 796 return 0; 797 } 798 799 dma_map = calloc(1, sizeof(*dma_map)); 800 if (dma_map == NULL) { 801 return -ENOMEM; 802 } 803 804 dma_map->map.argsz = sizeof(dma_map->map); 805 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 806 dma_map->map.vaddr = vaddr; 807 dma_map->map.iova = iova; 808 dma_map->map.size = size; 809 810 dma_map->unmap.argsz = sizeof(dma_map->unmap); 811 dma_map->unmap.flags = 0; 812 dma_map->unmap.iova = iova; 813 dma_map->unmap.size = size; 814 815 pthread_mutex_lock(&g_vfio.mutex); 816 if (g_vfio.device_ref == 0) { 817 /* VFIO requires at least one device (IOMMU group) to be added to 818 * a VFIO container before it is possible to perform any IOMMU 819 * operations on that container. This memory will be mapped once 820 * the first device (IOMMU group) is hotplugged. 821 * 822 * Since the vfio container is managed internally by DPDK, it is 823 * also possible that some device is already in that container, but 824 * it's not managed by SPDK - e.g. an NIC attached internally 825 * inside DPDK. We could map the memory straight away in such 826 * scenario, but there's no need to do it. DPDK devices clearly 827 * don't need our mappings and hence we defer the mapping 828 * unconditionally until the first SPDK-managed device is 829 * hotplugged. 830 */ 831 goto out_insert; 832 } 833 834 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 835 if (ret) { 836 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 837 pthread_mutex_unlock(&g_vfio.mutex); 838 free(dma_map); 839 return ret; 840 } 841 842 out_insert: 843 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 844 pthread_mutex_unlock(&g_vfio.mutex); 845 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 846 return 0; 847 } 848 849 static int 850 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 851 { 852 struct spdk_vfio_dma_map *dma_map; 853 uint64_t refcount; 854 int ret; 855 856 pthread_mutex_lock(&g_vfio.mutex); 857 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 858 if (dma_map->map.iova == iova) { 859 break; 860 } 861 } 862 863 if (dma_map == NULL) { 864 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 865 pthread_mutex_unlock(&g_vfio.mutex); 866 return -ENXIO; 867 } 868 869 refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 870 assert(refcount < UINT64_MAX); 871 if (refcount > 0) { 872 spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 873 } 874 875 /* We still have outstanding references, don't clear it. */ 876 if (refcount > 1) { 877 pthread_mutex_unlock(&g_vfio.mutex); 878 return 0; 879 } 880 881 /** don't support partial or multiple-page unmap for now */ 882 assert(dma_map->map.size == size); 883 884 if (g_vfio.device_ref == 0) { 885 /* Memory is not mapped anymore, just remove it's references */ 886 goto out_remove; 887 } 888 889 890 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 891 if (ret) { 892 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 893 pthread_mutex_unlock(&g_vfio.mutex); 894 return ret; 895 } 896 897 out_remove: 898 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 899 pthread_mutex_unlock(&g_vfio.mutex); 900 free(dma_map); 901 return 0; 902 } 903 #endif 904 905 static uint64_t 906 vtophys_get_paddr_memseg(uint64_t vaddr) 907 { 908 uintptr_t paddr; 909 struct rte_memseg *seg; 910 911 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 912 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 913 if (seg != NULL) { 914 paddr = seg->phys_addr; 915 if (paddr == RTE_BAD_IOVA) { 916 return SPDK_VTOPHYS_ERROR; 917 } 918 paddr += (vaddr - (uintptr_t)seg->addr); 919 return paddr; 920 } 921 #else 922 struct rte_mem_config *mcfg; 923 uint32_t seg_idx; 924 925 mcfg = rte_eal_get_configuration()->mem_config; 926 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 927 seg = &mcfg->memseg[seg_idx]; 928 if (seg->addr == NULL) { 929 break; 930 } 931 932 if (vaddr >= (uintptr_t)seg->addr && 933 vaddr < ((uintptr_t)seg->addr + seg->len)) { 934 paddr = seg->phys_addr; 935 if (paddr == RTE_BAD_IOVA) { 936 return SPDK_VTOPHYS_ERROR; 937 } 938 paddr += (vaddr - (uintptr_t)seg->addr); 939 return paddr; 940 } 941 } 942 #endif 943 944 return SPDK_VTOPHYS_ERROR; 945 } 946 947 /* Try to get the paddr from /proc/self/pagemap */ 948 static uint64_t 949 vtophys_get_paddr_pagemap(uint64_t vaddr) 950 { 951 uintptr_t paddr; 952 953 /* Silence static analyzers */ 954 assert(vaddr != 0); 955 paddr = rte_mem_virt2iova((void *)vaddr); 956 if (paddr == RTE_BAD_IOVA) { 957 /* 958 * The vaddr may be valid but doesn't have a backing page 959 * assigned yet. Touch the page to ensure a backing page 960 * gets assigned, then try to translate again. 961 */ 962 rte_atomic64_read((rte_atomic64_t *)vaddr); 963 paddr = rte_mem_virt2iova((void *)vaddr); 964 } 965 if (paddr == RTE_BAD_IOVA) { 966 /* Unable to get to the physical address. */ 967 return SPDK_VTOPHYS_ERROR; 968 } 969 970 return paddr; 971 } 972 973 /* Try to get the paddr from pci devices */ 974 static uint64_t 975 vtophys_get_paddr_pci(uint64_t vaddr) 976 { 977 struct spdk_vtophys_pci_device *vtophys_dev; 978 uintptr_t paddr; 979 struct rte_pci_device *dev; 980 struct rte_mem_resource *res; 981 unsigned r; 982 983 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 984 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 985 dev = vtophys_dev->pci_device; 986 987 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 988 res = &dev->mem_resource[r]; 989 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 990 vaddr < (uint64_t)res->addr + res->len) { 991 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 992 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 993 (void *)paddr); 994 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 995 return paddr; 996 } 997 } 998 } 999 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1000 1001 return SPDK_VTOPHYS_ERROR; 1002 } 1003 1004 static int 1005 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1006 enum spdk_mem_map_notify_action action, 1007 void *vaddr, size_t len) 1008 { 1009 int rc = 0, pci_phys = 0; 1010 uint64_t paddr; 1011 1012 if ((uintptr_t)vaddr & ~MASK_256TB) { 1013 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1014 return -EINVAL; 1015 } 1016 1017 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1018 DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 1019 vaddr, len); 1020 return -EINVAL; 1021 } 1022 1023 /* Get the physical address from the DPDK memsegs */ 1024 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1025 1026 switch (action) { 1027 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1028 if (paddr == SPDK_VTOPHYS_ERROR) { 1029 /* This is not an address that DPDK is managing. */ 1030 #if SPDK_VFIO_ENABLED 1031 enum rte_iova_mode iova_mode; 1032 1033 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1034 iova_mode = rte_eal_iova_mode(); 1035 #else 1036 iova_mode = rte_eal_get_configuration()->iova_mode; 1037 #endif 1038 1039 if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1040 /* We'll use the virtual address as the iova to match DPDK. */ 1041 paddr = (uint64_t)vaddr; 1042 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1043 if (rc) { 1044 return -EFAULT; 1045 } 1046 while (len > 0) { 1047 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1048 if (rc != 0) { 1049 return rc; 1050 } 1051 vaddr += VALUE_2MB; 1052 paddr += VALUE_2MB; 1053 len -= VALUE_2MB; 1054 } 1055 } else 1056 #endif 1057 { 1058 /* Get the physical address from /proc/self/pagemap. */ 1059 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1060 if (paddr == SPDK_VTOPHYS_ERROR) { 1061 /* Get the physical address from PCI devices */ 1062 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1063 if (paddr == SPDK_VTOPHYS_ERROR) { 1064 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1065 return -EFAULT; 1066 } 1067 /* The beginning of this address range points to a PCI resource, 1068 * so the rest must point to a PCI resource as well. 1069 */ 1070 pci_phys = 1; 1071 } 1072 1073 /* Get paddr for each 2MB chunk in this address range */ 1074 while (len > 0) { 1075 /* Get the physical address from /proc/self/pagemap. */ 1076 if (pci_phys) { 1077 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1078 } else { 1079 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1080 } 1081 1082 if (paddr == SPDK_VTOPHYS_ERROR) { 1083 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1084 return -EFAULT; 1085 } 1086 1087 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1088 if (!pci_phys && (paddr & MASK_2MB)) { 1089 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1090 return -EINVAL; 1091 } 1092 #if SPDK_VFIO_ENABLED 1093 /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1094 * with the IOMMU using the physical address to match. */ 1095 if (spdk_iommu_is_enabled()) { 1096 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1097 if (rc) { 1098 DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1099 return -EFAULT; 1100 } 1101 } 1102 #endif 1103 1104 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1105 if (rc != 0) { 1106 return rc; 1107 } 1108 1109 vaddr += VALUE_2MB; 1110 len -= VALUE_2MB; 1111 } 1112 } 1113 } else { 1114 /* This is an address managed by DPDK. Just setup the translations. */ 1115 while (len > 0) { 1116 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1117 if (paddr == SPDK_VTOPHYS_ERROR) { 1118 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1119 return -EFAULT; 1120 } 1121 1122 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1123 if (rc != 0) { 1124 return rc; 1125 } 1126 1127 vaddr += VALUE_2MB; 1128 len -= VALUE_2MB; 1129 } 1130 } 1131 1132 break; 1133 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1134 #if SPDK_VFIO_ENABLED 1135 if (paddr == SPDK_VTOPHYS_ERROR) { 1136 /* 1137 * This is not an address that DPDK is managing. If vfio is enabled, 1138 * we need to unmap the range from the IOMMU 1139 */ 1140 if (spdk_iommu_is_enabled()) { 1141 uint64_t buffer_len = len; 1142 uint8_t *va = vaddr; 1143 enum rte_iova_mode iova_mode; 1144 1145 #if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) 1146 iova_mode = rte_eal_iova_mode(); 1147 #else 1148 iova_mode = rte_eal_get_configuration()->iova_mode; 1149 #endif 1150 /* 1151 * In virtual address mode, the region is contiguous and can be done in 1152 * one unmap. 1153 */ 1154 if (iova_mode == RTE_IOVA_VA) { 1155 paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1156 if (buffer_len != len || paddr != (uintptr_t)va) { 1157 DEBUG_PRINT("Unmapping %p with length %lu failed because " 1158 "translation had address 0x%" PRIx64 " and length %lu\n", 1159 va, len, paddr, buffer_len); 1160 return -EINVAL; 1161 } 1162 rc = vtophys_iommu_unmap_dma(paddr, len); 1163 if (rc) { 1164 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1165 return -EFAULT; 1166 } 1167 } else if (iova_mode == RTE_IOVA_PA) { 1168 /* Get paddr for each 2MB chunk in this address range */ 1169 while (buffer_len > 0) { 1170 paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1171 1172 if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1173 DEBUG_PRINT("could not get phys addr for %p\n", va); 1174 return -EFAULT; 1175 } 1176 1177 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1178 if (rc) { 1179 DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1180 return -EFAULT; 1181 } 1182 1183 va += VALUE_2MB; 1184 buffer_len -= VALUE_2MB; 1185 } 1186 } 1187 } 1188 } 1189 #endif 1190 while (len > 0) { 1191 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1192 if (rc != 0) { 1193 return rc; 1194 } 1195 1196 vaddr += VALUE_2MB; 1197 len -= VALUE_2MB; 1198 } 1199 1200 break; 1201 default: 1202 SPDK_UNREACHABLE(); 1203 } 1204 1205 return rc; 1206 } 1207 1208 static int 1209 vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 1210 { 1211 /* This function is always called with paddrs for two subsequent 1212 * 2MB chunks in virtual address space, so those chunks will be only 1213 * physically contiguous if the physical addresses are 2MB apart 1214 * from each other as well. 1215 */ 1216 return (paddr2 - paddr1 == VALUE_2MB); 1217 } 1218 1219 #if SPDK_VFIO_ENABLED 1220 1221 static bool 1222 spdk_vfio_enabled(void) 1223 { 1224 return rte_vfio_is_enabled("vfio_pci"); 1225 } 1226 1227 /* Check if IOMMU is enabled on the system */ 1228 static bool 1229 has_iommu_groups(void) 1230 { 1231 struct dirent *d; 1232 int count = 0; 1233 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1234 1235 if (dir == NULL) { 1236 return false; 1237 } 1238 1239 while (count < 3 && (d = readdir(dir)) != NULL) { 1240 count++; 1241 } 1242 1243 closedir(dir); 1244 /* there will always be ./ and ../ entries */ 1245 return count > 2; 1246 } 1247 1248 static bool 1249 spdk_vfio_noiommu_enabled(void) 1250 { 1251 return rte_vfio_noiommu_is_enabled(); 1252 } 1253 1254 static void 1255 spdk_vtophys_iommu_init(void) 1256 { 1257 char proc_fd_path[PATH_MAX + 1]; 1258 char link_path[PATH_MAX + 1]; 1259 const char vfio_path[] = "/dev/vfio/vfio"; 1260 DIR *dir; 1261 struct dirent *d; 1262 1263 if (!spdk_vfio_enabled()) { 1264 return; 1265 } 1266 1267 if (spdk_vfio_noiommu_enabled()) { 1268 g_vfio.noiommu_enabled = true; 1269 } else if (!has_iommu_groups()) { 1270 return; 1271 } 1272 1273 dir = opendir("/proc/self/fd"); 1274 if (!dir) { 1275 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1276 return; 1277 } 1278 1279 while ((d = readdir(dir)) != NULL) { 1280 if (d->d_type != DT_LNK) { 1281 continue; 1282 } 1283 1284 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1285 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1286 continue; 1287 } 1288 1289 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1290 sscanf(d->d_name, "%d", &g_vfio.fd); 1291 break; 1292 } 1293 } 1294 1295 closedir(dir); 1296 1297 if (g_vfio.fd < 0) { 1298 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1299 return; 1300 } 1301 1302 g_vfio.enabled = true; 1303 1304 return; 1305 } 1306 #endif 1307 1308 void 1309 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1310 { 1311 struct spdk_vtophys_pci_device *vtophys_dev; 1312 1313 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1314 1315 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1316 if (vtophys_dev) { 1317 vtophys_dev->pci_device = pci_device; 1318 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1319 } else { 1320 DEBUG_PRINT("Memory allocation error\n"); 1321 } 1322 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1323 1324 #if SPDK_VFIO_ENABLED 1325 struct spdk_vfio_dma_map *dma_map; 1326 int ret; 1327 1328 if (!g_vfio.enabled) { 1329 return; 1330 } 1331 1332 pthread_mutex_lock(&g_vfio.mutex); 1333 g_vfio.device_ref++; 1334 if (g_vfio.device_ref > 1) { 1335 pthread_mutex_unlock(&g_vfio.mutex); 1336 return; 1337 } 1338 1339 /* This is the first SPDK device using DPDK vfio. This means that the first 1340 * IOMMU group might have been just been added to the DPDK vfio container. 1341 * From this point it is certain that the memory can be mapped now. 1342 */ 1343 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1344 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1345 if (ret) { 1346 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1347 break; 1348 } 1349 } 1350 pthread_mutex_unlock(&g_vfio.mutex); 1351 #endif 1352 } 1353 1354 void 1355 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1356 { 1357 struct spdk_vtophys_pci_device *vtophys_dev; 1358 1359 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1360 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1361 if (vtophys_dev->pci_device == pci_device) { 1362 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1363 free(vtophys_dev); 1364 break; 1365 } 1366 } 1367 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1368 1369 #if SPDK_VFIO_ENABLED 1370 struct spdk_vfio_dma_map *dma_map; 1371 int ret; 1372 1373 if (!g_vfio.enabled) { 1374 return; 1375 } 1376 1377 pthread_mutex_lock(&g_vfio.mutex); 1378 assert(g_vfio.device_ref > 0); 1379 g_vfio.device_ref--; 1380 if (g_vfio.device_ref > 0) { 1381 pthread_mutex_unlock(&g_vfio.mutex); 1382 return; 1383 } 1384 1385 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1386 * any additional devices using it's vfio container, all the mappings 1387 * will be automatically removed by the Linux vfio driver. We unmap 1388 * the memory manually to be able to easily re-map it later regardless 1389 * of other, external factors. 1390 */ 1391 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1392 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1393 if (ret) { 1394 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1395 break; 1396 } 1397 } 1398 pthread_mutex_unlock(&g_vfio.mutex); 1399 #endif 1400 } 1401 1402 int 1403 spdk_vtophys_init(void) 1404 { 1405 const struct spdk_mem_map_ops vtophys_map_ops = { 1406 .notify_cb = spdk_vtophys_notify, 1407 .are_contiguous = vtophys_check_contiguous_entries, 1408 }; 1409 1410 const struct spdk_mem_map_ops phys_ref_map_ops = { 1411 .notify_cb = NULL, 1412 .are_contiguous = NULL, 1413 }; 1414 1415 #if SPDK_VFIO_ENABLED 1416 spdk_vtophys_iommu_init(); 1417 #endif 1418 1419 g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1420 if (g_phys_ref_map == NULL) { 1421 DEBUG_PRINT("phys_ref map allocation failed.\n"); 1422 return -ENOMEM; 1423 } 1424 1425 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1426 if (g_vtophys_map == NULL) { 1427 DEBUG_PRINT("vtophys map allocation failed\n"); 1428 return -ENOMEM; 1429 } 1430 return 0; 1431 } 1432 1433 uint64_t 1434 spdk_vtophys(void *buf, uint64_t *size) 1435 { 1436 uint64_t vaddr, paddr_2mb; 1437 1438 vaddr = (uint64_t)buf; 1439 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1440 1441 /* 1442 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1443 * we will still bitwise-or it with the buf offset below, but the result will still be 1444 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1445 * unaligned) we must now check the return value before addition. 1446 */ 1447 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1448 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1449 return SPDK_VTOPHYS_ERROR; 1450 } else { 1451 return paddr_2mb + (vaddr & MASK_2MB); 1452 } 1453 } 1454