1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_eal_memconfig.h> 40 41 #include "spdk_internal/assert.h" 42 #include "spdk_internal/memory.h" 43 44 #include "spdk/assert.h" 45 #include "spdk/likely.h" 46 #include "spdk/queue.h" 47 #include "spdk/util.h" 48 #include "spdk/env_dpdk.h" 49 50 #ifdef __FreeBSD__ 51 #define SPDK_VFIO_ENABLED 0 52 #else 53 #include <linux/version.h> 54 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 55 #define SPDK_VFIO_ENABLED 1 56 #include <linux/vfio.h> 57 #include <rte_vfio.h> 58 59 struct spdk_vfio_dma_map { 60 struct vfio_iommu_type1_dma_map map; 61 struct vfio_iommu_type1_dma_unmap unmap; 62 TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 63 }; 64 65 struct vfio_cfg { 66 int fd; 67 bool enabled; 68 bool noiommu_enabled; 69 unsigned device_ref; 70 TAILQ_HEAD(, spdk_vfio_dma_map) maps; 71 pthread_mutex_t mutex; 72 }; 73 74 static struct vfio_cfg g_vfio = { 75 .fd = -1, 76 .enabled = false, 77 .noiommu_enabled = false, 78 .device_ref = 0, 79 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 80 .mutex = PTHREAD_MUTEX_INITIALIZER 81 }; 82 83 #else 84 #define SPDK_VFIO_ENABLED 0 85 #endif 86 #endif 87 88 #if DEBUG 89 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 90 #else 91 #define DEBUG_PRINT(...) 92 #endif 93 94 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 95 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 96 97 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 98 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 99 100 /* Page is registered */ 101 #define REG_MAP_REGISTERED (1ULL << 62) 102 103 /* A notification region barrier. The 2MB translation entry that's marked 104 * with this flag must be unregistered separately. This allows contiguous 105 * regions to be unregistered in the same chunks they were registered. 106 */ 107 #define REG_MAP_NOTIFY_START (1ULL << 63) 108 109 /* Translation of a single 2MB page. */ 110 struct map_2mb { 111 uint64_t translation_2mb; 112 }; 113 114 /* Second-level map table indexed by bits [21..29] of the virtual address. 115 * Each entry contains the address translation or error for entries that haven't 116 * been retrieved yet. 117 */ 118 struct map_1gb { 119 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 120 }; 121 122 /* Top-level map table indexed by bits [30..47] of the virtual address. 123 * Each entry points to a second-level map table or NULL. 124 */ 125 struct map_256tb { 126 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 127 }; 128 129 /* Page-granularity memory address translation */ 130 struct spdk_mem_map { 131 struct map_256tb map_256tb; 132 pthread_mutex_t mutex; 133 uint64_t default_translation; 134 struct spdk_mem_map_ops ops; 135 void *cb_ctx; 136 TAILQ_ENTRY(spdk_mem_map) tailq; 137 }; 138 139 /* Registrations map. The 64 bit translations are bit fields with the 140 * following layout (starting with the low bits): 141 * 0 - 61 : reserved 142 * 62 - 63 : flags 143 */ 144 static struct spdk_mem_map *g_mem_reg_map; 145 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 146 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 147 148 /* 149 * Walk the currently registered memory via the main memory registration map 150 * and call the new map's notify callback for each virtually contiguous region. 151 */ 152 static int 153 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 154 { 155 size_t idx_256tb; 156 uint64_t idx_1gb; 157 uint64_t contig_start = UINT64_MAX; 158 uint64_t contig_end = UINT64_MAX; 159 struct map_1gb *map_1gb; 160 int rc; 161 162 if (!g_mem_reg_map) { 163 return -EINVAL; 164 } 165 166 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 167 pthread_mutex_lock(&g_mem_reg_map->mutex); 168 169 for (idx_256tb = 0; 170 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 171 idx_256tb++) { 172 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 173 174 if (!map_1gb) { 175 if (contig_start != UINT64_MAX) { 176 /* End of of a virtually contiguous range */ 177 rc = map->ops.notify_cb(map->cb_ctx, map, action, 178 (void *)contig_start, 179 contig_end - contig_start + VALUE_2MB); 180 /* Don't bother handling unregister failures. It can't be any worse */ 181 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 182 goto err_unregister; 183 } 184 } 185 contig_start = UINT64_MAX; 186 continue; 187 } 188 189 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 190 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 191 (contig_start == UINT64_MAX || 192 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 193 /* Rebuild the virtual address from the indexes */ 194 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 195 196 if (contig_start == UINT64_MAX) { 197 contig_start = vaddr; 198 } 199 200 contig_end = vaddr; 201 } else { 202 if (contig_start != UINT64_MAX) { 203 /* End of of a virtually contiguous range */ 204 rc = map->ops.notify_cb(map->cb_ctx, map, action, 205 (void *)contig_start, 206 contig_end - contig_start + VALUE_2MB); 207 /* Don't bother handling unregister failures. It can't be any worse */ 208 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 209 goto err_unregister; 210 } 211 212 /* This page might be a part of a neighbour region, so process 213 * it again. The idx_1gb will be incremented immediately. 214 */ 215 idx_1gb--; 216 } 217 contig_start = UINT64_MAX; 218 } 219 } 220 } 221 222 pthread_mutex_unlock(&g_mem_reg_map->mutex); 223 return 0; 224 225 err_unregister: 226 /* Unwind to the first empty translation so we don't unregister 227 * a region that just failed to register. 228 */ 229 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 230 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 231 contig_start = UINT64_MAX; 232 contig_end = UINT64_MAX; 233 234 /* Unregister any memory we managed to register before the failure */ 235 for (; idx_256tb < SIZE_MAX; idx_256tb--) { 236 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 237 238 if (!map_1gb) { 239 if (contig_end != UINT64_MAX) { 240 /* End of of a virtually contiguous range */ 241 map->ops.notify_cb(map->cb_ctx, map, 242 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 243 (void *)contig_start, 244 contig_end - contig_start + VALUE_2MB); 245 } 246 contig_end = UINT64_MAX; 247 continue; 248 } 249 250 for (; idx_1gb < UINT64_MAX; idx_1gb--) { 251 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 252 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 253 /* Rebuild the virtual address from the indexes */ 254 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 255 256 if (contig_end == UINT64_MAX) { 257 contig_end = vaddr; 258 } 259 contig_start = vaddr; 260 } else { 261 if (contig_end != UINT64_MAX) { 262 /* End of of a virtually contiguous range */ 263 map->ops.notify_cb(map->cb_ctx, map, 264 SPDK_MEM_MAP_NOTIFY_UNREGISTER, 265 (void *)contig_start, 266 contig_end - contig_start + VALUE_2MB); 267 idx_1gb++; 268 } 269 contig_end = UINT64_MAX; 270 } 271 } 272 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 273 } 274 275 pthread_mutex_unlock(&g_mem_reg_map->mutex); 276 return rc; 277 } 278 279 struct spdk_mem_map * 280 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 281 { 282 struct spdk_mem_map *map; 283 int rc; 284 285 map = calloc(1, sizeof(*map)); 286 if (map == NULL) { 287 return NULL; 288 } 289 290 if (pthread_mutex_init(&map->mutex, NULL)) { 291 free(map); 292 return NULL; 293 } 294 295 map->default_translation = default_translation; 296 map->cb_ctx = cb_ctx; 297 if (ops) { 298 map->ops = *ops; 299 } 300 301 if (ops && ops->notify_cb) { 302 pthread_mutex_lock(&g_spdk_mem_map_mutex); 303 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 304 if (rc != 0) { 305 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 306 DEBUG_PRINT("Initial mem_map notify failed\n"); 307 pthread_mutex_destroy(&map->mutex); 308 free(map); 309 return NULL; 310 } 311 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 312 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 313 } 314 315 return map; 316 } 317 318 void 319 spdk_mem_map_free(struct spdk_mem_map **pmap) 320 { 321 struct spdk_mem_map *map; 322 size_t i; 323 324 if (!pmap) { 325 return; 326 } 327 328 map = *pmap; 329 330 if (!map) { 331 return; 332 } 333 334 if (map->ops.notify_cb) { 335 pthread_mutex_lock(&g_spdk_mem_map_mutex); 336 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 337 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 338 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 339 } 340 341 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 342 free(map->map_256tb.map[i]); 343 } 344 345 pthread_mutex_destroy(&map->mutex); 346 347 free(map); 348 *pmap = NULL; 349 } 350 351 int 352 spdk_mem_register(void *vaddr, size_t len) 353 { 354 struct spdk_mem_map *map; 355 int rc; 356 void *seg_vaddr; 357 size_t seg_len; 358 uint64_t reg; 359 360 if ((uintptr_t)vaddr & ~MASK_256TB) { 361 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 362 return -EINVAL; 363 } 364 365 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 366 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 367 __func__, vaddr, len); 368 return -EINVAL; 369 } 370 371 if (len == 0) { 372 return 0; 373 } 374 375 pthread_mutex_lock(&g_spdk_mem_map_mutex); 376 377 seg_vaddr = vaddr; 378 seg_len = len; 379 while (seg_len > 0) { 380 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 381 if (reg & REG_MAP_REGISTERED) { 382 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 383 return -EBUSY; 384 } 385 seg_vaddr += VALUE_2MB; 386 seg_len -= VALUE_2MB; 387 } 388 389 seg_vaddr = vaddr; 390 seg_len = 0; 391 while (len > 0) { 392 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 393 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 394 seg_len += VALUE_2MB; 395 vaddr += VALUE_2MB; 396 len -= VALUE_2MB; 397 } 398 399 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 400 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 401 if (rc != 0) { 402 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 403 return rc; 404 } 405 } 406 407 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 408 return 0; 409 } 410 411 int 412 spdk_mem_unregister(void *vaddr, size_t len) 413 { 414 struct spdk_mem_map *map; 415 int rc; 416 void *seg_vaddr; 417 size_t seg_len; 418 uint64_t reg, newreg; 419 420 if ((uintptr_t)vaddr & ~MASK_256TB) { 421 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 422 return -EINVAL; 423 } 424 425 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 426 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 427 __func__, vaddr, len); 428 return -EINVAL; 429 } 430 431 pthread_mutex_lock(&g_spdk_mem_map_mutex); 432 433 /* The first page must be a start of a region. Also check if it's 434 * registered to make sure we don't return -ERANGE for non-registered 435 * regions. 436 */ 437 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 438 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 439 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 440 return -ERANGE; 441 } 442 443 seg_vaddr = vaddr; 444 seg_len = len; 445 while (seg_len > 0) { 446 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 447 if ((reg & REG_MAP_REGISTERED) == 0) { 448 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 449 return -EINVAL; 450 } 451 seg_vaddr += VALUE_2MB; 452 seg_len -= VALUE_2MB; 453 } 454 455 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 456 /* If the next page is registered, it must be a start of a region as well, 457 * otherwise we'd be unregistering only a part of a region. 458 */ 459 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 460 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 461 return -ERANGE; 462 } 463 seg_vaddr = vaddr; 464 seg_len = 0; 465 466 while (len > 0) { 467 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 468 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 469 470 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 471 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 472 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 473 if (rc != 0) { 474 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 475 return rc; 476 } 477 } 478 479 seg_vaddr = vaddr; 480 seg_len = VALUE_2MB; 481 } else { 482 seg_len += VALUE_2MB; 483 } 484 485 vaddr += VALUE_2MB; 486 len -= VALUE_2MB; 487 } 488 489 if (seg_len > 0) { 490 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 491 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 492 if (rc != 0) { 493 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 494 return rc; 495 } 496 } 497 } 498 499 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 500 return 0; 501 } 502 503 static struct map_1gb * 504 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 505 { 506 struct map_1gb *map_1gb; 507 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 508 size_t i; 509 510 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 511 return NULL; 512 } 513 514 map_1gb = map->map_256tb.map[idx_256tb]; 515 516 if (!map_1gb) { 517 pthread_mutex_lock(&map->mutex); 518 519 /* Recheck to make sure nobody else got the mutex first. */ 520 map_1gb = map->map_256tb.map[idx_256tb]; 521 if (!map_1gb) { 522 map_1gb = malloc(sizeof(struct map_1gb)); 523 if (map_1gb) { 524 /* initialize all entries to default translation */ 525 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 526 map_1gb->map[i].translation_2mb = map->default_translation; 527 } 528 map->map_256tb.map[idx_256tb] = map_1gb; 529 } 530 } 531 532 pthread_mutex_unlock(&map->mutex); 533 534 if (!map_1gb) { 535 DEBUG_PRINT("allocation failed\n"); 536 return NULL; 537 } 538 } 539 540 return map_1gb; 541 } 542 543 int 544 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 545 uint64_t translation) 546 { 547 uint64_t vfn_2mb; 548 struct map_1gb *map_1gb; 549 uint64_t idx_1gb; 550 struct map_2mb *map_2mb; 551 552 if ((uintptr_t)vaddr & ~MASK_256TB) { 553 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 554 return -EINVAL; 555 } 556 557 /* For now, only 2 MB-aligned registrations are supported */ 558 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 559 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 560 __func__, vaddr, size); 561 return -EINVAL; 562 } 563 564 vfn_2mb = vaddr >> SHIFT_2MB; 565 566 while (size) { 567 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 568 if (!map_1gb) { 569 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 570 return -ENOMEM; 571 } 572 573 idx_1gb = MAP_1GB_IDX(vfn_2mb); 574 map_2mb = &map_1gb->map[idx_1gb]; 575 map_2mb->translation_2mb = translation; 576 577 size -= VALUE_2MB; 578 vfn_2mb++; 579 } 580 581 return 0; 582 } 583 584 int 585 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 586 { 587 uint64_t vfn_2mb; 588 struct map_1gb *map_1gb; 589 uint64_t idx_1gb; 590 struct map_2mb *map_2mb; 591 592 if ((uintptr_t)vaddr & ~MASK_256TB) { 593 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 594 return -EINVAL; 595 } 596 597 /* For now, only 2 MB-aligned registrations are supported */ 598 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 599 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 600 __func__, vaddr, size); 601 return -EINVAL; 602 } 603 604 vfn_2mb = vaddr >> SHIFT_2MB; 605 606 while (size) { 607 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 608 if (!map_1gb) { 609 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 610 return -ENOMEM; 611 } 612 613 idx_1gb = MAP_1GB_IDX(vfn_2mb); 614 map_2mb = &map_1gb->map[idx_1gb]; 615 map_2mb->translation_2mb = map->default_translation; 616 617 size -= VALUE_2MB; 618 vfn_2mb++; 619 } 620 621 return 0; 622 } 623 624 inline uint64_t 625 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 626 { 627 const struct map_1gb *map_1gb; 628 const struct map_2mb *map_2mb; 629 uint64_t idx_256tb; 630 uint64_t idx_1gb; 631 uint64_t vfn_2mb; 632 uint64_t cur_size; 633 uint64_t prev_translation; 634 uint64_t orig_translation; 635 636 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 637 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 638 return map->default_translation; 639 } 640 641 vfn_2mb = vaddr >> SHIFT_2MB; 642 idx_256tb = MAP_256TB_IDX(vfn_2mb); 643 idx_1gb = MAP_1GB_IDX(vfn_2mb); 644 645 map_1gb = map->map_256tb.map[idx_256tb]; 646 if (spdk_unlikely(!map_1gb)) { 647 return map->default_translation; 648 } 649 650 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 651 map_2mb = &map_1gb->map[idx_1gb]; 652 if (size == NULL || map->ops.are_contiguous == NULL || 653 map_2mb->translation_2mb == map->default_translation) { 654 if (size != NULL) { 655 *size = spdk_min(*size, cur_size); 656 } 657 return map_2mb->translation_2mb; 658 } 659 660 orig_translation = map_2mb->translation_2mb; 661 prev_translation = orig_translation; 662 while (cur_size < *size) { 663 vfn_2mb++; 664 idx_256tb = MAP_256TB_IDX(vfn_2mb); 665 idx_1gb = MAP_1GB_IDX(vfn_2mb); 666 667 map_1gb = map->map_256tb.map[idx_256tb]; 668 if (spdk_unlikely(!map_1gb)) { 669 break; 670 } 671 672 map_2mb = &map_1gb->map[idx_1gb]; 673 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 674 break; 675 } 676 677 cur_size += VALUE_2MB; 678 prev_translation = map_2mb->translation_2mb; 679 } 680 681 *size = spdk_min(*size, cur_size); 682 return orig_translation; 683 } 684 685 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 686 static void 687 memory_hotplug_cb(enum rte_mem_event event_type, 688 const void *addr, size_t len, void *arg) 689 { 690 if (event_type == RTE_MEM_EVENT_ALLOC) { 691 spdk_mem_register((void *)addr, len); 692 693 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) 694 if (!spdk_env_dpdk_external_init()) { 695 return; 696 } 697 #endif 698 699 /* Prior to DPDK 19.02, we have to worry about DPDK 700 * freeing memory in different units than it was allocated. 701 * That doesn't work with things like RDMA MRs. So for 702 * those versions of DPDK, mark each segment so that DPDK 703 * won't later free it. That ensures we don't have to deal 704 * with that scenario. 705 * 706 * DPDK 19.02 added the --match-allocations RTE flag to 707 * avoid this condition. 708 * 709 * Note: if the user initialized DPDK separately, we can't 710 * be sure that --match-allocations was specified, so need 711 * to still mark the segments so they aren't freed. 712 */ 713 while (len > 0) { 714 struct rte_memseg *seg; 715 716 seg = rte_mem_virt2memseg(addr, NULL); 717 assert(seg != NULL); 718 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 719 addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 720 len -= seg->hugepage_sz; 721 } 722 } else if (event_type == RTE_MEM_EVENT_FREE) { 723 spdk_mem_unregister((void *)addr, len); 724 } 725 } 726 727 static int 728 memory_iter_cb(const struct rte_memseg_list *msl, 729 const struct rte_memseg *ms, size_t len, void *arg) 730 { 731 return spdk_mem_register(ms->addr, len); 732 } 733 #endif 734 735 int 736 spdk_mem_map_init(void) 737 { 738 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 739 if (g_mem_reg_map == NULL) { 740 DEBUG_PRINT("memory registration map allocation failed\n"); 741 return -1; 742 } 743 744 /* 745 * Walk all DPDK memory segments and register them 746 * with the master memory map 747 */ 748 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 749 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 750 rte_memseg_contig_walk(memory_iter_cb, NULL); 751 #else 752 struct rte_mem_config *mcfg; 753 size_t seg_idx; 754 755 mcfg = rte_eal_get_configuration()->mem_config; 756 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 757 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 758 759 if (seg->addr == NULL) { 760 break; 761 } 762 763 spdk_mem_register(seg->addr, seg->len); 764 } 765 #endif 766 return 0; 767 } 768 769 bool 770 spdk_iommu_is_enabled(void) 771 { 772 #if SPDK_VFIO_ENABLED 773 return g_vfio.enabled && !g_vfio.noiommu_enabled; 774 #else 775 return false; 776 #endif 777 } 778 779 struct spdk_vtophys_pci_device { 780 struct rte_pci_device *pci_device; 781 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 782 }; 783 784 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 785 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 786 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 787 788 static struct spdk_mem_map *g_vtophys_map; 789 790 #if SPDK_VFIO_ENABLED 791 static int 792 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 793 { 794 struct spdk_vfio_dma_map *dma_map; 795 int ret; 796 797 dma_map = calloc(1, sizeof(*dma_map)); 798 if (dma_map == NULL) { 799 return -ENOMEM; 800 } 801 802 dma_map->map.argsz = sizeof(dma_map->map); 803 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 804 dma_map->map.vaddr = vaddr; 805 dma_map->map.iova = iova; 806 dma_map->map.size = size; 807 808 dma_map->unmap.argsz = sizeof(dma_map->unmap); 809 dma_map->unmap.flags = 0; 810 dma_map->unmap.iova = iova; 811 dma_map->unmap.size = size; 812 813 pthread_mutex_lock(&g_vfio.mutex); 814 if (g_vfio.device_ref == 0) { 815 /* VFIO requires at least one device (IOMMU group) to be added to 816 * a VFIO container before it is possible to perform any IOMMU 817 * operations on that container. This memory will be mapped once 818 * the first device (IOMMU group) is hotplugged. 819 * 820 * Since the vfio container is managed internally by DPDK, it is 821 * also possible that some device is already in that container, but 822 * it's not managed by SPDK - e.g. an NIC attached internally 823 * inside DPDK. We could map the memory straight away in such 824 * scenario, but there's no need to do it. DPDK devices clearly 825 * don't need our mappings and hence we defer the mapping 826 * unconditionally until the first SPDK-managed device is 827 * hotplugged. 828 */ 829 goto out_insert; 830 } 831 832 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 833 if (ret) { 834 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); 835 pthread_mutex_unlock(&g_vfio.mutex); 836 free(dma_map); 837 return ret; 838 } 839 840 out_insert: 841 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 842 pthread_mutex_unlock(&g_vfio.mutex); 843 return 0; 844 } 845 846 static int 847 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 848 { 849 struct spdk_vfio_dma_map *dma_map; 850 int ret; 851 852 pthread_mutex_lock(&g_vfio.mutex); 853 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 854 if (dma_map->map.iova == iova) { 855 break; 856 } 857 } 858 859 if (dma_map == NULL) { 860 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 861 pthread_mutex_unlock(&g_vfio.mutex); 862 return -ENXIO; 863 } 864 865 /** don't support partial or multiple-page unmap for now */ 866 assert(dma_map->map.size == size); 867 868 if (g_vfio.device_ref == 0) { 869 /* Memory is not mapped anymore, just remove it's references */ 870 goto out_remove; 871 } 872 873 874 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 875 if (ret) { 876 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); 877 pthread_mutex_unlock(&g_vfio.mutex); 878 return ret; 879 } 880 881 out_remove: 882 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 883 pthread_mutex_unlock(&g_vfio.mutex); 884 free(dma_map); 885 return 0; 886 } 887 #endif 888 889 static uint64_t 890 vtophys_get_paddr_memseg(uint64_t vaddr) 891 { 892 uintptr_t paddr; 893 struct rte_memseg *seg; 894 895 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) 896 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 897 if (seg != NULL) { 898 paddr = seg->phys_addr; 899 if (paddr == RTE_BAD_IOVA) { 900 return SPDK_VTOPHYS_ERROR; 901 } 902 paddr += (vaddr - (uintptr_t)seg->addr); 903 return paddr; 904 } 905 #else 906 struct rte_mem_config *mcfg; 907 uint32_t seg_idx; 908 909 mcfg = rte_eal_get_configuration()->mem_config; 910 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 911 seg = &mcfg->memseg[seg_idx]; 912 if (seg->addr == NULL) { 913 break; 914 } 915 916 if (vaddr >= (uintptr_t)seg->addr && 917 vaddr < ((uintptr_t)seg->addr + seg->len)) { 918 paddr = seg->phys_addr; 919 if (paddr == RTE_BAD_IOVA) { 920 return SPDK_VTOPHYS_ERROR; 921 } 922 paddr += (vaddr - (uintptr_t)seg->addr); 923 return paddr; 924 } 925 } 926 #endif 927 928 return SPDK_VTOPHYS_ERROR; 929 } 930 931 /* Try to get the paddr from /proc/self/pagemap */ 932 static uint64_t 933 vtophys_get_paddr_pagemap(uint64_t vaddr) 934 { 935 uintptr_t paddr; 936 937 paddr = rte_mem_virt2iova((void *)vaddr); 938 if (paddr == RTE_BAD_IOVA) { 939 /* 940 * The vaddr may be valid but doesn't have a backing page 941 * assigned yet. Touch the page to ensure a backing page 942 * gets assigned, then try to translate again. 943 */ 944 rte_atomic64_read((rte_atomic64_t *)vaddr); 945 paddr = rte_mem_virt2iova((void *)vaddr); 946 } 947 if (paddr == RTE_BAD_IOVA) { 948 /* Unable to get to the physical address. */ 949 return SPDK_VTOPHYS_ERROR; 950 } 951 952 return paddr; 953 } 954 955 /* Try to get the paddr from pci devices */ 956 static uint64_t 957 vtophys_get_paddr_pci(uint64_t vaddr) 958 { 959 struct spdk_vtophys_pci_device *vtophys_dev; 960 uintptr_t paddr; 961 struct rte_pci_device *dev; 962 struct rte_mem_resource *res; 963 unsigned r; 964 965 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 966 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 967 dev = vtophys_dev->pci_device; 968 969 for (r = 0; r < PCI_MAX_RESOURCE; r++) { 970 res = &dev->mem_resource[r]; 971 if (res->phys_addr && vaddr >= (uint64_t)res->addr && 972 vaddr < (uint64_t)res->addr + res->len) { 973 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 974 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, 975 (void *)paddr); 976 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 977 return paddr; 978 } 979 } 980 } 981 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 982 983 return SPDK_VTOPHYS_ERROR; 984 } 985 986 static int 987 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 988 enum spdk_mem_map_notify_action action, 989 void *vaddr, size_t len) 990 { 991 int rc = 0, pci_phys = 0; 992 uint64_t paddr; 993 994 if ((uintptr_t)vaddr & ~MASK_256TB) { 995 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 996 return -EINVAL; 997 } 998 999 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 1000 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 1001 __func__, vaddr, len); 1002 return -EINVAL; 1003 } 1004 1005 while (len > 0) { 1006 /* Get the physical address from the DPDK memsegs */ 1007 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1008 1009 switch (action) { 1010 case SPDK_MEM_MAP_NOTIFY_REGISTER: 1011 if (paddr == SPDK_VTOPHYS_ERROR) { 1012 /* This is not an address that DPDK is managing. */ 1013 #if SPDK_VFIO_ENABLED 1014 if (spdk_iommu_is_enabled()) { 1015 /* We'll use the virtual address as the iova. DPDK 1016 * currently uses physical addresses as the iovas (or counts 1017 * up from 0 if it can't get physical addresses), so 1018 * the range of user space virtual addresses and physical 1019 * addresses will never overlap. 1020 */ 1021 paddr = (uint64_t)vaddr; 1022 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1023 if (rc) { 1024 return -EFAULT; 1025 } 1026 } else 1027 #endif 1028 { 1029 /* Get the physical address from /proc/self/pagemap. */ 1030 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1031 if (paddr == SPDK_VTOPHYS_ERROR) { 1032 /* Get the physical address from PCI devices */ 1033 paddr = vtophys_get_paddr_pci((uint64_t)vaddr); 1034 if (paddr == SPDK_VTOPHYS_ERROR) { 1035 DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1036 return -EFAULT; 1037 } 1038 pci_phys = 1; 1039 } 1040 } 1041 } 1042 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ 1043 if (!pci_phys && (paddr & MASK_2MB)) { 1044 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1045 return -EINVAL; 1046 } 1047 1048 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1049 break; 1050 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1051 #if SPDK_VFIO_ENABLED 1052 if (paddr == SPDK_VTOPHYS_ERROR) { 1053 /* 1054 * This is not an address that DPDK is managing. If vfio is enabled, 1055 * we need to unmap the range from the IOMMU 1056 */ 1057 if (spdk_iommu_is_enabled()) { 1058 uint64_t buffer_len = VALUE_2MB; 1059 paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); 1060 if (buffer_len != VALUE_2MB) { 1061 return -EINVAL; 1062 } 1063 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1064 if (rc) { 1065 return -EFAULT; 1066 } 1067 } 1068 } 1069 #endif 1070 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1071 break; 1072 default: 1073 SPDK_UNREACHABLE(); 1074 } 1075 1076 if (rc != 0) { 1077 return rc; 1078 } 1079 vaddr += VALUE_2MB; 1080 len -= VALUE_2MB; 1081 } 1082 1083 return rc; 1084 } 1085 1086 #if SPDK_VFIO_ENABLED 1087 1088 static bool 1089 spdk_vfio_enabled(void) 1090 { 1091 return rte_vfio_is_enabled("vfio_pci"); 1092 } 1093 1094 /* Check if IOMMU is enabled on the system */ 1095 static bool 1096 has_iommu_groups(void) 1097 { 1098 struct dirent *d; 1099 int count = 0; 1100 DIR *dir = opendir("/sys/kernel/iommu_groups"); 1101 1102 if (dir == NULL) { 1103 return false; 1104 } 1105 1106 while (count < 3 && (d = readdir(dir)) != NULL) { 1107 count++; 1108 } 1109 1110 closedir(dir); 1111 /* there will always be ./ and ../ entries */ 1112 return count > 2; 1113 } 1114 1115 static bool 1116 spdk_vfio_noiommu_enabled(void) 1117 { 1118 return rte_vfio_noiommu_is_enabled(); 1119 } 1120 1121 static void 1122 spdk_vtophys_iommu_init(void) 1123 { 1124 char proc_fd_path[PATH_MAX + 1]; 1125 char link_path[PATH_MAX + 1]; 1126 const char vfio_path[] = "/dev/vfio/vfio"; 1127 DIR *dir; 1128 struct dirent *d; 1129 1130 if (!spdk_vfio_enabled()) { 1131 return; 1132 } 1133 1134 if (spdk_vfio_noiommu_enabled()) { 1135 g_vfio.noiommu_enabled = true; 1136 } else if (!has_iommu_groups()) { 1137 return; 1138 } 1139 1140 dir = opendir("/proc/self/fd"); 1141 if (!dir) { 1142 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1143 return; 1144 } 1145 1146 while ((d = readdir(dir)) != NULL) { 1147 if (d->d_type != DT_LNK) { 1148 continue; 1149 } 1150 1151 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1152 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1153 continue; 1154 } 1155 1156 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1157 sscanf(d->d_name, "%d", &g_vfio.fd); 1158 break; 1159 } 1160 } 1161 1162 closedir(dir); 1163 1164 if (g_vfio.fd < 0) { 1165 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1166 return; 1167 } 1168 1169 g_vfio.enabled = true; 1170 1171 return; 1172 } 1173 #endif 1174 1175 void 1176 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) 1177 { 1178 struct spdk_vtophys_pci_device *vtophys_dev; 1179 1180 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1181 1182 vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1183 if (vtophys_dev) { 1184 vtophys_dev->pci_device = pci_device; 1185 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1186 } else { 1187 DEBUG_PRINT("Memory allocation error\n"); 1188 } 1189 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1190 1191 #if SPDK_VFIO_ENABLED 1192 struct spdk_vfio_dma_map *dma_map; 1193 int ret; 1194 1195 if (!g_vfio.enabled) { 1196 return; 1197 } 1198 1199 pthread_mutex_lock(&g_vfio.mutex); 1200 g_vfio.device_ref++; 1201 if (g_vfio.device_ref > 1) { 1202 pthread_mutex_unlock(&g_vfio.mutex); 1203 return; 1204 } 1205 1206 /* This is the first SPDK device using DPDK vfio. This means that the first 1207 * IOMMU group might have been just been added to the DPDK vfio container. 1208 * From this point it is certain that the memory can be mapped now. 1209 */ 1210 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1211 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 1212 if (ret) { 1213 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 1214 break; 1215 } 1216 } 1217 pthread_mutex_unlock(&g_vfio.mutex); 1218 #endif 1219 } 1220 1221 void 1222 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1223 { 1224 struct spdk_vtophys_pci_device *vtophys_dev; 1225 1226 pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1227 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1228 if (vtophys_dev->pci_device == pci_device) { 1229 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1230 free(vtophys_dev); 1231 break; 1232 } 1233 } 1234 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1235 1236 #if SPDK_VFIO_ENABLED 1237 struct spdk_vfio_dma_map *dma_map; 1238 int ret; 1239 1240 if (!g_vfio.enabled) { 1241 return; 1242 } 1243 1244 pthread_mutex_lock(&g_vfio.mutex); 1245 assert(g_vfio.device_ref > 0); 1246 g_vfio.device_ref--; 1247 if (g_vfio.device_ref > 0) { 1248 pthread_mutex_unlock(&g_vfio.mutex); 1249 return; 1250 } 1251 1252 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 1253 * any additional devices using it's vfio container, all the mappings 1254 * will be automatically removed by the Linux vfio driver. We unmap 1255 * the memory manually to be able to easily re-map it later regardless 1256 * of other, external factors. 1257 */ 1258 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 1259 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); 1260 if (ret) { 1261 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 1262 break; 1263 } 1264 } 1265 pthread_mutex_unlock(&g_vfio.mutex); 1266 #endif 1267 } 1268 1269 int 1270 spdk_vtophys_init(void) 1271 { 1272 const struct spdk_mem_map_ops vtophys_map_ops = { 1273 .notify_cb = spdk_vtophys_notify, 1274 .are_contiguous = NULL 1275 }; 1276 1277 #if SPDK_VFIO_ENABLED 1278 spdk_vtophys_iommu_init(); 1279 #endif 1280 1281 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1282 if (g_vtophys_map == NULL) { 1283 DEBUG_PRINT("vtophys map allocation failed\n"); 1284 return -1; 1285 } 1286 return 0; 1287 } 1288 1289 uint64_t 1290 spdk_vtophys(void *buf, uint64_t *size) 1291 { 1292 uint64_t vaddr, paddr_2mb; 1293 1294 vaddr = (uint64_t)buf; 1295 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1296 1297 /* 1298 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1299 * we will still bitwise-or it with the buf offset below, but the result will still be 1300 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1301 * unaligned) we must now check the return value before addition. 1302 */ 1303 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1304 if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1305 return SPDK_VTOPHYS_ERROR; 1306 } else { 1307 return paddr_2mb + (vaddr & MASK_2MB); 1308 } 1309 } 1310