1488570ebSJim Harris /* SPDX-License-Identifier: BSD-3-Clause 2a6dbe372Spaul luse * Copyright (C) 2017 Intel Corporation. 31d24e67dSBen Walker * All rights reserved. 41d24e67dSBen Walker */ 51d24e67dSBen Walker 61d24e67dSBen Walker #include "spdk/stdinc.h" 71d24e67dSBen Walker 81d24e67dSBen Walker #include "env_internal.h" 911313c20SJim Harris #include "pci_dpdk.h" 101d24e67dSBen Walker 111d24e67dSBen Walker #include <rte_config.h> 12cf35beccSDarek Stojaczyk #include <rte_memory.h> 131d24e67dSBen Walker #include <rte_eal_memconfig.h> 14c7f50109SJim Harris #include <rte_dev.h> 15c7f50109SJim Harris #include <rte_pci.h> 161d24e67dSBen Walker 171d24e67dSBen Walker #include "spdk_internal/assert.h" 181d24e67dSBen Walker 191d24e67dSBen Walker #include "spdk/assert.h" 201d24e67dSBen Walker #include "spdk/likely.h" 211d24e67dSBen Walker #include "spdk/queue.h" 221d24e67dSBen Walker #include "spdk/util.h" 23e03861f1SDarek Stojaczyk #include "spdk/memory.h" 240871b382SJim Harris #include "spdk/env_dpdk.h" 25d8190d02SVitaliy Mysak #include "spdk/log.h" 261d24e67dSBen Walker 27a36bc251SBen Walker #ifdef __linux__ 28f74d069eSJim Harris #include <linux/version.h> 296f48bf7cSDarek Stojaczyk #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) 30f74d069eSJim Harris #include <linux/vfio.h> 31f74d069eSJim Harris #include <rte_vfio.h> 32f74d069eSJim Harris 33f74d069eSJim Harris struct spdk_vfio_dma_map { 34f74d069eSJim Harris struct vfio_iommu_type1_dma_map map; 35f74d069eSJim Harris TAILQ_ENTRY(spdk_vfio_dma_map) tailq; 36f74d069eSJim Harris }; 37f74d069eSJim Harris 38f74d069eSJim Harris struct vfio_cfg { 39f74d069eSJim Harris int fd; 40f74d069eSJim Harris bool enabled; 41f74d069eSJim Harris bool noiommu_enabled; 42f74d069eSJim Harris unsigned device_ref; 43f74d069eSJim Harris TAILQ_HEAD(, spdk_vfio_dma_map) maps; 44f74d069eSJim Harris pthread_mutex_t mutex; 45f74d069eSJim Harris }; 46f74d069eSJim Harris 47f74d069eSJim Harris static struct vfio_cfg g_vfio = { 48f74d069eSJim Harris .fd = -1, 49f74d069eSJim Harris .enabled = false, 50f74d069eSJim Harris .noiommu_enabled = false, 51f74d069eSJim Harris .device_ref = 0, 52f74d069eSJim Harris .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), 53f74d069eSJim Harris .mutex = PTHREAD_MUTEX_INITIALIZER 54f74d069eSJim Harris }; 55f74d069eSJim Harris #endif 56f74d069eSJim Harris #endif 57f74d069eSJim Harris 581d24e67dSBen Walker #if DEBUG 59d8190d02SVitaliy Mysak #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) 601d24e67dSBen Walker #else 611d24e67dSBen Walker #define DEBUG_PRINT(...) 621d24e67dSBen Walker #endif 631d24e67dSBen Walker 641d24e67dSBen Walker #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 651d24e67dSBen Walker #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 661d24e67dSBen Walker 6733e88d79SJim Harris #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 687ac8b609SDaniel Verkamp #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) 691d24e67dSBen Walker 70a33e0943SDarek Stojaczyk /* Page is registered */ 71a33e0943SDarek Stojaczyk #define REG_MAP_REGISTERED (1ULL << 62) 726bcd9295SDarek Stojaczyk 736bcd9295SDarek Stojaczyk /* A notification region barrier. The 2MB translation entry that's marked 746bcd9295SDarek Stojaczyk * with this flag must be unregistered separately. This allows contiguous 756bcd9295SDarek Stojaczyk * regions to be unregistered in the same chunks they were registered. 766bcd9295SDarek Stojaczyk */ 776bcd9295SDarek Stojaczyk #define REG_MAP_NOTIFY_START (1ULL << 63) 786bcd9295SDarek Stojaczyk 791d24e67dSBen Walker /* Translation of a single 2MB page. */ 801d24e67dSBen Walker struct map_2mb { 811d24e67dSBen Walker uint64_t translation_2mb; 821d24e67dSBen Walker }; 831d24e67dSBen Walker 841d24e67dSBen Walker /* Second-level map table indexed by bits [21..29] of the virtual address. 851d24e67dSBen Walker * Each entry contains the address translation or error for entries that haven't 861d24e67dSBen Walker * been retrieved yet. 871d24e67dSBen Walker */ 881d24e67dSBen Walker struct map_1gb { 897ac8b609SDaniel Verkamp struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; 901d24e67dSBen Walker }; 911d24e67dSBen Walker 9218f80e3aSJim Harris /* Top-level map table indexed by bits [30..47] of the virtual address. 931d24e67dSBen Walker * Each entry points to a second-level map table or NULL. 941d24e67dSBen Walker */ 9533e88d79SJim Harris struct map_256tb { 967ac8b609SDaniel Verkamp struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; 971d24e67dSBen Walker }; 981d24e67dSBen Walker 991d24e67dSBen Walker /* Page-granularity memory address translation */ 1001d24e67dSBen Walker struct spdk_mem_map { 10133e88d79SJim Harris struct map_256tb map_256tb; 1021d24e67dSBen Walker pthread_mutex_t mutex; 1031d24e67dSBen Walker uint64_t default_translation; 1041ee27f79SSeth Howell struct spdk_mem_map_ops ops; 1051d24e67dSBen Walker void *cb_ctx; 1061d24e67dSBen Walker TAILQ_ENTRY(spdk_mem_map) tailq; 1071d24e67dSBen Walker }; 1081d24e67dSBen Walker 1096bcd9295SDarek Stojaczyk /* Registrations map. The 64 bit translations are bit fields with the 1106bcd9295SDarek Stojaczyk * following layout (starting with the low bits): 111a33e0943SDarek Stojaczyk * 0 - 61 : reserved 112a33e0943SDarek Stojaczyk * 62 - 63 : flags 1136bcd9295SDarek Stojaczyk */ 1141d24e67dSBen Walker static struct spdk_mem_map *g_mem_reg_map; 115dcac8e97SDarek Stojaczyk static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = 116dcac8e97SDarek Stojaczyk TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 1171d24e67dSBen Walker static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 1181d24e67dSBen Walker 119396c445cSJim Harris static bool g_legacy_mem; 120a6658c54SSarvesh Lanke static bool g_huge_pages = true; 121396c445cSJim Harris 1221d24e67dSBen Walker /* 1231d24e67dSBen Walker * Walk the currently registered memory via the main memory registration map 1241d24e67dSBen Walker * and call the new map's notify callback for each virtually contiguous region. 1251d24e67dSBen Walker */ 126803449caSDarek Stojaczyk static int 1273456377bSSeth Howell mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 1281d24e67dSBen Walker { 12933e88d79SJim Harris size_t idx_256tb; 130803449caSDarek Stojaczyk uint64_t idx_1gb; 131cb9f0f33SDarek Stojaczyk uint64_t contig_start = UINT64_MAX; 132cb9f0f33SDarek Stojaczyk uint64_t contig_end = UINT64_MAX; 133803449caSDarek Stojaczyk struct map_1gb *map_1gb; 134803449caSDarek Stojaczyk int rc; 1351d24e67dSBen Walker 1361d24e67dSBen Walker if (!g_mem_reg_map) { 137803449caSDarek Stojaczyk return -EINVAL; 1381d24e67dSBen Walker } 1391d24e67dSBen Walker 1401d24e67dSBen Walker /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 1411d24e67dSBen Walker pthread_mutex_lock(&g_mem_reg_map->mutex); 1421d24e67dSBen Walker 14333e88d79SJim Harris for (idx_256tb = 0; 14433e88d79SJim Harris idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 14533e88d79SJim Harris idx_256tb++) { 146803449caSDarek Stojaczyk map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 1471d24e67dSBen Walker 1481d24e67dSBen Walker if (!map_1gb) { 149cb9f0f33SDarek Stojaczyk if (contig_start != UINT64_MAX) { 150803449caSDarek Stojaczyk /* End of of a virtually contiguous range */ 151803449caSDarek Stojaczyk rc = map->ops.notify_cb(map->cb_ctx, map, action, 152803449caSDarek Stojaczyk (void *)contig_start, 153803449caSDarek Stojaczyk contig_end - contig_start + VALUE_2MB); 154803449caSDarek Stojaczyk /* Don't bother handling unregister failures. It can't be any worse */ 155803449caSDarek Stojaczyk if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 156803449caSDarek Stojaczyk goto err_unregister; 157803449caSDarek Stojaczyk } 158803449caSDarek Stojaczyk } 159cb9f0f33SDarek Stojaczyk contig_start = UINT64_MAX; 1601d24e67dSBen Walker continue; 1611d24e67dSBen Walker } 1621d24e67dSBen Walker 1631d24e67dSBen Walker for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 164a33e0943SDarek Stojaczyk if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 1656bcd9295SDarek Stojaczyk (contig_start == UINT64_MAX || 1666bcd9295SDarek Stojaczyk (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 1671d24e67dSBen Walker /* Rebuild the virtual address from the indexes */ 16833e88d79SJim Harris uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 1691d24e67dSBen Walker 170cb9f0f33SDarek Stojaczyk if (contig_start == UINT64_MAX) { 1711d24e67dSBen Walker contig_start = vaddr; 1721d24e67dSBen Walker } 173803449caSDarek Stojaczyk 1741d24e67dSBen Walker contig_end = vaddr; 1751d24e67dSBen Walker } else { 176cb9f0f33SDarek Stojaczyk if (contig_start != UINT64_MAX) { 177803449caSDarek Stojaczyk /* End of of a virtually contiguous range */ 178803449caSDarek Stojaczyk rc = map->ops.notify_cb(map->cb_ctx, map, action, 179803449caSDarek Stojaczyk (void *)contig_start, 180803449caSDarek Stojaczyk contig_end - contig_start + VALUE_2MB); 181803449caSDarek Stojaczyk /* Don't bother handling unregister failures. It can't be any worse */ 182803449caSDarek Stojaczyk if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { 183803449caSDarek Stojaczyk goto err_unregister; 184803449caSDarek Stojaczyk } 1856bcd9295SDarek Stojaczyk 1866bcd9295SDarek Stojaczyk /* This page might be a part of a neighbour region, so process 1876bcd9295SDarek Stojaczyk * it again. The idx_1gb will be incremented immediately. 1886bcd9295SDarek Stojaczyk */ 1896bcd9295SDarek Stojaczyk idx_1gb--; 190803449caSDarek Stojaczyk } 191cb9f0f33SDarek Stojaczyk contig_start = UINT64_MAX; 1921d24e67dSBen Walker } 1931d24e67dSBen Walker } 1941d24e67dSBen Walker } 1951d24e67dSBen Walker 1961d24e67dSBen Walker pthread_mutex_unlock(&g_mem_reg_map->mutex); 197803449caSDarek Stojaczyk return 0; 198803449caSDarek Stojaczyk 199803449caSDarek Stojaczyk err_unregister: 200803449caSDarek Stojaczyk /* Unwind to the first empty translation so we don't unregister 201803449caSDarek Stojaczyk * a region that just failed to register. 202803449caSDarek Stojaczyk */ 203803449caSDarek Stojaczyk idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); 204803449caSDarek Stojaczyk idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); 205cb9f0f33SDarek Stojaczyk contig_start = UINT64_MAX; 206cb9f0f33SDarek Stojaczyk contig_end = UINT64_MAX; 207803449caSDarek Stojaczyk 208803449caSDarek Stojaczyk /* Unregister any memory we managed to register before the failure */ 209803449caSDarek Stojaczyk for (; idx_256tb < SIZE_MAX; idx_256tb--) { 210803449caSDarek Stojaczyk map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 211803449caSDarek Stojaczyk 212803449caSDarek Stojaczyk if (!map_1gb) { 213cb9f0f33SDarek Stojaczyk if (contig_end != UINT64_MAX) { 214803449caSDarek Stojaczyk /* End of of a virtually contiguous range */ 215803449caSDarek Stojaczyk map->ops.notify_cb(map->cb_ctx, map, 216803449caSDarek Stojaczyk SPDK_MEM_MAP_NOTIFY_UNREGISTER, 217803449caSDarek Stojaczyk (void *)contig_start, 218803449caSDarek Stojaczyk contig_end - contig_start + VALUE_2MB); 219803449caSDarek Stojaczyk } 220cb9f0f33SDarek Stojaczyk contig_end = UINT64_MAX; 221803449caSDarek Stojaczyk continue; 222803449caSDarek Stojaczyk } 223803449caSDarek Stojaczyk 224803449caSDarek Stojaczyk for (; idx_1gb < UINT64_MAX; idx_1gb--) { 225803449caSDarek Stojaczyk /* Rebuild the virtual address from the indexes */ 226803449caSDarek Stojaczyk uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 227c8837711SAviv Ben-David if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && 228c8837711SAviv Ben-David (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { 229803449caSDarek Stojaczyk 230cb9f0f33SDarek Stojaczyk if (contig_end == UINT64_MAX) { 231803449caSDarek Stojaczyk contig_end = vaddr; 232803449caSDarek Stojaczyk } 233803449caSDarek Stojaczyk contig_start = vaddr; 234803449caSDarek Stojaczyk } else { 235cb9f0f33SDarek Stojaczyk if (contig_end != UINT64_MAX) { 236c8837711SAviv Ben-David if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) { 237c8837711SAviv Ben-David contig_start = vaddr; 238c8837711SAviv Ben-David } 239803449caSDarek Stojaczyk /* End of of a virtually contiguous range */ 240803449caSDarek Stojaczyk map->ops.notify_cb(map->cb_ctx, map, 241803449caSDarek Stojaczyk SPDK_MEM_MAP_NOTIFY_UNREGISTER, 242803449caSDarek Stojaczyk (void *)contig_start, 243803449caSDarek Stojaczyk contig_end - contig_start + VALUE_2MB); 244803449caSDarek Stojaczyk } 245cb9f0f33SDarek Stojaczyk contig_end = UINT64_MAX; 246803449caSDarek Stojaczyk } 247803449caSDarek Stojaczyk } 248803449caSDarek Stojaczyk idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; 249803449caSDarek Stojaczyk } 250803449caSDarek Stojaczyk 251803449caSDarek Stojaczyk pthread_mutex_unlock(&g_mem_reg_map->mutex); 252803449caSDarek Stojaczyk return rc; 2531d24e67dSBen Walker } 2541d24e67dSBen Walker 2551d24e67dSBen Walker struct spdk_mem_map * 2564e06bb5eSSeth Howell spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) 2571d24e67dSBen Walker { 2581d24e67dSBen Walker struct spdk_mem_map *map; 259803449caSDarek Stojaczyk int rc; 260c8837711SAviv Ben-David size_t i; 2611d24e67dSBen Walker 2621d24e67dSBen Walker map = calloc(1, sizeof(*map)); 2631d24e67dSBen Walker if (map == NULL) { 2641d24e67dSBen Walker return NULL; 2651d24e67dSBen Walker } 2661d24e67dSBen Walker 2671d24e67dSBen Walker if (pthread_mutex_init(&map->mutex, NULL)) { 2681d24e67dSBen Walker free(map); 2691d24e67dSBen Walker return NULL; 2701d24e67dSBen Walker } 2711d24e67dSBen Walker 2721d24e67dSBen Walker map->default_translation = default_translation; 2731d24e67dSBen Walker map->cb_ctx = cb_ctx; 2744e06bb5eSSeth Howell if (ops) { 2754e06bb5eSSeth Howell map->ops = *ops; 2764e06bb5eSSeth Howell } 2771d24e67dSBen Walker 2784e06bb5eSSeth Howell if (ops && ops->notify_cb) { 279a9e484a6SDarek Stojaczyk pthread_mutex_lock(&g_spdk_mem_map_mutex); 2803456377bSSeth Howell rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 281803449caSDarek Stojaczyk if (rc != 0) { 282803449caSDarek Stojaczyk pthread_mutex_unlock(&g_spdk_mem_map_mutex); 283803449caSDarek Stojaczyk DEBUG_PRINT("Initial mem_map notify failed\n"); 284803449caSDarek Stojaczyk pthread_mutex_destroy(&map->mutex); 285c8837711SAviv Ben-David for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 286c8837711SAviv Ben-David free(map->map_256tb.map[i]); 287c8837711SAviv Ben-David } 288803449caSDarek Stojaczyk free(map); 289803449caSDarek Stojaczyk return NULL; 290803449caSDarek Stojaczyk } 2911d24e67dSBen Walker TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 2921d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 293a9e484a6SDarek Stojaczyk } 2941d24e67dSBen Walker 2951d24e67dSBen Walker return map; 2961d24e67dSBen Walker } 2971d24e67dSBen Walker 2981d24e67dSBen Walker void 2991d24e67dSBen Walker spdk_mem_map_free(struct spdk_mem_map **pmap) 3001d24e67dSBen Walker { 3011d24e67dSBen Walker struct spdk_mem_map *map; 3021d24e67dSBen Walker size_t i; 3031d24e67dSBen Walker 3041d24e67dSBen Walker if (!pmap) { 3051d24e67dSBen Walker return; 3061d24e67dSBen Walker } 3071d24e67dSBen Walker 3081d24e67dSBen Walker map = *pmap; 3091d24e67dSBen Walker 3101d24e67dSBen Walker if (!map) { 3111d24e67dSBen Walker return; 3121d24e67dSBen Walker } 3131d24e67dSBen Walker 3140d57730eSDarek Stojaczyk if (map->ops.notify_cb) { 3151d24e67dSBen Walker pthread_mutex_lock(&g_spdk_mem_map_mutex); 3163456377bSSeth Howell mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 3171d24e67dSBen Walker TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 3181d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 3190d57730eSDarek Stojaczyk } 3201d24e67dSBen Walker 32133e88d79SJim Harris for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 32233e88d79SJim Harris free(map->map_256tb.map[i]); 3231d24e67dSBen Walker } 3241d24e67dSBen Walker 3251d24e67dSBen Walker pthread_mutex_destroy(&map->mutex); 3261d24e67dSBen Walker 3271d24e67dSBen Walker free(map); 3281d24e67dSBen Walker *pmap = NULL; 3291d24e67dSBen Walker } 3301d24e67dSBen Walker 3311d24e67dSBen Walker int 33280b71d70SJim Harris spdk_mem_register(void *_vaddr, size_t len) 3331d24e67dSBen Walker { 3341d24e67dSBen Walker struct spdk_mem_map *map; 3351d24e67dSBen Walker int rc; 33680b71d70SJim Harris uint64_t vaddr = (uintptr_t)_vaddr; 33780b71d70SJim Harris uint64_t seg_vaddr; 3381d24e67dSBen Walker size_t seg_len; 339a33e0943SDarek Stojaczyk uint64_t reg; 3401d24e67dSBen Walker 34133e88d79SJim Harris if ((uintptr_t)vaddr & ~MASK_256TB) { 34280b71d70SJim Harris DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 3431d24e67dSBen Walker return -EINVAL; 3441d24e67dSBen Walker } 3451d24e67dSBen Walker 3461d24e67dSBen Walker if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 34780b71d70SJim Harris DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 3481d24e67dSBen Walker __func__, vaddr, len); 3491d24e67dSBen Walker return -EINVAL; 3501d24e67dSBen Walker } 3511d24e67dSBen Walker 352a33e0943SDarek Stojaczyk if (len == 0) { 353a33e0943SDarek Stojaczyk return 0; 354a33e0943SDarek Stojaczyk } 355a33e0943SDarek Stojaczyk 3561d24e67dSBen Walker pthread_mutex_lock(&g_spdk_mem_map_mutex); 3571d24e67dSBen Walker 3581d24e67dSBen Walker seg_vaddr = vaddr; 359a33e0943SDarek Stojaczyk seg_len = len; 360a33e0943SDarek Stojaczyk while (seg_len > 0) { 361a33e0943SDarek Stojaczyk reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 362a33e0943SDarek Stojaczyk if (reg & REG_MAP_REGISTERED) { 363a33e0943SDarek Stojaczyk pthread_mutex_unlock(&g_spdk_mem_map_mutex); 364a33e0943SDarek Stojaczyk return -EBUSY; 365a33e0943SDarek Stojaczyk } 366a33e0943SDarek Stojaczyk seg_vaddr += VALUE_2MB; 367a33e0943SDarek Stojaczyk seg_len -= VALUE_2MB; 368a33e0943SDarek Stojaczyk } 369a33e0943SDarek Stojaczyk 370a33e0943SDarek Stojaczyk seg_vaddr = vaddr; 3711d24e67dSBen Walker seg_len = 0; 3721d24e67dSBen Walker while (len > 0) { 3736bcd9295SDarek Stojaczyk spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 374a33e0943SDarek Stojaczyk seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); 3751d24e67dSBen Walker seg_len += VALUE_2MB; 3761d24e67dSBen Walker vaddr += VALUE_2MB; 3771d24e67dSBen Walker len -= VALUE_2MB; 3781d24e67dSBen Walker } 3791d24e67dSBen Walker 3801d24e67dSBen Walker TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 38180b71d70SJim Harris rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, 38280b71d70SJim Harris (void *)seg_vaddr, seg_len); 3831d24e67dSBen Walker if (rc != 0) { 3841d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 3851d24e67dSBen Walker return rc; 3861d24e67dSBen Walker } 3871d24e67dSBen Walker } 3881d24e67dSBen Walker 3891d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 3901d24e67dSBen Walker return 0; 3911d24e67dSBen Walker } 3921d24e67dSBen Walker 3931d24e67dSBen Walker int 39480b71d70SJim Harris spdk_mem_unregister(void *_vaddr, size_t len) 3951d24e67dSBen Walker { 3961d24e67dSBen Walker struct spdk_mem_map *map; 3971d24e67dSBen Walker int rc; 39880b71d70SJim Harris uint64_t vaddr = (uintptr_t)_vaddr; 39980b71d70SJim Harris uint64_t seg_vaddr; 4001d24e67dSBen Walker size_t seg_len; 40108039550SDarek Stojaczyk uint64_t reg, newreg; 4021d24e67dSBen Walker 40333e88d79SJim Harris if ((uintptr_t)vaddr & ~MASK_256TB) { 40480b71d70SJim Harris DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr); 4051d24e67dSBen Walker return -EINVAL; 4061d24e67dSBen Walker } 4071d24e67dSBen Walker 4081d24e67dSBen Walker if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 40980b71d70SJim Harris DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n", 4101d24e67dSBen Walker __func__, vaddr, len); 4111d24e67dSBen Walker return -EINVAL; 4121d24e67dSBen Walker } 4131d24e67dSBen Walker 4141d24e67dSBen Walker pthread_mutex_lock(&g_spdk_mem_map_mutex); 4151d24e67dSBen Walker 41608039550SDarek Stojaczyk /* The first page must be a start of a region. Also check if it's 41708039550SDarek Stojaczyk * registered to make sure we don't return -ERANGE for non-registered 41808039550SDarek Stojaczyk * regions. 41908039550SDarek Stojaczyk */ 42008039550SDarek Stojaczyk reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 42108039550SDarek Stojaczyk if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { 42208039550SDarek Stojaczyk pthread_mutex_unlock(&g_spdk_mem_map_mutex); 42308039550SDarek Stojaczyk return -ERANGE; 42408039550SDarek Stojaczyk } 42508039550SDarek Stojaczyk 4261d24e67dSBen Walker seg_vaddr = vaddr; 4271d24e67dSBen Walker seg_len = len; 4281d24e67dSBen Walker while (seg_len > 0) { 4296bcd9295SDarek Stojaczyk reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 430a33e0943SDarek Stojaczyk if ((reg & REG_MAP_REGISTERED) == 0) { 4311d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 4321d24e67dSBen Walker return -EINVAL; 4331d24e67dSBen Walker } 4341d24e67dSBen Walker seg_vaddr += VALUE_2MB; 4351d24e67dSBen Walker seg_len -= VALUE_2MB; 4361d24e67dSBen Walker } 4371d24e67dSBen Walker 43808039550SDarek Stojaczyk newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 43908039550SDarek Stojaczyk /* If the next page is registered, it must be a start of a region as well, 44008039550SDarek Stojaczyk * otherwise we'd be unregistering only a part of a region. 44108039550SDarek Stojaczyk */ 44208039550SDarek Stojaczyk if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { 44308039550SDarek Stojaczyk pthread_mutex_unlock(&g_spdk_mem_map_mutex); 44408039550SDarek Stojaczyk return -ERANGE; 44508039550SDarek Stojaczyk } 4461d24e67dSBen Walker seg_vaddr = vaddr; 4471d24e67dSBen Walker seg_len = 0; 44808039550SDarek Stojaczyk 4491d24e67dSBen Walker while (len > 0) { 4506bcd9295SDarek Stojaczyk reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); 451a33e0943SDarek Stojaczyk spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); 4521d24e67dSBen Walker 453a33e0943SDarek Stojaczyk if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { 454dcac8e97SDarek Stojaczyk TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 45580b71d70SJim Harris rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 45680b71d70SJim Harris (void *)seg_vaddr, seg_len); 4571d24e67dSBen Walker if (rc != 0) { 4581d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 4591d24e67dSBen Walker return rc; 4601d24e67dSBen Walker } 4611d24e67dSBen Walker } 4621d24e67dSBen Walker 4633372a72cSDarek Stojaczyk seg_vaddr = vaddr; 4643372a72cSDarek Stojaczyk seg_len = VALUE_2MB; 4651d24e67dSBen Walker } else { 4661d24e67dSBen Walker seg_len += VALUE_2MB; 4671d24e67dSBen Walker } 4681d24e67dSBen Walker 4691d24e67dSBen Walker vaddr += VALUE_2MB; 4701d24e67dSBen Walker len -= VALUE_2MB; 4711d24e67dSBen Walker } 4721d24e67dSBen Walker 4731d24e67dSBen Walker if (seg_len > 0) { 474dcac8e97SDarek Stojaczyk TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { 47580b71d70SJim Harris rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, 47680b71d70SJim Harris (void *)seg_vaddr, seg_len); 4771d24e67dSBen Walker if (rc != 0) { 4781d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 4791d24e67dSBen Walker return rc; 4801d24e67dSBen Walker } 4811d24e67dSBen Walker } 4821d24e67dSBen Walker } 4831d24e67dSBen Walker 4841d24e67dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 4851d24e67dSBen Walker return 0; 4861d24e67dSBen Walker } 4871d24e67dSBen Walker 488cf450c0dSBen Walker int 489cf450c0dSBen Walker spdk_mem_reserve(void *vaddr, size_t len) 490cf450c0dSBen Walker { 491cf450c0dSBen Walker struct spdk_mem_map *map; 492cf450c0dSBen Walker void *seg_vaddr; 493cf450c0dSBen Walker size_t seg_len; 494cf450c0dSBen Walker uint64_t reg; 495cf450c0dSBen Walker 496cf450c0dSBen Walker if ((uintptr_t)vaddr & ~MASK_256TB) { 497cf450c0dSBen Walker DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 498cf450c0dSBen Walker return -EINVAL; 499cf450c0dSBen Walker } 500cf450c0dSBen Walker 501cf450c0dSBen Walker if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 502cf450c0dSBen Walker DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 503cf450c0dSBen Walker __func__, vaddr, len); 504cf450c0dSBen Walker return -EINVAL; 505cf450c0dSBen Walker } 506cf450c0dSBen Walker 507cf450c0dSBen Walker if (len == 0) { 508cf450c0dSBen Walker return 0; 509cf450c0dSBen Walker } 510cf450c0dSBen Walker 511cf450c0dSBen Walker pthread_mutex_lock(&g_spdk_mem_map_mutex); 512cf450c0dSBen Walker 513cf450c0dSBen Walker /* Check if any part of this range is already registered */ 514cf450c0dSBen Walker seg_vaddr = vaddr; 515cf450c0dSBen Walker seg_len = len; 516cf450c0dSBen Walker while (seg_len > 0) { 517cf450c0dSBen Walker reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); 518cf450c0dSBen Walker if (reg & REG_MAP_REGISTERED) { 519cf450c0dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 520cf450c0dSBen Walker return -EBUSY; 521cf450c0dSBen Walker } 522cf450c0dSBen Walker seg_vaddr += VALUE_2MB; 523cf450c0dSBen Walker seg_len -= VALUE_2MB; 524cf450c0dSBen Walker } 525cf450c0dSBen Walker 526cf450c0dSBen Walker /* Simply set the translation to the memory map's default. This allocates the space in the 527cf450c0dSBen Walker * map but does not provide a valid translation. */ 528cf450c0dSBen Walker spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, 529cf450c0dSBen Walker g_mem_reg_map->default_translation); 530cf450c0dSBen Walker 531cf450c0dSBen Walker TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 532cf450c0dSBen Walker spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); 533cf450c0dSBen Walker } 534cf450c0dSBen Walker 535cf450c0dSBen Walker pthread_mutex_unlock(&g_spdk_mem_map_mutex); 536cf450c0dSBen Walker return 0; 537cf450c0dSBen Walker } 538cf450c0dSBen Walker 5391d24e67dSBen Walker static struct map_1gb * 5403456377bSSeth Howell mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 5411d24e67dSBen Walker { 5421d24e67dSBen Walker struct map_1gb *map_1gb; 54333e88d79SJim Harris uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 5441d24e67dSBen Walker size_t i; 5451d24e67dSBen Walker 5467d24e2a4SDaniel Verkamp if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { 5477d24e2a4SDaniel Verkamp return NULL; 5487d24e2a4SDaniel Verkamp } 5497d24e2a4SDaniel Verkamp 55033e88d79SJim Harris map_1gb = map->map_256tb.map[idx_256tb]; 5511d24e67dSBen Walker 5521d24e67dSBen Walker if (!map_1gb) { 5531d24e67dSBen Walker pthread_mutex_lock(&map->mutex); 5541d24e67dSBen Walker 5551d24e67dSBen Walker /* Recheck to make sure nobody else got the mutex first. */ 55633e88d79SJim Harris map_1gb = map->map_256tb.map[idx_256tb]; 5571d24e67dSBen Walker if (!map_1gb) { 5581d24e67dSBen Walker map_1gb = malloc(sizeof(struct map_1gb)); 5591d24e67dSBen Walker if (map_1gb) { 5601d24e67dSBen Walker /* initialize all entries to default translation */ 5611d24e67dSBen Walker for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 5621d24e67dSBen Walker map_1gb->map[i].translation_2mb = map->default_translation; 5631d24e67dSBen Walker } 56433e88d79SJim Harris map->map_256tb.map[idx_256tb] = map_1gb; 5651d24e67dSBen Walker } 5661d24e67dSBen Walker } 5671d24e67dSBen Walker 5681d24e67dSBen Walker pthread_mutex_unlock(&map->mutex); 5691d24e67dSBen Walker 5701d24e67dSBen Walker if (!map_1gb) { 5711d24e67dSBen Walker DEBUG_PRINT("allocation failed\n"); 5721d24e67dSBen Walker return NULL; 5731d24e67dSBen Walker } 5741d24e67dSBen Walker } 5751d24e67dSBen Walker 5761d24e67dSBen Walker return map_1gb; 5771d24e67dSBen Walker } 5781d24e67dSBen Walker 5791d24e67dSBen Walker int 5801d24e67dSBen Walker spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 5811d24e67dSBen Walker uint64_t translation) 5821d24e67dSBen Walker { 5831d24e67dSBen Walker uint64_t vfn_2mb; 5841d24e67dSBen Walker struct map_1gb *map_1gb; 5851d24e67dSBen Walker uint64_t idx_1gb; 5861d24e67dSBen Walker struct map_2mb *map_2mb; 5871d24e67dSBen Walker 58833e88d79SJim Harris if ((uintptr_t)vaddr & ~MASK_256TB) { 589b8c99a3aSNick Connolly DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr); 5901d24e67dSBen Walker return -EINVAL; 5911d24e67dSBen Walker } 5921d24e67dSBen Walker 59320836de2SJim Harris /* For now, only 2 MB-aligned registrations are supported */ 5941d24e67dSBen Walker if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 595b8c99a3aSNick Connolly DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n", 5961d24e67dSBen Walker __func__, vaddr, size); 5971d24e67dSBen Walker return -EINVAL; 5981d24e67dSBen Walker } 5991d24e67dSBen Walker 6001d24e67dSBen Walker vfn_2mb = vaddr >> SHIFT_2MB; 6011d24e67dSBen Walker 6021d24e67dSBen Walker while (size) { 6033456377bSSeth Howell map_1gb = mem_map_get_map_1gb(map, vfn_2mb); 6041d24e67dSBen Walker if (!map_1gb) { 6051d24e67dSBen Walker DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 6061d24e67dSBen Walker return -ENOMEM; 6071d24e67dSBen Walker } 6081d24e67dSBen Walker 6091d24e67dSBen Walker idx_1gb = MAP_1GB_IDX(vfn_2mb); 6101d24e67dSBen Walker map_2mb = &map_1gb->map[idx_1gb]; 6111d24e67dSBen Walker map_2mb->translation_2mb = translation; 6121d24e67dSBen Walker 6131d24e67dSBen Walker size -= VALUE_2MB; 6141d24e67dSBen Walker vfn_2mb++; 6151d24e67dSBen Walker } 6161d24e67dSBen Walker 6171d24e67dSBen Walker return 0; 6181d24e67dSBen Walker } 6191d24e67dSBen Walker 6201d24e67dSBen Walker int 6211d24e67dSBen Walker spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 6221d24e67dSBen Walker { 623d7b5ca74SJim Harris return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); 6241d24e67dSBen Walker } 6251d24e67dSBen Walker 626608a2d58SJim Harris inline uint64_t 627d288c412SSeth Howell spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) 6281d24e67dSBen Walker { 6291d24e67dSBen Walker const struct map_1gb *map_1gb; 6301d24e67dSBen Walker const struct map_2mb *map_2mb; 63133e88d79SJim Harris uint64_t idx_256tb; 6321d24e67dSBen Walker uint64_t idx_1gb; 6331d24e67dSBen Walker uint64_t vfn_2mb; 6345d573868SSeth Howell uint64_t cur_size; 6355d573868SSeth Howell uint64_t prev_translation; 6367245134aSSeth Howell uint64_t orig_translation; 6371d24e67dSBen Walker 63833e88d79SJim Harris if (spdk_unlikely(vaddr & ~MASK_256TB)) { 6391d24e67dSBen Walker DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 6401d24e67dSBen Walker return map->default_translation; 6411d24e67dSBen Walker } 6421d24e67dSBen Walker 6431d24e67dSBen Walker vfn_2mb = vaddr >> SHIFT_2MB; 64433e88d79SJim Harris idx_256tb = MAP_256TB_IDX(vfn_2mb); 6451d24e67dSBen Walker idx_1gb = MAP_1GB_IDX(vfn_2mb); 6461d24e67dSBen Walker 64733e88d79SJim Harris map_1gb = map->map_256tb.map[idx_256tb]; 6481d24e67dSBen Walker if (spdk_unlikely(!map_1gb)) { 6491d24e67dSBen Walker return map->default_translation; 6501d24e67dSBen Walker } 6511d24e67dSBen Walker 65237b7a308SDarek Stojaczyk cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); 6535d573868SSeth Howell map_2mb = &map_1gb->map[idx_1gb]; 6545d573868SSeth Howell if (size == NULL || map->ops.are_contiguous == NULL || 6555d573868SSeth Howell map_2mb->translation_2mb == map->default_translation) { 656b45683c7SDarek Stojaczyk if (size != NULL) { 657641c5b00SDarek Stojaczyk *size = spdk_min(*size, cur_size); 658b45683c7SDarek Stojaczyk } 6591d24e67dSBen Walker return map_2mb->translation_2mb; 6601d24e67dSBen Walker } 6611d24e67dSBen Walker 6627245134aSSeth Howell orig_translation = map_2mb->translation_2mb; 6637245134aSSeth Howell prev_translation = orig_translation; 664b45683c7SDarek Stojaczyk while (cur_size < *size) { 6655d573868SSeth Howell vfn_2mb++; 6665d573868SSeth Howell idx_256tb = MAP_256TB_IDX(vfn_2mb); 6675d573868SSeth Howell idx_1gb = MAP_1GB_IDX(vfn_2mb); 6685d573868SSeth Howell 6695d573868SSeth Howell map_1gb = map->map_256tb.map[idx_256tb]; 6705d573868SSeth Howell if (spdk_unlikely(!map_1gb)) { 6715d573868SSeth Howell break; 6725d573868SSeth Howell } 6735d573868SSeth Howell 6745d573868SSeth Howell map_2mb = &map_1gb->map[idx_1gb]; 6755d573868SSeth Howell if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { 6765d573868SSeth Howell break; 6775d573868SSeth Howell } 6785d573868SSeth Howell 6795d573868SSeth Howell cur_size += VALUE_2MB; 6805d573868SSeth Howell prev_translation = map_2mb->translation_2mb; 6815d573868SSeth Howell } 6825d573868SSeth Howell 683641c5b00SDarek Stojaczyk *size = spdk_min(*size, cur_size); 6847245134aSSeth Howell return orig_translation; 6855d573868SSeth Howell } 6865d573868SSeth Howell 687b6fce191SDariusz Stojaczyk static void 688b6fce191SDariusz Stojaczyk memory_hotplug_cb(enum rte_mem_event event_type, 689b6fce191SDariusz Stojaczyk const void *addr, size_t len, void *arg) 690b6fce191SDariusz Stojaczyk { 691b6fce191SDariusz Stojaczyk if (event_type == RTE_MEM_EVENT_ALLOC) { 6929cec99b8SJim Harris spdk_mem_register((void *)addr, len); 6939cec99b8SJim Harris 6940871b382SJim Harris if (!spdk_env_dpdk_external_init()) { 6950871b382SJim Harris return; 6960871b382SJim Harris } 6970871b382SJim Harris 6986bb83abdSTomasz Zawadzki /* When the user initialized DPDK separately, we can't 6996bb83abdSTomasz Zawadzki * be sure that --match-allocations RTE flag was specified. 7006bb83abdSTomasz Zawadzki * Without this flag, DPDK can free memory in different units 7016bb83abdSTomasz Zawadzki * than it was allocated. It doesn't work with things like RDMA MRs. 7020871b382SJim Harris * 7036bb83abdSTomasz Zawadzki * For such cases, we mark segments so they aren't freed. 7049cec99b8SJim Harris */ 705f4cb6c90SDarek Stojaczyk while (len > 0) { 706f4cb6c90SDarek Stojaczyk struct rte_memseg *seg; 707f4cb6c90SDarek Stojaczyk 708f4cb6c90SDarek Stojaczyk seg = rte_mem_virt2memseg(addr, NULL); 709f4cb6c90SDarek Stojaczyk assert(seg != NULL); 7109cec99b8SJim Harris seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; 711f4cb6c90SDarek Stojaczyk addr = (void *)((uintptr_t)addr + seg->hugepage_sz); 712f4cb6c90SDarek Stojaczyk len -= seg->hugepage_sz; 713f4cb6c90SDarek Stojaczyk } 714b6fce191SDariusz Stojaczyk } else if (event_type == RTE_MEM_EVENT_FREE) { 715ba9853b9SJim Harris spdk_mem_unregister((void *)addr, len); 716b6fce191SDariusz Stojaczyk } 717b6fce191SDariusz Stojaczyk } 718b6fce191SDariusz Stojaczyk 719b6fce191SDariusz Stojaczyk static int 720b6fce191SDariusz Stojaczyk memory_iter_cb(const struct rte_memseg_list *msl, 721b6fce191SDariusz Stojaczyk const struct rte_memseg *ms, size_t len, void *arg) 722b6fce191SDariusz Stojaczyk { 723b6fce191SDariusz Stojaczyk return spdk_mem_register(ms->addr, len); 724b6fce191SDariusz Stojaczyk } 725b6fce191SDariusz Stojaczyk 726095f4254SLance Hartmann int 72715d0ae62SSeth Howell mem_map_init(bool legacy_mem) 7281d24e67dSBen Walker { 729396c445cSJim Harris g_legacy_mem = legacy_mem; 730396c445cSJim Harris 7311d24e67dSBen Walker g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 7321d24e67dSBen Walker if (g_mem_reg_map == NULL) { 7331d24e67dSBen Walker DEBUG_PRINT("memory registration map allocation failed\n"); 73437c0a02eSJim Harris return -ENOMEM; 7351d24e67dSBen Walker } 7361d24e67dSBen Walker 7371d24e67dSBen Walker /* 7381d24e67dSBen Walker * Walk all DPDK memory segments and register them 739fe137c89SJim Harris * with the main memory map 7401d24e67dSBen Walker */ 741a6658c54SSarvesh Lanke if (g_huge_pages) { 742b6fce191SDariusz Stojaczyk rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); 743b6fce191SDariusz Stojaczyk rte_memseg_contig_walk(memory_iter_cb, NULL); 744a6658c54SSarvesh Lanke } 745095f4254SLance Hartmann return 0; 7461d24e67dSBen Walker } 747f74d069eSJim Harris 748d9f92cd3SSeth Howell bool 749d9f92cd3SSeth Howell spdk_iommu_is_enabled(void) 750d9f92cd3SSeth Howell { 7513456377bSSeth Howell #if VFIO_ENABLED 752d9f92cd3SSeth Howell return g_vfio.enabled && !g_vfio.noiommu_enabled; 753d9f92cd3SSeth Howell #else 754d9f92cd3SSeth Howell return false; 755d9f92cd3SSeth Howell #endif 756d9f92cd3SSeth Howell } 757d9f92cd3SSeth Howell 758f74d069eSJim Harris struct spdk_vtophys_pci_device { 759f74d069eSJim Harris struct rte_pci_device *pci_device; 760f74d069eSJim Harris TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; 761f74d069eSJim Harris }; 762f74d069eSJim Harris 763f74d069eSJim Harris static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; 764f74d069eSJim Harris static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = 765f74d069eSJim Harris TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); 766f74d069eSJim Harris 767f74d069eSJim Harris static struct spdk_mem_map *g_vtophys_map; 768f4a63bb8SSeth Howell static struct spdk_mem_map *g_phys_ref_map; 769*40c9acf6SJim Harris static struct spdk_mem_map *g_numa_map; 770f74d069eSJim Harris 771ba9853b9SJim Harris #if VFIO_ENABLED 772ba9853b9SJim Harris static int 773a36bc251SBen Walker _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 774ba9853b9SJim Harris { 775ba9853b9SJim Harris struct spdk_vfio_dma_map *dma_map; 776ba9853b9SJim Harris int ret; 777ba9853b9SJim Harris 778ba9853b9SJim Harris dma_map = calloc(1, sizeof(*dma_map)); 779ba9853b9SJim Harris if (dma_map == NULL) { 780ba9853b9SJim Harris return -ENOMEM; 781ba9853b9SJim Harris } 782ba9853b9SJim Harris 783ba9853b9SJim Harris dma_map->map.argsz = sizeof(dma_map->map); 784ba9853b9SJim Harris dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 785ba9853b9SJim Harris dma_map->map.vaddr = vaddr; 786ba9853b9SJim Harris dma_map->map.iova = iova; 787ba9853b9SJim Harris dma_map->map.size = size; 788ba9853b9SJim Harris 789ba9853b9SJim Harris if (g_vfio.device_ref == 0) { 790ba9853b9SJim Harris /* VFIO requires at least one device (IOMMU group) to be added to 791ba9853b9SJim Harris * a VFIO container before it is possible to perform any IOMMU 792ba9853b9SJim Harris * operations on that container. This memory will be mapped once 793ba9853b9SJim Harris * the first device (IOMMU group) is hotplugged. 794ba9853b9SJim Harris * 795ba9853b9SJim Harris * Since the vfio container is managed internally by DPDK, it is 796ba9853b9SJim Harris * also possible that some device is already in that container, but 797ba9853b9SJim Harris * it's not managed by SPDK - e.g. an NIC attached internally 798ba9853b9SJim Harris * inside DPDK. We could map the memory straight away in such 799ba9853b9SJim Harris * scenario, but there's no need to do it. DPDK devices clearly 800ba9853b9SJim Harris * don't need our mappings and hence we defer the mapping 801ba9853b9SJim Harris * unconditionally until the first SPDK-managed device is 802ba9853b9SJim Harris * hotplugged. 803ba9853b9SJim Harris */ 804ba9853b9SJim Harris goto out_insert; 805ba9853b9SJim Harris } 806ba9853b9SJim Harris 807ba9853b9SJim Harris ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 808ba9853b9SJim Harris if (ret) { 80962b9210cSChangpeng Liu /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */ 81062b9210cSChangpeng Liu SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno); 811ba9853b9SJim Harris } 812ba9853b9SJim Harris 813ba9853b9SJim Harris out_insert: 814ba9853b9SJim Harris TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); 815a36bc251SBen Walker return 0; 816a36bc251SBen Walker } 817a36bc251SBen Walker 818a36bc251SBen Walker 819a36bc251SBen Walker static int 820a36bc251SBen Walker vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) 821a36bc251SBen Walker { 822a36bc251SBen Walker uint64_t refcount; 823a36bc251SBen Walker int ret; 824a36bc251SBen Walker 825a36bc251SBen Walker refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 826a36bc251SBen Walker assert(refcount < UINT64_MAX); 827a36bc251SBen Walker if (refcount > 0) { 828ba9853b9SJim Harris spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 829ba9853b9SJim Harris return 0; 830ba9853b9SJim Harris } 831ba9853b9SJim Harris 832a36bc251SBen Walker pthread_mutex_lock(&g_vfio.mutex); 833a36bc251SBen Walker ret = _vfio_iommu_map_dma(vaddr, iova, size); 834a36bc251SBen Walker pthread_mutex_unlock(&g_vfio.mutex); 835a36bc251SBen Walker if (ret) { 836a36bc251SBen Walker return ret; 837a36bc251SBen Walker } 838a36bc251SBen Walker 839a36bc251SBen Walker spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); 840a36bc251SBen Walker return 0; 841a36bc251SBen Walker } 842a36bc251SBen Walker 843a36bc251SBen Walker int 844a36bc251SBen Walker vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) 845a36bc251SBen Walker { 846a36bc251SBen Walker int ret; 847a36bc251SBen Walker 848a36bc251SBen Walker pthread_mutex_lock(&g_vfio.mutex); 849a36bc251SBen Walker ret = _vfio_iommu_map_dma(vaddr, iova, size); 850a36bc251SBen Walker pthread_mutex_unlock(&g_vfio.mutex); 851a36bc251SBen Walker 852a36bc251SBen Walker return ret; 853a36bc251SBen Walker } 854a36bc251SBen Walker 855a36bc251SBen Walker static int 856a36bc251SBen Walker _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) 857a36bc251SBen Walker { 858a36bc251SBen Walker struct vfio_iommu_type1_dma_unmap unmap = {}; 859a36bc251SBen Walker int ret; 860a36bc251SBen Walker 861a36bc251SBen Walker if (g_vfio.device_ref == 0) { 862a36bc251SBen Walker /* Memory is not mapped anymore, just remove it's references */ 863a36bc251SBen Walker goto out_remove; 864a36bc251SBen Walker } 865a36bc251SBen Walker 866a36bc251SBen Walker unmap.argsz = sizeof(unmap); 867a36bc251SBen Walker unmap.flags = 0; 868a36bc251SBen Walker unmap.iova = dma_map->map.iova; 869a36bc251SBen Walker unmap.size = dma_map->map.size; 870a36bc251SBen Walker ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 871a36bc251SBen Walker if (ret) { 872a36bc251SBen Walker SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); 873a36bc251SBen Walker } 874a36bc251SBen Walker 875a36bc251SBen Walker out_remove: 876a36bc251SBen Walker TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); 877a36bc251SBen Walker free(dma_map); 878a36bc251SBen Walker return 0; 879a36bc251SBen Walker } 880a36bc251SBen Walker 881ba9853b9SJim Harris static int 882ba9853b9SJim Harris vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) 883ba9853b9SJim Harris { 884ba9853b9SJim Harris struct spdk_vfio_dma_map *dma_map; 885ba9853b9SJim Harris uint64_t refcount; 886ba9853b9SJim Harris int ret; 887ba9853b9SJim Harris 888ba9853b9SJim Harris pthread_mutex_lock(&g_vfio.mutex); 889ba9853b9SJim Harris TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 890ba9853b9SJim Harris if (dma_map->map.iova == iova) { 891ba9853b9SJim Harris break; 892ba9853b9SJim Harris } 893ba9853b9SJim Harris } 894ba9853b9SJim Harris 895ba9853b9SJim Harris if (dma_map == NULL) { 896ba9853b9SJim Harris DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); 897ba9853b9SJim Harris pthread_mutex_unlock(&g_vfio.mutex); 898ba9853b9SJim Harris return -ENXIO; 899ba9853b9SJim Harris } 900ba9853b9SJim Harris 901ba9853b9SJim Harris refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); 902ba9853b9SJim Harris assert(refcount < UINT64_MAX); 903ba9853b9SJim Harris if (refcount > 0) { 904ba9853b9SJim Harris spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); 905ba9853b9SJim Harris } 906ba9853b9SJim Harris 907ba9853b9SJim Harris /* We still have outstanding references, don't clear it. */ 908ba9853b9SJim Harris if (refcount > 1) { 909ba9853b9SJim Harris pthread_mutex_unlock(&g_vfio.mutex); 910ba9853b9SJim Harris return 0; 911ba9853b9SJim Harris } 912ba9853b9SJim Harris 913ba9853b9SJim Harris /** don't support partial or multiple-page unmap for now */ 914ba9853b9SJim Harris assert(dma_map->map.size == size); 915ba9853b9SJim Harris 916a36bc251SBen Walker ret = _vfio_iommu_unmap_dma(dma_map); 917ba9853b9SJim Harris pthread_mutex_unlock(&g_vfio.mutex); 918a36bc251SBen Walker 919a36bc251SBen Walker return ret; 920a36bc251SBen Walker } 921a36bc251SBen Walker 922a36bc251SBen Walker int 923a36bc251SBen Walker vtophys_iommu_unmap_dma_bar(uint64_t vaddr) 924a36bc251SBen Walker { 925a36bc251SBen Walker struct spdk_vfio_dma_map *dma_map; 926a36bc251SBen Walker int ret; 927a36bc251SBen Walker 928a36bc251SBen Walker pthread_mutex_lock(&g_vfio.mutex); 929a36bc251SBen Walker TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 930a36bc251SBen Walker if (dma_map->map.vaddr == vaddr) { 931a36bc251SBen Walker break; 932a36bc251SBen Walker } 933a36bc251SBen Walker } 934a36bc251SBen Walker 935a36bc251SBen Walker if (dma_map == NULL) { 936a36bc251SBen Walker DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); 937a36bc251SBen Walker pthread_mutex_unlock(&g_vfio.mutex); 938a36bc251SBen Walker return -ENXIO; 939a36bc251SBen Walker } 940a36bc251SBen Walker 941a36bc251SBen Walker ret = _vfio_iommu_unmap_dma(dma_map); 942a36bc251SBen Walker pthread_mutex_unlock(&g_vfio.mutex); 943a36bc251SBen Walker return ret; 944ba9853b9SJim Harris } 945ba9853b9SJim Harris #endif 946ba9853b9SJim Harris 947f74d069eSJim Harris static uint64_t 948f74d069eSJim Harris vtophys_get_paddr_memseg(uint64_t vaddr) 949f74d069eSJim Harris { 950f74d069eSJim Harris uintptr_t paddr; 951f74d069eSJim Harris struct rte_memseg *seg; 952f74d069eSJim Harris 953f74d069eSJim Harris seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); 954f74d069eSJim Harris if (seg != NULL) { 955ab856faaSTomasz Zawadzki paddr = seg->iova; 956f74d069eSJim Harris if (paddr == RTE_BAD_IOVA) { 957f74d069eSJim Harris return SPDK_VTOPHYS_ERROR; 958f74d069eSJim Harris } 959f74d069eSJim Harris paddr += (vaddr - (uintptr_t)seg->addr); 960f74d069eSJim Harris return paddr; 961f74d069eSJim Harris } 962f74d069eSJim Harris 963f74d069eSJim Harris return SPDK_VTOPHYS_ERROR; 964f74d069eSJim Harris } 965f74d069eSJim Harris 966f74d069eSJim Harris /* Try to get the paddr from /proc/self/pagemap */ 967f74d069eSJim Harris static uint64_t 968f74d069eSJim Harris vtophys_get_paddr_pagemap(uint64_t vaddr) 969f74d069eSJim Harris { 970f74d069eSJim Harris uintptr_t paddr; 971f74d069eSJim Harris 972a7ff5ff5SDarek Stojaczyk /* Silence static analyzers */ 973a7ff5ff5SDarek Stojaczyk assert(vaddr != 0); 9746f48bf7cSDarek Stojaczyk paddr = rte_mem_virt2iova((void *)vaddr); 9756f48bf7cSDarek Stojaczyk if (paddr == RTE_BAD_IOVA) { 976f74d069eSJim Harris /* 977f74d069eSJim Harris * The vaddr may be valid but doesn't have a backing page 978f74d069eSJim Harris * assigned yet. Touch the page to ensure a backing page 979f74d069eSJim Harris * gets assigned, then try to translate again. 980f74d069eSJim Harris */ 981f74d069eSJim Harris rte_atomic64_read((rte_atomic64_t *)vaddr); 9826f48bf7cSDarek Stojaczyk paddr = rte_mem_virt2iova((void *)vaddr); 983f74d069eSJim Harris } 9846f48bf7cSDarek Stojaczyk if (paddr == RTE_BAD_IOVA) { 985f74d069eSJim Harris /* Unable to get to the physical address. */ 986f74d069eSJim Harris return SPDK_VTOPHYS_ERROR; 987f74d069eSJim Harris } 988f74d069eSJim Harris 989f74d069eSJim Harris return paddr; 990f74d069eSJim Harris } 991f74d069eSJim Harris 992c7f50109SJim Harris static uint64_t 993c7f50109SJim Harris pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) 994c7f50109SJim Harris { 995c7f50109SJim Harris struct rte_mem_resource *res; 996c7f50109SJim Harris uint64_t paddr; 997c7f50109SJim Harris unsigned r; 998c7f50109SJim Harris 999c7f50109SJim Harris for (r = 0; r < PCI_MAX_RESOURCE; r++) { 1000c7f50109SJim Harris res = dpdk_pci_device_get_mem_resource(dev, r); 1001c7f50109SJim Harris 1002c7f50109SJim Harris if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || 1003c7f50109SJim Harris (vaddr + len) >= (uint64_t)res->addr + res->len) { 1004c7f50109SJim Harris continue; 1005c7f50109SJim Harris } 1006c7f50109SJim Harris 1007c7f50109SJim Harris #if VFIO_ENABLED 1008c7f50109SJim Harris if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { 1009c7f50109SJim Harris /* 1010c7f50109SJim Harris * The IOMMU is on and we're using IOVA == VA. The BAR was 1011c7f50109SJim Harris * automatically registered when it was mapped, so just return 1012c7f50109SJim Harris * the virtual address here. 1013c7f50109SJim Harris */ 1014c7f50109SJim Harris return vaddr; 1015c7f50109SJim Harris } 1016c7f50109SJim Harris #endif 1017c7f50109SJim Harris paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); 1018c7f50109SJim Harris return paddr; 1019c7f50109SJim Harris } 1020c7f50109SJim Harris 1021c7f50109SJim Harris return SPDK_VTOPHYS_ERROR; 1022c7f50109SJim Harris } 1023c7f50109SJim Harris 1024f74d069eSJim Harris /* Try to get the paddr from pci devices */ 1025f74d069eSJim Harris static uint64_t 1026a36bc251SBen Walker vtophys_get_paddr_pci(uint64_t vaddr, size_t len) 1027f74d069eSJim Harris { 1028f74d069eSJim Harris struct spdk_vtophys_pci_device *vtophys_dev; 1029f74d069eSJim Harris uintptr_t paddr; 1030f74d069eSJim Harris struct rte_pci_device *dev; 1031f74d069eSJim Harris 1032f74d069eSJim Harris pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1033f74d069eSJim Harris TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1034f74d069eSJim Harris dev = vtophys_dev->pci_device; 1035c7f50109SJim Harris paddr = pci_device_vtophys(dev, vaddr, len); 10362bb7185fSJim Harris if (paddr != SPDK_VTOPHYS_ERROR) { 1037f74d069eSJim Harris pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1038f74d069eSJim Harris return paddr; 1039f74d069eSJim Harris } 1040f74d069eSJim Harris } 1041f74d069eSJim Harris pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 1042f74d069eSJim Harris 1043f74d069eSJim Harris return SPDK_VTOPHYS_ERROR; 1044f74d069eSJim Harris } 1045f74d069eSJim Harris 1046f74d069eSJim Harris static int 10473456377bSSeth Howell vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, 1048f74d069eSJim Harris enum spdk_mem_map_notify_action action, 1049f74d069eSJim Harris void *vaddr, size_t len) 1050f74d069eSJim Harris { 1051a36bc251SBen Walker int rc = 0; 1052f74d069eSJim Harris uint64_t paddr; 1053f74d069eSJim Harris 1054f74d069eSJim Harris if ((uintptr_t)vaddr & ~MASK_256TB) { 1055f74d069eSJim Harris DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 1056f74d069eSJim Harris return -EINVAL; 1057f74d069eSJim Harris } 1058f74d069eSJim Harris 1059f74d069eSJim Harris if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 10608a252783SSeth Howell DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", 10618a252783SSeth Howell vaddr, len); 1062f74d069eSJim Harris return -EINVAL; 1063f74d069eSJim Harris } 1064f74d069eSJim Harris 1065f74d069eSJim Harris /* Get the physical address from the DPDK memsegs */ 1066f74d069eSJim Harris paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1067f74d069eSJim Harris 1068f74d069eSJim Harris switch (action) { 1069f74d069eSJim Harris case SPDK_MEM_MAP_NOTIFY_REGISTER: 1070f74d069eSJim Harris if (paddr == SPDK_VTOPHYS_ERROR) { 1071ba9853b9SJim Harris /* This is not an address that DPDK is managing. */ 1072a36bc251SBen Walker 1073a36bc251SBen Walker /* Check if this is a PCI BAR. They need special handling */ 1074a36bc251SBen Walker paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1075a36bc251SBen Walker if (paddr != SPDK_VTOPHYS_ERROR) { 1076a36bc251SBen Walker /* Get paddr for each 2MB chunk in this address range */ 1077a36bc251SBen Walker while (len > 0) { 1078a36bc251SBen Walker paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1079a36bc251SBen Walker if (paddr == SPDK_VTOPHYS_ERROR) { 1080a36bc251SBen Walker DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1081a36bc251SBen Walker return -EFAULT; 1082a36bc251SBen Walker } 1083a36bc251SBen Walker 1084a36bc251SBen Walker rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1085a36bc251SBen Walker if (rc != 0) { 1086a36bc251SBen Walker return rc; 1087a36bc251SBen Walker } 1088a36bc251SBen Walker 1089a36bc251SBen Walker vaddr += VALUE_2MB; 1090a36bc251SBen Walker len -= VALUE_2MB; 1091a36bc251SBen Walker } 1092a36bc251SBen Walker 1093a36bc251SBen Walker return 0; 1094a36bc251SBen Walker } 1095a36bc251SBen Walker 10963456377bSSeth Howell #if VFIO_ENABLED 1097dd7cd80cSBen Walker enum rte_iova_mode iova_mode; 1098dd7cd80cSBen Walker 1099dd7cd80cSBen Walker iova_mode = rte_eal_iova_mode(); 1100dd7cd80cSBen Walker 1101dd7cd80cSBen Walker if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { 1102dd7cd80cSBen Walker /* We'll use the virtual address as the iova to match DPDK. */ 1103f74d069eSJim Harris paddr = (uint64_t)vaddr; 1104ba9853b9SJim Harris rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); 1105ba9853b9SJim Harris if (rc) { 1106ba9853b9SJim Harris return -EFAULT; 1107f74d069eSJim Harris } 1108ba9853b9SJim Harris while (len > 0) { 1109ba9853b9SJim Harris rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1110aaac4888SMaciej Szwed if (rc != 0) { 1111ba9853b9SJim Harris return rc; 1112aaac4888SMaciej Szwed } 1113aaac4888SMaciej Szwed vaddr += VALUE_2MB; 1114ba9853b9SJim Harris paddr += VALUE_2MB; 1115ba9853b9SJim Harris len -= VALUE_2MB; 1116aaac4888SMaciej Szwed } 1117f74d069eSJim Harris } else 1118f74d069eSJim Harris #endif 1119f74d069eSJim Harris { 1120f74d069eSJim Harris /* Get the physical address from /proc/self/pagemap. */ 1121f74d069eSJim Harris paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1122f74d069eSJim Harris if (paddr == SPDK_VTOPHYS_ERROR) { 1123f74d069eSJim Harris DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1124ba9853b9SJim Harris return -EFAULT; 1125f74d069eSJim Harris } 1126be04cfc3SDarek Stojaczyk 1127be04cfc3SDarek Stojaczyk /* Get paddr for each 2MB chunk in this address range */ 1128be04cfc3SDarek Stojaczyk while (len > 0) { 1129be04cfc3SDarek Stojaczyk /* Get the physical address from /proc/self/pagemap. */ 1130be04cfc3SDarek Stojaczyk paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); 1131be04cfc3SDarek Stojaczyk 1132be04cfc3SDarek Stojaczyk if (paddr == SPDK_VTOPHYS_ERROR) { 1133be04cfc3SDarek Stojaczyk DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1134ba9853b9SJim Harris return -EFAULT; 1135f74d069eSJim Harris } 1136be04cfc3SDarek Stojaczyk 1137a36bc251SBen Walker if (paddr & MASK_2MB) { 1138f74d069eSJim Harris DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); 1139ba9853b9SJim Harris return -EINVAL; 1140f74d069eSJim Harris } 11413456377bSSeth Howell #if VFIO_ENABLED 1142dd7cd80cSBen Walker /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory 1143dd7cd80cSBen Walker * with the IOMMU using the physical address to match. */ 1144dd7cd80cSBen Walker if (spdk_iommu_is_enabled()) { 1145ba9853b9SJim Harris rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); 1146ba9853b9SJim Harris if (rc) { 1147ba9853b9SJim Harris DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); 1148ba9853b9SJim Harris return -EFAULT; 1149dd7cd80cSBen Walker } 1150dd7cd80cSBen Walker } 1151dd7cd80cSBen Walker #endif 1152f74d069eSJim Harris 1153f74d069eSJim Harris rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1154be04cfc3SDarek Stojaczyk if (rc != 0) { 1155ba9853b9SJim Harris return rc; 1156be04cfc3SDarek Stojaczyk } 1157be04cfc3SDarek Stojaczyk 1158be04cfc3SDarek Stojaczyk vaddr += VALUE_2MB; 1159be04cfc3SDarek Stojaczyk len -= VALUE_2MB; 1160be04cfc3SDarek Stojaczyk } 1161be04cfc3SDarek Stojaczyk } 1162be04cfc3SDarek Stojaczyk } else { 1163be04cfc3SDarek Stojaczyk /* This is an address managed by DPDK. Just setup the translations. */ 1164be04cfc3SDarek Stojaczyk while (len > 0) { 1165be04cfc3SDarek Stojaczyk paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); 1166be04cfc3SDarek Stojaczyk if (paddr == SPDK_VTOPHYS_ERROR) { 1167be04cfc3SDarek Stojaczyk DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1168be04cfc3SDarek Stojaczyk return -EFAULT; 1169be04cfc3SDarek Stojaczyk } 1170be04cfc3SDarek Stojaczyk 1171be04cfc3SDarek Stojaczyk rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); 1172be04cfc3SDarek Stojaczyk if (rc != 0) { 1173be04cfc3SDarek Stojaczyk return rc; 1174be04cfc3SDarek Stojaczyk } 1175be04cfc3SDarek Stojaczyk 1176be04cfc3SDarek Stojaczyk vaddr += VALUE_2MB; 1177be04cfc3SDarek Stojaczyk len -= VALUE_2MB; 1178be04cfc3SDarek Stojaczyk } 1179be04cfc3SDarek Stojaczyk } 1180be04cfc3SDarek Stojaczyk 1181f74d069eSJim Harris break; 1182f74d069eSJim Harris case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 11833456377bSSeth Howell #if VFIO_ENABLED 1184ba9853b9SJim Harris if (paddr == SPDK_VTOPHYS_ERROR) { 1185ba9853b9SJim Harris /* 1186a36bc251SBen Walker * This is not an address that DPDK is managing. 1187a36bc251SBen Walker */ 1188a36bc251SBen Walker 1189a36bc251SBen Walker /* Check if this is a PCI BAR. They need special handling */ 1190a36bc251SBen Walker paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); 1191a36bc251SBen Walker if (paddr != SPDK_VTOPHYS_ERROR) { 1192a36bc251SBen Walker /* Get paddr for each 2MB chunk in this address range */ 1193a36bc251SBen Walker while (len > 0) { 1194a36bc251SBen Walker paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); 1195a36bc251SBen Walker if (paddr == SPDK_VTOPHYS_ERROR) { 1196a36bc251SBen Walker DEBUG_PRINT("could not get phys addr for %p\n", vaddr); 1197a36bc251SBen Walker return -EFAULT; 1198a36bc251SBen Walker } 1199a36bc251SBen Walker 1200a36bc251SBen Walker rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1201a36bc251SBen Walker if (rc != 0) { 1202a36bc251SBen Walker return rc; 1203a36bc251SBen Walker } 1204a36bc251SBen Walker 1205a36bc251SBen Walker vaddr += VALUE_2MB; 1206a36bc251SBen Walker len -= VALUE_2MB; 1207a36bc251SBen Walker } 1208a36bc251SBen Walker 1209a36bc251SBen Walker return 0; 1210a36bc251SBen Walker } 1211a36bc251SBen Walker 1212a36bc251SBen Walker /* If vfio is enabled, 1213ba9853b9SJim Harris * we need to unmap the range from the IOMMU 1214ba9853b9SJim Harris */ 121588179a65SSeth Howell if (spdk_iommu_is_enabled()) { 1216be04cfc3SDarek Stojaczyk uint64_t buffer_len = len; 1217dd7cd80cSBen Walker uint8_t *va = vaddr; 1218dd7cd80cSBen Walker enum rte_iova_mode iova_mode; 1219dd7cd80cSBen Walker 1220dd7cd80cSBen Walker iova_mode = rte_eal_iova_mode(); 1221dd7cd80cSBen Walker /* 1222dd7cd80cSBen Walker * In virtual address mode, the region is contiguous and can be done in 1223dd7cd80cSBen Walker * one unmap. 1224dd7cd80cSBen Walker */ 1225dd7cd80cSBen Walker if (iova_mode == RTE_IOVA_VA) { 1226dd7cd80cSBen Walker paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); 1227dd7cd80cSBen Walker if (buffer_len != len || paddr != (uintptr_t)va) { 1228dd7cd80cSBen Walker DEBUG_PRINT("Unmapping %p with length %lu failed because " 1229dd7cd80cSBen Walker "translation had address 0x%" PRIx64 " and length %lu\n", 1230dd7cd80cSBen Walker va, len, paddr, buffer_len); 1231f74d069eSJim Harris return -EINVAL; 1232f74d069eSJim Harris } 1233ba9853b9SJim Harris rc = vtophys_iommu_unmap_dma(paddr, len); 1234ba9853b9SJim Harris if (rc) { 1235ba9853b9SJim Harris DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1236ba9853b9SJim Harris return -EFAULT; 1237f74d069eSJim Harris } 1238dd7cd80cSBen Walker } else if (iova_mode == RTE_IOVA_PA) { 1239dd7cd80cSBen Walker /* Get paddr for each 2MB chunk in this address range */ 1240ba9853b9SJim Harris while (buffer_len > 0) { 1241dd7cd80cSBen Walker paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); 1242dd7cd80cSBen Walker 1243dd7cd80cSBen Walker if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { 1244dd7cd80cSBen Walker DEBUG_PRINT("could not get phys addr for %p\n", va); 1245dd7cd80cSBen Walker return -EFAULT; 1246dd7cd80cSBen Walker } 1247dd7cd80cSBen Walker 1248ba9853b9SJim Harris rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); 1249ba9853b9SJim Harris if (rc) { 1250ba9853b9SJim Harris DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); 1251ba9853b9SJim Harris return -EFAULT; 1252dd7cd80cSBen Walker } 1253dd7cd80cSBen Walker 1254ba9853b9SJim Harris va += VALUE_2MB; 1255ba9853b9SJim Harris buffer_len -= VALUE_2MB; 1256dd7cd80cSBen Walker } 1257dd7cd80cSBen Walker } 1258f74d069eSJim Harris } 1259f74d069eSJim Harris } 1260f74d069eSJim Harris #endif 1261be04cfc3SDarek Stojaczyk while (len > 0) { 1262f74d069eSJim Harris rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); 1263f74d069eSJim Harris if (rc != 0) { 1264f74d069eSJim Harris return rc; 1265f74d069eSJim Harris } 1266be04cfc3SDarek Stojaczyk 1267f74d069eSJim Harris vaddr += VALUE_2MB; 1268f74d069eSJim Harris len -= VALUE_2MB; 1269f74d069eSJim Harris } 1270f74d069eSJim Harris 1271be04cfc3SDarek Stojaczyk break; 1272be04cfc3SDarek Stojaczyk default: 1273be04cfc3SDarek Stojaczyk SPDK_UNREACHABLE(); 1274be04cfc3SDarek Stojaczyk } 1275be04cfc3SDarek Stojaczyk 1276f74d069eSJim Harris return rc; 1277f74d069eSJim Harris } 1278f74d069eSJim Harris 127943f4e393SDarek Stojaczyk static int 1280*40c9acf6SJim Harris numa_notify(void *cb_ctx, struct spdk_mem_map *map, 1281*40c9acf6SJim Harris enum spdk_mem_map_notify_action action, 1282*40c9acf6SJim Harris void *vaddr, size_t len) 1283*40c9acf6SJim Harris { 1284*40c9acf6SJim Harris struct rte_memseg *seg; 1285*40c9acf6SJim Harris 1286*40c9acf6SJim Harris /* We always return 0 from here, even if we aren't able to get a 1287*40c9acf6SJim Harris * memseg for the address. This can happen in non-DPDK memory 1288*40c9acf6SJim Harris * registration paths, for example vhost or vfio-user. That is OK, 1289*40c9acf6SJim Harris * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for 1290*40c9acf6SJim Harris * that kind of memory. If we return an error here, the 1291*40c9acf6SJim Harris * spdk_mem_register() from vhost or vfio-user would fail which is 1292*40c9acf6SJim Harris * not what we want. 1293*40c9acf6SJim Harris */ 1294*40c9acf6SJim Harris seg = rte_mem_virt2memseg(vaddr, NULL); 1295*40c9acf6SJim Harris if (seg == NULL) { 1296*40c9acf6SJim Harris return 0; 1297*40c9acf6SJim Harris } 1298*40c9acf6SJim Harris 1299*40c9acf6SJim Harris switch (action) { 1300*40c9acf6SJim Harris case SPDK_MEM_MAP_NOTIFY_REGISTER: 1301*40c9acf6SJim Harris spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id); 1302*40c9acf6SJim Harris break; 1303*40c9acf6SJim Harris case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 1304*40c9acf6SJim Harris spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len); 1305*40c9acf6SJim Harris break; 1306*40c9acf6SJim Harris default: 1307*40c9acf6SJim Harris break; 1308*40c9acf6SJim Harris } 1309*40c9acf6SJim Harris 1310*40c9acf6SJim Harris return 0; 1311*40c9acf6SJim Harris } 1312*40c9acf6SJim Harris 1313*40c9acf6SJim Harris static int 131443f4e393SDarek Stojaczyk vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) 131543f4e393SDarek Stojaczyk { 131643f4e393SDarek Stojaczyk /* This function is always called with paddrs for two subsequent 131743f4e393SDarek Stojaczyk * 2MB chunks in virtual address space, so those chunks will be only 131843f4e393SDarek Stojaczyk * physically contiguous if the physical addresses are 2MB apart 131943f4e393SDarek Stojaczyk * from each other as well. 132043f4e393SDarek Stojaczyk */ 132143f4e393SDarek Stojaczyk return (paddr2 - paddr1 == VALUE_2MB); 132243f4e393SDarek Stojaczyk } 132343f4e393SDarek Stojaczyk 13243456377bSSeth Howell #if VFIO_ENABLED 1325f74d069eSJim Harris 1326f74d069eSJim Harris static bool 13273456377bSSeth Howell vfio_enabled(void) 1328f74d069eSJim Harris { 1329f74d069eSJim Harris return rte_vfio_is_enabled("vfio_pci"); 1330f74d069eSJim Harris } 1331f74d069eSJim Harris 1332f74d069eSJim Harris /* Check if IOMMU is enabled on the system */ 1333f74d069eSJim Harris static bool 1334f74d069eSJim Harris has_iommu_groups(void) 1335f74d069eSJim Harris { 1336f74d069eSJim Harris int count = 0; 1337f74d069eSJim Harris DIR *dir = opendir("/sys/kernel/iommu_groups"); 1338f74d069eSJim Harris 1339f74d069eSJim Harris if (dir == NULL) { 1340f74d069eSJim Harris return false; 1341f74d069eSJim Harris } 1342f74d069eSJim Harris 13432a53883aSyidong0635 while (count < 3 && readdir(dir) != NULL) { 1344f74d069eSJim Harris count++; 1345f74d069eSJim Harris } 1346f74d069eSJim Harris 1347f74d069eSJim Harris closedir(dir); 1348f74d069eSJim Harris /* there will always be ./ and ../ entries */ 1349f74d069eSJim Harris return count > 2; 1350f74d069eSJim Harris } 1351f74d069eSJim Harris 1352f74d069eSJim Harris static bool 13533456377bSSeth Howell vfio_noiommu_enabled(void) 1354f74d069eSJim Harris { 1355f74d069eSJim Harris return rte_vfio_noiommu_is_enabled(); 1356f74d069eSJim Harris } 1357f74d069eSJim Harris 1358f74d069eSJim Harris static void 13593456377bSSeth Howell vtophys_iommu_init(void) 1360f74d069eSJim Harris { 1361f74d069eSJim Harris char proc_fd_path[PATH_MAX + 1]; 1362f74d069eSJim Harris char link_path[PATH_MAX + 1]; 1363f74d069eSJim Harris const char vfio_path[] = "/dev/vfio/vfio"; 1364f74d069eSJim Harris DIR *dir; 1365f74d069eSJim Harris struct dirent *d; 1366f74d069eSJim Harris 13673456377bSSeth Howell if (!vfio_enabled()) { 1368f74d069eSJim Harris return; 1369f74d069eSJim Harris } 1370f74d069eSJim Harris 13713456377bSSeth Howell if (vfio_noiommu_enabled()) { 1372f74d069eSJim Harris g_vfio.noiommu_enabled = true; 1373f74d069eSJim Harris } else if (!has_iommu_groups()) { 1374f74d069eSJim Harris return; 1375f74d069eSJim Harris } 1376f74d069eSJim Harris 1377f74d069eSJim Harris dir = opendir("/proc/self/fd"); 1378f74d069eSJim Harris if (!dir) { 1379f74d069eSJim Harris DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); 1380f74d069eSJim Harris return; 1381f74d069eSJim Harris } 1382f74d069eSJim Harris 1383f74d069eSJim Harris while ((d = readdir(dir)) != NULL) { 1384f74d069eSJim Harris if (d->d_type != DT_LNK) { 1385f74d069eSJim Harris continue; 1386f74d069eSJim Harris } 1387f74d069eSJim Harris 1388f74d069eSJim Harris snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); 1389f74d069eSJim Harris if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { 1390f74d069eSJim Harris continue; 1391f74d069eSJim Harris } 1392f74d069eSJim Harris 1393f74d069eSJim Harris if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { 1394f74d069eSJim Harris sscanf(d->d_name, "%d", &g_vfio.fd); 1395f74d069eSJim Harris break; 1396f74d069eSJim Harris } 1397f74d069eSJim Harris } 1398f74d069eSJim Harris 1399f74d069eSJim Harris closedir(dir); 1400f74d069eSJim Harris 1401f74d069eSJim Harris if (g_vfio.fd < 0) { 1402f74d069eSJim Harris DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); 1403f74d069eSJim Harris return; 1404f74d069eSJim Harris } 1405f74d069eSJim Harris 1406f74d069eSJim Harris g_vfio.enabled = true; 1407f74d069eSJim Harris 1408f74d069eSJim Harris return; 1409f74d069eSJim Harris } 14108f7d9ec2SBen Walker 1411f74d069eSJim Harris #endif 1412f74d069eSJim Harris 1413f74d069eSJim Harris void 141415d0ae62SSeth Howell vtophys_pci_device_added(struct rte_pci_device *pci_device) 1415f74d069eSJim Harris { 1416f74d069eSJim Harris struct spdk_vtophys_pci_device *vtophys_dev; 1417f74d069eSJim Harris 1418f74d069eSJim Harris pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1419f74d069eSJim Harris 1420f74d069eSJim Harris vtophys_dev = calloc(1, sizeof(*vtophys_dev)); 1421f74d069eSJim Harris if (vtophys_dev) { 1422f74d069eSJim Harris vtophys_dev->pci_device = pci_device; 1423f74d069eSJim Harris TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); 1424f74d069eSJim Harris } else { 1425f74d069eSJim Harris DEBUG_PRINT("Memory allocation error\n"); 1426f74d069eSJim Harris } 1427f74d069eSJim Harris pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 14287fed70f1SChangpeng Liu 14297fed70f1SChangpeng Liu #if VFIO_ENABLED 14307fed70f1SChangpeng Liu struct spdk_vfio_dma_map *dma_map; 14317fed70f1SChangpeng Liu int ret; 14327fed70f1SChangpeng Liu 14337fed70f1SChangpeng Liu if (!g_vfio.enabled) { 14347fed70f1SChangpeng Liu return; 14357fed70f1SChangpeng Liu } 14367fed70f1SChangpeng Liu 14377fed70f1SChangpeng Liu pthread_mutex_lock(&g_vfio.mutex); 14387fed70f1SChangpeng Liu g_vfio.device_ref++; 14397fed70f1SChangpeng Liu if (g_vfio.device_ref > 1) { 14407fed70f1SChangpeng Liu pthread_mutex_unlock(&g_vfio.mutex); 14417fed70f1SChangpeng Liu return; 14427fed70f1SChangpeng Liu } 14437fed70f1SChangpeng Liu 14447fed70f1SChangpeng Liu /* This is the first SPDK device using DPDK vfio. This means that the first 14457fed70f1SChangpeng Liu * IOMMU group might have been just been added to the DPDK vfio container. 14467fed70f1SChangpeng Liu * From this point it is certain that the memory can be mapped now. 14477fed70f1SChangpeng Liu */ 14487fed70f1SChangpeng Liu TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 14497fed70f1SChangpeng Liu ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); 14507fed70f1SChangpeng Liu if (ret) { 14517fed70f1SChangpeng Liu DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); 14527fed70f1SChangpeng Liu break; 14537fed70f1SChangpeng Liu } 14547fed70f1SChangpeng Liu } 14557fed70f1SChangpeng Liu pthread_mutex_unlock(&g_vfio.mutex); 14567fed70f1SChangpeng Liu #endif 1457f74d069eSJim Harris } 1458f74d069eSJim Harris 1459f74d069eSJim Harris void 146015d0ae62SSeth Howell vtophys_pci_device_removed(struct rte_pci_device *pci_device) 1461f74d069eSJim Harris { 1462f74d069eSJim Harris struct spdk_vtophys_pci_device *vtophys_dev; 1463f74d069eSJim Harris 1464f74d069eSJim Harris pthread_mutex_lock(&g_vtophys_pci_devices_mutex); 1465f74d069eSJim Harris TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { 1466f74d069eSJim Harris if (vtophys_dev->pci_device == pci_device) { 1467f74d069eSJim Harris TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); 1468f74d069eSJim Harris free(vtophys_dev); 1469f74d069eSJim Harris break; 1470f74d069eSJim Harris } 1471f74d069eSJim Harris } 1472f74d069eSJim Harris pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); 14737fed70f1SChangpeng Liu 14747fed70f1SChangpeng Liu #if VFIO_ENABLED 14757fed70f1SChangpeng Liu struct spdk_vfio_dma_map *dma_map; 14767fed70f1SChangpeng Liu int ret; 14777fed70f1SChangpeng Liu 14787fed70f1SChangpeng Liu if (!g_vfio.enabled) { 14797fed70f1SChangpeng Liu return; 14807fed70f1SChangpeng Liu } 14817fed70f1SChangpeng Liu 14827fed70f1SChangpeng Liu pthread_mutex_lock(&g_vfio.mutex); 14837fed70f1SChangpeng Liu assert(g_vfio.device_ref > 0); 14847fed70f1SChangpeng Liu g_vfio.device_ref--; 14857fed70f1SChangpeng Liu if (g_vfio.device_ref > 0) { 14867fed70f1SChangpeng Liu pthread_mutex_unlock(&g_vfio.mutex); 14877fed70f1SChangpeng Liu return; 14887fed70f1SChangpeng Liu } 14897fed70f1SChangpeng Liu 14907fed70f1SChangpeng Liu /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have 14917fed70f1SChangpeng Liu * any additional devices using it's vfio container, all the mappings 14927fed70f1SChangpeng Liu * will be automatically removed by the Linux vfio driver. We unmap 14937fed70f1SChangpeng Liu * the memory manually to be able to easily re-map it later regardless 14947fed70f1SChangpeng Liu * of other, external factors. 14957fed70f1SChangpeng Liu */ 14967fed70f1SChangpeng Liu TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { 14977fed70f1SChangpeng Liu struct vfio_iommu_type1_dma_unmap unmap = {}; 14987fed70f1SChangpeng Liu unmap.argsz = sizeof(unmap); 14997fed70f1SChangpeng Liu unmap.flags = 0; 15007fed70f1SChangpeng Liu unmap.iova = dma_map->map.iova; 15017fed70f1SChangpeng Liu unmap.size = dma_map->map.size; 15027fed70f1SChangpeng Liu ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); 15037fed70f1SChangpeng Liu if (ret) { 15047fed70f1SChangpeng Liu DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); 15057fed70f1SChangpeng Liu break; 15067fed70f1SChangpeng Liu } 15077fed70f1SChangpeng Liu } 15087fed70f1SChangpeng Liu pthread_mutex_unlock(&g_vfio.mutex); 15097fed70f1SChangpeng Liu #endif 1510f74d069eSJim Harris } 1511f74d069eSJim Harris 1512f74d069eSJim Harris int 151315d0ae62SSeth Howell vtophys_init(void) 1514f74d069eSJim Harris { 1515f74d069eSJim Harris const struct spdk_mem_map_ops vtophys_map_ops = { 15163456377bSSeth Howell .notify_cb = vtophys_notify, 151743f4e393SDarek Stojaczyk .are_contiguous = vtophys_check_contiguous_entries, 1518f74d069eSJim Harris }; 1519f74d069eSJim Harris 1520f4a63bb8SSeth Howell const struct spdk_mem_map_ops phys_ref_map_ops = { 1521f4a63bb8SSeth Howell .notify_cb = NULL, 1522f4a63bb8SSeth Howell .are_contiguous = NULL, 1523f4a63bb8SSeth Howell }; 1524f4a63bb8SSeth Howell 1525*40c9acf6SJim Harris const struct spdk_mem_map_ops numa_map_ops = { 1526*40c9acf6SJim Harris .notify_cb = numa_notify, 1527*40c9acf6SJim Harris .are_contiguous = NULL, 1528*40c9acf6SJim Harris }; 1529*40c9acf6SJim Harris 15303456377bSSeth Howell #if VFIO_ENABLED 15313456377bSSeth Howell vtophys_iommu_init(); 1532f74d069eSJim Harris #endif 1533f74d069eSJim Harris 1534f4a63bb8SSeth Howell g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); 1535f4a63bb8SSeth Howell if (g_phys_ref_map == NULL) { 1536f4a63bb8SSeth Howell DEBUG_PRINT("phys_ref map allocation failed.\n"); 1537f4a63bb8SSeth Howell return -ENOMEM; 1538f4a63bb8SSeth Howell } 1539f4a63bb8SSeth Howell 1540*40c9acf6SJim Harris g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL); 1541*40c9acf6SJim Harris if (g_numa_map == NULL) { 1542*40c9acf6SJim Harris DEBUG_PRINT("numa map allocation failed.\n"); 1543*40c9acf6SJim Harris spdk_mem_map_free(&g_phys_ref_map); 1544*40c9acf6SJim Harris return -ENOMEM; 1545*40c9acf6SJim Harris } 1546*40c9acf6SJim Harris 1547a6658c54SSarvesh Lanke if (g_huge_pages) { 1548f74d069eSJim Harris g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); 1549f74d069eSJim Harris if (g_vtophys_map == NULL) { 1550f74d069eSJim Harris DEBUG_PRINT("vtophys map allocation failed\n"); 1551*40c9acf6SJim Harris spdk_mem_map_free(&g_numa_map); 155235dfd3eaSyidong0635 spdk_mem_map_free(&g_phys_ref_map); 155337c0a02eSJim Harris return -ENOMEM; 1554f74d069eSJim Harris } 1555a6658c54SSarvesh Lanke } 1556f74d069eSJim Harris return 0; 1557f74d069eSJim Harris } 1558f74d069eSJim Harris 1559f74d069eSJim Harris uint64_t 1560d48a7b26SBen Walker spdk_vtophys(const void *buf, uint64_t *size) 1561f74d069eSJim Harris { 1562f74d069eSJim Harris uint64_t vaddr, paddr_2mb; 1563f74d069eSJim Harris 1564a6658c54SSarvesh Lanke if (!g_huge_pages) { 1565a6658c54SSarvesh Lanke return SPDK_VTOPHYS_ERROR; 1566a6658c54SSarvesh Lanke } 1567a6658c54SSarvesh Lanke 1568f74d069eSJim Harris vaddr = (uint64_t)buf; 1569f74d069eSJim Harris paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); 1570f74d069eSJim Harris 1571f74d069eSJim Harris /* 1572f74d069eSJim Harris * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, 1573f74d069eSJim Harris * we will still bitwise-or it with the buf offset below, but the result will still be 1574f74d069eSJim Harris * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being 1575f74d069eSJim Harris * unaligned) we must now check the return value before addition. 1576f74d069eSJim Harris */ 1577f74d069eSJim Harris SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); 1578f74d069eSJim Harris if (paddr_2mb == SPDK_VTOPHYS_ERROR) { 1579f74d069eSJim Harris return SPDK_VTOPHYS_ERROR; 1580f74d069eSJim Harris } else { 1581f74d069eSJim Harris return paddr_2mb + (vaddr & MASK_2MB); 1582f74d069eSJim Harris } 1583f74d069eSJim Harris } 1584c0bf9314SChangpeng Liu 1585*40c9acf6SJim Harris int32_t 1586*40c9acf6SJim Harris spdk_mem_get_numa_id(const void *buf, uint64_t *size) 1587*40c9acf6SJim Harris { 1588*40c9acf6SJim Harris return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size); 1589*40c9acf6SJim Harris } 1590*40c9acf6SJim Harris 1591c0bf9314SChangpeng Liu int 1592c0bf9314SChangpeng Liu spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset) 1593c0bf9314SChangpeng Liu { 1594c0bf9314SChangpeng Liu struct rte_memseg *seg; 1595c0bf9314SChangpeng Liu int ret, fd; 1596c0bf9314SChangpeng Liu 1597c0bf9314SChangpeng Liu seg = rte_mem_virt2memseg(vaddr, NULL); 1598c0bf9314SChangpeng Liu if (!seg) { 1599c0bf9314SChangpeng Liu SPDK_ERRLOG("memory %p doesn't exist\n", vaddr); 1600c0bf9314SChangpeng Liu return -ENOENT; 1601c0bf9314SChangpeng Liu } 1602c0bf9314SChangpeng Liu 1603c0bf9314SChangpeng Liu fd = rte_memseg_get_fd_thread_unsafe(seg); 1604c0bf9314SChangpeng Liu if (fd < 0) { 1605c0bf9314SChangpeng Liu return fd; 1606c0bf9314SChangpeng Liu } 1607c0bf9314SChangpeng Liu 1608c0bf9314SChangpeng Liu ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset); 1609c0bf9314SChangpeng Liu if (ret < 0) { 1610c0bf9314SChangpeng Liu return ret; 1611c0bf9314SChangpeng Liu } 1612c0bf9314SChangpeng Liu 1613c0bf9314SChangpeng Liu return fd; 1614c0bf9314SChangpeng Liu } 1615a6658c54SSarvesh Lanke 1616a6658c54SSarvesh Lanke void 1617a6658c54SSarvesh Lanke mem_disable_huge_pages(void) 1618a6658c54SSarvesh Lanke { 1619a6658c54SSarvesh Lanke g_huge_pages = false; 1620a6658c54SSarvesh Lanke } 1621