199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2010-2018 Intel Corporation 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson 599a2dd95SBruce Richardson #include <inttypes.h> 699a2dd95SBruce Richardson #include <string.h> 799a2dd95SBruce Richardson #include <fcntl.h> 899a2dd95SBruce Richardson #include <unistd.h> 999a2dd95SBruce Richardson #include <sys/ioctl.h> 1099a2dd95SBruce Richardson 1199a2dd95SBruce Richardson #include <rte_errno.h> 1299a2dd95SBruce Richardson #include <rte_log.h> 1399a2dd95SBruce Richardson #include <rte_memory.h> 1499a2dd95SBruce Richardson #include <rte_eal_memconfig.h> 1599a2dd95SBruce Richardson #include <rte_vfio.h> 1699a2dd95SBruce Richardson 1799a2dd95SBruce Richardson #include "eal_filesystem.h" 1899a2dd95SBruce Richardson #include "eal_memcfg.h" 1999a2dd95SBruce Richardson #include "eal_vfio.h" 2099a2dd95SBruce Richardson #include "eal_private.h" 2199a2dd95SBruce Richardson #include "eal_internal_cfg.h" 2299a2dd95SBruce Richardson 2399a2dd95SBruce Richardson #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" 2499a2dd95SBruce Richardson 2599a2dd95SBruce Richardson /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can 2699a2dd95SBruce Richardson * recreate the mappings for DPDK segments, but we cannot do so for memory that 2799a2dd95SBruce Richardson * was registered by the user themselves, so we need to store the user mappings 2899a2dd95SBruce Richardson * somewhere, to recreate them later. 2999a2dd95SBruce Richardson */ 3099a2dd95SBruce Richardson #define VFIO_MAX_USER_MEM_MAPS 256 3199a2dd95SBruce Richardson struct user_mem_map { 3256259f7fSXuan Ding uint64_t addr; /**< start VA */ 3356259f7fSXuan Ding uint64_t iova; /**< start IOVA */ 3456259f7fSXuan Ding uint64_t len; /**< total length of the mapping */ 3556259f7fSXuan Ding uint64_t chunk; /**< this mapping can be split in chunks of this size */ 3699a2dd95SBruce Richardson }; 3799a2dd95SBruce Richardson 3899a2dd95SBruce Richardson struct user_mem_maps { 3999a2dd95SBruce Richardson rte_spinlock_recursive_t lock; 4099a2dd95SBruce Richardson int n_maps; 4199a2dd95SBruce Richardson struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; 4299a2dd95SBruce Richardson }; 4399a2dd95SBruce Richardson 4499a2dd95SBruce Richardson struct vfio_config { 4599a2dd95SBruce Richardson int vfio_enabled; 4699a2dd95SBruce Richardson int vfio_container_fd; 4799a2dd95SBruce Richardson int vfio_active_groups; 4899a2dd95SBruce Richardson const struct vfio_iommu_type *vfio_iommu_type; 4999a2dd95SBruce Richardson struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; 5099a2dd95SBruce Richardson struct user_mem_maps mem_maps; 5199a2dd95SBruce Richardson }; 5299a2dd95SBruce Richardson 5399a2dd95SBruce Richardson /* per-process VFIO config */ 5499a2dd95SBruce Richardson static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; 5599a2dd95SBruce Richardson static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; 5699a2dd95SBruce Richardson 5799a2dd95SBruce Richardson static int vfio_type1_dma_map(int); 5899a2dd95SBruce Richardson static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 5999a2dd95SBruce Richardson static int vfio_spapr_dma_map(int); 6099a2dd95SBruce Richardson static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 6199a2dd95SBruce Richardson static int vfio_noiommu_dma_map(int); 6299a2dd95SBruce Richardson static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 6399a2dd95SBruce Richardson static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, 6499a2dd95SBruce Richardson uint64_t iova, uint64_t len, int do_map); 6599a2dd95SBruce Richardson 6699a2dd95SBruce Richardson /* IOMMU types we support */ 6799a2dd95SBruce Richardson static const struct vfio_iommu_type iommu_types[] = { 6899a2dd95SBruce Richardson /* x86 IOMMU, otherwise known as type 1 */ 6999a2dd95SBruce Richardson { 7099a2dd95SBruce Richardson .type_id = RTE_VFIO_TYPE1, 7199a2dd95SBruce Richardson .name = "Type 1", 7299a2dd95SBruce Richardson .partial_unmap = false, 7399a2dd95SBruce Richardson .dma_map_func = &vfio_type1_dma_map, 7499a2dd95SBruce Richardson .dma_user_map_func = &vfio_type1_dma_mem_map 7599a2dd95SBruce Richardson }, 7699a2dd95SBruce Richardson /* ppc64 IOMMU, otherwise known as spapr */ 7799a2dd95SBruce Richardson { 7899a2dd95SBruce Richardson .type_id = RTE_VFIO_SPAPR, 7999a2dd95SBruce Richardson .name = "sPAPR", 8099a2dd95SBruce Richardson .partial_unmap = true, 8199a2dd95SBruce Richardson .dma_map_func = &vfio_spapr_dma_map, 8299a2dd95SBruce Richardson .dma_user_map_func = &vfio_spapr_dma_mem_map 8399a2dd95SBruce Richardson }, 8499a2dd95SBruce Richardson /* IOMMU-less mode */ 8599a2dd95SBruce Richardson { 8699a2dd95SBruce Richardson .type_id = RTE_VFIO_NOIOMMU, 8799a2dd95SBruce Richardson .name = "No-IOMMU", 8899a2dd95SBruce Richardson .partial_unmap = true, 8999a2dd95SBruce Richardson .dma_map_func = &vfio_noiommu_dma_map, 9099a2dd95SBruce Richardson .dma_user_map_func = &vfio_noiommu_dma_mem_map 9199a2dd95SBruce Richardson }, 9299a2dd95SBruce Richardson }; 9399a2dd95SBruce Richardson 9499a2dd95SBruce Richardson static int 9599a2dd95SBruce Richardson is_null_map(const struct user_mem_map *map) 9699a2dd95SBruce Richardson { 9756259f7fSXuan Ding return map->addr == 0 && map->iova == 0 && 9856259f7fSXuan Ding map->len == 0 && map->chunk == 0; 9999a2dd95SBruce Richardson } 10099a2dd95SBruce Richardson 10199a2dd95SBruce Richardson /* we may need to merge user mem maps together in case of user mapping/unmapping 10299a2dd95SBruce Richardson * chunks of memory, so we'll need a comparator function to sort segments. 10399a2dd95SBruce Richardson */ 10499a2dd95SBruce Richardson static int 10599a2dd95SBruce Richardson user_mem_map_cmp(const void *a, const void *b) 10699a2dd95SBruce Richardson { 10799a2dd95SBruce Richardson const struct user_mem_map *umm_a = a; 10899a2dd95SBruce Richardson const struct user_mem_map *umm_b = b; 10999a2dd95SBruce Richardson 11099a2dd95SBruce Richardson /* move null entries to end */ 11199a2dd95SBruce Richardson if (is_null_map(umm_a)) 11299a2dd95SBruce Richardson return 1; 11399a2dd95SBruce Richardson if (is_null_map(umm_b)) 11499a2dd95SBruce Richardson return -1; 11599a2dd95SBruce Richardson 11699a2dd95SBruce Richardson /* sort by iova first */ 11799a2dd95SBruce Richardson if (umm_a->iova < umm_b->iova) 11899a2dd95SBruce Richardson return -1; 11999a2dd95SBruce Richardson if (umm_a->iova > umm_b->iova) 12099a2dd95SBruce Richardson return 1; 12199a2dd95SBruce Richardson 12299a2dd95SBruce Richardson if (umm_a->addr < umm_b->addr) 12399a2dd95SBruce Richardson return -1; 12499a2dd95SBruce Richardson if (umm_a->addr > umm_b->addr) 12599a2dd95SBruce Richardson return 1; 12699a2dd95SBruce Richardson 12799a2dd95SBruce Richardson if (umm_a->len < umm_b->len) 12899a2dd95SBruce Richardson return -1; 12999a2dd95SBruce Richardson if (umm_a->len > umm_b->len) 13099a2dd95SBruce Richardson return 1; 13199a2dd95SBruce Richardson 13256259f7fSXuan Ding if (umm_a->chunk < umm_b->chunk) 13356259f7fSXuan Ding return -1; 13456259f7fSXuan Ding if (umm_a->chunk > umm_b->chunk) 13556259f7fSXuan Ding return 1; 13656259f7fSXuan Ding 13799a2dd95SBruce Richardson return 0; 13899a2dd95SBruce Richardson } 13999a2dd95SBruce Richardson 14056259f7fSXuan Ding /* 14156259f7fSXuan Ding * Take in an address range and list of current mappings, and produce a list of 14256259f7fSXuan Ding * mappings that will be kept. 14399a2dd95SBruce Richardson */ 14456259f7fSXuan Ding static int 14556259f7fSXuan Ding process_maps(struct user_mem_map *src, size_t src_len, 14656259f7fSXuan Ding struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len) 14799a2dd95SBruce Richardson { 14856259f7fSXuan Ding struct user_mem_map *src_first = &src[0]; 14956259f7fSXuan Ding struct user_mem_map *src_last = &src[src_len - 1]; 15056259f7fSXuan Ding struct user_mem_map *dst_first = &newmap[0]; 15156259f7fSXuan Ding /* we can get at most two new segments */ 15256259f7fSXuan Ding struct user_mem_map *dst_last = &newmap[1]; 15356259f7fSXuan Ding uint64_t first_off = vaddr - src_first->addr; 15456259f7fSXuan Ding uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len); 15556259f7fSXuan Ding int newmap_len = 0; 15699a2dd95SBruce Richardson 15756259f7fSXuan Ding if (first_off != 0) { 15856259f7fSXuan Ding dst_first->addr = src_first->addr; 15956259f7fSXuan Ding dst_first->iova = src_first->iova; 16056259f7fSXuan Ding dst_first->len = first_off; 16156259f7fSXuan Ding dst_first->chunk = src_first->chunk; 16299a2dd95SBruce Richardson 16356259f7fSXuan Ding newmap_len++; 16456259f7fSXuan Ding } 16556259f7fSXuan Ding if (last_off != 0) { 16656259f7fSXuan Ding /* if we had start offset, we have two segments */ 16756259f7fSXuan Ding struct user_mem_map *last = 16856259f7fSXuan Ding first_off == 0 ? dst_first : dst_last; 16956259f7fSXuan Ding last->addr = (src_last->addr + src_last->len) - last_off; 17056259f7fSXuan Ding last->iova = (src_last->iova + src_last->len) - last_off; 17156259f7fSXuan Ding last->len = last_off; 17256259f7fSXuan Ding last->chunk = src_last->chunk; 17399a2dd95SBruce Richardson 17456259f7fSXuan Ding newmap_len++; 17556259f7fSXuan Ding } 17656259f7fSXuan Ding return newmap_len; 17756259f7fSXuan Ding } 17856259f7fSXuan Ding 17956259f7fSXuan Ding /* erase certain maps from the list */ 18056259f7fSXuan Ding static void 18156259f7fSXuan Ding delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps, 18256259f7fSXuan Ding size_t n_del) 18356259f7fSXuan Ding { 18456259f7fSXuan Ding int i; 18556259f7fSXuan Ding size_t j; 18656259f7fSXuan Ding 18756259f7fSXuan Ding for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) { 18856259f7fSXuan Ding struct user_mem_map *left = &user_mem_maps->maps[i]; 18956259f7fSXuan Ding struct user_mem_map *right = &del_maps[j]; 19056259f7fSXuan Ding 19156259f7fSXuan Ding if (user_mem_map_cmp(left, right) == 0) { 19256259f7fSXuan Ding memset(left, 0, sizeof(*left)); 19356259f7fSXuan Ding j++; 19456259f7fSXuan Ding user_mem_maps->n_maps--; 19556259f7fSXuan Ding } 19656259f7fSXuan Ding } 19756259f7fSXuan Ding } 19856259f7fSXuan Ding 19956259f7fSXuan Ding static void 20056259f7fSXuan Ding copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps, 20156259f7fSXuan Ding size_t n_add) 20256259f7fSXuan Ding { 20356259f7fSXuan Ding int i; 20456259f7fSXuan Ding size_t j; 20556259f7fSXuan Ding 20656259f7fSXuan Ding for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) { 20756259f7fSXuan Ding struct user_mem_map *left = &user_mem_maps->maps[i]; 20856259f7fSXuan Ding struct user_mem_map *right = &add_maps[j]; 20956259f7fSXuan Ding 21056259f7fSXuan Ding /* insert into empty space */ 21156259f7fSXuan Ding if (is_null_map(left)) { 21256259f7fSXuan Ding memcpy(left, right, sizeof(*left)); 21356259f7fSXuan Ding j++; 21456259f7fSXuan Ding user_mem_maps->n_maps++; 21556259f7fSXuan Ding } 21699a2dd95SBruce Richardson } 21799a2dd95SBruce Richardson } 21899a2dd95SBruce Richardson 21999a2dd95SBruce Richardson /* try merging two maps into one, return 1 if succeeded */ 22099a2dd95SBruce Richardson static int 22199a2dd95SBruce Richardson merge_map(struct user_mem_map *left, struct user_mem_map *right) 22299a2dd95SBruce Richardson { 22399a2dd95SBruce Richardson /* merge the same maps into one */ 22499a2dd95SBruce Richardson if (memcmp(left, right, sizeof(struct user_mem_map)) == 0) 22599a2dd95SBruce Richardson goto out; 22699a2dd95SBruce Richardson 22799a2dd95SBruce Richardson if (left->addr + left->len != right->addr) 22899a2dd95SBruce Richardson return 0; 22999a2dd95SBruce Richardson if (left->iova + left->len != right->iova) 23099a2dd95SBruce Richardson return 0; 23156259f7fSXuan Ding if (left->chunk != right->chunk) 23256259f7fSXuan Ding return 0; 23399a2dd95SBruce Richardson left->len += right->len; 23499a2dd95SBruce Richardson 23599a2dd95SBruce Richardson out: 23699a2dd95SBruce Richardson memset(right, 0, sizeof(*right)); 23799a2dd95SBruce Richardson 23899a2dd95SBruce Richardson return 1; 23999a2dd95SBruce Richardson } 24099a2dd95SBruce Richardson 24156259f7fSXuan Ding static bool 24256259f7fSXuan Ding addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps, 24356259f7fSXuan Ding uint64_t vaddr, uint64_t iova) 24456259f7fSXuan Ding { 24556259f7fSXuan Ding unsigned int i; 24656259f7fSXuan Ding 24756259f7fSXuan Ding for (i = 0; i < n_maps; i++) { 24856259f7fSXuan Ding struct user_mem_map *map = &maps[i]; 24956259f7fSXuan Ding uint64_t map_va_end = map->addr + map->len; 25056259f7fSXuan Ding uint64_t map_iova_end = map->iova + map->len; 25156259f7fSXuan Ding uint64_t map_va_off = vaddr - map->addr; 25256259f7fSXuan Ding uint64_t map_iova_off = iova - map->iova; 25356259f7fSXuan Ding 25456259f7fSXuan Ding /* we include end of the segment in comparison as well */ 25556259f7fSXuan Ding bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end); 25656259f7fSXuan Ding bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end); 25756259f7fSXuan Ding /* chunk may not be power of two, so use modulo */ 25856259f7fSXuan Ding bool addr_is_aligned = (map_va_off % map->chunk) == 0; 25956259f7fSXuan Ding bool iova_is_aligned = (map_iova_off % map->chunk) == 0; 26056259f7fSXuan Ding 26156259f7fSXuan Ding if (addr_in_map && iova_in_map && 26256259f7fSXuan Ding addr_is_aligned && iova_is_aligned) 26356259f7fSXuan Ding return true; 26456259f7fSXuan Ding } 26556259f7fSXuan Ding return false; 26656259f7fSXuan Ding } 26756259f7fSXuan Ding 26856259f7fSXuan Ding static int 26956259f7fSXuan Ding find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr, 27056259f7fSXuan Ding uint64_t iova, uint64_t len, struct user_mem_map *dst, 27156259f7fSXuan Ding size_t dst_len) 27299a2dd95SBruce Richardson { 27399a2dd95SBruce Richardson uint64_t va_end = addr + len; 27499a2dd95SBruce Richardson uint64_t iova_end = iova + len; 27556259f7fSXuan Ding bool found = false; 27656259f7fSXuan Ding size_t j; 27756259f7fSXuan Ding int i, ret; 27899a2dd95SBruce Richardson 27956259f7fSXuan Ding for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) { 28099a2dd95SBruce Richardson struct user_mem_map *map = &user_mem_maps->maps[i]; 28199a2dd95SBruce Richardson uint64_t map_va_end = map->addr + map->len; 28299a2dd95SBruce Richardson uint64_t map_iova_end = map->iova + map->len; 28399a2dd95SBruce Richardson 28456259f7fSXuan Ding bool start_addr_in_map = (addr >= map->addr) && 28556259f7fSXuan Ding (addr < map_va_end); 28656259f7fSXuan Ding bool end_addr_in_map = (va_end > map->addr) && 28756259f7fSXuan Ding (va_end <= map_va_end); 28856259f7fSXuan Ding bool start_iova_in_map = (iova >= map->iova) && 28956259f7fSXuan Ding (iova < map_iova_end); 29056259f7fSXuan Ding bool end_iova_in_map = (iova_end > map->iova) && 29156259f7fSXuan Ding (iova_end <= map_iova_end); 29299a2dd95SBruce Richardson 29356259f7fSXuan Ding /* do we have space in temporary map? */ 29456259f7fSXuan Ding if (j == dst_len) { 29556259f7fSXuan Ding ret = -ENOSPC; 29656259f7fSXuan Ding goto err; 29799a2dd95SBruce Richardson } 29856259f7fSXuan Ding /* check if current map is start of our segment */ 29956259f7fSXuan Ding if (!found && start_addr_in_map && start_iova_in_map) 30056259f7fSXuan Ding found = true; 30156259f7fSXuan Ding /* if we have previously found a segment, add it to the map */ 30256259f7fSXuan Ding if (found) { 30356259f7fSXuan Ding /* copy the segment into our temporary map */ 30456259f7fSXuan Ding memcpy(&dst[j++], map, sizeof(*map)); 30556259f7fSXuan Ding 30656259f7fSXuan Ding /* if we match end of segment, quit */ 30756259f7fSXuan Ding if (end_addr_in_map && end_iova_in_map) 30856259f7fSXuan Ding return j; 30956259f7fSXuan Ding } 31056259f7fSXuan Ding } 31156259f7fSXuan Ding /* we didn't find anything */ 31256259f7fSXuan Ding ret = -ENOENT; 31356259f7fSXuan Ding err: 31456259f7fSXuan Ding memset(dst, 0, sizeof(*dst) * dst_len); 31556259f7fSXuan Ding return ret; 31699a2dd95SBruce Richardson } 31799a2dd95SBruce Richardson 31899a2dd95SBruce Richardson /* this will sort all user maps, and merge/compact any adjacent maps */ 31999a2dd95SBruce Richardson static void 32099a2dd95SBruce Richardson compact_user_maps(struct user_mem_maps *user_mem_maps) 32199a2dd95SBruce Richardson { 32256259f7fSXuan Ding int i; 32399a2dd95SBruce Richardson 32456259f7fSXuan Ding qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 32599a2dd95SBruce Richardson sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 32699a2dd95SBruce Richardson 32799a2dd95SBruce Richardson /* we'll go over the list backwards when merging */ 32856259f7fSXuan Ding for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) { 32999a2dd95SBruce Richardson struct user_mem_map *l, *r; 33099a2dd95SBruce Richardson 33199a2dd95SBruce Richardson l = &user_mem_maps->maps[i]; 33299a2dd95SBruce Richardson r = &user_mem_maps->maps[i + 1]; 33399a2dd95SBruce Richardson 33499a2dd95SBruce Richardson if (is_null_map(l) || is_null_map(r)) 33599a2dd95SBruce Richardson continue; 33699a2dd95SBruce Richardson 33756259f7fSXuan Ding /* try and merge the maps */ 33899a2dd95SBruce Richardson if (merge_map(l, r)) 33956259f7fSXuan Ding user_mem_maps->n_maps--; 34099a2dd95SBruce Richardson } 34199a2dd95SBruce Richardson 34299a2dd95SBruce Richardson /* the entries are still sorted, but now they have holes in them, so 34356259f7fSXuan Ding * sort the list again. 34499a2dd95SBruce Richardson */ 34556259f7fSXuan Ding qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 34656259f7fSXuan Ding sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 34799a2dd95SBruce Richardson } 34899a2dd95SBruce Richardson 34999a2dd95SBruce Richardson static int 35099a2dd95SBruce Richardson vfio_open_group_fd(int iommu_group_num) 35199a2dd95SBruce Richardson { 35299a2dd95SBruce Richardson int vfio_group_fd; 35399a2dd95SBruce Richardson char filename[PATH_MAX]; 35499a2dd95SBruce Richardson struct rte_mp_msg mp_req, *mp_rep; 35599a2dd95SBruce Richardson struct rte_mp_reply mp_reply = {0}; 35699a2dd95SBruce Richardson struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 35799a2dd95SBruce Richardson struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 35899a2dd95SBruce Richardson const struct internal_config *internal_conf = 35999a2dd95SBruce Richardson eal_get_internal_configuration(); 36099a2dd95SBruce Richardson 36199a2dd95SBruce Richardson /* if primary, try to open the group */ 36299a2dd95SBruce Richardson if (internal_conf->process_type == RTE_PROC_PRIMARY) { 36399a2dd95SBruce Richardson /* try regular group format */ 36499a2dd95SBruce Richardson snprintf(filename, sizeof(filename), 36599a2dd95SBruce Richardson VFIO_GROUP_FMT, iommu_group_num); 36699a2dd95SBruce Richardson vfio_group_fd = open(filename, O_RDWR); 36799a2dd95SBruce Richardson if (vfio_group_fd < 0) { 36899a2dd95SBruce Richardson /* if file not found, it's not an error */ 36999a2dd95SBruce Richardson if (errno != ENOENT) { 370ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot open %s: %s", 37199a2dd95SBruce Richardson filename, strerror(errno)); 37299a2dd95SBruce Richardson return -1; 37399a2dd95SBruce Richardson } 37499a2dd95SBruce Richardson 37599a2dd95SBruce Richardson /* special case: try no-IOMMU path as well */ 37699a2dd95SBruce Richardson snprintf(filename, sizeof(filename), 37799a2dd95SBruce Richardson VFIO_NOIOMMU_GROUP_FMT, 37899a2dd95SBruce Richardson iommu_group_num); 37999a2dd95SBruce Richardson vfio_group_fd = open(filename, O_RDWR); 38099a2dd95SBruce Richardson if (vfio_group_fd < 0) { 38199a2dd95SBruce Richardson if (errno != ENOENT) { 382ae67895bSDavid Marchand EAL_LOG(ERR, 383ae67895bSDavid Marchand "Cannot open %s: %s", 38499a2dd95SBruce Richardson filename, strerror(errno)); 38599a2dd95SBruce Richardson return -1; 38699a2dd95SBruce Richardson } 38799a2dd95SBruce Richardson return -ENOENT; 38899a2dd95SBruce Richardson } 38999a2dd95SBruce Richardson /* noiommu group found */ 39099a2dd95SBruce Richardson } 39199a2dd95SBruce Richardson 39299a2dd95SBruce Richardson return vfio_group_fd; 39399a2dd95SBruce Richardson } 39499a2dd95SBruce Richardson /* if we're in a secondary process, request group fd from the primary 39599a2dd95SBruce Richardson * process via mp channel. 39699a2dd95SBruce Richardson */ 39799a2dd95SBruce Richardson p->req = SOCKET_REQ_GROUP; 39899a2dd95SBruce Richardson p->group_num = iommu_group_num; 39999a2dd95SBruce Richardson strcpy(mp_req.name, EAL_VFIO_MP); 40099a2dd95SBruce Richardson mp_req.len_param = sizeof(*p); 40199a2dd95SBruce Richardson mp_req.num_fds = 0; 40299a2dd95SBruce Richardson 40399a2dd95SBruce Richardson vfio_group_fd = -1; 40499a2dd95SBruce Richardson if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 40599a2dd95SBruce Richardson mp_reply.nb_received == 1) { 40699a2dd95SBruce Richardson mp_rep = &mp_reply.msgs[0]; 40799a2dd95SBruce Richardson p = (struct vfio_mp_param *)mp_rep->param; 40899a2dd95SBruce Richardson if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 40999a2dd95SBruce Richardson vfio_group_fd = mp_rep->fds[0]; 41099a2dd95SBruce Richardson } else if (p->result == SOCKET_NO_FD) { 411ae67895bSDavid Marchand EAL_LOG(ERR, "Bad VFIO group fd"); 41299a2dd95SBruce Richardson vfio_group_fd = -ENOENT; 41399a2dd95SBruce Richardson } 41499a2dd95SBruce Richardson } 41599a2dd95SBruce Richardson 41699a2dd95SBruce Richardson free(mp_reply.msgs); 41799a2dd95SBruce Richardson if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 418ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot request VFIO group fd"); 41999a2dd95SBruce Richardson return vfio_group_fd; 42099a2dd95SBruce Richardson } 42199a2dd95SBruce Richardson 42299a2dd95SBruce Richardson static struct vfio_config * 42399a2dd95SBruce Richardson get_vfio_cfg_by_group_num(int iommu_group_num) 42499a2dd95SBruce Richardson { 42599a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 42699a2dd95SBruce Richardson int i, j; 42799a2dd95SBruce Richardson 42899a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 42999a2dd95SBruce Richardson vfio_cfg = &vfio_cfgs[i]; 43099a2dd95SBruce Richardson for (j = 0; j < VFIO_MAX_GROUPS; j++) { 43199a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[j].group_num == 43299a2dd95SBruce Richardson iommu_group_num) 43399a2dd95SBruce Richardson return vfio_cfg; 43499a2dd95SBruce Richardson } 43599a2dd95SBruce Richardson } 43699a2dd95SBruce Richardson 43799a2dd95SBruce Richardson return NULL; 43899a2dd95SBruce Richardson } 43999a2dd95SBruce Richardson 44099a2dd95SBruce Richardson static int 44199a2dd95SBruce Richardson vfio_get_group_fd(struct vfio_config *vfio_cfg, 44299a2dd95SBruce Richardson int iommu_group_num) 44399a2dd95SBruce Richardson { 44499a2dd95SBruce Richardson int i; 44599a2dd95SBruce Richardson int vfio_group_fd; 44699a2dd95SBruce Richardson struct vfio_group *cur_grp; 44799a2dd95SBruce Richardson 44899a2dd95SBruce Richardson /* check if we already have the group descriptor open */ 44999a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_GROUPS; i++) 45099a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) 45199a2dd95SBruce Richardson return vfio_cfg->vfio_groups[i].fd; 45299a2dd95SBruce Richardson 45399a2dd95SBruce Richardson /* Lets see first if there is room for a new group */ 45499a2dd95SBruce Richardson if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { 455ae67895bSDavid Marchand EAL_LOG(ERR, "Maximum number of VFIO groups reached!"); 45699a2dd95SBruce Richardson return -1; 45799a2dd95SBruce Richardson } 45899a2dd95SBruce Richardson 45999a2dd95SBruce Richardson /* Now lets get an index for the new group */ 46099a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_GROUPS; i++) 46199a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[i].group_num == -1) { 46299a2dd95SBruce Richardson cur_grp = &vfio_cfg->vfio_groups[i]; 46399a2dd95SBruce Richardson break; 46499a2dd95SBruce Richardson } 46599a2dd95SBruce Richardson 46699a2dd95SBruce Richardson /* This should not happen */ 46799a2dd95SBruce Richardson if (i == VFIO_MAX_GROUPS) { 468ae67895bSDavid Marchand EAL_LOG(ERR, "No VFIO group free slot found"); 46999a2dd95SBruce Richardson return -1; 47099a2dd95SBruce Richardson } 47199a2dd95SBruce Richardson 47299a2dd95SBruce Richardson vfio_group_fd = vfio_open_group_fd(iommu_group_num); 47399a2dd95SBruce Richardson if (vfio_group_fd < 0) { 474ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to open VFIO group %d", 47599a2dd95SBruce Richardson iommu_group_num); 47699a2dd95SBruce Richardson return vfio_group_fd; 47799a2dd95SBruce Richardson } 47899a2dd95SBruce Richardson 47999a2dd95SBruce Richardson cur_grp->group_num = iommu_group_num; 48099a2dd95SBruce Richardson cur_grp->fd = vfio_group_fd; 48199a2dd95SBruce Richardson vfio_cfg->vfio_active_groups++; 48299a2dd95SBruce Richardson 48399a2dd95SBruce Richardson return vfio_group_fd; 48499a2dd95SBruce Richardson } 48599a2dd95SBruce Richardson 48699a2dd95SBruce Richardson static struct vfio_config * 48799a2dd95SBruce Richardson get_vfio_cfg_by_group_fd(int vfio_group_fd) 48899a2dd95SBruce Richardson { 48999a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 49099a2dd95SBruce Richardson int i, j; 49199a2dd95SBruce Richardson 49299a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 49399a2dd95SBruce Richardson vfio_cfg = &vfio_cfgs[i]; 49499a2dd95SBruce Richardson for (j = 0; j < VFIO_MAX_GROUPS; j++) 49599a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 49699a2dd95SBruce Richardson return vfio_cfg; 49799a2dd95SBruce Richardson } 49899a2dd95SBruce Richardson 49999a2dd95SBruce Richardson return NULL; 50099a2dd95SBruce Richardson } 50199a2dd95SBruce Richardson 50299a2dd95SBruce Richardson static struct vfio_config * 50399a2dd95SBruce Richardson get_vfio_cfg_by_container_fd(int container_fd) 50499a2dd95SBruce Richardson { 50599a2dd95SBruce Richardson int i; 50699a2dd95SBruce Richardson 50799a2dd95SBruce Richardson if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) 50899a2dd95SBruce Richardson return default_vfio_cfg; 50999a2dd95SBruce Richardson 51099a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 51199a2dd95SBruce Richardson if (vfio_cfgs[i].vfio_container_fd == container_fd) 51299a2dd95SBruce Richardson return &vfio_cfgs[i]; 51399a2dd95SBruce Richardson } 51499a2dd95SBruce Richardson 51599a2dd95SBruce Richardson return NULL; 51699a2dd95SBruce Richardson } 51799a2dd95SBruce Richardson 51899a2dd95SBruce Richardson int 51999a2dd95SBruce Richardson rte_vfio_get_group_fd(int iommu_group_num) 52099a2dd95SBruce Richardson { 52199a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 52299a2dd95SBruce Richardson 52399a2dd95SBruce Richardson /* get the vfio_config it belongs to */ 52499a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 52599a2dd95SBruce Richardson vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 52699a2dd95SBruce Richardson 52799a2dd95SBruce Richardson return vfio_get_group_fd(vfio_cfg, iommu_group_num); 52899a2dd95SBruce Richardson } 52999a2dd95SBruce Richardson 53099a2dd95SBruce Richardson static int 53199a2dd95SBruce Richardson get_vfio_group_idx(int vfio_group_fd) 53299a2dd95SBruce Richardson { 53399a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 53499a2dd95SBruce Richardson int i, j; 53599a2dd95SBruce Richardson 53699a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 53799a2dd95SBruce Richardson vfio_cfg = &vfio_cfgs[i]; 53899a2dd95SBruce Richardson for (j = 0; j < VFIO_MAX_GROUPS; j++) 53999a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 54099a2dd95SBruce Richardson return j; 54199a2dd95SBruce Richardson } 54299a2dd95SBruce Richardson 54399a2dd95SBruce Richardson return -1; 54499a2dd95SBruce Richardson } 54599a2dd95SBruce Richardson 54699a2dd95SBruce Richardson static void 54799a2dd95SBruce Richardson vfio_group_device_get(int vfio_group_fd) 54899a2dd95SBruce Richardson { 54999a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 55099a2dd95SBruce Richardson int i; 55199a2dd95SBruce Richardson 55299a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 55399a2dd95SBruce Richardson if (vfio_cfg == NULL) { 554ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO group fd!"); 55599a2dd95SBruce Richardson return; 55699a2dd95SBruce Richardson } 55799a2dd95SBruce Richardson 55899a2dd95SBruce Richardson i = get_vfio_group_idx(vfio_group_fd); 55999a2dd95SBruce Richardson if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 560ae67895bSDavid Marchand EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 56199a2dd95SBruce Richardson else 56299a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].devices++; 56399a2dd95SBruce Richardson } 56499a2dd95SBruce Richardson 56599a2dd95SBruce Richardson static void 56699a2dd95SBruce Richardson vfio_group_device_put(int vfio_group_fd) 56799a2dd95SBruce Richardson { 56899a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 56999a2dd95SBruce Richardson int i; 57099a2dd95SBruce Richardson 57199a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 57299a2dd95SBruce Richardson if (vfio_cfg == NULL) { 573ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO group fd!"); 57499a2dd95SBruce Richardson return; 57599a2dd95SBruce Richardson } 57699a2dd95SBruce Richardson 57799a2dd95SBruce Richardson i = get_vfio_group_idx(vfio_group_fd); 57899a2dd95SBruce Richardson if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 579ae67895bSDavid Marchand EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 58099a2dd95SBruce Richardson else 58199a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].devices--; 58299a2dd95SBruce Richardson } 58399a2dd95SBruce Richardson 58499a2dd95SBruce Richardson static int 58599a2dd95SBruce Richardson vfio_group_device_count(int vfio_group_fd) 58699a2dd95SBruce Richardson { 58799a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 58899a2dd95SBruce Richardson int i; 58999a2dd95SBruce Richardson 59099a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 59199a2dd95SBruce Richardson if (vfio_cfg == NULL) { 592ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO group fd!"); 59399a2dd95SBruce Richardson return -1; 59499a2dd95SBruce Richardson } 59599a2dd95SBruce Richardson 59699a2dd95SBruce Richardson i = get_vfio_group_idx(vfio_group_fd); 59799a2dd95SBruce Richardson if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { 598ae67895bSDavid Marchand EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 59999a2dd95SBruce Richardson return -1; 60099a2dd95SBruce Richardson } 60199a2dd95SBruce Richardson 60299a2dd95SBruce Richardson return vfio_cfg->vfio_groups[i].devices; 60399a2dd95SBruce Richardson } 60499a2dd95SBruce Richardson 60599a2dd95SBruce Richardson static void 60699a2dd95SBruce Richardson vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, 60799a2dd95SBruce Richardson void *arg __rte_unused) 60899a2dd95SBruce Richardson { 60999a2dd95SBruce Richardson struct rte_memseg_list *msl; 61099a2dd95SBruce Richardson struct rte_memseg *ms; 61199a2dd95SBruce Richardson size_t cur_len = 0; 61299a2dd95SBruce Richardson 61399a2dd95SBruce Richardson msl = rte_mem_virt2memseg_list(addr); 61499a2dd95SBruce Richardson 61599a2dd95SBruce Richardson /* for IOVA as VA mode, no need to care for IOVA addresses */ 61699a2dd95SBruce Richardson if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { 61799a2dd95SBruce Richardson uint64_t vfio_va = (uint64_t)(uintptr_t)addr; 61899a2dd95SBruce Richardson uint64_t page_sz = msl->page_sz; 61999a2dd95SBruce Richardson 62099a2dd95SBruce Richardson /* Maintain granularity of DMA map/unmap to memseg size */ 62199a2dd95SBruce Richardson for (; cur_len < len; cur_len += page_sz) { 62299a2dd95SBruce Richardson if (type == RTE_MEM_EVENT_ALLOC) 62399a2dd95SBruce Richardson vfio_dma_mem_map(default_vfio_cfg, vfio_va, 62499a2dd95SBruce Richardson vfio_va, page_sz, 1); 62599a2dd95SBruce Richardson else 62699a2dd95SBruce Richardson vfio_dma_mem_map(default_vfio_cfg, vfio_va, 62799a2dd95SBruce Richardson vfio_va, page_sz, 0); 62899a2dd95SBruce Richardson vfio_va += page_sz; 62999a2dd95SBruce Richardson } 63099a2dd95SBruce Richardson 63199a2dd95SBruce Richardson return; 63299a2dd95SBruce Richardson } 63399a2dd95SBruce Richardson 63499a2dd95SBruce Richardson /* memsegs are contiguous in memory */ 63599a2dd95SBruce Richardson ms = rte_mem_virt2memseg(addr, msl); 63699a2dd95SBruce Richardson while (cur_len < len) { 63799a2dd95SBruce Richardson /* some memory segments may have invalid IOVA */ 63899a2dd95SBruce Richardson if (ms->iova == RTE_BAD_IOVA) { 639ae67895bSDavid Marchand EAL_LOG(DEBUG, 640ae67895bSDavid Marchand "Memory segment at %p has bad IOVA, skipping", 64199a2dd95SBruce Richardson ms->addr); 64299a2dd95SBruce Richardson goto next; 64399a2dd95SBruce Richardson } 64499a2dd95SBruce Richardson if (type == RTE_MEM_EVENT_ALLOC) 64599a2dd95SBruce Richardson vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 64699a2dd95SBruce Richardson ms->iova, ms->len, 1); 64799a2dd95SBruce Richardson else 64899a2dd95SBruce Richardson vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 64999a2dd95SBruce Richardson ms->iova, ms->len, 0); 65099a2dd95SBruce Richardson next: 65199a2dd95SBruce Richardson cur_len += ms->len; 65299a2dd95SBruce Richardson ++ms; 65399a2dd95SBruce Richardson } 65499a2dd95SBruce Richardson } 65599a2dd95SBruce Richardson 65699a2dd95SBruce Richardson static int 65799a2dd95SBruce Richardson vfio_sync_default_container(void) 65899a2dd95SBruce Richardson { 65999a2dd95SBruce Richardson struct rte_mp_msg mp_req, *mp_rep; 66099a2dd95SBruce Richardson struct rte_mp_reply mp_reply = {0}; 66199a2dd95SBruce Richardson struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 66299a2dd95SBruce Richardson struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 66399a2dd95SBruce Richardson int iommu_type_id; 66499a2dd95SBruce Richardson unsigned int i; 66599a2dd95SBruce Richardson 66699a2dd95SBruce Richardson /* cannot be called from primary */ 66799a2dd95SBruce Richardson if (rte_eal_process_type() != RTE_PROC_SECONDARY) 66899a2dd95SBruce Richardson return -1; 66999a2dd95SBruce Richardson 67099a2dd95SBruce Richardson /* default container fd should have been opened in rte_vfio_enable() */ 67199a2dd95SBruce Richardson if (!default_vfio_cfg->vfio_enabled || 67299a2dd95SBruce Richardson default_vfio_cfg->vfio_container_fd < 0) { 673ae67895bSDavid Marchand EAL_LOG(ERR, "VFIO support is not initialized"); 67499a2dd95SBruce Richardson return -1; 67599a2dd95SBruce Richardson } 67699a2dd95SBruce Richardson 67799a2dd95SBruce Richardson /* find default container's IOMMU type */ 67899a2dd95SBruce Richardson p->req = SOCKET_REQ_IOMMU_TYPE; 67999a2dd95SBruce Richardson strcpy(mp_req.name, EAL_VFIO_MP); 68099a2dd95SBruce Richardson mp_req.len_param = sizeof(*p); 68199a2dd95SBruce Richardson mp_req.num_fds = 0; 68299a2dd95SBruce Richardson 68399a2dd95SBruce Richardson iommu_type_id = -1; 68499a2dd95SBruce Richardson if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 68599a2dd95SBruce Richardson mp_reply.nb_received == 1) { 68699a2dd95SBruce Richardson mp_rep = &mp_reply.msgs[0]; 68799a2dd95SBruce Richardson p = (struct vfio_mp_param *)mp_rep->param; 68899a2dd95SBruce Richardson if (p->result == SOCKET_OK) 68999a2dd95SBruce Richardson iommu_type_id = p->iommu_type_id; 69099a2dd95SBruce Richardson } 69199a2dd95SBruce Richardson free(mp_reply.msgs); 69299a2dd95SBruce Richardson if (iommu_type_id < 0) { 693ae67895bSDavid Marchand EAL_LOG(ERR, 694ae67895bSDavid Marchand "Could not get IOMMU type for default container"); 69599a2dd95SBruce Richardson return -1; 69699a2dd95SBruce Richardson } 69799a2dd95SBruce Richardson 69899a2dd95SBruce Richardson /* we now have an fd for default container, as well as its IOMMU type. 69999a2dd95SBruce Richardson * now, set up default VFIO container config to match. 70099a2dd95SBruce Richardson */ 70199a2dd95SBruce Richardson for (i = 0; i < RTE_DIM(iommu_types); i++) { 70299a2dd95SBruce Richardson const struct vfio_iommu_type *t = &iommu_types[i]; 70399a2dd95SBruce Richardson if (t->type_id != iommu_type_id) 70499a2dd95SBruce Richardson continue; 70599a2dd95SBruce Richardson 70699a2dd95SBruce Richardson /* we found our IOMMU type */ 70799a2dd95SBruce Richardson default_vfio_cfg->vfio_iommu_type = t; 70899a2dd95SBruce Richardson 70999a2dd95SBruce Richardson return 0; 71099a2dd95SBruce Richardson } 711ae67895bSDavid Marchand EAL_LOG(ERR, "Could not find IOMMU type id (%i)", 71299a2dd95SBruce Richardson iommu_type_id); 71399a2dd95SBruce Richardson return -1; 71499a2dd95SBruce Richardson } 71599a2dd95SBruce Richardson 71699a2dd95SBruce Richardson int 71799a2dd95SBruce Richardson rte_vfio_clear_group(int vfio_group_fd) 71899a2dd95SBruce Richardson { 71999a2dd95SBruce Richardson int i; 72099a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 72199a2dd95SBruce Richardson 72299a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 72399a2dd95SBruce Richardson if (vfio_cfg == NULL) { 724ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO group fd!"); 72599a2dd95SBruce Richardson return -1; 72699a2dd95SBruce Richardson } 72799a2dd95SBruce Richardson 72899a2dd95SBruce Richardson i = get_vfio_group_idx(vfio_group_fd); 72999a2dd95SBruce Richardson if (i < 0) 73099a2dd95SBruce Richardson return -1; 73199a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].group_num = -1; 73299a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].fd = -1; 73399a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].devices = 0; 73499a2dd95SBruce Richardson vfio_cfg->vfio_active_groups--; 73599a2dd95SBruce Richardson 73699a2dd95SBruce Richardson return 0; 73799a2dd95SBruce Richardson } 73899a2dd95SBruce Richardson 73999a2dd95SBruce Richardson int 74099a2dd95SBruce Richardson rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, 74199a2dd95SBruce Richardson int *vfio_dev_fd, struct vfio_device_info *device_info) 74299a2dd95SBruce Richardson { 74399a2dd95SBruce Richardson struct vfio_group_status group_status = { 74499a2dd95SBruce Richardson .argsz = sizeof(group_status) 74599a2dd95SBruce Richardson }; 74699a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 74799a2dd95SBruce Richardson struct user_mem_maps *user_mem_maps; 74899a2dd95SBruce Richardson int vfio_container_fd; 74999a2dd95SBruce Richardson int vfio_group_fd; 75099a2dd95SBruce Richardson int iommu_group_num; 75199a2dd95SBruce Richardson rte_uuid_t vf_token; 75299a2dd95SBruce Richardson int i, ret; 75399a2dd95SBruce Richardson const struct internal_config *internal_conf = 75499a2dd95SBruce Richardson eal_get_internal_configuration(); 75599a2dd95SBruce Richardson 75699a2dd95SBruce Richardson /* get group number */ 75799a2dd95SBruce Richardson ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 75899a2dd95SBruce Richardson if (ret == 0) { 759ae67895bSDavid Marchand EAL_LOG(NOTICE, 760ae67895bSDavid Marchand "%s not managed by VFIO driver, skipping", 76199a2dd95SBruce Richardson dev_addr); 76299a2dd95SBruce Richardson return 1; 76399a2dd95SBruce Richardson } 76499a2dd95SBruce Richardson 76599a2dd95SBruce Richardson /* if negative, something failed */ 76699a2dd95SBruce Richardson if (ret < 0) 76799a2dd95SBruce Richardson return -1; 76899a2dd95SBruce Richardson 76999a2dd95SBruce Richardson /* get the actual group fd */ 77099a2dd95SBruce Richardson vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 77199a2dd95SBruce Richardson if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 77299a2dd95SBruce Richardson return -1; 77399a2dd95SBruce Richardson 77499a2dd95SBruce Richardson /* 77599a2dd95SBruce Richardson * if vfio_group_fd == -ENOENT, that means the device 77699a2dd95SBruce Richardson * isn't managed by VFIO 77799a2dd95SBruce Richardson */ 77899a2dd95SBruce Richardson if (vfio_group_fd == -ENOENT) { 779ae67895bSDavid Marchand EAL_LOG(NOTICE, 780ae67895bSDavid Marchand "%s not managed by VFIO driver, skipping", 78199a2dd95SBruce Richardson dev_addr); 78299a2dd95SBruce Richardson return 1; 78399a2dd95SBruce Richardson } 78499a2dd95SBruce Richardson 78599a2dd95SBruce Richardson /* 78699a2dd95SBruce Richardson * at this point, we know that this group is viable (meaning, all devices 78799a2dd95SBruce Richardson * are either bound to VFIO or not bound to anything) 78899a2dd95SBruce Richardson */ 78999a2dd95SBruce Richardson 79099a2dd95SBruce Richardson /* check if the group is viable */ 79199a2dd95SBruce Richardson ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); 79299a2dd95SBruce Richardson if (ret) { 793ae67895bSDavid Marchand EAL_LOG(ERR, "%s cannot get VFIO group status, " 794ae67895bSDavid Marchand "error %i (%s)", dev_addr, errno, strerror(errno)); 79599a2dd95SBruce Richardson close(vfio_group_fd); 79699a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 79799a2dd95SBruce Richardson return -1; 79899a2dd95SBruce Richardson } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 799ae67895bSDavid Marchand EAL_LOG(ERR, "%s VFIO group is not viable! " 800ae67895bSDavid Marchand "Not all devices in IOMMU group bound to VFIO or unbound", 80199a2dd95SBruce Richardson dev_addr); 80299a2dd95SBruce Richardson close(vfio_group_fd); 80399a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 80499a2dd95SBruce Richardson return -1; 80599a2dd95SBruce Richardson } 80699a2dd95SBruce Richardson 80799a2dd95SBruce Richardson /* get the vfio_config it belongs to */ 80899a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 80999a2dd95SBruce Richardson vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 81099a2dd95SBruce Richardson vfio_container_fd = vfio_cfg->vfio_container_fd; 81199a2dd95SBruce Richardson user_mem_maps = &vfio_cfg->mem_maps; 81299a2dd95SBruce Richardson 81399a2dd95SBruce Richardson /* check if group does not have a container yet */ 81499a2dd95SBruce Richardson if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { 81599a2dd95SBruce Richardson 81699a2dd95SBruce Richardson /* add group to a container */ 81799a2dd95SBruce Richardson ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, 81899a2dd95SBruce Richardson &vfio_container_fd); 81999a2dd95SBruce Richardson if (ret) { 820ae67895bSDavid Marchand EAL_LOG(ERR, 82199a2dd95SBruce Richardson "%s cannot add VFIO group to container, error " 822ae67895bSDavid Marchand "%i (%s)", dev_addr, errno, strerror(errno)); 82399a2dd95SBruce Richardson close(vfio_group_fd); 82499a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 82599a2dd95SBruce Richardson return -1; 82699a2dd95SBruce Richardson } 82799a2dd95SBruce Richardson 82899a2dd95SBruce Richardson /* 82999a2dd95SBruce Richardson * pick an IOMMU type and set up DMA mappings for container 83099a2dd95SBruce Richardson * 83199a2dd95SBruce Richardson * needs to be done only once, only when first group is 83299a2dd95SBruce Richardson * assigned to a container and only in primary process. 83399a2dd95SBruce Richardson * Note this can happen several times with the hotplug 83499a2dd95SBruce Richardson * functionality. 83599a2dd95SBruce Richardson */ 83699a2dd95SBruce Richardson if (internal_conf->process_type == RTE_PROC_PRIMARY && 83799a2dd95SBruce Richardson vfio_cfg->vfio_active_groups == 1 && 83899a2dd95SBruce Richardson vfio_group_device_count(vfio_group_fd) == 0) { 83999a2dd95SBruce Richardson const struct vfio_iommu_type *t; 84099a2dd95SBruce Richardson 84199a2dd95SBruce Richardson /* select an IOMMU type which we will be using */ 84299a2dd95SBruce Richardson t = vfio_set_iommu_type(vfio_container_fd); 84399a2dd95SBruce Richardson if (!t) { 844ae67895bSDavid Marchand EAL_LOG(ERR, 845ae67895bSDavid Marchand "%s failed to select IOMMU type", 84699a2dd95SBruce Richardson dev_addr); 84799a2dd95SBruce Richardson close(vfio_group_fd); 84899a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 84999a2dd95SBruce Richardson return -1; 85099a2dd95SBruce Richardson } 85199a2dd95SBruce Richardson /* lock memory hotplug before mapping and release it 85299a2dd95SBruce Richardson * after registering callback, to prevent races 85399a2dd95SBruce Richardson */ 85499a2dd95SBruce Richardson rte_mcfg_mem_read_lock(); 85599a2dd95SBruce Richardson if (vfio_cfg == default_vfio_cfg) 85699a2dd95SBruce Richardson ret = t->dma_map_func(vfio_container_fd); 85799a2dd95SBruce Richardson else 85899a2dd95SBruce Richardson ret = 0; 85999a2dd95SBruce Richardson if (ret) { 860ae67895bSDavid Marchand EAL_LOG(ERR, 86199a2dd95SBruce Richardson "%s DMA remapping failed, error " 862ae67895bSDavid Marchand "%i (%s)", 86399a2dd95SBruce Richardson dev_addr, errno, strerror(errno)); 86499a2dd95SBruce Richardson close(vfio_group_fd); 86599a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 86699a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 86799a2dd95SBruce Richardson return -1; 86899a2dd95SBruce Richardson } 86999a2dd95SBruce Richardson 87099a2dd95SBruce Richardson vfio_cfg->vfio_iommu_type = t; 87199a2dd95SBruce Richardson 87299a2dd95SBruce Richardson /* re-map all user-mapped segments */ 87399a2dd95SBruce Richardson rte_spinlock_recursive_lock(&user_mem_maps->lock); 87499a2dd95SBruce Richardson 87599a2dd95SBruce Richardson /* this IOMMU type may not support DMA mapping, but 87699a2dd95SBruce Richardson * if we have mappings in the list - that means we have 87799a2dd95SBruce Richardson * previously mapped something successfully, so we can 87899a2dd95SBruce Richardson * be sure that DMA mapping is supported. 87999a2dd95SBruce Richardson */ 88099a2dd95SBruce Richardson for (i = 0; i < user_mem_maps->n_maps; i++) { 88199a2dd95SBruce Richardson struct user_mem_map *map; 88299a2dd95SBruce Richardson map = &user_mem_maps->maps[i]; 88399a2dd95SBruce Richardson 88499a2dd95SBruce Richardson ret = t->dma_user_map_func( 88599a2dd95SBruce Richardson vfio_container_fd, 88699a2dd95SBruce Richardson map->addr, map->iova, map->len, 88799a2dd95SBruce Richardson 1); 88899a2dd95SBruce Richardson if (ret) { 889ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't map user memory for DMA: " 89099a2dd95SBruce Richardson "va: 0x%" PRIx64 " " 89199a2dd95SBruce Richardson "iova: 0x%" PRIx64 " " 892ae67895bSDavid Marchand "len: 0x%" PRIu64, 89399a2dd95SBruce Richardson map->addr, map->iova, 89499a2dd95SBruce Richardson map->len); 89599a2dd95SBruce Richardson rte_spinlock_recursive_unlock( 89699a2dd95SBruce Richardson &user_mem_maps->lock); 89799a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 89899a2dd95SBruce Richardson return -1; 89999a2dd95SBruce Richardson } 90099a2dd95SBruce Richardson } 90199a2dd95SBruce Richardson rte_spinlock_recursive_unlock(&user_mem_maps->lock); 90299a2dd95SBruce Richardson 90399a2dd95SBruce Richardson /* register callback for mem events */ 90499a2dd95SBruce Richardson if (vfio_cfg == default_vfio_cfg) 90599a2dd95SBruce Richardson ret = rte_mem_event_callback_register( 90699a2dd95SBruce Richardson VFIO_MEM_EVENT_CLB_NAME, 90799a2dd95SBruce Richardson vfio_mem_event_callback, NULL); 90899a2dd95SBruce Richardson else 90999a2dd95SBruce Richardson ret = 0; 91099a2dd95SBruce Richardson /* unlock memory hotplug */ 91199a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 91299a2dd95SBruce Richardson 91399a2dd95SBruce Richardson if (ret && rte_errno != ENOTSUP) { 914ae67895bSDavid Marchand EAL_LOG(ERR, "Could not install memory event callback for VFIO"); 91599a2dd95SBruce Richardson return -1; 91699a2dd95SBruce Richardson } 91799a2dd95SBruce Richardson if (ret) 918ae67895bSDavid Marchand EAL_LOG(DEBUG, "Memory event callbacks not supported"); 91999a2dd95SBruce Richardson else 920ae67895bSDavid Marchand EAL_LOG(DEBUG, "Installed memory event callback for VFIO"); 92199a2dd95SBruce Richardson } 92299a2dd95SBruce Richardson } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && 92399a2dd95SBruce Richardson vfio_cfg == default_vfio_cfg && 92499a2dd95SBruce Richardson vfio_cfg->vfio_iommu_type == NULL) { 92599a2dd95SBruce Richardson /* if we're not a primary process, we do not set up the VFIO 92699a2dd95SBruce Richardson * container because it's already been set up by the primary 92799a2dd95SBruce Richardson * process. instead, we simply ask the primary about VFIO type 92899a2dd95SBruce Richardson * we are using, and set the VFIO config up appropriately. 92999a2dd95SBruce Richardson */ 93099a2dd95SBruce Richardson ret = vfio_sync_default_container(); 93199a2dd95SBruce Richardson if (ret < 0) { 932ae67895bSDavid Marchand EAL_LOG(ERR, "Could not sync default VFIO container"); 93399a2dd95SBruce Richardson close(vfio_group_fd); 93499a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 93599a2dd95SBruce Richardson return -1; 93699a2dd95SBruce Richardson } 93799a2dd95SBruce Richardson /* we have successfully initialized VFIO, notify user */ 93899a2dd95SBruce Richardson const struct vfio_iommu_type *t = 93999a2dd95SBruce Richardson default_vfio_cfg->vfio_iommu_type; 940ae67895bSDavid Marchand EAL_LOG(INFO, "Using IOMMU type %d (%s)", 94199a2dd95SBruce Richardson t->type_id, t->name); 94299a2dd95SBruce Richardson } 94399a2dd95SBruce Richardson 94499a2dd95SBruce Richardson rte_eal_vfio_get_vf_token(vf_token); 94599a2dd95SBruce Richardson 94699a2dd95SBruce Richardson /* get a file descriptor for the device with VF token firstly */ 94799a2dd95SBruce Richardson if (!rte_uuid_is_null(vf_token)) { 94899a2dd95SBruce Richardson char vf_token_str[RTE_UUID_STRLEN]; 94999a2dd95SBruce Richardson char dev[PATH_MAX]; 95099a2dd95SBruce Richardson 95199a2dd95SBruce Richardson rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str)); 95299a2dd95SBruce Richardson snprintf(dev, sizeof(dev), 95399a2dd95SBruce Richardson "%s vf_token=%s", dev_addr, vf_token_str); 95499a2dd95SBruce Richardson 95599a2dd95SBruce Richardson *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, 95699a2dd95SBruce Richardson dev); 95799a2dd95SBruce Richardson if (*vfio_dev_fd >= 0) 95899a2dd95SBruce Richardson goto dev_get_info; 95999a2dd95SBruce Richardson } 96099a2dd95SBruce Richardson 96199a2dd95SBruce Richardson /* get a file descriptor for the device */ 96299a2dd95SBruce Richardson *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); 96399a2dd95SBruce Richardson if (*vfio_dev_fd < 0) { 96499a2dd95SBruce Richardson /* if we cannot get a device fd, this implies a problem with 96599a2dd95SBruce Richardson * the VFIO group or the container not having IOMMU configured. 96699a2dd95SBruce Richardson */ 96799a2dd95SBruce Richardson 968ae67895bSDavid Marchand EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed", 96999a2dd95SBruce Richardson dev_addr); 97099a2dd95SBruce Richardson close(vfio_group_fd); 97199a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 97299a2dd95SBruce Richardson return -1; 97399a2dd95SBruce Richardson } 97499a2dd95SBruce Richardson 97599a2dd95SBruce Richardson /* test and setup the device */ 97699a2dd95SBruce Richardson dev_get_info: 97799a2dd95SBruce Richardson ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); 97899a2dd95SBruce Richardson if (ret) { 979ae67895bSDavid Marchand EAL_LOG(ERR, "%s cannot get device info, " 980ae67895bSDavid Marchand "error %i (%s)", dev_addr, errno, 98199a2dd95SBruce Richardson strerror(errno)); 98299a2dd95SBruce Richardson close(*vfio_dev_fd); 98399a2dd95SBruce Richardson close(vfio_group_fd); 98499a2dd95SBruce Richardson rte_vfio_clear_group(vfio_group_fd); 98599a2dd95SBruce Richardson return -1; 98699a2dd95SBruce Richardson } 98799a2dd95SBruce Richardson vfio_group_device_get(vfio_group_fd); 98899a2dd95SBruce Richardson 98999a2dd95SBruce Richardson return 0; 99099a2dd95SBruce Richardson } 99199a2dd95SBruce Richardson 99299a2dd95SBruce Richardson int 99399a2dd95SBruce Richardson rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, 99499a2dd95SBruce Richardson int vfio_dev_fd) 99599a2dd95SBruce Richardson { 99699a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 99799a2dd95SBruce Richardson int vfio_group_fd; 99899a2dd95SBruce Richardson int iommu_group_num; 99999a2dd95SBruce Richardson int ret; 100099a2dd95SBruce Richardson 100199a2dd95SBruce Richardson /* we don't want any DMA mapping messages to come while we're detaching 100299a2dd95SBruce Richardson * VFIO device, because this might be the last device and we might need 100399a2dd95SBruce Richardson * to unregister the callback. 100499a2dd95SBruce Richardson */ 100599a2dd95SBruce Richardson rte_mcfg_mem_read_lock(); 100699a2dd95SBruce Richardson 100799a2dd95SBruce Richardson /* get group number */ 100899a2dd95SBruce Richardson ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 100999a2dd95SBruce Richardson if (ret <= 0) { 1010ae67895bSDavid Marchand EAL_LOG(WARNING, "%s not managed by VFIO driver", 101199a2dd95SBruce Richardson dev_addr); 101299a2dd95SBruce Richardson /* This is an error at this point. */ 101399a2dd95SBruce Richardson ret = -1; 101499a2dd95SBruce Richardson goto out; 101599a2dd95SBruce Richardson } 101699a2dd95SBruce Richardson 101799a2dd95SBruce Richardson /* get the actual group fd */ 101899a2dd95SBruce Richardson vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 101999a2dd95SBruce Richardson if (vfio_group_fd < 0) { 1020ae67895bSDavid Marchand EAL_LOG(INFO, "rte_vfio_get_group_fd failed for %s", 102199a2dd95SBruce Richardson dev_addr); 102299a2dd95SBruce Richardson ret = vfio_group_fd; 102399a2dd95SBruce Richardson goto out; 102499a2dd95SBruce Richardson } 102599a2dd95SBruce Richardson 102699a2dd95SBruce Richardson /* get the vfio_config it belongs to */ 102799a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 102899a2dd95SBruce Richardson vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 102999a2dd95SBruce Richardson 103099a2dd95SBruce Richardson /* At this point we got an active group. Closing it will make the 103199a2dd95SBruce Richardson * container detachment. If this is the last active group, VFIO kernel 103299a2dd95SBruce Richardson * code will unset the container and the IOMMU mappings. 103399a2dd95SBruce Richardson */ 103499a2dd95SBruce Richardson 103599a2dd95SBruce Richardson /* Closing a device */ 103699a2dd95SBruce Richardson if (close(vfio_dev_fd) < 0) { 1037ae67895bSDavid Marchand EAL_LOG(INFO, "Error when closing vfio_dev_fd for %s", 103899a2dd95SBruce Richardson dev_addr); 103999a2dd95SBruce Richardson ret = -1; 104099a2dd95SBruce Richardson goto out; 104199a2dd95SBruce Richardson } 104299a2dd95SBruce Richardson 104399a2dd95SBruce Richardson /* An VFIO group can have several devices attached. Just when there is 104499a2dd95SBruce Richardson * no devices remaining should the group be closed. 104599a2dd95SBruce Richardson */ 104699a2dd95SBruce Richardson vfio_group_device_put(vfio_group_fd); 104799a2dd95SBruce Richardson if (!vfio_group_device_count(vfio_group_fd)) { 104899a2dd95SBruce Richardson 104999a2dd95SBruce Richardson if (close(vfio_group_fd) < 0) { 1050ae67895bSDavid Marchand EAL_LOG(INFO, "Error when closing vfio_group_fd for %s", 105199a2dd95SBruce Richardson dev_addr); 105299a2dd95SBruce Richardson ret = -1; 105399a2dd95SBruce Richardson goto out; 105499a2dd95SBruce Richardson } 105599a2dd95SBruce Richardson 105699a2dd95SBruce Richardson if (rte_vfio_clear_group(vfio_group_fd) < 0) { 1057ae67895bSDavid Marchand EAL_LOG(INFO, "Error when clearing group for %s", 105899a2dd95SBruce Richardson dev_addr); 105999a2dd95SBruce Richardson ret = -1; 106099a2dd95SBruce Richardson goto out; 106199a2dd95SBruce Richardson } 106299a2dd95SBruce Richardson } 106399a2dd95SBruce Richardson 106499a2dd95SBruce Richardson /* if there are no active device groups, unregister the callback to 106599a2dd95SBruce Richardson * avoid spurious attempts to map/unmap memory from VFIO. 106699a2dd95SBruce Richardson */ 106799a2dd95SBruce Richardson if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && 106899a2dd95SBruce Richardson rte_eal_process_type() != RTE_PROC_SECONDARY) 106999a2dd95SBruce Richardson rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, 107099a2dd95SBruce Richardson NULL); 107199a2dd95SBruce Richardson 107299a2dd95SBruce Richardson /* success */ 107399a2dd95SBruce Richardson ret = 0; 107499a2dd95SBruce Richardson 107599a2dd95SBruce Richardson out: 107699a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 107799a2dd95SBruce Richardson return ret; 107899a2dd95SBruce Richardson } 107999a2dd95SBruce Richardson 108099a2dd95SBruce Richardson int 108199a2dd95SBruce Richardson rte_vfio_enable(const char *modname) 108299a2dd95SBruce Richardson { 108399a2dd95SBruce Richardson /* initialize group list */ 108499a2dd95SBruce Richardson int i, j; 108599a2dd95SBruce Richardson int vfio_available; 108699a2dd95SBruce Richardson const struct internal_config *internal_conf = 108799a2dd95SBruce Richardson eal_get_internal_configuration(); 108899a2dd95SBruce Richardson 108999a2dd95SBruce Richardson rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; 109099a2dd95SBruce Richardson 109199a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 109299a2dd95SBruce Richardson vfio_cfgs[i].vfio_container_fd = -1; 109399a2dd95SBruce Richardson vfio_cfgs[i].vfio_active_groups = 0; 109499a2dd95SBruce Richardson vfio_cfgs[i].vfio_iommu_type = NULL; 109599a2dd95SBruce Richardson vfio_cfgs[i].mem_maps.lock = lock; 109699a2dd95SBruce Richardson 109799a2dd95SBruce Richardson for (j = 0; j < VFIO_MAX_GROUPS; j++) { 109899a2dd95SBruce Richardson vfio_cfgs[i].vfio_groups[j].fd = -1; 109999a2dd95SBruce Richardson vfio_cfgs[i].vfio_groups[j].group_num = -1; 110099a2dd95SBruce Richardson vfio_cfgs[i].vfio_groups[j].devices = 0; 110199a2dd95SBruce Richardson } 110299a2dd95SBruce Richardson } 110399a2dd95SBruce Richardson 1104ae67895bSDavid Marchand EAL_LOG(DEBUG, "Probing VFIO support..."); 110599a2dd95SBruce Richardson 110699a2dd95SBruce Richardson /* check if vfio module is loaded */ 110799a2dd95SBruce Richardson vfio_available = rte_eal_check_module(modname); 110899a2dd95SBruce Richardson 110999a2dd95SBruce Richardson /* return error directly */ 111099a2dd95SBruce Richardson if (vfio_available == -1) { 1111ae67895bSDavid Marchand EAL_LOG(INFO, "Could not get loaded module details!"); 111299a2dd95SBruce Richardson return -1; 111399a2dd95SBruce Richardson } 111499a2dd95SBruce Richardson 111599a2dd95SBruce Richardson /* return 0 if VFIO modules not loaded */ 111699a2dd95SBruce Richardson if (vfio_available == 0) { 1117ae67895bSDavid Marchand EAL_LOG(DEBUG, 1118ae67895bSDavid Marchand "VFIO modules not loaded, skipping VFIO support..."); 111999a2dd95SBruce Richardson return 0; 112099a2dd95SBruce Richardson } 112199a2dd95SBruce Richardson 112299a2dd95SBruce Richardson if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1123*6e18a2d4SDavid Marchand if (vfio_mp_sync_setup() == -1) { 1124*6e18a2d4SDavid Marchand default_vfio_cfg->vfio_container_fd = -1; 1125*6e18a2d4SDavid Marchand } else { 112699a2dd95SBruce Richardson /* open a new container */ 1127*6e18a2d4SDavid Marchand default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); 1128*6e18a2d4SDavid Marchand } 112999a2dd95SBruce Richardson } else { 113099a2dd95SBruce Richardson /* get the default container from the primary process */ 113199a2dd95SBruce Richardson default_vfio_cfg->vfio_container_fd = 113299a2dd95SBruce Richardson vfio_get_default_container_fd(); 113399a2dd95SBruce Richardson } 113499a2dd95SBruce Richardson 113599a2dd95SBruce Richardson /* check if we have VFIO driver enabled */ 113699a2dd95SBruce Richardson if (default_vfio_cfg->vfio_container_fd != -1) { 1137ae67895bSDavid Marchand EAL_LOG(INFO, "VFIO support initialized"); 113899a2dd95SBruce Richardson default_vfio_cfg->vfio_enabled = 1; 113999a2dd95SBruce Richardson } else { 1140ae67895bSDavid Marchand EAL_LOG(NOTICE, "VFIO support could not be initialized"); 114199a2dd95SBruce Richardson } 114299a2dd95SBruce Richardson 114399a2dd95SBruce Richardson return 0; 114499a2dd95SBruce Richardson } 114599a2dd95SBruce Richardson 114699a2dd95SBruce Richardson int 114799a2dd95SBruce Richardson rte_vfio_is_enabled(const char *modname) 114899a2dd95SBruce Richardson { 114999a2dd95SBruce Richardson const int mod_available = rte_eal_check_module(modname) > 0; 115099a2dd95SBruce Richardson return default_vfio_cfg->vfio_enabled && mod_available; 115199a2dd95SBruce Richardson } 115299a2dd95SBruce Richardson 115399a2dd95SBruce Richardson int 115499a2dd95SBruce Richardson vfio_get_default_container_fd(void) 115599a2dd95SBruce Richardson { 115699a2dd95SBruce Richardson struct rte_mp_msg mp_req, *mp_rep; 115799a2dd95SBruce Richardson struct rte_mp_reply mp_reply = {0}; 115899a2dd95SBruce Richardson struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 115999a2dd95SBruce Richardson struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 116099a2dd95SBruce Richardson int container_fd; 116199a2dd95SBruce Richardson const struct internal_config *internal_conf = 116299a2dd95SBruce Richardson eal_get_internal_configuration(); 116399a2dd95SBruce Richardson 116499a2dd95SBruce Richardson if (default_vfio_cfg->vfio_enabled) 116599a2dd95SBruce Richardson return default_vfio_cfg->vfio_container_fd; 116699a2dd95SBruce Richardson 116799a2dd95SBruce Richardson if (internal_conf->process_type == RTE_PROC_PRIMARY) { 116899a2dd95SBruce Richardson /* if we were secondary process we would try requesting 116999a2dd95SBruce Richardson * container fd from the primary, but we're the primary 117099a2dd95SBruce Richardson * process so just exit here 117199a2dd95SBruce Richardson */ 117299a2dd95SBruce Richardson return -1; 117399a2dd95SBruce Richardson } 117499a2dd95SBruce Richardson 117599a2dd95SBruce Richardson p->req = SOCKET_REQ_DEFAULT_CONTAINER; 117699a2dd95SBruce Richardson strcpy(mp_req.name, EAL_VFIO_MP); 117799a2dd95SBruce Richardson mp_req.len_param = sizeof(*p); 117899a2dd95SBruce Richardson mp_req.num_fds = 0; 117999a2dd95SBruce Richardson 118099a2dd95SBruce Richardson if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 118199a2dd95SBruce Richardson mp_reply.nb_received == 1) { 118299a2dd95SBruce Richardson mp_rep = &mp_reply.msgs[0]; 118399a2dd95SBruce Richardson p = (struct vfio_mp_param *)mp_rep->param; 118499a2dd95SBruce Richardson if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 118599a2dd95SBruce Richardson container_fd = mp_rep->fds[0]; 118699a2dd95SBruce Richardson free(mp_reply.msgs); 118799a2dd95SBruce Richardson return container_fd; 118899a2dd95SBruce Richardson } 118999a2dd95SBruce Richardson } 119099a2dd95SBruce Richardson 119199a2dd95SBruce Richardson free(mp_reply.msgs); 1192ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot request default VFIO container fd"); 119399a2dd95SBruce Richardson return -1; 119499a2dd95SBruce Richardson } 119599a2dd95SBruce Richardson 119699a2dd95SBruce Richardson int 119799a2dd95SBruce Richardson vfio_get_iommu_type(void) 119899a2dd95SBruce Richardson { 119999a2dd95SBruce Richardson if (default_vfio_cfg->vfio_iommu_type == NULL) 120099a2dd95SBruce Richardson return -1; 120199a2dd95SBruce Richardson 120299a2dd95SBruce Richardson return default_vfio_cfg->vfio_iommu_type->type_id; 120399a2dd95SBruce Richardson } 120499a2dd95SBruce Richardson 120599a2dd95SBruce Richardson const struct vfio_iommu_type * 120699a2dd95SBruce Richardson vfio_set_iommu_type(int vfio_container_fd) 120799a2dd95SBruce Richardson { 120899a2dd95SBruce Richardson unsigned idx; 120999a2dd95SBruce Richardson for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 121099a2dd95SBruce Richardson const struct vfio_iommu_type *t = &iommu_types[idx]; 121199a2dd95SBruce Richardson 121299a2dd95SBruce Richardson int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, 121399a2dd95SBruce Richardson t->type_id); 121499a2dd95SBruce Richardson if (!ret) { 1215ae67895bSDavid Marchand EAL_LOG(INFO, "Using IOMMU type %d (%s)", 121699a2dd95SBruce Richardson t->type_id, t->name); 121799a2dd95SBruce Richardson return t; 121899a2dd95SBruce Richardson } 121999a2dd95SBruce Richardson /* not an error, there may be more supported IOMMU types */ 1220ae67895bSDavid Marchand EAL_LOG(DEBUG, "Set IOMMU type %d (%s) failed, error " 1221ae67895bSDavid Marchand "%i (%s)", t->type_id, t->name, errno, 122299a2dd95SBruce Richardson strerror(errno)); 122399a2dd95SBruce Richardson } 122499a2dd95SBruce Richardson /* if we didn't find a suitable IOMMU type, fail */ 122599a2dd95SBruce Richardson return NULL; 122699a2dd95SBruce Richardson } 122799a2dd95SBruce Richardson 122899a2dd95SBruce Richardson int 1229de2d364fSMingjin Ye rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr, 1230de2d364fSMingjin Ye int *vfio_dev_fd, struct vfio_device_info *device_info) 1231de2d364fSMingjin Ye { 1232de2d364fSMingjin Ye int ret; 1233de2d364fSMingjin Ye 1234de2d364fSMingjin Ye if (device_info == NULL || *vfio_dev_fd < 0) 1235de2d364fSMingjin Ye return -1; 1236de2d364fSMingjin Ye 1237de2d364fSMingjin Ye if (*vfio_dev_fd == 0) { 1238de2d364fSMingjin Ye ret = rte_vfio_setup_device(sysfs_base, dev_addr, 1239de2d364fSMingjin Ye vfio_dev_fd, device_info); 1240de2d364fSMingjin Ye if (ret) 1241de2d364fSMingjin Ye return -1; 1242de2d364fSMingjin Ye } else { 1243de2d364fSMingjin Ye ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); 1244de2d364fSMingjin Ye if (ret) { 1245de2d364fSMingjin Ye EAL_LOG(ERR, "%s cannot get device info, error %i (%s)", 1246de2d364fSMingjin Ye dev_addr, errno, strerror(errno)); 1247de2d364fSMingjin Ye return -1; 1248de2d364fSMingjin Ye } 1249de2d364fSMingjin Ye } 1250de2d364fSMingjin Ye 1251de2d364fSMingjin Ye return 0; 1252de2d364fSMingjin Ye } 1253de2d364fSMingjin Ye 1254de2d364fSMingjin Ye int 125599a2dd95SBruce Richardson vfio_has_supported_extensions(int vfio_container_fd) 125699a2dd95SBruce Richardson { 125799a2dd95SBruce Richardson int ret; 125899a2dd95SBruce Richardson unsigned idx, n_extensions = 0; 125999a2dd95SBruce Richardson for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 126099a2dd95SBruce Richardson const struct vfio_iommu_type *t = &iommu_types[idx]; 126199a2dd95SBruce Richardson 126299a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, 126399a2dd95SBruce Richardson t->type_id); 126499a2dd95SBruce Richardson if (ret < 0) { 1265ae67895bSDavid Marchand EAL_LOG(ERR, "Could not get IOMMU type, error " 1266ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 126799a2dd95SBruce Richardson close(vfio_container_fd); 126899a2dd95SBruce Richardson return -1; 126999a2dd95SBruce Richardson } else if (ret == 1) { 127099a2dd95SBruce Richardson /* we found a supported extension */ 127199a2dd95SBruce Richardson n_extensions++; 127299a2dd95SBruce Richardson } 1273ae67895bSDavid Marchand EAL_LOG(DEBUG, "IOMMU type %d (%s) is %s", 127499a2dd95SBruce Richardson t->type_id, t->name, 127599a2dd95SBruce Richardson ret ? "supported" : "not supported"); 127699a2dd95SBruce Richardson } 127799a2dd95SBruce Richardson 127899a2dd95SBruce Richardson /* if we didn't find any supported IOMMU types, fail */ 127999a2dd95SBruce Richardson if (!n_extensions) { 128099a2dd95SBruce Richardson close(vfio_container_fd); 128199a2dd95SBruce Richardson return -1; 128299a2dd95SBruce Richardson } 128399a2dd95SBruce Richardson 128499a2dd95SBruce Richardson return 0; 128599a2dd95SBruce Richardson } 128699a2dd95SBruce Richardson 128799a2dd95SBruce Richardson int 128899a2dd95SBruce Richardson rte_vfio_get_container_fd(void) 128999a2dd95SBruce Richardson { 129099a2dd95SBruce Richardson int ret, vfio_container_fd; 129199a2dd95SBruce Richardson struct rte_mp_msg mp_req, *mp_rep; 129299a2dd95SBruce Richardson struct rte_mp_reply mp_reply = {0}; 129399a2dd95SBruce Richardson struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 129499a2dd95SBruce Richardson struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 129599a2dd95SBruce Richardson const struct internal_config *internal_conf = 129699a2dd95SBruce Richardson eal_get_internal_configuration(); 129799a2dd95SBruce Richardson 129899a2dd95SBruce Richardson 129999a2dd95SBruce Richardson /* if we're in a primary process, try to open the container */ 130099a2dd95SBruce Richardson if (internal_conf->process_type == RTE_PROC_PRIMARY) { 130199a2dd95SBruce Richardson vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); 130299a2dd95SBruce Richardson if (vfio_container_fd < 0) { 1303ae67895bSDavid Marchand EAL_LOG(ERR, 130499a2dd95SBruce Richardson "Cannot open VFIO container %s, error " 1305ae67895bSDavid Marchand "%i (%s)", VFIO_CONTAINER_PATH, 130699a2dd95SBruce Richardson errno, strerror(errno)); 130799a2dd95SBruce Richardson return -1; 130899a2dd95SBruce Richardson } 130999a2dd95SBruce Richardson 131099a2dd95SBruce Richardson /* check VFIO API version */ 131199a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); 131299a2dd95SBruce Richardson if (ret != VFIO_API_VERSION) { 131399a2dd95SBruce Richardson if (ret < 0) 1314ae67895bSDavid Marchand EAL_LOG(ERR, 131599a2dd95SBruce Richardson "Could not get VFIO API version, error " 1316ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 131799a2dd95SBruce Richardson else 1318ae67895bSDavid Marchand EAL_LOG(ERR, "Unsupported VFIO API version!"); 131999a2dd95SBruce Richardson close(vfio_container_fd); 132099a2dd95SBruce Richardson return -1; 132199a2dd95SBruce Richardson } 132299a2dd95SBruce Richardson 132399a2dd95SBruce Richardson ret = vfio_has_supported_extensions(vfio_container_fd); 132499a2dd95SBruce Richardson if (ret) { 1325ae67895bSDavid Marchand EAL_LOG(ERR, 1326ae67895bSDavid Marchand "No supported IOMMU extensions found!"); 132799a2dd95SBruce Richardson return -1; 132899a2dd95SBruce Richardson } 132999a2dd95SBruce Richardson 133099a2dd95SBruce Richardson return vfio_container_fd; 133199a2dd95SBruce Richardson } 133299a2dd95SBruce Richardson /* 133399a2dd95SBruce Richardson * if we're in a secondary process, request container fd from the 133499a2dd95SBruce Richardson * primary process via mp channel 133599a2dd95SBruce Richardson */ 133699a2dd95SBruce Richardson p->req = SOCKET_REQ_CONTAINER; 133799a2dd95SBruce Richardson strcpy(mp_req.name, EAL_VFIO_MP); 133899a2dd95SBruce Richardson mp_req.len_param = sizeof(*p); 133999a2dd95SBruce Richardson mp_req.num_fds = 0; 134099a2dd95SBruce Richardson 134199a2dd95SBruce Richardson vfio_container_fd = -1; 134299a2dd95SBruce Richardson if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 134399a2dd95SBruce Richardson mp_reply.nb_received == 1) { 134499a2dd95SBruce Richardson mp_rep = &mp_reply.msgs[0]; 134599a2dd95SBruce Richardson p = (struct vfio_mp_param *)mp_rep->param; 134699a2dd95SBruce Richardson if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 134799a2dd95SBruce Richardson vfio_container_fd = mp_rep->fds[0]; 134899a2dd95SBruce Richardson free(mp_reply.msgs); 134999a2dd95SBruce Richardson return vfio_container_fd; 135099a2dd95SBruce Richardson } 135199a2dd95SBruce Richardson } 135299a2dd95SBruce Richardson 135399a2dd95SBruce Richardson free(mp_reply.msgs); 1354ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot request VFIO container fd"); 135599a2dd95SBruce Richardson return -1; 135699a2dd95SBruce Richardson } 135799a2dd95SBruce Richardson 135899a2dd95SBruce Richardson int 135999a2dd95SBruce Richardson rte_vfio_get_group_num(const char *sysfs_base, 136099a2dd95SBruce Richardson const char *dev_addr, int *iommu_group_num) 136199a2dd95SBruce Richardson { 136299a2dd95SBruce Richardson char linkname[PATH_MAX]; 136399a2dd95SBruce Richardson char filename[PATH_MAX]; 136499a2dd95SBruce Richardson char *tok[16], *group_tok, *end; 136599a2dd95SBruce Richardson int ret; 136699a2dd95SBruce Richardson 136799a2dd95SBruce Richardson memset(linkname, 0, sizeof(linkname)); 136899a2dd95SBruce Richardson memset(filename, 0, sizeof(filename)); 136999a2dd95SBruce Richardson 137099a2dd95SBruce Richardson /* try to find out IOMMU group for this device */ 137199a2dd95SBruce Richardson snprintf(linkname, sizeof(linkname), 137299a2dd95SBruce Richardson "%s/%s/iommu_group", sysfs_base, dev_addr); 137399a2dd95SBruce Richardson 137499a2dd95SBruce Richardson ret = readlink(linkname, filename, sizeof(filename)); 137599a2dd95SBruce Richardson 137699a2dd95SBruce Richardson /* if the link doesn't exist, no VFIO for us */ 137799a2dd95SBruce Richardson if (ret < 0) 137899a2dd95SBruce Richardson return 0; 137999a2dd95SBruce Richardson 138099a2dd95SBruce Richardson ret = rte_strsplit(filename, sizeof(filename), 138199a2dd95SBruce Richardson tok, RTE_DIM(tok), '/'); 138299a2dd95SBruce Richardson 138399a2dd95SBruce Richardson if (ret <= 0) { 1384ae67895bSDavid Marchand EAL_LOG(ERR, "%s cannot get IOMMU group", dev_addr); 138599a2dd95SBruce Richardson return -1; 138699a2dd95SBruce Richardson } 138799a2dd95SBruce Richardson 138899a2dd95SBruce Richardson /* IOMMU group is always the last token */ 138999a2dd95SBruce Richardson errno = 0; 139099a2dd95SBruce Richardson group_tok = tok[ret - 1]; 139199a2dd95SBruce Richardson end = group_tok; 139299a2dd95SBruce Richardson *iommu_group_num = strtol(group_tok, &end, 10); 139399a2dd95SBruce Richardson if ((end != group_tok && *end != '\0') || errno != 0) { 1394ae67895bSDavid Marchand EAL_LOG(ERR, "%s error parsing IOMMU number!", dev_addr); 139599a2dd95SBruce Richardson return -1; 139699a2dd95SBruce Richardson } 139799a2dd95SBruce Richardson 139899a2dd95SBruce Richardson return 1; 139999a2dd95SBruce Richardson } 140099a2dd95SBruce Richardson 140199a2dd95SBruce Richardson static int 140299a2dd95SBruce Richardson type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, 140399a2dd95SBruce Richardson void *arg) 140499a2dd95SBruce Richardson { 140599a2dd95SBruce Richardson int *vfio_container_fd = arg; 140699a2dd95SBruce Richardson 140799a2dd95SBruce Richardson /* skip external memory that isn't a heap */ 140899a2dd95SBruce Richardson if (msl->external && !msl->heap) 140999a2dd95SBruce Richardson return 0; 141099a2dd95SBruce Richardson 141199a2dd95SBruce Richardson /* skip any segments with invalid IOVA addresses */ 141299a2dd95SBruce Richardson if (ms->iova == RTE_BAD_IOVA) 141399a2dd95SBruce Richardson return 0; 141499a2dd95SBruce Richardson 141599a2dd95SBruce Richardson return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, 141699a2dd95SBruce Richardson ms->len, 1); 141799a2dd95SBruce Richardson } 141899a2dd95SBruce Richardson 141999a2dd95SBruce Richardson static int 142099a2dd95SBruce Richardson vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 142199a2dd95SBruce Richardson uint64_t len, int do_map) 142299a2dd95SBruce Richardson { 142399a2dd95SBruce Richardson struct vfio_iommu_type1_dma_map dma_map; 142499a2dd95SBruce Richardson struct vfio_iommu_type1_dma_unmap dma_unmap; 142599a2dd95SBruce Richardson int ret; 142699a2dd95SBruce Richardson 142799a2dd95SBruce Richardson if (do_map != 0) { 142899a2dd95SBruce Richardson memset(&dma_map, 0, sizeof(dma_map)); 142999a2dd95SBruce Richardson dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 143099a2dd95SBruce Richardson dma_map.vaddr = vaddr; 143199a2dd95SBruce Richardson dma_map.size = len; 143299a2dd95SBruce Richardson dma_map.iova = iova; 143399a2dd95SBruce Richardson dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 143499a2dd95SBruce Richardson VFIO_DMA_MAP_FLAG_WRITE; 143599a2dd95SBruce Richardson 143699a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 143799a2dd95SBruce Richardson if (ret) { 143899a2dd95SBruce Richardson /** 143999a2dd95SBruce Richardson * In case the mapping was already done EEXIST will be 144099a2dd95SBruce Richardson * returned from kernel. 144199a2dd95SBruce Richardson */ 144299a2dd95SBruce Richardson if (errno == EEXIST) { 1443ae67895bSDavid Marchand EAL_LOG(DEBUG, 144499a2dd95SBruce Richardson "Memory segment is already mapped, skipping"); 144599a2dd95SBruce Richardson } else { 1446ae67895bSDavid Marchand EAL_LOG(ERR, 144799a2dd95SBruce Richardson "Cannot set up DMA remapping, error " 1448ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 144999a2dd95SBruce Richardson return -1; 145099a2dd95SBruce Richardson } 145199a2dd95SBruce Richardson } 145299a2dd95SBruce Richardson } else { 145399a2dd95SBruce Richardson memset(&dma_unmap, 0, sizeof(dma_unmap)); 145499a2dd95SBruce Richardson dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 145599a2dd95SBruce Richardson dma_unmap.size = len; 145699a2dd95SBruce Richardson dma_unmap.iova = iova; 145799a2dd95SBruce Richardson 145899a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 145999a2dd95SBruce Richardson &dma_unmap); 146099a2dd95SBruce Richardson if (ret) { 1461ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot clear DMA remapping, error " 1462ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 146399a2dd95SBruce Richardson return -1; 146499a2dd95SBruce Richardson } else if (dma_unmap.size != len) { 1465ae67895bSDavid Marchand EAL_LOG(ERR, "Unexpected size %"PRIu64 1466ae67895bSDavid Marchand " of DMA remapping cleared instead of %"PRIu64, 146799a2dd95SBruce Richardson (uint64_t)dma_unmap.size, len); 146899a2dd95SBruce Richardson rte_errno = EIO; 146999a2dd95SBruce Richardson return -1; 147099a2dd95SBruce Richardson } 147199a2dd95SBruce Richardson } 147299a2dd95SBruce Richardson 147399a2dd95SBruce Richardson return 0; 147499a2dd95SBruce Richardson } 147599a2dd95SBruce Richardson 147699a2dd95SBruce Richardson static int 147799a2dd95SBruce Richardson vfio_type1_dma_map(int vfio_container_fd) 147899a2dd95SBruce Richardson { 147999a2dd95SBruce Richardson return rte_memseg_walk(type1_map, &vfio_container_fd); 148099a2dd95SBruce Richardson } 148199a2dd95SBruce Richardson 148299a2dd95SBruce Richardson /* Track the size of the statically allocated DMA window for SPAPR */ 148399a2dd95SBruce Richardson uint64_t spapr_dma_win_len; 148499a2dd95SBruce Richardson uint64_t spapr_dma_win_page_sz; 148599a2dd95SBruce Richardson 148699a2dd95SBruce Richardson static int 148799a2dd95SBruce Richardson vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 148899a2dd95SBruce Richardson uint64_t len, int do_map) 148999a2dd95SBruce Richardson { 149099a2dd95SBruce Richardson struct vfio_iommu_spapr_register_memory reg = { 149199a2dd95SBruce Richardson .argsz = sizeof(reg), 149299a2dd95SBruce Richardson .vaddr = (uintptr_t) vaddr, 149399a2dd95SBruce Richardson .size = len, 149499a2dd95SBruce Richardson .flags = 0 149599a2dd95SBruce Richardson }; 149699a2dd95SBruce Richardson int ret; 149799a2dd95SBruce Richardson 149899a2dd95SBruce Richardson if (do_map != 0) { 149999a2dd95SBruce Richardson struct vfio_iommu_type1_dma_map dma_map; 150099a2dd95SBruce Richardson 150199a2dd95SBruce Richardson if (iova + len > spapr_dma_win_len) { 1502ae67895bSDavid Marchand EAL_LOG(ERR, "DMA map attempt outside DMA window"); 150399a2dd95SBruce Richardson return -1; 150499a2dd95SBruce Richardson } 150599a2dd95SBruce Richardson 150699a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, 150799a2dd95SBruce Richardson VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 150899a2dd95SBruce Richardson if (ret) { 1509ae67895bSDavid Marchand EAL_LOG(ERR, 151099a2dd95SBruce Richardson "Cannot register vaddr for IOMMU, error " 1511ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 151299a2dd95SBruce Richardson return -1; 151399a2dd95SBruce Richardson } 151499a2dd95SBruce Richardson 151599a2dd95SBruce Richardson memset(&dma_map, 0, sizeof(dma_map)); 151699a2dd95SBruce Richardson dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 151799a2dd95SBruce Richardson dma_map.vaddr = vaddr; 151899a2dd95SBruce Richardson dma_map.size = len; 151999a2dd95SBruce Richardson dma_map.iova = iova; 152099a2dd95SBruce Richardson dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 152199a2dd95SBruce Richardson VFIO_DMA_MAP_FLAG_WRITE; 152299a2dd95SBruce Richardson 152399a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 152499a2dd95SBruce Richardson if (ret) { 1525ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot map vaddr for IOMMU, error " 1526ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 152799a2dd95SBruce Richardson return -1; 152899a2dd95SBruce Richardson } 152999a2dd95SBruce Richardson 153099a2dd95SBruce Richardson } else { 153199a2dd95SBruce Richardson struct vfio_iommu_type1_dma_map dma_unmap; 153299a2dd95SBruce Richardson 153399a2dd95SBruce Richardson memset(&dma_unmap, 0, sizeof(dma_unmap)); 153499a2dd95SBruce Richardson dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 153599a2dd95SBruce Richardson dma_unmap.size = len; 153699a2dd95SBruce Richardson dma_unmap.iova = iova; 153799a2dd95SBruce Richardson 153899a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 153999a2dd95SBruce Richardson &dma_unmap); 154099a2dd95SBruce Richardson if (ret) { 1541ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot unmap vaddr for IOMMU, error " 1542ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 154399a2dd95SBruce Richardson return -1; 154499a2dd95SBruce Richardson } 154599a2dd95SBruce Richardson 154699a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, 154799a2dd95SBruce Richardson VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 154899a2dd95SBruce Richardson if (ret) { 1549ae67895bSDavid Marchand EAL_LOG(ERR, 155099a2dd95SBruce Richardson "Cannot unregister vaddr for IOMMU, error " 1551ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 155299a2dd95SBruce Richardson return -1; 155399a2dd95SBruce Richardson } 155499a2dd95SBruce Richardson } 155599a2dd95SBruce Richardson 155699a2dd95SBruce Richardson return ret; 155799a2dd95SBruce Richardson } 155899a2dd95SBruce Richardson 155999a2dd95SBruce Richardson static int 156099a2dd95SBruce Richardson vfio_spapr_map_walk(const struct rte_memseg_list *msl, 156199a2dd95SBruce Richardson const struct rte_memseg *ms, void *arg) 156299a2dd95SBruce Richardson { 156399a2dd95SBruce Richardson int *vfio_container_fd = arg; 156499a2dd95SBruce Richardson 156599a2dd95SBruce Richardson /* skip external memory that isn't a heap */ 156699a2dd95SBruce Richardson if (msl->external && !msl->heap) 156799a2dd95SBruce Richardson return 0; 156899a2dd95SBruce Richardson 156999a2dd95SBruce Richardson /* skip any segments with invalid IOVA addresses */ 157099a2dd95SBruce Richardson if (ms->iova == RTE_BAD_IOVA) 157199a2dd95SBruce Richardson return 0; 157299a2dd95SBruce Richardson 157399a2dd95SBruce Richardson return vfio_spapr_dma_do_map(*vfio_container_fd, 157499a2dd95SBruce Richardson ms->addr_64, ms->iova, ms->len, 1); 157599a2dd95SBruce Richardson } 157699a2dd95SBruce Richardson 157799a2dd95SBruce Richardson struct spapr_size_walk_param { 157899a2dd95SBruce Richardson uint64_t max_va; 157999a2dd95SBruce Richardson uint64_t page_sz; 158099a2dd95SBruce Richardson bool is_user_managed; 158199a2dd95SBruce Richardson }; 158299a2dd95SBruce Richardson 158399a2dd95SBruce Richardson /* 158499a2dd95SBruce Richardson * In order to set the DMA window size required for the SPAPR IOMMU 158599a2dd95SBruce Richardson * we need to walk the existing virtual memory allocations as well as 158699a2dd95SBruce Richardson * find the hugepage size used. 158799a2dd95SBruce Richardson */ 158899a2dd95SBruce Richardson static int 158999a2dd95SBruce Richardson vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg) 159099a2dd95SBruce Richardson { 159199a2dd95SBruce Richardson struct spapr_size_walk_param *param = arg; 159299a2dd95SBruce Richardson uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len; 159399a2dd95SBruce Richardson 159499a2dd95SBruce Richardson if (msl->external && !msl->heap) { 159599a2dd95SBruce Richardson /* ignore user managed external memory */ 159699a2dd95SBruce Richardson param->is_user_managed = true; 159799a2dd95SBruce Richardson return 0; 159899a2dd95SBruce Richardson } 159999a2dd95SBruce Richardson 160099a2dd95SBruce Richardson if (max > param->max_va) { 160199a2dd95SBruce Richardson param->page_sz = msl->page_sz; 160299a2dd95SBruce Richardson param->max_va = max; 160399a2dd95SBruce Richardson } 160499a2dd95SBruce Richardson 160599a2dd95SBruce Richardson return 0; 160699a2dd95SBruce Richardson } 160799a2dd95SBruce Richardson 160899a2dd95SBruce Richardson /* 160999a2dd95SBruce Richardson * Find the highest memory address used in physical or virtual address 161099a2dd95SBruce Richardson * space and use that as the top of the DMA window. 161199a2dd95SBruce Richardson */ 161299a2dd95SBruce Richardson static int 161399a2dd95SBruce Richardson find_highest_mem_addr(struct spapr_size_walk_param *param) 161499a2dd95SBruce Richardson { 161599a2dd95SBruce Richardson /* find the maximum IOVA address for setting the DMA window size */ 161699a2dd95SBruce Richardson if (rte_eal_iova_mode() == RTE_IOVA_PA) { 161799a2dd95SBruce Richardson static const char proc_iomem[] = "/proc/iomem"; 161899a2dd95SBruce Richardson static const char str_sysram[] = "System RAM"; 161999a2dd95SBruce Richardson uint64_t start, end, max = 0; 162099a2dd95SBruce Richardson char *line = NULL; 162199a2dd95SBruce Richardson char *dash, *space; 162299a2dd95SBruce Richardson size_t line_len; 162399a2dd95SBruce Richardson 162499a2dd95SBruce Richardson /* 162599a2dd95SBruce Richardson * Example "System RAM" in /proc/iomem: 162699a2dd95SBruce Richardson * 00000000-1fffffffff : System RAM 162799a2dd95SBruce Richardson * 200000000000-201fffffffff : System RAM 162899a2dd95SBruce Richardson */ 162999a2dd95SBruce Richardson FILE *fd = fopen(proc_iomem, "r"); 163099a2dd95SBruce Richardson if (fd == NULL) { 1631ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot open %s", proc_iomem); 163299a2dd95SBruce Richardson return -1; 163399a2dd95SBruce Richardson } 163499a2dd95SBruce Richardson /* Scan /proc/iomem for the highest PA in the system */ 163599a2dd95SBruce Richardson while (getline(&line, &line_len, fd) != -1) { 163699a2dd95SBruce Richardson if (strstr(line, str_sysram) == NULL) 163799a2dd95SBruce Richardson continue; 163899a2dd95SBruce Richardson 163999a2dd95SBruce Richardson space = strstr(line, " "); 164099a2dd95SBruce Richardson dash = strstr(line, "-"); 164199a2dd95SBruce Richardson 164299a2dd95SBruce Richardson /* Validate the format of the memory string */ 164399a2dd95SBruce Richardson if (space == NULL || dash == NULL || space < dash) { 1644ae67895bSDavid Marchand EAL_LOG(ERR, "Can't parse line \"%s\" in file %s", 164599a2dd95SBruce Richardson line, proc_iomem); 164699a2dd95SBruce Richardson continue; 164799a2dd95SBruce Richardson } 164899a2dd95SBruce Richardson 164999a2dd95SBruce Richardson start = strtoull(line, NULL, 16); 165099a2dd95SBruce Richardson end = strtoull(dash + 1, NULL, 16); 1651ae67895bSDavid Marchand EAL_LOG(DEBUG, "Found system RAM from 0x%" PRIx64 1652ae67895bSDavid Marchand " to 0x%" PRIx64, start, end); 165399a2dd95SBruce Richardson if (end > max) 165499a2dd95SBruce Richardson max = end; 165599a2dd95SBruce Richardson } 165699a2dd95SBruce Richardson free(line); 165799a2dd95SBruce Richardson fclose(fd); 165899a2dd95SBruce Richardson 165999a2dd95SBruce Richardson if (max == 0) { 1660ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to find valid \"System RAM\" " 1661ae67895bSDavid Marchand "entry in file %s", proc_iomem); 166299a2dd95SBruce Richardson return -1; 166399a2dd95SBruce Richardson } 166499a2dd95SBruce Richardson 166599a2dd95SBruce Richardson spapr_dma_win_len = rte_align64pow2(max + 1); 166699a2dd95SBruce Richardson return 0; 166799a2dd95SBruce Richardson } else if (rte_eal_iova_mode() == RTE_IOVA_VA) { 1668ae67895bSDavid Marchand EAL_LOG(DEBUG, "Highest VA address in memseg list is 0x%" 1669ae67895bSDavid Marchand PRIx64, param->max_va); 167099a2dd95SBruce Richardson spapr_dma_win_len = rte_align64pow2(param->max_va); 167199a2dd95SBruce Richardson return 0; 167299a2dd95SBruce Richardson } 167399a2dd95SBruce Richardson 167499a2dd95SBruce Richardson spapr_dma_win_len = 0; 1675ae67895bSDavid Marchand EAL_LOG(ERR, "Unsupported IOVA mode"); 167699a2dd95SBruce Richardson return -1; 167799a2dd95SBruce Richardson } 167899a2dd95SBruce Richardson 167999a2dd95SBruce Richardson 168099a2dd95SBruce Richardson /* 168199a2dd95SBruce Richardson * The SPAPRv2 IOMMU supports 2 DMA windows with starting 168299a2dd95SBruce Richardson * address at 0 or 1<<59. By default, a DMA window is set 168399a2dd95SBruce Richardson * at address 0, 2GB long, with a 4KB page. For DPDK we 168499a2dd95SBruce Richardson * must remove the default window and setup a new DMA window 168599a2dd95SBruce Richardson * based on the hugepage size and memory requirements of 168699a2dd95SBruce Richardson * the application before we can map memory for DMA. 168799a2dd95SBruce Richardson */ 168899a2dd95SBruce Richardson static int 168999a2dd95SBruce Richardson spapr_dma_win_size(void) 169099a2dd95SBruce Richardson { 169199a2dd95SBruce Richardson struct spapr_size_walk_param param; 169299a2dd95SBruce Richardson 169399a2dd95SBruce Richardson /* only create DMA window once */ 169499a2dd95SBruce Richardson if (spapr_dma_win_len > 0) 169599a2dd95SBruce Richardson return 0; 169699a2dd95SBruce Richardson 169799a2dd95SBruce Richardson /* walk the memseg list to find the page size/max VA address */ 169899a2dd95SBruce Richardson memset(¶m, 0, sizeof(param)); 169999a2dd95SBruce Richardson if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) { 1700ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to walk memseg list for DMA window size"); 170199a2dd95SBruce Richardson return -1; 170299a2dd95SBruce Richardson } 170399a2dd95SBruce Richardson 170499a2dd95SBruce Richardson /* we can't be sure if DMA window covers external memory */ 170599a2dd95SBruce Richardson if (param.is_user_managed) 1706ae67895bSDavid Marchand EAL_LOG(WARNING, "Detected user managed external memory which may not be managed by the IOMMU"); 170799a2dd95SBruce Richardson 170899a2dd95SBruce Richardson /* check physical/virtual memory size */ 170999a2dd95SBruce Richardson if (find_highest_mem_addr(¶m) < 0) 171099a2dd95SBruce Richardson return -1; 1711ae67895bSDavid Marchand EAL_LOG(DEBUG, "Setting DMA window size to 0x%" PRIx64, 171299a2dd95SBruce Richardson spapr_dma_win_len); 171399a2dd95SBruce Richardson spapr_dma_win_page_sz = param.page_sz; 17143d4e27fdSDavid Marchand rte_mem_set_dma_mask(rte_ctz64(spapr_dma_win_len)); 171599a2dd95SBruce Richardson return 0; 171699a2dd95SBruce Richardson } 171799a2dd95SBruce Richardson 171899a2dd95SBruce Richardson static int 171999a2dd95SBruce Richardson vfio_spapr_create_dma_window(int vfio_container_fd) 172099a2dd95SBruce Richardson { 172199a2dd95SBruce Richardson struct vfio_iommu_spapr_tce_create create = { 172299a2dd95SBruce Richardson .argsz = sizeof(create), }; 172399a2dd95SBruce Richardson struct vfio_iommu_spapr_tce_remove remove = { 172499a2dd95SBruce Richardson .argsz = sizeof(remove), }; 172599a2dd95SBruce Richardson struct vfio_iommu_spapr_tce_info info = { 172699a2dd95SBruce Richardson .argsz = sizeof(info), }; 172799a2dd95SBruce Richardson int ret; 172899a2dd95SBruce Richardson 172999a2dd95SBruce Richardson ret = spapr_dma_win_size(); 173099a2dd95SBruce Richardson if (ret < 0) 173199a2dd95SBruce Richardson return ret; 173299a2dd95SBruce Richardson 173399a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 173499a2dd95SBruce Richardson if (ret) { 1735ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot get IOMMU info, error %i (%s)", 173699a2dd95SBruce Richardson errno, strerror(errno)); 173799a2dd95SBruce Richardson return -1; 173899a2dd95SBruce Richardson } 173999a2dd95SBruce Richardson 174099a2dd95SBruce Richardson /* 174199a2dd95SBruce Richardson * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window 174299a2dd95SBruce Richardson * can't be changed for v1 but it can be changed for v2. Since DPDK only 174399a2dd95SBruce Richardson * supports v2, remove the default DMA window so it can be resized. 174499a2dd95SBruce Richardson */ 174599a2dd95SBruce Richardson remove.start_addr = info.dma32_window_start; 174699a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 174799a2dd95SBruce Richardson if (ret) 174899a2dd95SBruce Richardson return -1; 174999a2dd95SBruce Richardson 175099a2dd95SBruce Richardson /* create a new DMA window (start address is not selectable) */ 175199a2dd95SBruce Richardson create.window_size = spapr_dma_win_len; 17523d4e27fdSDavid Marchand create.page_shift = rte_ctz64(spapr_dma_win_page_sz); 175399a2dd95SBruce Richardson create.levels = 1; 175499a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 175599a2dd95SBruce Richardson #ifdef VFIO_IOMMU_SPAPR_INFO_DDW 175699a2dd95SBruce Richardson /* 175799a2dd95SBruce Richardson * The vfio_iommu_spapr_tce_info structure was modified in 175899a2dd95SBruce Richardson * Linux kernel 4.2.0 to add support for the 175999a2dd95SBruce Richardson * vfio_iommu_spapr_tce_ddw_info structure needed to try 176099a2dd95SBruce Richardson * multiple table levels. Skip the attempt if running with 176199a2dd95SBruce Richardson * an older kernel. 176299a2dd95SBruce Richardson */ 176399a2dd95SBruce Richardson if (ret) { 176499a2dd95SBruce Richardson /* if at first we don't succeed, try more levels */ 176599a2dd95SBruce Richardson uint32_t levels; 176699a2dd95SBruce Richardson 176799a2dd95SBruce Richardson for (levels = create.levels + 1; 176899a2dd95SBruce Richardson ret && levels <= info.ddw.levels; levels++) { 176999a2dd95SBruce Richardson create.levels = levels; 177099a2dd95SBruce Richardson ret = ioctl(vfio_container_fd, 177199a2dd95SBruce Richardson VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 177299a2dd95SBruce Richardson } 177399a2dd95SBruce Richardson } 177499a2dd95SBruce Richardson #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */ 177599a2dd95SBruce Richardson if (ret) { 1776ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot create new DMA window, error " 1777ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 1778ae67895bSDavid Marchand EAL_LOG(ERR, 1779ae67895bSDavid Marchand "Consider using a larger hugepage size if supported by the system"); 178099a2dd95SBruce Richardson return -1; 178199a2dd95SBruce Richardson } 178299a2dd95SBruce Richardson 178399a2dd95SBruce Richardson /* verify the start address */ 178499a2dd95SBruce Richardson if (create.start_addr != 0) { 1785ae67895bSDavid Marchand EAL_LOG(ERR, "Received unsupported start address 0x%" 1786ae67895bSDavid Marchand PRIx64, (uint64_t)create.start_addr); 178799a2dd95SBruce Richardson return -1; 178899a2dd95SBruce Richardson } 178999a2dd95SBruce Richardson return ret; 179099a2dd95SBruce Richardson } 179199a2dd95SBruce Richardson 179299a2dd95SBruce Richardson static int 179399a2dd95SBruce Richardson vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, 179499a2dd95SBruce Richardson uint64_t iova, uint64_t len, int do_map) 179599a2dd95SBruce Richardson { 179699a2dd95SBruce Richardson int ret = 0; 179799a2dd95SBruce Richardson 179899a2dd95SBruce Richardson if (do_map) { 179999a2dd95SBruce Richardson if (vfio_spapr_dma_do_map(vfio_container_fd, 180099a2dd95SBruce Richardson vaddr, iova, len, 1)) { 1801ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to map DMA"); 180299a2dd95SBruce Richardson ret = -1; 180399a2dd95SBruce Richardson } 180499a2dd95SBruce Richardson } else { 180599a2dd95SBruce Richardson if (vfio_spapr_dma_do_map(vfio_container_fd, 180699a2dd95SBruce Richardson vaddr, iova, len, 0)) { 1807ae67895bSDavid Marchand EAL_LOG(ERR, "Failed to unmap DMA"); 180899a2dd95SBruce Richardson ret = -1; 180999a2dd95SBruce Richardson } 181099a2dd95SBruce Richardson } 181199a2dd95SBruce Richardson 181299a2dd95SBruce Richardson return ret; 181399a2dd95SBruce Richardson } 181499a2dd95SBruce Richardson 181599a2dd95SBruce Richardson static int 181699a2dd95SBruce Richardson vfio_spapr_dma_map(int vfio_container_fd) 181799a2dd95SBruce Richardson { 181899a2dd95SBruce Richardson if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) { 1819ae67895bSDavid Marchand EAL_LOG(ERR, "Could not create new DMA window!"); 182099a2dd95SBruce Richardson return -1; 182199a2dd95SBruce Richardson } 182299a2dd95SBruce Richardson 182399a2dd95SBruce Richardson /* map all existing DPDK segments for DMA */ 182499a2dd95SBruce Richardson if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) 182599a2dd95SBruce Richardson return -1; 182699a2dd95SBruce Richardson 182799a2dd95SBruce Richardson return 0; 182899a2dd95SBruce Richardson } 182999a2dd95SBruce Richardson 183099a2dd95SBruce Richardson static int 183199a2dd95SBruce Richardson vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) 183299a2dd95SBruce Richardson { 183399a2dd95SBruce Richardson /* No-IOMMU mode does not need DMA mapping */ 183499a2dd95SBruce Richardson return 0; 183599a2dd95SBruce Richardson } 183699a2dd95SBruce Richardson 183799a2dd95SBruce Richardson static int 183899a2dd95SBruce Richardson vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, 183999a2dd95SBruce Richardson uint64_t __rte_unused vaddr, 184099a2dd95SBruce Richardson uint64_t __rte_unused iova, uint64_t __rte_unused len, 184199a2dd95SBruce Richardson int __rte_unused do_map) 184299a2dd95SBruce Richardson { 184399a2dd95SBruce Richardson /* No-IOMMU mode does not need DMA mapping */ 184499a2dd95SBruce Richardson return 0; 184599a2dd95SBruce Richardson } 184699a2dd95SBruce Richardson 184799a2dd95SBruce Richardson static int 184899a2dd95SBruce Richardson vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 184999a2dd95SBruce Richardson uint64_t len, int do_map) 185099a2dd95SBruce Richardson { 185199a2dd95SBruce Richardson const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; 185299a2dd95SBruce Richardson 185399a2dd95SBruce Richardson if (!t) { 1854ae67895bSDavid Marchand EAL_LOG(ERR, "VFIO support not initialized"); 185599a2dd95SBruce Richardson rte_errno = ENODEV; 185699a2dd95SBruce Richardson return -1; 185799a2dd95SBruce Richardson } 185899a2dd95SBruce Richardson 185999a2dd95SBruce Richardson if (!t->dma_user_map_func) { 1860ae67895bSDavid Marchand EAL_LOG(ERR, 1861ae67895bSDavid Marchand "VFIO custom DMA region mapping not supported by IOMMU %s", 186299a2dd95SBruce Richardson t->name); 186399a2dd95SBruce Richardson rte_errno = ENOTSUP; 186499a2dd95SBruce Richardson return -1; 186599a2dd95SBruce Richardson } 186699a2dd95SBruce Richardson 186799a2dd95SBruce Richardson return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, 186899a2dd95SBruce Richardson len, do_map); 186999a2dd95SBruce Richardson } 187099a2dd95SBruce Richardson 187199a2dd95SBruce Richardson static int 187299a2dd95SBruce Richardson container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 187399a2dd95SBruce Richardson uint64_t len) 187499a2dd95SBruce Richardson { 187599a2dd95SBruce Richardson struct user_mem_map *new_map; 187699a2dd95SBruce Richardson struct user_mem_maps *user_mem_maps; 187756259f7fSXuan Ding bool has_partial_unmap; 187899a2dd95SBruce Richardson int ret = 0; 187999a2dd95SBruce Richardson 188099a2dd95SBruce Richardson user_mem_maps = &vfio_cfg->mem_maps; 188199a2dd95SBruce Richardson rte_spinlock_recursive_lock(&user_mem_maps->lock); 188299a2dd95SBruce Richardson if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { 1883ae67895bSDavid Marchand EAL_LOG(ERR, "No more space for user mem maps"); 188499a2dd95SBruce Richardson rte_errno = ENOMEM; 188599a2dd95SBruce Richardson ret = -1; 188699a2dd95SBruce Richardson goto out; 188799a2dd95SBruce Richardson } 188899a2dd95SBruce Richardson /* map the entry */ 188999a2dd95SBruce Richardson if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { 189099a2dd95SBruce Richardson /* technically, this will fail if there are currently no devices 189199a2dd95SBruce Richardson * plugged in, even if a device were added later, this mapping 189299a2dd95SBruce Richardson * might have succeeded. however, since we cannot verify if this 189399a2dd95SBruce Richardson * is a valid mapping without having a device attached, consider 189499a2dd95SBruce Richardson * this to be unsupported, because we can't just store any old 189599a2dd95SBruce Richardson * mapping and pollute list of active mappings willy-nilly. 189699a2dd95SBruce Richardson */ 1897ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't map new region for DMA"); 189899a2dd95SBruce Richardson ret = -1; 189999a2dd95SBruce Richardson goto out; 190099a2dd95SBruce Richardson } 190156259f7fSXuan Ding /* do we have partial unmap support? */ 190256259f7fSXuan Ding has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 190356259f7fSXuan Ding 190499a2dd95SBruce Richardson /* create new user mem map entry */ 190599a2dd95SBruce Richardson new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; 190699a2dd95SBruce Richardson new_map->addr = vaddr; 190799a2dd95SBruce Richardson new_map->iova = iova; 190899a2dd95SBruce Richardson new_map->len = len; 190956259f7fSXuan Ding /* for IOMMU types supporting partial unmap, we don't need chunking */ 191056259f7fSXuan Ding new_map->chunk = has_partial_unmap ? 0 : len; 191199a2dd95SBruce Richardson 191299a2dd95SBruce Richardson compact_user_maps(user_mem_maps); 191399a2dd95SBruce Richardson out: 191499a2dd95SBruce Richardson rte_spinlock_recursive_unlock(&user_mem_maps->lock); 191599a2dd95SBruce Richardson return ret; 191699a2dd95SBruce Richardson } 191799a2dd95SBruce Richardson 191899a2dd95SBruce Richardson static int 191999a2dd95SBruce Richardson container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 192099a2dd95SBruce Richardson uint64_t len) 192199a2dd95SBruce Richardson { 192256259f7fSXuan Ding struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS]; 192356259f7fSXuan Ding struct user_mem_map new_maps[2]; /* can be at most 2 */ 192499a2dd95SBruce Richardson struct user_mem_maps *user_mem_maps; 192556259f7fSXuan Ding int n_orig, n_new, newlen, ret = 0; 192656259f7fSXuan Ding bool has_partial_unmap; 192799a2dd95SBruce Richardson 192899a2dd95SBruce Richardson user_mem_maps = &vfio_cfg->mem_maps; 192999a2dd95SBruce Richardson rte_spinlock_recursive_lock(&user_mem_maps->lock); 193099a2dd95SBruce Richardson 193156259f7fSXuan Ding /* 193256259f7fSXuan Ding * Previously, we had adjacent mappings entirely contained within one 193356259f7fSXuan Ding * mapping entry. Since we now store original mapping length in some 193456259f7fSXuan Ding * cases, this is no longer the case, so unmapping can potentially go 193556259f7fSXuan Ding * over multiple segments and split them in any number of ways. 193656259f7fSXuan Ding * 193756259f7fSXuan Ding * To complicate things further, some IOMMU types support arbitrary 193856259f7fSXuan Ding * partial unmapping, while others will only support unmapping along the 193956259f7fSXuan Ding * chunk size, so there are a lot of cases we need to handle. To make 194056259f7fSXuan Ding * things easier code wise, instead of trying to adjust existing 194156259f7fSXuan Ding * mappings, let's just rebuild them using information we have. 194256259f7fSXuan Ding */ 194356259f7fSXuan Ding 194456259f7fSXuan Ding /* 194556259f7fSXuan Ding * first thing to do is check if there exists a mapping that includes 194656259f7fSXuan Ding * the start and the end of our requested unmap. We need to collect all 194756259f7fSXuan Ding * maps that include our unmapped region. 194856259f7fSXuan Ding */ 194956259f7fSXuan Ding n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len, 195056259f7fSXuan Ding orig_maps, RTE_DIM(orig_maps)); 195156259f7fSXuan Ding /* did we find anything? */ 195256259f7fSXuan Ding if (n_orig < 0) { 1953ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't find previously mapped region"); 195499a2dd95SBruce Richardson rte_errno = EINVAL; 195599a2dd95SBruce Richardson ret = -1; 195699a2dd95SBruce Richardson goto out; 195799a2dd95SBruce Richardson } 195856259f7fSXuan Ding 1959ab910a80SAnatoly Burakov /* do we have partial unmap capability? */ 1960ab910a80SAnatoly Burakov has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 1961ab910a80SAnatoly Burakov 196256259f7fSXuan Ding /* 196356259f7fSXuan Ding * if we don't support partial unmap, we must check if start and end of 196456259f7fSXuan Ding * current unmap region are chunk-aligned. 196599a2dd95SBruce Richardson */ 196656259f7fSXuan Ding if (!has_partial_unmap) { 196756259f7fSXuan Ding bool start_aligned, end_aligned; 196856259f7fSXuan Ding 196956259f7fSXuan Ding start_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 197056259f7fSXuan Ding vaddr, iova); 197156259f7fSXuan Ding end_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 197256259f7fSXuan Ding vaddr + len, iova + len); 197356259f7fSXuan Ding 197456259f7fSXuan Ding if (!start_aligned || !end_aligned) { 1975ae67895bSDavid Marchand EAL_LOG(DEBUG, "DMA partial unmap unsupported"); 197699a2dd95SBruce Richardson rte_errno = ENOTSUP; 197799a2dd95SBruce Richardson ret = -1; 197899a2dd95SBruce Richardson goto out; 197999a2dd95SBruce Richardson } 198056259f7fSXuan Ding } 198156259f7fSXuan Ding 198256259f7fSXuan Ding /* 198356259f7fSXuan Ding * now we know we can potentially unmap the region, but we still have to 198456259f7fSXuan Ding * figure out if there is enough space in our list to store remaining 198556259f7fSXuan Ding * maps. for this, we will figure out how many segments we are going to 198656259f7fSXuan Ding * remove, and how many new segments we are going to create. 198756259f7fSXuan Ding */ 198856259f7fSXuan Ding n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len); 198956259f7fSXuan Ding 199056259f7fSXuan Ding /* can we store the new maps in our list? */ 199156259f7fSXuan Ding newlen = (user_mem_maps->n_maps - n_orig) + n_new; 199256259f7fSXuan Ding if (newlen >= VFIO_MAX_USER_MEM_MAPS) { 1993ae67895bSDavid Marchand EAL_LOG(ERR, "Not enough space to store partial mapping"); 199499a2dd95SBruce Richardson rte_errno = ENOMEM; 199599a2dd95SBruce Richardson ret = -1; 199699a2dd95SBruce Richardson goto out; 199799a2dd95SBruce Richardson } 199899a2dd95SBruce Richardson 199999a2dd95SBruce Richardson /* unmap the entry */ 200099a2dd95SBruce Richardson if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { 200199a2dd95SBruce Richardson /* there may not be any devices plugged in, so unmapping will 200299a2dd95SBruce Richardson * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't 200399a2dd95SBruce Richardson * stop us from removing the mapping, as the assumption is we 200499a2dd95SBruce Richardson * won't be needing this memory any more and thus will want to 200599a2dd95SBruce Richardson * prevent it from being remapped again on hotplug. so, only 200699a2dd95SBruce Richardson * fail if we indeed failed to unmap (e.g. if the mapping was 200799a2dd95SBruce Richardson * within our mapped range but had invalid alignment). 200899a2dd95SBruce Richardson */ 200999a2dd95SBruce Richardson if (rte_errno != ENODEV && rte_errno != ENOTSUP) { 2010ae67895bSDavid Marchand EAL_LOG(ERR, "Couldn't unmap region for DMA"); 201199a2dd95SBruce Richardson ret = -1; 201299a2dd95SBruce Richardson goto out; 201399a2dd95SBruce Richardson } else { 2014ae67895bSDavid Marchand EAL_LOG(DEBUG, "DMA unmapping failed, but removing mappings anyway"); 201599a2dd95SBruce Richardson } 201699a2dd95SBruce Richardson } 201799a2dd95SBruce Richardson 201856259f7fSXuan Ding /* we have unmapped the region, so now update the maps */ 201956259f7fSXuan Ding delete_maps(user_mem_maps, orig_maps, n_orig); 202056259f7fSXuan Ding copy_maps(user_mem_maps, new_maps, n_new); 202199a2dd95SBruce Richardson compact_user_maps(user_mem_maps); 202299a2dd95SBruce Richardson out: 202399a2dd95SBruce Richardson rte_spinlock_recursive_unlock(&user_mem_maps->lock); 202499a2dd95SBruce Richardson return ret; 202599a2dd95SBruce Richardson } 202699a2dd95SBruce Richardson 202799a2dd95SBruce Richardson int 202899a2dd95SBruce Richardson rte_vfio_noiommu_is_enabled(void) 202999a2dd95SBruce Richardson { 203099a2dd95SBruce Richardson int fd; 203199a2dd95SBruce Richardson ssize_t cnt; 203299a2dd95SBruce Richardson char c; 203399a2dd95SBruce Richardson 203499a2dd95SBruce Richardson fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); 203599a2dd95SBruce Richardson if (fd < 0) { 203699a2dd95SBruce Richardson if (errno != ENOENT) { 2037ae67895bSDavid Marchand EAL_LOG(ERR, "Cannot open VFIO noiommu file " 2038ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 203999a2dd95SBruce Richardson return -1; 204099a2dd95SBruce Richardson } 204199a2dd95SBruce Richardson /* 204299a2dd95SBruce Richardson * else the file does not exists 204399a2dd95SBruce Richardson * i.e. noiommu is not enabled 204499a2dd95SBruce Richardson */ 204599a2dd95SBruce Richardson return 0; 204699a2dd95SBruce Richardson } 204799a2dd95SBruce Richardson 204899a2dd95SBruce Richardson cnt = read(fd, &c, 1); 204999a2dd95SBruce Richardson close(fd); 205099a2dd95SBruce Richardson if (cnt != 1) { 2051ae67895bSDavid Marchand EAL_LOG(ERR, "Unable to read from VFIO noiommu file " 2052ae67895bSDavid Marchand "%i (%s)", errno, strerror(errno)); 205399a2dd95SBruce Richardson return -1; 205499a2dd95SBruce Richardson } 205599a2dd95SBruce Richardson 205699a2dd95SBruce Richardson return c == 'Y'; 205799a2dd95SBruce Richardson } 205899a2dd95SBruce Richardson 205999a2dd95SBruce Richardson int 206099a2dd95SBruce Richardson rte_vfio_container_create(void) 206199a2dd95SBruce Richardson { 206299a2dd95SBruce Richardson int i; 206399a2dd95SBruce Richardson 206499a2dd95SBruce Richardson /* Find an empty slot to store new vfio config */ 206599a2dd95SBruce Richardson for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { 206699a2dd95SBruce Richardson if (vfio_cfgs[i].vfio_container_fd == -1) 206799a2dd95SBruce Richardson break; 206899a2dd95SBruce Richardson } 206999a2dd95SBruce Richardson 207099a2dd95SBruce Richardson if (i == VFIO_MAX_CONTAINERS) { 2071ae67895bSDavid Marchand EAL_LOG(ERR, "Exceed max VFIO container limit"); 207299a2dd95SBruce Richardson return -1; 207399a2dd95SBruce Richardson } 207499a2dd95SBruce Richardson 207599a2dd95SBruce Richardson vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); 207699a2dd95SBruce Richardson if (vfio_cfgs[i].vfio_container_fd < 0) { 2077ae67895bSDavid Marchand EAL_LOG(NOTICE, "Fail to create a new VFIO container"); 207899a2dd95SBruce Richardson return -1; 207999a2dd95SBruce Richardson } 208099a2dd95SBruce Richardson 208199a2dd95SBruce Richardson return vfio_cfgs[i].vfio_container_fd; 208299a2dd95SBruce Richardson } 208399a2dd95SBruce Richardson 208499a2dd95SBruce Richardson int 208599a2dd95SBruce Richardson rte_vfio_container_destroy(int container_fd) 208699a2dd95SBruce Richardson { 208799a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 208899a2dd95SBruce Richardson int i; 208999a2dd95SBruce Richardson 209099a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 209199a2dd95SBruce Richardson if (vfio_cfg == NULL) { 2092ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO container fd"); 209399a2dd95SBruce Richardson return -1; 209499a2dd95SBruce Richardson } 209599a2dd95SBruce Richardson 209699a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_GROUPS; i++) 209799a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[i].group_num != -1) 209899a2dd95SBruce Richardson rte_vfio_container_group_unbind(container_fd, 209999a2dd95SBruce Richardson vfio_cfg->vfio_groups[i].group_num); 210099a2dd95SBruce Richardson 210199a2dd95SBruce Richardson close(container_fd); 210299a2dd95SBruce Richardson vfio_cfg->vfio_container_fd = -1; 210399a2dd95SBruce Richardson vfio_cfg->vfio_active_groups = 0; 210499a2dd95SBruce Richardson vfio_cfg->vfio_iommu_type = NULL; 210599a2dd95SBruce Richardson 210699a2dd95SBruce Richardson return 0; 210799a2dd95SBruce Richardson } 210899a2dd95SBruce Richardson 210999a2dd95SBruce Richardson int 211099a2dd95SBruce Richardson rte_vfio_container_group_bind(int container_fd, int iommu_group_num) 211199a2dd95SBruce Richardson { 211299a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 211399a2dd95SBruce Richardson 211499a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 211599a2dd95SBruce Richardson if (vfio_cfg == NULL) { 2116ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO container fd"); 211799a2dd95SBruce Richardson return -1; 211899a2dd95SBruce Richardson } 211999a2dd95SBruce Richardson 212099a2dd95SBruce Richardson return vfio_get_group_fd(vfio_cfg, iommu_group_num); 212199a2dd95SBruce Richardson } 212299a2dd95SBruce Richardson 212399a2dd95SBruce Richardson int 212499a2dd95SBruce Richardson rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) 212599a2dd95SBruce Richardson { 212699a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 212799a2dd95SBruce Richardson struct vfio_group *cur_grp = NULL; 212899a2dd95SBruce Richardson int i; 212999a2dd95SBruce Richardson 213099a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 213199a2dd95SBruce Richardson if (vfio_cfg == NULL) { 2132ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO container fd"); 213399a2dd95SBruce Richardson return -1; 213499a2dd95SBruce Richardson } 213599a2dd95SBruce Richardson 213699a2dd95SBruce Richardson for (i = 0; i < VFIO_MAX_GROUPS; i++) { 213799a2dd95SBruce Richardson if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { 213899a2dd95SBruce Richardson cur_grp = &vfio_cfg->vfio_groups[i]; 213999a2dd95SBruce Richardson break; 214099a2dd95SBruce Richardson } 214199a2dd95SBruce Richardson } 214299a2dd95SBruce Richardson 214399a2dd95SBruce Richardson /* This should not happen */ 214499a2dd95SBruce Richardson if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { 2145ae67895bSDavid Marchand EAL_LOG(ERR, "Specified VFIO group number not found"); 214699a2dd95SBruce Richardson return -1; 214799a2dd95SBruce Richardson } 214899a2dd95SBruce Richardson 214999a2dd95SBruce Richardson if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { 2150ae67895bSDavid Marchand EAL_LOG(ERR, 215199a2dd95SBruce Richardson "Error when closing vfio_group_fd for iommu_group_num " 2152ae67895bSDavid Marchand "%d", iommu_group_num); 215399a2dd95SBruce Richardson return -1; 215499a2dd95SBruce Richardson } 215599a2dd95SBruce Richardson cur_grp->group_num = -1; 215699a2dd95SBruce Richardson cur_grp->fd = -1; 215799a2dd95SBruce Richardson cur_grp->devices = 0; 215899a2dd95SBruce Richardson vfio_cfg->vfio_active_groups--; 215999a2dd95SBruce Richardson 216099a2dd95SBruce Richardson return 0; 216199a2dd95SBruce Richardson } 216299a2dd95SBruce Richardson 216399a2dd95SBruce Richardson int 216499a2dd95SBruce Richardson rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, 216599a2dd95SBruce Richardson uint64_t len) 216699a2dd95SBruce Richardson { 216799a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 216899a2dd95SBruce Richardson 216999a2dd95SBruce Richardson if (len == 0) { 217099a2dd95SBruce Richardson rte_errno = EINVAL; 217199a2dd95SBruce Richardson return -1; 217299a2dd95SBruce Richardson } 217399a2dd95SBruce Richardson 217499a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 217599a2dd95SBruce Richardson if (vfio_cfg == NULL) { 2176ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO container fd"); 217799a2dd95SBruce Richardson return -1; 217899a2dd95SBruce Richardson } 217999a2dd95SBruce Richardson 218099a2dd95SBruce Richardson return container_dma_map(vfio_cfg, vaddr, iova, len); 218199a2dd95SBruce Richardson } 218299a2dd95SBruce Richardson 218399a2dd95SBruce Richardson int 218499a2dd95SBruce Richardson rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, 218599a2dd95SBruce Richardson uint64_t len) 218699a2dd95SBruce Richardson { 218799a2dd95SBruce Richardson struct vfio_config *vfio_cfg; 218899a2dd95SBruce Richardson 218999a2dd95SBruce Richardson if (len == 0) { 219099a2dd95SBruce Richardson rte_errno = EINVAL; 219199a2dd95SBruce Richardson return -1; 219299a2dd95SBruce Richardson } 219399a2dd95SBruce Richardson 219499a2dd95SBruce Richardson vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 219599a2dd95SBruce Richardson if (vfio_cfg == NULL) { 2196ae67895bSDavid Marchand EAL_LOG(ERR, "Invalid VFIO container fd"); 219799a2dd95SBruce Richardson return -1; 219899a2dd95SBruce Richardson } 219999a2dd95SBruce Richardson 220099a2dd95SBruce Richardson return container_dma_unmap(vfio_cfg, vaddr, iova, len); 220199a2dd95SBruce Richardson } 2202