1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <inttypes.h> 6 #include <string.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <sys/ioctl.h> 10 11 #include <rte_errno.h> 12 #include <rte_log.h> 13 #include <rte_memory.h> 14 #include <rte_eal_memconfig.h> 15 #include <rte_vfio.h> 16 17 #include "eal_filesystem.h" 18 #include "eal_memcfg.h" 19 #include "eal_vfio.h" 20 #include "eal_private.h" 21 #include "eal_internal_cfg.h" 22 23 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" 24 25 /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can 26 * recreate the mappings for DPDK segments, but we cannot do so for memory that 27 * was registered by the user themselves, so we need to store the user mappings 28 * somewhere, to recreate them later. 29 */ 30 #define VFIO_MAX_USER_MEM_MAPS 256 31 struct user_mem_map { 32 uint64_t addr; /**< start VA */ 33 uint64_t iova; /**< start IOVA */ 34 uint64_t len; /**< total length of the mapping */ 35 uint64_t chunk; /**< this mapping can be split in chunks of this size */ 36 }; 37 38 struct user_mem_maps { 39 rte_spinlock_recursive_t lock; 40 int n_maps; 41 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; 42 }; 43 44 struct vfio_config { 45 int vfio_enabled; 46 int vfio_container_fd; 47 int vfio_active_groups; 48 const struct vfio_iommu_type *vfio_iommu_type; 49 struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; 50 struct user_mem_maps mem_maps; 51 }; 52 53 /* per-process VFIO config */ 54 static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; 55 static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; 56 57 static int vfio_type1_dma_map(int); 58 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 59 static int vfio_spapr_dma_map(int); 60 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 61 static int vfio_noiommu_dma_map(int); 62 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 63 static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, 64 uint64_t iova, uint64_t len, int do_map); 65 66 /* IOMMU types we support */ 67 static const struct vfio_iommu_type iommu_types[] = { 68 /* x86 IOMMU, otherwise known as type 1 */ 69 { 70 .type_id = RTE_VFIO_TYPE1, 71 .name = "Type 1", 72 .partial_unmap = false, 73 .dma_map_func = &vfio_type1_dma_map, 74 .dma_user_map_func = &vfio_type1_dma_mem_map 75 }, 76 /* ppc64 IOMMU, otherwise known as spapr */ 77 { 78 .type_id = RTE_VFIO_SPAPR, 79 .name = "sPAPR", 80 .partial_unmap = true, 81 .dma_map_func = &vfio_spapr_dma_map, 82 .dma_user_map_func = &vfio_spapr_dma_mem_map 83 }, 84 /* IOMMU-less mode */ 85 { 86 .type_id = RTE_VFIO_NOIOMMU, 87 .name = "No-IOMMU", 88 .partial_unmap = true, 89 .dma_map_func = &vfio_noiommu_dma_map, 90 .dma_user_map_func = &vfio_noiommu_dma_mem_map 91 }, 92 }; 93 94 static int 95 is_null_map(const struct user_mem_map *map) 96 { 97 return map->addr == 0 && map->iova == 0 && 98 map->len == 0 && map->chunk == 0; 99 } 100 101 /* we may need to merge user mem maps together in case of user mapping/unmapping 102 * chunks of memory, so we'll need a comparator function to sort segments. 103 */ 104 static int 105 user_mem_map_cmp(const void *a, const void *b) 106 { 107 const struct user_mem_map *umm_a = a; 108 const struct user_mem_map *umm_b = b; 109 110 /* move null entries to end */ 111 if (is_null_map(umm_a)) 112 return 1; 113 if (is_null_map(umm_b)) 114 return -1; 115 116 /* sort by iova first */ 117 if (umm_a->iova < umm_b->iova) 118 return -1; 119 if (umm_a->iova > umm_b->iova) 120 return 1; 121 122 if (umm_a->addr < umm_b->addr) 123 return -1; 124 if (umm_a->addr > umm_b->addr) 125 return 1; 126 127 if (umm_a->len < umm_b->len) 128 return -1; 129 if (umm_a->len > umm_b->len) 130 return 1; 131 132 if (umm_a->chunk < umm_b->chunk) 133 return -1; 134 if (umm_a->chunk > umm_b->chunk) 135 return 1; 136 137 return 0; 138 } 139 140 /* 141 * Take in an address range and list of current mappings, and produce a list of 142 * mappings that will be kept. 143 */ 144 static int 145 process_maps(struct user_mem_map *src, size_t src_len, 146 struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len) 147 { 148 struct user_mem_map *src_first = &src[0]; 149 struct user_mem_map *src_last = &src[src_len - 1]; 150 struct user_mem_map *dst_first = &newmap[0]; 151 /* we can get at most two new segments */ 152 struct user_mem_map *dst_last = &newmap[1]; 153 uint64_t first_off = vaddr - src_first->addr; 154 uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len); 155 int newmap_len = 0; 156 157 if (first_off != 0) { 158 dst_first->addr = src_first->addr; 159 dst_first->iova = src_first->iova; 160 dst_first->len = first_off; 161 dst_first->chunk = src_first->chunk; 162 163 newmap_len++; 164 } 165 if (last_off != 0) { 166 /* if we had start offset, we have two segments */ 167 struct user_mem_map *last = 168 first_off == 0 ? dst_first : dst_last; 169 last->addr = (src_last->addr + src_last->len) - last_off; 170 last->iova = (src_last->iova + src_last->len) - last_off; 171 last->len = last_off; 172 last->chunk = src_last->chunk; 173 174 newmap_len++; 175 } 176 return newmap_len; 177 } 178 179 /* erase certain maps from the list */ 180 static void 181 delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps, 182 size_t n_del) 183 { 184 int i; 185 size_t j; 186 187 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) { 188 struct user_mem_map *left = &user_mem_maps->maps[i]; 189 struct user_mem_map *right = &del_maps[j]; 190 191 if (user_mem_map_cmp(left, right) == 0) { 192 memset(left, 0, sizeof(*left)); 193 j++; 194 user_mem_maps->n_maps--; 195 } 196 } 197 } 198 199 static void 200 copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps, 201 size_t n_add) 202 { 203 int i; 204 size_t j; 205 206 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) { 207 struct user_mem_map *left = &user_mem_maps->maps[i]; 208 struct user_mem_map *right = &add_maps[j]; 209 210 /* insert into empty space */ 211 if (is_null_map(left)) { 212 memcpy(left, right, sizeof(*left)); 213 j++; 214 user_mem_maps->n_maps++; 215 } 216 } 217 } 218 219 /* try merging two maps into one, return 1 if succeeded */ 220 static int 221 merge_map(struct user_mem_map *left, struct user_mem_map *right) 222 { 223 /* merge the same maps into one */ 224 if (memcmp(left, right, sizeof(struct user_mem_map)) == 0) 225 goto out; 226 227 if (left->addr + left->len != right->addr) 228 return 0; 229 if (left->iova + left->len != right->iova) 230 return 0; 231 if (left->chunk != right->chunk) 232 return 0; 233 left->len += right->len; 234 235 out: 236 memset(right, 0, sizeof(*right)); 237 238 return 1; 239 } 240 241 static bool 242 addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps, 243 uint64_t vaddr, uint64_t iova) 244 { 245 unsigned int i; 246 247 for (i = 0; i < n_maps; i++) { 248 struct user_mem_map *map = &maps[i]; 249 uint64_t map_va_end = map->addr + map->len; 250 uint64_t map_iova_end = map->iova + map->len; 251 uint64_t map_va_off = vaddr - map->addr; 252 uint64_t map_iova_off = iova - map->iova; 253 254 /* we include end of the segment in comparison as well */ 255 bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end); 256 bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end); 257 /* chunk may not be power of two, so use modulo */ 258 bool addr_is_aligned = (map_va_off % map->chunk) == 0; 259 bool iova_is_aligned = (map_iova_off % map->chunk) == 0; 260 261 if (addr_in_map && iova_in_map && 262 addr_is_aligned && iova_is_aligned) 263 return true; 264 } 265 return false; 266 } 267 268 static int 269 find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr, 270 uint64_t iova, uint64_t len, struct user_mem_map *dst, 271 size_t dst_len) 272 { 273 uint64_t va_end = addr + len; 274 uint64_t iova_end = iova + len; 275 bool found = false; 276 size_t j; 277 int i, ret; 278 279 for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) { 280 struct user_mem_map *map = &user_mem_maps->maps[i]; 281 uint64_t map_va_end = map->addr + map->len; 282 uint64_t map_iova_end = map->iova + map->len; 283 284 bool start_addr_in_map = (addr >= map->addr) && 285 (addr < map_va_end); 286 bool end_addr_in_map = (va_end > map->addr) && 287 (va_end <= map_va_end); 288 bool start_iova_in_map = (iova >= map->iova) && 289 (iova < map_iova_end); 290 bool end_iova_in_map = (iova_end > map->iova) && 291 (iova_end <= map_iova_end); 292 293 /* do we have space in temporary map? */ 294 if (j == dst_len) { 295 ret = -ENOSPC; 296 goto err; 297 } 298 /* check if current map is start of our segment */ 299 if (!found && start_addr_in_map && start_iova_in_map) 300 found = true; 301 /* if we have previously found a segment, add it to the map */ 302 if (found) { 303 /* copy the segment into our temporary map */ 304 memcpy(&dst[j++], map, sizeof(*map)); 305 306 /* if we match end of segment, quit */ 307 if (end_addr_in_map && end_iova_in_map) 308 return j; 309 } 310 } 311 /* we didn't find anything */ 312 ret = -ENOENT; 313 err: 314 memset(dst, 0, sizeof(*dst) * dst_len); 315 return ret; 316 } 317 318 /* this will sort all user maps, and merge/compact any adjacent maps */ 319 static void 320 compact_user_maps(struct user_mem_maps *user_mem_maps) 321 { 322 int i; 323 324 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 325 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 326 327 /* we'll go over the list backwards when merging */ 328 for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) { 329 struct user_mem_map *l, *r; 330 331 l = &user_mem_maps->maps[i]; 332 r = &user_mem_maps->maps[i + 1]; 333 334 if (is_null_map(l) || is_null_map(r)) 335 continue; 336 337 /* try and merge the maps */ 338 if (merge_map(l, r)) 339 user_mem_maps->n_maps--; 340 } 341 342 /* the entries are still sorted, but now they have holes in them, so 343 * sort the list again. 344 */ 345 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 346 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 347 } 348 349 static int 350 vfio_open_group_fd(int iommu_group_num) 351 { 352 int vfio_group_fd; 353 char filename[PATH_MAX]; 354 struct rte_mp_msg mp_req, *mp_rep; 355 struct rte_mp_reply mp_reply = {0}; 356 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 357 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 358 const struct internal_config *internal_conf = 359 eal_get_internal_configuration(); 360 361 /* if primary, try to open the group */ 362 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 363 /* try regular group format */ 364 snprintf(filename, sizeof(filename), 365 VFIO_GROUP_FMT, iommu_group_num); 366 vfio_group_fd = open(filename, O_RDWR); 367 if (vfio_group_fd < 0) { 368 /* if file not found, it's not an error */ 369 if (errno != ENOENT) { 370 EAL_LOG(ERR, "Cannot open %s: %s", 371 filename, strerror(errno)); 372 return -1; 373 } 374 375 /* special case: try no-IOMMU path as well */ 376 snprintf(filename, sizeof(filename), 377 VFIO_NOIOMMU_GROUP_FMT, 378 iommu_group_num); 379 vfio_group_fd = open(filename, O_RDWR); 380 if (vfio_group_fd < 0) { 381 if (errno != ENOENT) { 382 EAL_LOG(ERR, 383 "Cannot open %s: %s", 384 filename, strerror(errno)); 385 return -1; 386 } 387 return -ENOENT; 388 } 389 /* noiommu group found */ 390 } 391 392 return vfio_group_fd; 393 } 394 /* if we're in a secondary process, request group fd from the primary 395 * process via mp channel. 396 */ 397 p->req = SOCKET_REQ_GROUP; 398 p->group_num = iommu_group_num; 399 strcpy(mp_req.name, EAL_VFIO_MP); 400 mp_req.len_param = sizeof(*p); 401 mp_req.num_fds = 0; 402 403 vfio_group_fd = -1; 404 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 405 mp_reply.nb_received == 1) { 406 mp_rep = &mp_reply.msgs[0]; 407 p = (struct vfio_mp_param *)mp_rep->param; 408 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 409 vfio_group_fd = mp_rep->fds[0]; 410 } else if (p->result == SOCKET_NO_FD) { 411 EAL_LOG(ERR, "Bad VFIO group fd"); 412 vfio_group_fd = -ENOENT; 413 } 414 } 415 416 free(mp_reply.msgs); 417 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 418 EAL_LOG(ERR, "Cannot request VFIO group fd"); 419 return vfio_group_fd; 420 } 421 422 static struct vfio_config * 423 get_vfio_cfg_by_group_num(int iommu_group_num) 424 { 425 struct vfio_config *vfio_cfg; 426 int i, j; 427 428 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 429 vfio_cfg = &vfio_cfgs[i]; 430 for (j = 0; j < VFIO_MAX_GROUPS; j++) { 431 if (vfio_cfg->vfio_groups[j].group_num == 432 iommu_group_num) 433 return vfio_cfg; 434 } 435 } 436 437 return NULL; 438 } 439 440 static int 441 vfio_get_group_fd(struct vfio_config *vfio_cfg, 442 int iommu_group_num) 443 { 444 int i; 445 int vfio_group_fd; 446 struct vfio_group *cur_grp; 447 448 /* check if we already have the group descriptor open */ 449 for (i = 0; i < VFIO_MAX_GROUPS; i++) 450 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) 451 return vfio_cfg->vfio_groups[i].fd; 452 453 /* Lets see first if there is room for a new group */ 454 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { 455 EAL_LOG(ERR, "Maximum number of VFIO groups reached!"); 456 return -1; 457 } 458 459 /* Now lets get an index for the new group */ 460 for (i = 0; i < VFIO_MAX_GROUPS; i++) 461 if (vfio_cfg->vfio_groups[i].group_num == -1) { 462 cur_grp = &vfio_cfg->vfio_groups[i]; 463 break; 464 } 465 466 /* This should not happen */ 467 if (i == VFIO_MAX_GROUPS) { 468 EAL_LOG(ERR, "No VFIO group free slot found"); 469 return -1; 470 } 471 472 vfio_group_fd = vfio_open_group_fd(iommu_group_num); 473 if (vfio_group_fd < 0) { 474 EAL_LOG(ERR, "Failed to open VFIO group %d", 475 iommu_group_num); 476 return vfio_group_fd; 477 } 478 479 cur_grp->group_num = iommu_group_num; 480 cur_grp->fd = vfio_group_fd; 481 vfio_cfg->vfio_active_groups++; 482 483 return vfio_group_fd; 484 } 485 486 static struct vfio_config * 487 get_vfio_cfg_by_group_fd(int vfio_group_fd) 488 { 489 struct vfio_config *vfio_cfg; 490 int i, j; 491 492 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 493 vfio_cfg = &vfio_cfgs[i]; 494 for (j = 0; j < VFIO_MAX_GROUPS; j++) 495 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 496 return vfio_cfg; 497 } 498 499 return NULL; 500 } 501 502 static struct vfio_config * 503 get_vfio_cfg_by_container_fd(int container_fd) 504 { 505 int i; 506 507 if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) 508 return default_vfio_cfg; 509 510 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 511 if (vfio_cfgs[i].vfio_container_fd == container_fd) 512 return &vfio_cfgs[i]; 513 } 514 515 return NULL; 516 } 517 518 int 519 rte_vfio_get_group_fd(int iommu_group_num) 520 { 521 struct vfio_config *vfio_cfg; 522 523 /* get the vfio_config it belongs to */ 524 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 525 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 526 527 return vfio_get_group_fd(vfio_cfg, iommu_group_num); 528 } 529 530 static int 531 get_vfio_group_idx(int vfio_group_fd) 532 { 533 struct vfio_config *vfio_cfg; 534 int i, j; 535 536 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 537 vfio_cfg = &vfio_cfgs[i]; 538 for (j = 0; j < VFIO_MAX_GROUPS; j++) 539 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 540 return j; 541 } 542 543 return -1; 544 } 545 546 static void 547 vfio_group_device_get(int vfio_group_fd) 548 { 549 struct vfio_config *vfio_cfg; 550 int i; 551 552 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 553 if (vfio_cfg == NULL) { 554 EAL_LOG(ERR, "Invalid VFIO group fd!"); 555 return; 556 } 557 558 i = get_vfio_group_idx(vfio_group_fd); 559 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 560 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 561 else 562 vfio_cfg->vfio_groups[i].devices++; 563 } 564 565 static void 566 vfio_group_device_put(int vfio_group_fd) 567 { 568 struct vfio_config *vfio_cfg; 569 int i; 570 571 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 572 if (vfio_cfg == NULL) { 573 EAL_LOG(ERR, "Invalid VFIO group fd!"); 574 return; 575 } 576 577 i = get_vfio_group_idx(vfio_group_fd); 578 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 579 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 580 else 581 vfio_cfg->vfio_groups[i].devices--; 582 } 583 584 static int 585 vfio_group_device_count(int vfio_group_fd) 586 { 587 struct vfio_config *vfio_cfg; 588 int i; 589 590 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 591 if (vfio_cfg == NULL) { 592 EAL_LOG(ERR, "Invalid VFIO group fd!"); 593 return -1; 594 } 595 596 i = get_vfio_group_idx(vfio_group_fd); 597 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { 598 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 599 return -1; 600 } 601 602 return vfio_cfg->vfio_groups[i].devices; 603 } 604 605 static void 606 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, 607 void *arg __rte_unused) 608 { 609 struct rte_memseg_list *msl; 610 struct rte_memseg *ms; 611 size_t cur_len = 0; 612 613 msl = rte_mem_virt2memseg_list(addr); 614 615 /* for IOVA as VA mode, no need to care for IOVA addresses */ 616 if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { 617 uint64_t vfio_va = (uint64_t)(uintptr_t)addr; 618 uint64_t page_sz = msl->page_sz; 619 620 /* Maintain granularity of DMA map/unmap to memseg size */ 621 for (; cur_len < len; cur_len += page_sz) { 622 if (type == RTE_MEM_EVENT_ALLOC) 623 vfio_dma_mem_map(default_vfio_cfg, vfio_va, 624 vfio_va, page_sz, 1); 625 else 626 vfio_dma_mem_map(default_vfio_cfg, vfio_va, 627 vfio_va, page_sz, 0); 628 vfio_va += page_sz; 629 } 630 631 return; 632 } 633 634 /* memsegs are contiguous in memory */ 635 ms = rte_mem_virt2memseg(addr, msl); 636 while (cur_len < len) { 637 /* some memory segments may have invalid IOVA */ 638 if (ms->iova == RTE_BAD_IOVA) { 639 EAL_LOG(DEBUG, 640 "Memory segment at %p has bad IOVA, skipping", 641 ms->addr); 642 goto next; 643 } 644 if (type == RTE_MEM_EVENT_ALLOC) 645 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 646 ms->iova, ms->len, 1); 647 else 648 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 649 ms->iova, ms->len, 0); 650 next: 651 cur_len += ms->len; 652 ++ms; 653 } 654 } 655 656 static int 657 vfio_sync_default_container(void) 658 { 659 struct rte_mp_msg mp_req, *mp_rep; 660 struct rte_mp_reply mp_reply = {0}; 661 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 662 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 663 int iommu_type_id; 664 unsigned int i; 665 666 /* cannot be called from primary */ 667 if (rte_eal_process_type() != RTE_PROC_SECONDARY) 668 return -1; 669 670 /* default container fd should have been opened in rte_vfio_enable() */ 671 if (!default_vfio_cfg->vfio_enabled || 672 default_vfio_cfg->vfio_container_fd < 0) { 673 EAL_LOG(ERR, "VFIO support is not initialized"); 674 return -1; 675 } 676 677 /* find default container's IOMMU type */ 678 p->req = SOCKET_REQ_IOMMU_TYPE; 679 strcpy(mp_req.name, EAL_VFIO_MP); 680 mp_req.len_param = sizeof(*p); 681 mp_req.num_fds = 0; 682 683 iommu_type_id = -1; 684 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 685 mp_reply.nb_received == 1) { 686 mp_rep = &mp_reply.msgs[0]; 687 p = (struct vfio_mp_param *)mp_rep->param; 688 if (p->result == SOCKET_OK) 689 iommu_type_id = p->iommu_type_id; 690 } 691 free(mp_reply.msgs); 692 if (iommu_type_id < 0) { 693 EAL_LOG(ERR, 694 "Could not get IOMMU type for default container"); 695 return -1; 696 } 697 698 /* we now have an fd for default container, as well as its IOMMU type. 699 * now, set up default VFIO container config to match. 700 */ 701 for (i = 0; i < RTE_DIM(iommu_types); i++) { 702 const struct vfio_iommu_type *t = &iommu_types[i]; 703 if (t->type_id != iommu_type_id) 704 continue; 705 706 /* we found our IOMMU type */ 707 default_vfio_cfg->vfio_iommu_type = t; 708 709 return 0; 710 } 711 EAL_LOG(ERR, "Could not find IOMMU type id (%i)", 712 iommu_type_id); 713 return -1; 714 } 715 716 int 717 rte_vfio_clear_group(int vfio_group_fd) 718 { 719 int i; 720 struct vfio_config *vfio_cfg; 721 722 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 723 if (vfio_cfg == NULL) { 724 EAL_LOG(ERR, "Invalid VFIO group fd!"); 725 return -1; 726 } 727 728 i = get_vfio_group_idx(vfio_group_fd); 729 if (i < 0) 730 return -1; 731 vfio_cfg->vfio_groups[i].group_num = -1; 732 vfio_cfg->vfio_groups[i].fd = -1; 733 vfio_cfg->vfio_groups[i].devices = 0; 734 vfio_cfg->vfio_active_groups--; 735 736 return 0; 737 } 738 739 int 740 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, 741 int *vfio_dev_fd, struct vfio_device_info *device_info) 742 { 743 struct vfio_group_status group_status = { 744 .argsz = sizeof(group_status) 745 }; 746 struct vfio_config *vfio_cfg; 747 struct user_mem_maps *user_mem_maps; 748 int vfio_container_fd; 749 int vfio_group_fd; 750 int iommu_group_num; 751 rte_uuid_t vf_token; 752 int i, ret; 753 const struct internal_config *internal_conf = 754 eal_get_internal_configuration(); 755 756 /* get group number */ 757 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 758 if (ret == 0) { 759 EAL_LOG(NOTICE, 760 "%s not managed by VFIO driver, skipping", 761 dev_addr); 762 return 1; 763 } 764 765 /* if negative, something failed */ 766 if (ret < 0) 767 return -1; 768 769 /* get the actual group fd */ 770 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 771 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 772 return -1; 773 774 /* 775 * if vfio_group_fd == -ENOENT, that means the device 776 * isn't managed by VFIO 777 */ 778 if (vfio_group_fd == -ENOENT) { 779 EAL_LOG(NOTICE, 780 "%s not managed by VFIO driver, skipping", 781 dev_addr); 782 return 1; 783 } 784 785 /* 786 * at this point, we know that this group is viable (meaning, all devices 787 * are either bound to VFIO or not bound to anything) 788 */ 789 790 /* check if the group is viable */ 791 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); 792 if (ret) { 793 EAL_LOG(ERR, "%s cannot get VFIO group status, " 794 "error %i (%s)", dev_addr, errno, strerror(errno)); 795 close(vfio_group_fd); 796 rte_vfio_clear_group(vfio_group_fd); 797 return -1; 798 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 799 EAL_LOG(ERR, "%s VFIO group is not viable! " 800 "Not all devices in IOMMU group bound to VFIO or unbound", 801 dev_addr); 802 close(vfio_group_fd); 803 rte_vfio_clear_group(vfio_group_fd); 804 return -1; 805 } 806 807 /* get the vfio_config it belongs to */ 808 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 809 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 810 vfio_container_fd = vfio_cfg->vfio_container_fd; 811 user_mem_maps = &vfio_cfg->mem_maps; 812 813 /* check if group does not have a container yet */ 814 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { 815 816 /* add group to a container */ 817 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, 818 &vfio_container_fd); 819 if (ret) { 820 EAL_LOG(ERR, 821 "%s cannot add VFIO group to container, error " 822 "%i (%s)", dev_addr, errno, strerror(errno)); 823 close(vfio_group_fd); 824 rte_vfio_clear_group(vfio_group_fd); 825 return -1; 826 } 827 828 /* 829 * pick an IOMMU type and set up DMA mappings for container 830 * 831 * needs to be done only once, only when first group is 832 * assigned to a container and only in primary process. 833 * Note this can happen several times with the hotplug 834 * functionality. 835 */ 836 if (internal_conf->process_type == RTE_PROC_PRIMARY && 837 vfio_cfg->vfio_active_groups == 1 && 838 vfio_group_device_count(vfio_group_fd) == 0) { 839 const struct vfio_iommu_type *t; 840 841 /* select an IOMMU type which we will be using */ 842 t = vfio_set_iommu_type(vfio_container_fd); 843 if (!t) { 844 EAL_LOG(ERR, 845 "%s failed to select IOMMU type", 846 dev_addr); 847 close(vfio_group_fd); 848 rte_vfio_clear_group(vfio_group_fd); 849 return -1; 850 } 851 /* lock memory hotplug before mapping and release it 852 * after registering callback, to prevent races 853 */ 854 rte_mcfg_mem_read_lock(); 855 if (vfio_cfg == default_vfio_cfg) 856 ret = t->dma_map_func(vfio_container_fd); 857 else 858 ret = 0; 859 if (ret) { 860 EAL_LOG(ERR, 861 "%s DMA remapping failed, error " 862 "%i (%s)", 863 dev_addr, errno, strerror(errno)); 864 close(vfio_group_fd); 865 rte_vfio_clear_group(vfio_group_fd); 866 rte_mcfg_mem_read_unlock(); 867 return -1; 868 } 869 870 vfio_cfg->vfio_iommu_type = t; 871 872 /* re-map all user-mapped segments */ 873 rte_spinlock_recursive_lock(&user_mem_maps->lock); 874 875 /* this IOMMU type may not support DMA mapping, but 876 * if we have mappings in the list - that means we have 877 * previously mapped something successfully, so we can 878 * be sure that DMA mapping is supported. 879 */ 880 for (i = 0; i < user_mem_maps->n_maps; i++) { 881 struct user_mem_map *map; 882 map = &user_mem_maps->maps[i]; 883 884 ret = t->dma_user_map_func( 885 vfio_container_fd, 886 map->addr, map->iova, map->len, 887 1); 888 if (ret) { 889 EAL_LOG(ERR, "Couldn't map user memory for DMA: " 890 "va: 0x%" PRIx64 " " 891 "iova: 0x%" PRIx64 " " 892 "len: 0x%" PRIu64, 893 map->addr, map->iova, 894 map->len); 895 rte_spinlock_recursive_unlock( 896 &user_mem_maps->lock); 897 rte_mcfg_mem_read_unlock(); 898 return -1; 899 } 900 } 901 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 902 903 /* register callback for mem events */ 904 if (vfio_cfg == default_vfio_cfg) 905 ret = rte_mem_event_callback_register( 906 VFIO_MEM_EVENT_CLB_NAME, 907 vfio_mem_event_callback, NULL); 908 else 909 ret = 0; 910 /* unlock memory hotplug */ 911 rte_mcfg_mem_read_unlock(); 912 913 if (ret && rte_errno != ENOTSUP) { 914 EAL_LOG(ERR, "Could not install memory event callback for VFIO"); 915 return -1; 916 } 917 if (ret) 918 EAL_LOG(DEBUG, "Memory event callbacks not supported"); 919 else 920 EAL_LOG(DEBUG, "Installed memory event callback for VFIO"); 921 } 922 } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && 923 vfio_cfg == default_vfio_cfg && 924 vfio_cfg->vfio_iommu_type == NULL) { 925 /* if we're not a primary process, we do not set up the VFIO 926 * container because it's already been set up by the primary 927 * process. instead, we simply ask the primary about VFIO type 928 * we are using, and set the VFIO config up appropriately. 929 */ 930 ret = vfio_sync_default_container(); 931 if (ret < 0) { 932 EAL_LOG(ERR, "Could not sync default VFIO container"); 933 close(vfio_group_fd); 934 rte_vfio_clear_group(vfio_group_fd); 935 return -1; 936 } 937 /* we have successfully initialized VFIO, notify user */ 938 const struct vfio_iommu_type *t = 939 default_vfio_cfg->vfio_iommu_type; 940 EAL_LOG(INFO, "Using IOMMU type %d (%s)", 941 t->type_id, t->name); 942 } 943 944 rte_eal_vfio_get_vf_token(vf_token); 945 946 /* get a file descriptor for the device with VF token firstly */ 947 if (!rte_uuid_is_null(vf_token)) { 948 char vf_token_str[RTE_UUID_STRLEN]; 949 char dev[PATH_MAX]; 950 951 rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str)); 952 snprintf(dev, sizeof(dev), 953 "%s vf_token=%s", dev_addr, vf_token_str); 954 955 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, 956 dev); 957 if (*vfio_dev_fd >= 0) 958 goto dev_get_info; 959 } 960 961 /* get a file descriptor for the device */ 962 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); 963 if (*vfio_dev_fd < 0) { 964 /* if we cannot get a device fd, this implies a problem with 965 * the VFIO group or the container not having IOMMU configured. 966 */ 967 968 EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed", 969 dev_addr); 970 close(vfio_group_fd); 971 rte_vfio_clear_group(vfio_group_fd); 972 return -1; 973 } 974 975 /* test and setup the device */ 976 dev_get_info: 977 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); 978 if (ret) { 979 EAL_LOG(ERR, "%s cannot get device info, " 980 "error %i (%s)", dev_addr, errno, 981 strerror(errno)); 982 close(*vfio_dev_fd); 983 close(vfio_group_fd); 984 rte_vfio_clear_group(vfio_group_fd); 985 return -1; 986 } 987 vfio_group_device_get(vfio_group_fd); 988 989 return 0; 990 } 991 992 int 993 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, 994 int vfio_dev_fd) 995 { 996 struct vfio_config *vfio_cfg; 997 int vfio_group_fd; 998 int iommu_group_num; 999 int ret; 1000 1001 /* we don't want any DMA mapping messages to come while we're detaching 1002 * VFIO device, because this might be the last device and we might need 1003 * to unregister the callback. 1004 */ 1005 rte_mcfg_mem_read_lock(); 1006 1007 /* get group number */ 1008 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 1009 if (ret <= 0) { 1010 EAL_LOG(WARNING, "%s not managed by VFIO driver", 1011 dev_addr); 1012 /* This is an error at this point. */ 1013 ret = -1; 1014 goto out; 1015 } 1016 1017 /* get the actual group fd */ 1018 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 1019 if (vfio_group_fd < 0) { 1020 EAL_LOG(INFO, "rte_vfio_get_group_fd failed for %s", 1021 dev_addr); 1022 ret = vfio_group_fd; 1023 goto out; 1024 } 1025 1026 /* get the vfio_config it belongs to */ 1027 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 1028 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 1029 1030 /* At this point we got an active group. Closing it will make the 1031 * container detachment. If this is the last active group, VFIO kernel 1032 * code will unset the container and the IOMMU mappings. 1033 */ 1034 1035 /* Closing a device */ 1036 if (close(vfio_dev_fd) < 0) { 1037 EAL_LOG(INFO, "Error when closing vfio_dev_fd for %s", 1038 dev_addr); 1039 ret = -1; 1040 goto out; 1041 } 1042 1043 /* An VFIO group can have several devices attached. Just when there is 1044 * no devices remaining should the group be closed. 1045 */ 1046 vfio_group_device_put(vfio_group_fd); 1047 if (!vfio_group_device_count(vfio_group_fd)) { 1048 1049 if (close(vfio_group_fd) < 0) { 1050 EAL_LOG(INFO, "Error when closing vfio_group_fd for %s", 1051 dev_addr); 1052 ret = -1; 1053 goto out; 1054 } 1055 1056 if (rte_vfio_clear_group(vfio_group_fd) < 0) { 1057 EAL_LOG(INFO, "Error when clearing group for %s", 1058 dev_addr); 1059 ret = -1; 1060 goto out; 1061 } 1062 } 1063 1064 /* if there are no active device groups, unregister the callback to 1065 * avoid spurious attempts to map/unmap memory from VFIO. 1066 */ 1067 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && 1068 rte_eal_process_type() != RTE_PROC_SECONDARY) 1069 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, 1070 NULL); 1071 1072 /* success */ 1073 ret = 0; 1074 1075 out: 1076 rte_mcfg_mem_read_unlock(); 1077 return ret; 1078 } 1079 1080 int 1081 rte_vfio_enable(const char *modname) 1082 { 1083 /* initialize group list */ 1084 int i, j; 1085 int vfio_available; 1086 const struct internal_config *internal_conf = 1087 eal_get_internal_configuration(); 1088 1089 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; 1090 1091 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 1092 vfio_cfgs[i].vfio_container_fd = -1; 1093 vfio_cfgs[i].vfio_active_groups = 0; 1094 vfio_cfgs[i].vfio_iommu_type = NULL; 1095 vfio_cfgs[i].mem_maps.lock = lock; 1096 1097 for (j = 0; j < VFIO_MAX_GROUPS; j++) { 1098 vfio_cfgs[i].vfio_groups[j].fd = -1; 1099 vfio_cfgs[i].vfio_groups[j].group_num = -1; 1100 vfio_cfgs[i].vfio_groups[j].devices = 0; 1101 } 1102 } 1103 1104 EAL_LOG(DEBUG, "Probing VFIO support..."); 1105 1106 /* check if vfio module is loaded */ 1107 vfio_available = rte_eal_check_module(modname); 1108 1109 /* return error directly */ 1110 if (vfio_available == -1) { 1111 EAL_LOG(INFO, "Could not get loaded module details!"); 1112 return -1; 1113 } 1114 1115 /* return 0 if VFIO modules not loaded */ 1116 if (vfio_available == 0) { 1117 EAL_LOG(DEBUG, 1118 "VFIO modules not loaded, skipping VFIO support..."); 1119 return 0; 1120 } 1121 1122 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1123 /* open a new container */ 1124 default_vfio_cfg->vfio_container_fd = 1125 rte_vfio_get_container_fd(); 1126 } else { 1127 /* get the default container from the primary process */ 1128 default_vfio_cfg->vfio_container_fd = 1129 vfio_get_default_container_fd(); 1130 } 1131 1132 /* check if we have VFIO driver enabled */ 1133 if (default_vfio_cfg->vfio_container_fd != -1) { 1134 EAL_LOG(INFO, "VFIO support initialized"); 1135 default_vfio_cfg->vfio_enabled = 1; 1136 } else { 1137 EAL_LOG(NOTICE, "VFIO support could not be initialized"); 1138 } 1139 1140 return 0; 1141 } 1142 1143 int 1144 rte_vfio_is_enabled(const char *modname) 1145 { 1146 const int mod_available = rte_eal_check_module(modname) > 0; 1147 return default_vfio_cfg->vfio_enabled && mod_available; 1148 } 1149 1150 int 1151 vfio_get_default_container_fd(void) 1152 { 1153 struct rte_mp_msg mp_req, *mp_rep; 1154 struct rte_mp_reply mp_reply = {0}; 1155 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 1156 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 1157 int container_fd; 1158 const struct internal_config *internal_conf = 1159 eal_get_internal_configuration(); 1160 1161 if (default_vfio_cfg->vfio_enabled) 1162 return default_vfio_cfg->vfio_container_fd; 1163 1164 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1165 /* if we were secondary process we would try requesting 1166 * container fd from the primary, but we're the primary 1167 * process so just exit here 1168 */ 1169 return -1; 1170 } 1171 1172 p->req = SOCKET_REQ_DEFAULT_CONTAINER; 1173 strcpy(mp_req.name, EAL_VFIO_MP); 1174 mp_req.len_param = sizeof(*p); 1175 mp_req.num_fds = 0; 1176 1177 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 1178 mp_reply.nb_received == 1) { 1179 mp_rep = &mp_reply.msgs[0]; 1180 p = (struct vfio_mp_param *)mp_rep->param; 1181 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 1182 container_fd = mp_rep->fds[0]; 1183 free(mp_reply.msgs); 1184 return container_fd; 1185 } 1186 } 1187 1188 free(mp_reply.msgs); 1189 EAL_LOG(ERR, "Cannot request default VFIO container fd"); 1190 return -1; 1191 } 1192 1193 int 1194 vfio_get_iommu_type(void) 1195 { 1196 if (default_vfio_cfg->vfio_iommu_type == NULL) 1197 return -1; 1198 1199 return default_vfio_cfg->vfio_iommu_type->type_id; 1200 } 1201 1202 const struct vfio_iommu_type * 1203 vfio_set_iommu_type(int vfio_container_fd) 1204 { 1205 unsigned idx; 1206 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 1207 const struct vfio_iommu_type *t = &iommu_types[idx]; 1208 1209 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, 1210 t->type_id); 1211 if (!ret) { 1212 EAL_LOG(INFO, "Using IOMMU type %d (%s)", 1213 t->type_id, t->name); 1214 return t; 1215 } 1216 /* not an error, there may be more supported IOMMU types */ 1217 EAL_LOG(DEBUG, "Set IOMMU type %d (%s) failed, error " 1218 "%i (%s)", t->type_id, t->name, errno, 1219 strerror(errno)); 1220 } 1221 /* if we didn't find a suitable IOMMU type, fail */ 1222 return NULL; 1223 } 1224 1225 int 1226 vfio_has_supported_extensions(int vfio_container_fd) 1227 { 1228 int ret; 1229 unsigned idx, n_extensions = 0; 1230 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 1231 const struct vfio_iommu_type *t = &iommu_types[idx]; 1232 1233 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, 1234 t->type_id); 1235 if (ret < 0) { 1236 EAL_LOG(ERR, "Could not get IOMMU type, error " 1237 "%i (%s)", errno, strerror(errno)); 1238 close(vfio_container_fd); 1239 return -1; 1240 } else if (ret == 1) { 1241 /* we found a supported extension */ 1242 n_extensions++; 1243 } 1244 EAL_LOG(DEBUG, "IOMMU type %d (%s) is %s", 1245 t->type_id, t->name, 1246 ret ? "supported" : "not supported"); 1247 } 1248 1249 /* if we didn't find any supported IOMMU types, fail */ 1250 if (!n_extensions) { 1251 close(vfio_container_fd); 1252 return -1; 1253 } 1254 1255 return 0; 1256 } 1257 1258 int 1259 rte_vfio_get_container_fd(void) 1260 { 1261 int ret, vfio_container_fd; 1262 struct rte_mp_msg mp_req, *mp_rep; 1263 struct rte_mp_reply mp_reply = {0}; 1264 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 1265 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 1266 const struct internal_config *internal_conf = 1267 eal_get_internal_configuration(); 1268 1269 1270 /* if we're in a primary process, try to open the container */ 1271 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1272 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); 1273 if (vfio_container_fd < 0) { 1274 EAL_LOG(ERR, 1275 "Cannot open VFIO container %s, error " 1276 "%i (%s)", VFIO_CONTAINER_PATH, 1277 errno, strerror(errno)); 1278 return -1; 1279 } 1280 1281 /* check VFIO API version */ 1282 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); 1283 if (ret != VFIO_API_VERSION) { 1284 if (ret < 0) 1285 EAL_LOG(ERR, 1286 "Could not get VFIO API version, error " 1287 "%i (%s)", errno, strerror(errno)); 1288 else 1289 EAL_LOG(ERR, "Unsupported VFIO API version!"); 1290 close(vfio_container_fd); 1291 return -1; 1292 } 1293 1294 ret = vfio_has_supported_extensions(vfio_container_fd); 1295 if (ret) { 1296 EAL_LOG(ERR, 1297 "No supported IOMMU extensions found!"); 1298 return -1; 1299 } 1300 1301 return vfio_container_fd; 1302 } 1303 /* 1304 * if we're in a secondary process, request container fd from the 1305 * primary process via mp channel 1306 */ 1307 p->req = SOCKET_REQ_CONTAINER; 1308 strcpy(mp_req.name, EAL_VFIO_MP); 1309 mp_req.len_param = sizeof(*p); 1310 mp_req.num_fds = 0; 1311 1312 vfio_container_fd = -1; 1313 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 1314 mp_reply.nb_received == 1) { 1315 mp_rep = &mp_reply.msgs[0]; 1316 p = (struct vfio_mp_param *)mp_rep->param; 1317 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 1318 vfio_container_fd = mp_rep->fds[0]; 1319 free(mp_reply.msgs); 1320 return vfio_container_fd; 1321 } 1322 } 1323 1324 free(mp_reply.msgs); 1325 EAL_LOG(ERR, "Cannot request VFIO container fd"); 1326 return -1; 1327 } 1328 1329 int 1330 rte_vfio_get_group_num(const char *sysfs_base, 1331 const char *dev_addr, int *iommu_group_num) 1332 { 1333 char linkname[PATH_MAX]; 1334 char filename[PATH_MAX]; 1335 char *tok[16], *group_tok, *end; 1336 int ret; 1337 1338 memset(linkname, 0, sizeof(linkname)); 1339 memset(filename, 0, sizeof(filename)); 1340 1341 /* try to find out IOMMU group for this device */ 1342 snprintf(linkname, sizeof(linkname), 1343 "%s/%s/iommu_group", sysfs_base, dev_addr); 1344 1345 ret = readlink(linkname, filename, sizeof(filename)); 1346 1347 /* if the link doesn't exist, no VFIO for us */ 1348 if (ret < 0) 1349 return 0; 1350 1351 ret = rte_strsplit(filename, sizeof(filename), 1352 tok, RTE_DIM(tok), '/'); 1353 1354 if (ret <= 0) { 1355 EAL_LOG(ERR, "%s cannot get IOMMU group", dev_addr); 1356 return -1; 1357 } 1358 1359 /* IOMMU group is always the last token */ 1360 errno = 0; 1361 group_tok = tok[ret - 1]; 1362 end = group_tok; 1363 *iommu_group_num = strtol(group_tok, &end, 10); 1364 if ((end != group_tok && *end != '\0') || errno != 0) { 1365 EAL_LOG(ERR, "%s error parsing IOMMU number!", dev_addr); 1366 return -1; 1367 } 1368 1369 return 1; 1370 } 1371 1372 static int 1373 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, 1374 void *arg) 1375 { 1376 int *vfio_container_fd = arg; 1377 1378 /* skip external memory that isn't a heap */ 1379 if (msl->external && !msl->heap) 1380 return 0; 1381 1382 /* skip any segments with invalid IOVA addresses */ 1383 if (ms->iova == RTE_BAD_IOVA) 1384 return 0; 1385 1386 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, 1387 ms->len, 1); 1388 } 1389 1390 static int 1391 vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 1392 uint64_t len, int do_map) 1393 { 1394 struct vfio_iommu_type1_dma_map dma_map; 1395 struct vfio_iommu_type1_dma_unmap dma_unmap; 1396 int ret; 1397 1398 if (do_map != 0) { 1399 memset(&dma_map, 0, sizeof(dma_map)); 1400 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 1401 dma_map.vaddr = vaddr; 1402 dma_map.size = len; 1403 dma_map.iova = iova; 1404 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 1405 VFIO_DMA_MAP_FLAG_WRITE; 1406 1407 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 1408 if (ret) { 1409 /** 1410 * In case the mapping was already done EEXIST will be 1411 * returned from kernel. 1412 */ 1413 if (errno == EEXIST) { 1414 EAL_LOG(DEBUG, 1415 "Memory segment is already mapped, skipping"); 1416 } else { 1417 EAL_LOG(ERR, 1418 "Cannot set up DMA remapping, error " 1419 "%i (%s)", errno, strerror(errno)); 1420 return -1; 1421 } 1422 } 1423 } else { 1424 memset(&dma_unmap, 0, sizeof(dma_unmap)); 1425 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 1426 dma_unmap.size = len; 1427 dma_unmap.iova = iova; 1428 1429 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 1430 &dma_unmap); 1431 if (ret) { 1432 EAL_LOG(ERR, "Cannot clear DMA remapping, error " 1433 "%i (%s)", errno, strerror(errno)); 1434 return -1; 1435 } else if (dma_unmap.size != len) { 1436 EAL_LOG(ERR, "Unexpected size %"PRIu64 1437 " of DMA remapping cleared instead of %"PRIu64, 1438 (uint64_t)dma_unmap.size, len); 1439 rte_errno = EIO; 1440 return -1; 1441 } 1442 } 1443 1444 return 0; 1445 } 1446 1447 static int 1448 vfio_type1_dma_map(int vfio_container_fd) 1449 { 1450 return rte_memseg_walk(type1_map, &vfio_container_fd); 1451 } 1452 1453 /* Track the size of the statically allocated DMA window for SPAPR */ 1454 uint64_t spapr_dma_win_len; 1455 uint64_t spapr_dma_win_page_sz; 1456 1457 static int 1458 vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 1459 uint64_t len, int do_map) 1460 { 1461 struct vfio_iommu_spapr_register_memory reg = { 1462 .argsz = sizeof(reg), 1463 .vaddr = (uintptr_t) vaddr, 1464 .size = len, 1465 .flags = 0 1466 }; 1467 int ret; 1468 1469 if (do_map != 0) { 1470 struct vfio_iommu_type1_dma_map dma_map; 1471 1472 if (iova + len > spapr_dma_win_len) { 1473 EAL_LOG(ERR, "DMA map attempt outside DMA window"); 1474 return -1; 1475 } 1476 1477 ret = ioctl(vfio_container_fd, 1478 VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 1479 if (ret) { 1480 EAL_LOG(ERR, 1481 "Cannot register vaddr for IOMMU, error " 1482 "%i (%s)", errno, strerror(errno)); 1483 return -1; 1484 } 1485 1486 memset(&dma_map, 0, sizeof(dma_map)); 1487 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 1488 dma_map.vaddr = vaddr; 1489 dma_map.size = len; 1490 dma_map.iova = iova; 1491 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 1492 VFIO_DMA_MAP_FLAG_WRITE; 1493 1494 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 1495 if (ret) { 1496 EAL_LOG(ERR, "Cannot map vaddr for IOMMU, error " 1497 "%i (%s)", errno, strerror(errno)); 1498 return -1; 1499 } 1500 1501 } else { 1502 struct vfio_iommu_type1_dma_map dma_unmap; 1503 1504 memset(&dma_unmap, 0, sizeof(dma_unmap)); 1505 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 1506 dma_unmap.size = len; 1507 dma_unmap.iova = iova; 1508 1509 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 1510 &dma_unmap); 1511 if (ret) { 1512 EAL_LOG(ERR, "Cannot unmap vaddr for IOMMU, error " 1513 "%i (%s)", errno, strerror(errno)); 1514 return -1; 1515 } 1516 1517 ret = ioctl(vfio_container_fd, 1518 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 1519 if (ret) { 1520 EAL_LOG(ERR, 1521 "Cannot unregister vaddr for IOMMU, error " 1522 "%i (%s)", errno, strerror(errno)); 1523 return -1; 1524 } 1525 } 1526 1527 return ret; 1528 } 1529 1530 static int 1531 vfio_spapr_map_walk(const struct rte_memseg_list *msl, 1532 const struct rte_memseg *ms, void *arg) 1533 { 1534 int *vfio_container_fd = arg; 1535 1536 /* skip external memory that isn't a heap */ 1537 if (msl->external && !msl->heap) 1538 return 0; 1539 1540 /* skip any segments with invalid IOVA addresses */ 1541 if (ms->iova == RTE_BAD_IOVA) 1542 return 0; 1543 1544 return vfio_spapr_dma_do_map(*vfio_container_fd, 1545 ms->addr_64, ms->iova, ms->len, 1); 1546 } 1547 1548 struct spapr_size_walk_param { 1549 uint64_t max_va; 1550 uint64_t page_sz; 1551 bool is_user_managed; 1552 }; 1553 1554 /* 1555 * In order to set the DMA window size required for the SPAPR IOMMU 1556 * we need to walk the existing virtual memory allocations as well as 1557 * find the hugepage size used. 1558 */ 1559 static int 1560 vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg) 1561 { 1562 struct spapr_size_walk_param *param = arg; 1563 uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len; 1564 1565 if (msl->external && !msl->heap) { 1566 /* ignore user managed external memory */ 1567 param->is_user_managed = true; 1568 return 0; 1569 } 1570 1571 if (max > param->max_va) { 1572 param->page_sz = msl->page_sz; 1573 param->max_va = max; 1574 } 1575 1576 return 0; 1577 } 1578 1579 /* 1580 * Find the highest memory address used in physical or virtual address 1581 * space and use that as the top of the DMA window. 1582 */ 1583 static int 1584 find_highest_mem_addr(struct spapr_size_walk_param *param) 1585 { 1586 /* find the maximum IOVA address for setting the DMA window size */ 1587 if (rte_eal_iova_mode() == RTE_IOVA_PA) { 1588 static const char proc_iomem[] = "/proc/iomem"; 1589 static const char str_sysram[] = "System RAM"; 1590 uint64_t start, end, max = 0; 1591 char *line = NULL; 1592 char *dash, *space; 1593 size_t line_len; 1594 1595 /* 1596 * Example "System RAM" in /proc/iomem: 1597 * 00000000-1fffffffff : System RAM 1598 * 200000000000-201fffffffff : System RAM 1599 */ 1600 FILE *fd = fopen(proc_iomem, "r"); 1601 if (fd == NULL) { 1602 EAL_LOG(ERR, "Cannot open %s", proc_iomem); 1603 return -1; 1604 } 1605 /* Scan /proc/iomem for the highest PA in the system */ 1606 while (getline(&line, &line_len, fd) != -1) { 1607 if (strstr(line, str_sysram) == NULL) 1608 continue; 1609 1610 space = strstr(line, " "); 1611 dash = strstr(line, "-"); 1612 1613 /* Validate the format of the memory string */ 1614 if (space == NULL || dash == NULL || space < dash) { 1615 EAL_LOG(ERR, "Can't parse line \"%s\" in file %s", 1616 line, proc_iomem); 1617 continue; 1618 } 1619 1620 start = strtoull(line, NULL, 16); 1621 end = strtoull(dash + 1, NULL, 16); 1622 EAL_LOG(DEBUG, "Found system RAM from 0x%" PRIx64 1623 " to 0x%" PRIx64, start, end); 1624 if (end > max) 1625 max = end; 1626 } 1627 free(line); 1628 fclose(fd); 1629 1630 if (max == 0) { 1631 EAL_LOG(ERR, "Failed to find valid \"System RAM\" " 1632 "entry in file %s", proc_iomem); 1633 return -1; 1634 } 1635 1636 spapr_dma_win_len = rte_align64pow2(max + 1); 1637 return 0; 1638 } else if (rte_eal_iova_mode() == RTE_IOVA_VA) { 1639 EAL_LOG(DEBUG, "Highest VA address in memseg list is 0x%" 1640 PRIx64, param->max_va); 1641 spapr_dma_win_len = rte_align64pow2(param->max_va); 1642 return 0; 1643 } 1644 1645 spapr_dma_win_len = 0; 1646 EAL_LOG(ERR, "Unsupported IOVA mode"); 1647 return -1; 1648 } 1649 1650 1651 /* 1652 * The SPAPRv2 IOMMU supports 2 DMA windows with starting 1653 * address at 0 or 1<<59. By default, a DMA window is set 1654 * at address 0, 2GB long, with a 4KB page. For DPDK we 1655 * must remove the default window and setup a new DMA window 1656 * based on the hugepage size and memory requirements of 1657 * the application before we can map memory for DMA. 1658 */ 1659 static int 1660 spapr_dma_win_size(void) 1661 { 1662 struct spapr_size_walk_param param; 1663 1664 /* only create DMA window once */ 1665 if (spapr_dma_win_len > 0) 1666 return 0; 1667 1668 /* walk the memseg list to find the page size/max VA address */ 1669 memset(¶m, 0, sizeof(param)); 1670 if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) { 1671 EAL_LOG(ERR, "Failed to walk memseg list for DMA window size"); 1672 return -1; 1673 } 1674 1675 /* we can't be sure if DMA window covers external memory */ 1676 if (param.is_user_managed) 1677 EAL_LOG(WARNING, "Detected user managed external memory which may not be managed by the IOMMU"); 1678 1679 /* check physical/virtual memory size */ 1680 if (find_highest_mem_addr(¶m) < 0) 1681 return -1; 1682 EAL_LOG(DEBUG, "Setting DMA window size to 0x%" PRIx64, 1683 spapr_dma_win_len); 1684 spapr_dma_win_page_sz = param.page_sz; 1685 rte_mem_set_dma_mask(rte_ctz64(spapr_dma_win_len)); 1686 return 0; 1687 } 1688 1689 static int 1690 vfio_spapr_create_dma_window(int vfio_container_fd) 1691 { 1692 struct vfio_iommu_spapr_tce_create create = { 1693 .argsz = sizeof(create), }; 1694 struct vfio_iommu_spapr_tce_remove remove = { 1695 .argsz = sizeof(remove), }; 1696 struct vfio_iommu_spapr_tce_info info = { 1697 .argsz = sizeof(info), }; 1698 int ret; 1699 1700 ret = spapr_dma_win_size(); 1701 if (ret < 0) 1702 return ret; 1703 1704 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 1705 if (ret) { 1706 EAL_LOG(ERR, "Cannot get IOMMU info, error %i (%s)", 1707 errno, strerror(errno)); 1708 return -1; 1709 } 1710 1711 /* 1712 * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window 1713 * can't be changed for v1 but it can be changed for v2. Since DPDK only 1714 * supports v2, remove the default DMA window so it can be resized. 1715 */ 1716 remove.start_addr = info.dma32_window_start; 1717 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 1718 if (ret) 1719 return -1; 1720 1721 /* create a new DMA window (start address is not selectable) */ 1722 create.window_size = spapr_dma_win_len; 1723 create.page_shift = rte_ctz64(spapr_dma_win_page_sz); 1724 create.levels = 1; 1725 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 1726 #ifdef VFIO_IOMMU_SPAPR_INFO_DDW 1727 /* 1728 * The vfio_iommu_spapr_tce_info structure was modified in 1729 * Linux kernel 4.2.0 to add support for the 1730 * vfio_iommu_spapr_tce_ddw_info structure needed to try 1731 * multiple table levels. Skip the attempt if running with 1732 * an older kernel. 1733 */ 1734 if (ret) { 1735 /* if at first we don't succeed, try more levels */ 1736 uint32_t levels; 1737 1738 for (levels = create.levels + 1; 1739 ret && levels <= info.ddw.levels; levels++) { 1740 create.levels = levels; 1741 ret = ioctl(vfio_container_fd, 1742 VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 1743 } 1744 } 1745 #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */ 1746 if (ret) { 1747 EAL_LOG(ERR, "Cannot create new DMA window, error " 1748 "%i (%s)", errno, strerror(errno)); 1749 EAL_LOG(ERR, 1750 "Consider using a larger hugepage size if supported by the system"); 1751 return -1; 1752 } 1753 1754 /* verify the start address */ 1755 if (create.start_addr != 0) { 1756 EAL_LOG(ERR, "Received unsupported start address 0x%" 1757 PRIx64, (uint64_t)create.start_addr); 1758 return -1; 1759 } 1760 return ret; 1761 } 1762 1763 static int 1764 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, 1765 uint64_t iova, uint64_t len, int do_map) 1766 { 1767 int ret = 0; 1768 1769 if (do_map) { 1770 if (vfio_spapr_dma_do_map(vfio_container_fd, 1771 vaddr, iova, len, 1)) { 1772 EAL_LOG(ERR, "Failed to map DMA"); 1773 ret = -1; 1774 } 1775 } else { 1776 if (vfio_spapr_dma_do_map(vfio_container_fd, 1777 vaddr, iova, len, 0)) { 1778 EAL_LOG(ERR, "Failed to unmap DMA"); 1779 ret = -1; 1780 } 1781 } 1782 1783 return ret; 1784 } 1785 1786 static int 1787 vfio_spapr_dma_map(int vfio_container_fd) 1788 { 1789 if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) { 1790 EAL_LOG(ERR, "Could not create new DMA window!"); 1791 return -1; 1792 } 1793 1794 /* map all existing DPDK segments for DMA */ 1795 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) 1796 return -1; 1797 1798 return 0; 1799 } 1800 1801 static int 1802 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) 1803 { 1804 /* No-IOMMU mode does not need DMA mapping */ 1805 return 0; 1806 } 1807 1808 static int 1809 vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, 1810 uint64_t __rte_unused vaddr, 1811 uint64_t __rte_unused iova, uint64_t __rte_unused len, 1812 int __rte_unused do_map) 1813 { 1814 /* No-IOMMU mode does not need DMA mapping */ 1815 return 0; 1816 } 1817 1818 static int 1819 vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1820 uint64_t len, int do_map) 1821 { 1822 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; 1823 1824 if (!t) { 1825 EAL_LOG(ERR, "VFIO support not initialized"); 1826 rte_errno = ENODEV; 1827 return -1; 1828 } 1829 1830 if (!t->dma_user_map_func) { 1831 EAL_LOG(ERR, 1832 "VFIO custom DMA region mapping not supported by IOMMU %s", 1833 t->name); 1834 rte_errno = ENOTSUP; 1835 return -1; 1836 } 1837 1838 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, 1839 len, do_map); 1840 } 1841 1842 static int 1843 container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1844 uint64_t len) 1845 { 1846 struct user_mem_map *new_map; 1847 struct user_mem_maps *user_mem_maps; 1848 bool has_partial_unmap; 1849 int ret = 0; 1850 1851 user_mem_maps = &vfio_cfg->mem_maps; 1852 rte_spinlock_recursive_lock(&user_mem_maps->lock); 1853 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { 1854 EAL_LOG(ERR, "No more space for user mem maps"); 1855 rte_errno = ENOMEM; 1856 ret = -1; 1857 goto out; 1858 } 1859 /* map the entry */ 1860 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { 1861 /* technically, this will fail if there are currently no devices 1862 * plugged in, even if a device were added later, this mapping 1863 * might have succeeded. however, since we cannot verify if this 1864 * is a valid mapping without having a device attached, consider 1865 * this to be unsupported, because we can't just store any old 1866 * mapping and pollute list of active mappings willy-nilly. 1867 */ 1868 EAL_LOG(ERR, "Couldn't map new region for DMA"); 1869 ret = -1; 1870 goto out; 1871 } 1872 /* do we have partial unmap support? */ 1873 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 1874 1875 /* create new user mem map entry */ 1876 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; 1877 new_map->addr = vaddr; 1878 new_map->iova = iova; 1879 new_map->len = len; 1880 /* for IOMMU types supporting partial unmap, we don't need chunking */ 1881 new_map->chunk = has_partial_unmap ? 0 : len; 1882 1883 compact_user_maps(user_mem_maps); 1884 out: 1885 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 1886 return ret; 1887 } 1888 1889 static int 1890 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1891 uint64_t len) 1892 { 1893 struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS]; 1894 struct user_mem_map new_maps[2]; /* can be at most 2 */ 1895 struct user_mem_maps *user_mem_maps; 1896 int n_orig, n_new, newlen, ret = 0; 1897 bool has_partial_unmap; 1898 1899 user_mem_maps = &vfio_cfg->mem_maps; 1900 rte_spinlock_recursive_lock(&user_mem_maps->lock); 1901 1902 /* 1903 * Previously, we had adjacent mappings entirely contained within one 1904 * mapping entry. Since we now store original mapping length in some 1905 * cases, this is no longer the case, so unmapping can potentially go 1906 * over multiple segments and split them in any number of ways. 1907 * 1908 * To complicate things further, some IOMMU types support arbitrary 1909 * partial unmapping, while others will only support unmapping along the 1910 * chunk size, so there are a lot of cases we need to handle. To make 1911 * things easier code wise, instead of trying to adjust existing 1912 * mappings, let's just rebuild them using information we have. 1913 */ 1914 1915 /* 1916 * first thing to do is check if there exists a mapping that includes 1917 * the start and the end of our requested unmap. We need to collect all 1918 * maps that include our unmapped region. 1919 */ 1920 n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len, 1921 orig_maps, RTE_DIM(orig_maps)); 1922 /* did we find anything? */ 1923 if (n_orig < 0) { 1924 EAL_LOG(ERR, "Couldn't find previously mapped region"); 1925 rte_errno = EINVAL; 1926 ret = -1; 1927 goto out; 1928 } 1929 1930 /* do we have partial unmap capability? */ 1931 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 1932 1933 /* 1934 * if we don't support partial unmap, we must check if start and end of 1935 * current unmap region are chunk-aligned. 1936 */ 1937 if (!has_partial_unmap) { 1938 bool start_aligned, end_aligned; 1939 1940 start_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 1941 vaddr, iova); 1942 end_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 1943 vaddr + len, iova + len); 1944 1945 if (!start_aligned || !end_aligned) { 1946 EAL_LOG(DEBUG, "DMA partial unmap unsupported"); 1947 rte_errno = ENOTSUP; 1948 ret = -1; 1949 goto out; 1950 } 1951 } 1952 1953 /* 1954 * now we know we can potentially unmap the region, but we still have to 1955 * figure out if there is enough space in our list to store remaining 1956 * maps. for this, we will figure out how many segments we are going to 1957 * remove, and how many new segments we are going to create. 1958 */ 1959 n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len); 1960 1961 /* can we store the new maps in our list? */ 1962 newlen = (user_mem_maps->n_maps - n_orig) + n_new; 1963 if (newlen >= VFIO_MAX_USER_MEM_MAPS) { 1964 EAL_LOG(ERR, "Not enough space to store partial mapping"); 1965 rte_errno = ENOMEM; 1966 ret = -1; 1967 goto out; 1968 } 1969 1970 /* unmap the entry */ 1971 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { 1972 /* there may not be any devices plugged in, so unmapping will 1973 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't 1974 * stop us from removing the mapping, as the assumption is we 1975 * won't be needing this memory any more and thus will want to 1976 * prevent it from being remapped again on hotplug. so, only 1977 * fail if we indeed failed to unmap (e.g. if the mapping was 1978 * within our mapped range but had invalid alignment). 1979 */ 1980 if (rte_errno != ENODEV && rte_errno != ENOTSUP) { 1981 EAL_LOG(ERR, "Couldn't unmap region for DMA"); 1982 ret = -1; 1983 goto out; 1984 } else { 1985 EAL_LOG(DEBUG, "DMA unmapping failed, but removing mappings anyway"); 1986 } 1987 } 1988 1989 /* we have unmapped the region, so now update the maps */ 1990 delete_maps(user_mem_maps, orig_maps, n_orig); 1991 copy_maps(user_mem_maps, new_maps, n_new); 1992 compact_user_maps(user_mem_maps); 1993 out: 1994 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 1995 return ret; 1996 } 1997 1998 int 1999 rte_vfio_noiommu_is_enabled(void) 2000 { 2001 int fd; 2002 ssize_t cnt; 2003 char c; 2004 2005 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); 2006 if (fd < 0) { 2007 if (errno != ENOENT) { 2008 EAL_LOG(ERR, "Cannot open VFIO noiommu file " 2009 "%i (%s)", errno, strerror(errno)); 2010 return -1; 2011 } 2012 /* 2013 * else the file does not exists 2014 * i.e. noiommu is not enabled 2015 */ 2016 return 0; 2017 } 2018 2019 cnt = read(fd, &c, 1); 2020 close(fd); 2021 if (cnt != 1) { 2022 EAL_LOG(ERR, "Unable to read from VFIO noiommu file " 2023 "%i (%s)", errno, strerror(errno)); 2024 return -1; 2025 } 2026 2027 return c == 'Y'; 2028 } 2029 2030 int 2031 rte_vfio_container_create(void) 2032 { 2033 int i; 2034 2035 /* Find an empty slot to store new vfio config */ 2036 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { 2037 if (vfio_cfgs[i].vfio_container_fd == -1) 2038 break; 2039 } 2040 2041 if (i == VFIO_MAX_CONTAINERS) { 2042 EAL_LOG(ERR, "Exceed max VFIO container limit"); 2043 return -1; 2044 } 2045 2046 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); 2047 if (vfio_cfgs[i].vfio_container_fd < 0) { 2048 EAL_LOG(NOTICE, "Fail to create a new VFIO container"); 2049 return -1; 2050 } 2051 2052 return vfio_cfgs[i].vfio_container_fd; 2053 } 2054 2055 int 2056 rte_vfio_container_destroy(int container_fd) 2057 { 2058 struct vfio_config *vfio_cfg; 2059 int i; 2060 2061 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2062 if (vfio_cfg == NULL) { 2063 EAL_LOG(ERR, "Invalid VFIO container fd"); 2064 return -1; 2065 } 2066 2067 for (i = 0; i < VFIO_MAX_GROUPS; i++) 2068 if (vfio_cfg->vfio_groups[i].group_num != -1) 2069 rte_vfio_container_group_unbind(container_fd, 2070 vfio_cfg->vfio_groups[i].group_num); 2071 2072 close(container_fd); 2073 vfio_cfg->vfio_container_fd = -1; 2074 vfio_cfg->vfio_active_groups = 0; 2075 vfio_cfg->vfio_iommu_type = NULL; 2076 2077 return 0; 2078 } 2079 2080 int 2081 rte_vfio_container_group_bind(int container_fd, int iommu_group_num) 2082 { 2083 struct vfio_config *vfio_cfg; 2084 2085 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2086 if (vfio_cfg == NULL) { 2087 EAL_LOG(ERR, "Invalid VFIO container fd"); 2088 return -1; 2089 } 2090 2091 return vfio_get_group_fd(vfio_cfg, iommu_group_num); 2092 } 2093 2094 int 2095 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) 2096 { 2097 struct vfio_config *vfio_cfg; 2098 struct vfio_group *cur_grp = NULL; 2099 int i; 2100 2101 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2102 if (vfio_cfg == NULL) { 2103 EAL_LOG(ERR, "Invalid VFIO container fd"); 2104 return -1; 2105 } 2106 2107 for (i = 0; i < VFIO_MAX_GROUPS; i++) { 2108 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { 2109 cur_grp = &vfio_cfg->vfio_groups[i]; 2110 break; 2111 } 2112 } 2113 2114 /* This should not happen */ 2115 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { 2116 EAL_LOG(ERR, "Specified VFIO group number not found"); 2117 return -1; 2118 } 2119 2120 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { 2121 EAL_LOG(ERR, 2122 "Error when closing vfio_group_fd for iommu_group_num " 2123 "%d", iommu_group_num); 2124 return -1; 2125 } 2126 cur_grp->group_num = -1; 2127 cur_grp->fd = -1; 2128 cur_grp->devices = 0; 2129 vfio_cfg->vfio_active_groups--; 2130 2131 return 0; 2132 } 2133 2134 int 2135 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, 2136 uint64_t len) 2137 { 2138 struct vfio_config *vfio_cfg; 2139 2140 if (len == 0) { 2141 rte_errno = EINVAL; 2142 return -1; 2143 } 2144 2145 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2146 if (vfio_cfg == NULL) { 2147 EAL_LOG(ERR, "Invalid VFIO container fd"); 2148 return -1; 2149 } 2150 2151 return container_dma_map(vfio_cfg, vaddr, iova, len); 2152 } 2153 2154 int 2155 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, 2156 uint64_t len) 2157 { 2158 struct vfio_config *vfio_cfg; 2159 2160 if (len == 0) { 2161 rte_errno = EINVAL; 2162 return -1; 2163 } 2164 2165 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2166 if (vfio_cfg == NULL) { 2167 EAL_LOG(ERR, "Invalid VFIO container fd"); 2168 return -1; 2169 } 2170 2171 return container_dma_unmap(vfio_cfg, vaddr, iova, len); 2172 } 2173