1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <inttypes.h> 6 #include <string.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <sys/ioctl.h> 10 11 #include <rte_errno.h> 12 #include <rte_log.h> 13 #include <rte_memory.h> 14 #include <rte_eal_memconfig.h> 15 #include <rte_vfio.h> 16 17 #include "eal_filesystem.h" 18 #include "eal_memcfg.h" 19 #include "eal_vfio.h" 20 #include "eal_private.h" 21 #include "eal_internal_cfg.h" 22 23 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" 24 25 /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can 26 * recreate the mappings for DPDK segments, but we cannot do so for memory that 27 * was registered by the user themselves, so we need to store the user mappings 28 * somewhere, to recreate them later. 29 */ 30 #define VFIO_MAX_USER_MEM_MAPS 256 31 struct user_mem_map { 32 uint64_t addr; /**< start VA */ 33 uint64_t iova; /**< start IOVA */ 34 uint64_t len; /**< total length of the mapping */ 35 uint64_t chunk; /**< this mapping can be split in chunks of this size */ 36 }; 37 38 struct user_mem_maps { 39 rte_spinlock_recursive_t lock; 40 int n_maps; 41 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; 42 }; 43 44 struct vfio_config { 45 int vfio_enabled; 46 int vfio_container_fd; 47 int vfio_active_groups; 48 const struct vfio_iommu_type *vfio_iommu_type; 49 struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; 50 struct user_mem_maps mem_maps; 51 }; 52 53 /* per-process VFIO config */ 54 static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; 55 static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; 56 57 static int vfio_type1_dma_map(int); 58 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 59 static int vfio_spapr_dma_map(int); 60 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 61 static int vfio_noiommu_dma_map(int); 62 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); 63 static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, 64 uint64_t iova, uint64_t len, int do_map); 65 66 /* IOMMU types we support */ 67 static const struct vfio_iommu_type iommu_types[] = { 68 /* x86 IOMMU, otherwise known as type 1 */ 69 { 70 .type_id = RTE_VFIO_TYPE1, 71 .name = "Type 1", 72 .partial_unmap = false, 73 .dma_map_func = &vfio_type1_dma_map, 74 .dma_user_map_func = &vfio_type1_dma_mem_map 75 }, 76 /* ppc64 IOMMU, otherwise known as spapr */ 77 { 78 .type_id = RTE_VFIO_SPAPR, 79 .name = "sPAPR", 80 .partial_unmap = true, 81 .dma_map_func = &vfio_spapr_dma_map, 82 .dma_user_map_func = &vfio_spapr_dma_mem_map 83 }, 84 /* IOMMU-less mode */ 85 { 86 .type_id = RTE_VFIO_NOIOMMU, 87 .name = "No-IOMMU", 88 .partial_unmap = true, 89 .dma_map_func = &vfio_noiommu_dma_map, 90 .dma_user_map_func = &vfio_noiommu_dma_mem_map 91 }, 92 }; 93 94 static int 95 is_null_map(const struct user_mem_map *map) 96 { 97 return map->addr == 0 && map->iova == 0 && 98 map->len == 0 && map->chunk == 0; 99 } 100 101 /* we may need to merge user mem maps together in case of user mapping/unmapping 102 * chunks of memory, so we'll need a comparator function to sort segments. 103 */ 104 static int 105 user_mem_map_cmp(const void *a, const void *b) 106 { 107 const struct user_mem_map *umm_a = a; 108 const struct user_mem_map *umm_b = b; 109 110 /* move null entries to end */ 111 if (is_null_map(umm_a)) 112 return 1; 113 if (is_null_map(umm_b)) 114 return -1; 115 116 /* sort by iova first */ 117 if (umm_a->iova < umm_b->iova) 118 return -1; 119 if (umm_a->iova > umm_b->iova) 120 return 1; 121 122 if (umm_a->addr < umm_b->addr) 123 return -1; 124 if (umm_a->addr > umm_b->addr) 125 return 1; 126 127 if (umm_a->len < umm_b->len) 128 return -1; 129 if (umm_a->len > umm_b->len) 130 return 1; 131 132 if (umm_a->chunk < umm_b->chunk) 133 return -1; 134 if (umm_a->chunk > umm_b->chunk) 135 return 1; 136 137 return 0; 138 } 139 140 /* 141 * Take in an address range and list of current mappings, and produce a list of 142 * mappings that will be kept. 143 */ 144 static int 145 process_maps(struct user_mem_map *src, size_t src_len, 146 struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len) 147 { 148 struct user_mem_map *src_first = &src[0]; 149 struct user_mem_map *src_last = &src[src_len - 1]; 150 struct user_mem_map *dst_first = &newmap[0]; 151 /* we can get at most two new segments */ 152 struct user_mem_map *dst_last = &newmap[1]; 153 uint64_t first_off = vaddr - src_first->addr; 154 uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len); 155 int newmap_len = 0; 156 157 if (first_off != 0) { 158 dst_first->addr = src_first->addr; 159 dst_first->iova = src_first->iova; 160 dst_first->len = first_off; 161 dst_first->chunk = src_first->chunk; 162 163 newmap_len++; 164 } 165 if (last_off != 0) { 166 /* if we had start offset, we have two segments */ 167 struct user_mem_map *last = 168 first_off == 0 ? dst_first : dst_last; 169 last->addr = (src_last->addr + src_last->len) - last_off; 170 last->iova = (src_last->iova + src_last->len) - last_off; 171 last->len = last_off; 172 last->chunk = src_last->chunk; 173 174 newmap_len++; 175 } 176 return newmap_len; 177 } 178 179 /* erase certain maps from the list */ 180 static void 181 delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps, 182 size_t n_del) 183 { 184 int i; 185 size_t j; 186 187 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) { 188 struct user_mem_map *left = &user_mem_maps->maps[i]; 189 struct user_mem_map *right = &del_maps[j]; 190 191 if (user_mem_map_cmp(left, right) == 0) { 192 memset(left, 0, sizeof(*left)); 193 j++; 194 user_mem_maps->n_maps--; 195 } 196 } 197 } 198 199 static void 200 copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps, 201 size_t n_add) 202 { 203 int i; 204 size_t j; 205 206 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) { 207 struct user_mem_map *left = &user_mem_maps->maps[i]; 208 struct user_mem_map *right = &add_maps[j]; 209 210 /* insert into empty space */ 211 if (is_null_map(left)) { 212 memcpy(left, right, sizeof(*left)); 213 j++; 214 user_mem_maps->n_maps++; 215 } 216 } 217 } 218 219 /* try merging two maps into one, return 1 if succeeded */ 220 static int 221 merge_map(struct user_mem_map *left, struct user_mem_map *right) 222 { 223 /* merge the same maps into one */ 224 if (memcmp(left, right, sizeof(struct user_mem_map)) == 0) 225 goto out; 226 227 if (left->addr + left->len != right->addr) 228 return 0; 229 if (left->iova + left->len != right->iova) 230 return 0; 231 if (left->chunk != right->chunk) 232 return 0; 233 left->len += right->len; 234 235 out: 236 memset(right, 0, sizeof(*right)); 237 238 return 1; 239 } 240 241 static bool 242 addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps, 243 uint64_t vaddr, uint64_t iova) 244 { 245 unsigned int i; 246 247 for (i = 0; i < n_maps; i++) { 248 struct user_mem_map *map = &maps[i]; 249 uint64_t map_va_end = map->addr + map->len; 250 uint64_t map_iova_end = map->iova + map->len; 251 uint64_t map_va_off = vaddr - map->addr; 252 uint64_t map_iova_off = iova - map->iova; 253 254 /* we include end of the segment in comparison as well */ 255 bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end); 256 bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end); 257 /* chunk may not be power of two, so use modulo */ 258 bool addr_is_aligned = (map_va_off % map->chunk) == 0; 259 bool iova_is_aligned = (map_iova_off % map->chunk) == 0; 260 261 if (addr_in_map && iova_in_map && 262 addr_is_aligned && iova_is_aligned) 263 return true; 264 } 265 return false; 266 } 267 268 static int 269 find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr, 270 uint64_t iova, uint64_t len, struct user_mem_map *dst, 271 size_t dst_len) 272 { 273 uint64_t va_end = addr + len; 274 uint64_t iova_end = iova + len; 275 bool found = false; 276 size_t j; 277 int i, ret; 278 279 for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) { 280 struct user_mem_map *map = &user_mem_maps->maps[i]; 281 uint64_t map_va_end = map->addr + map->len; 282 uint64_t map_iova_end = map->iova + map->len; 283 284 bool start_addr_in_map = (addr >= map->addr) && 285 (addr < map_va_end); 286 bool end_addr_in_map = (va_end > map->addr) && 287 (va_end <= map_va_end); 288 bool start_iova_in_map = (iova >= map->iova) && 289 (iova < map_iova_end); 290 bool end_iova_in_map = (iova_end > map->iova) && 291 (iova_end <= map_iova_end); 292 293 /* do we have space in temporary map? */ 294 if (j == dst_len) { 295 ret = -ENOSPC; 296 goto err; 297 } 298 /* check if current map is start of our segment */ 299 if (!found && start_addr_in_map && start_iova_in_map) 300 found = true; 301 /* if we have previously found a segment, add it to the map */ 302 if (found) { 303 /* copy the segment into our temporary map */ 304 memcpy(&dst[j++], map, sizeof(*map)); 305 306 /* if we match end of segment, quit */ 307 if (end_addr_in_map && end_iova_in_map) 308 return j; 309 } 310 } 311 /* we didn't find anything */ 312 ret = -ENOENT; 313 err: 314 memset(dst, 0, sizeof(*dst) * dst_len); 315 return ret; 316 } 317 318 /* this will sort all user maps, and merge/compact any adjacent maps */ 319 static void 320 compact_user_maps(struct user_mem_maps *user_mem_maps) 321 { 322 int i; 323 324 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 325 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 326 327 /* we'll go over the list backwards when merging */ 328 for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) { 329 struct user_mem_map *l, *r; 330 331 l = &user_mem_maps->maps[i]; 332 r = &user_mem_maps->maps[i + 1]; 333 334 if (is_null_map(l) || is_null_map(r)) 335 continue; 336 337 /* try and merge the maps */ 338 if (merge_map(l, r)) 339 user_mem_maps->n_maps--; 340 } 341 342 /* the entries are still sorted, but now they have holes in them, so 343 * sort the list again. 344 */ 345 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS, 346 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); 347 } 348 349 static int 350 vfio_open_group_fd(int iommu_group_num) 351 { 352 int vfio_group_fd; 353 char filename[PATH_MAX]; 354 struct rte_mp_msg mp_req, *mp_rep; 355 struct rte_mp_reply mp_reply = {0}; 356 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 357 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 358 const struct internal_config *internal_conf = 359 eal_get_internal_configuration(); 360 361 /* if primary, try to open the group */ 362 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 363 /* try regular group format */ 364 snprintf(filename, sizeof(filename), 365 VFIO_GROUP_FMT, iommu_group_num); 366 vfio_group_fd = open(filename, O_RDWR); 367 if (vfio_group_fd < 0) { 368 /* if file not found, it's not an error */ 369 if (errno != ENOENT) { 370 EAL_LOG(ERR, "Cannot open %s: %s", 371 filename, strerror(errno)); 372 return -1; 373 } 374 375 /* special case: try no-IOMMU path as well */ 376 snprintf(filename, sizeof(filename), 377 VFIO_NOIOMMU_GROUP_FMT, 378 iommu_group_num); 379 vfio_group_fd = open(filename, O_RDWR); 380 if (vfio_group_fd < 0) { 381 if (errno != ENOENT) { 382 EAL_LOG(ERR, 383 "Cannot open %s: %s", 384 filename, strerror(errno)); 385 return -1; 386 } 387 return -ENOENT; 388 } 389 /* noiommu group found */ 390 } 391 392 return vfio_group_fd; 393 } 394 /* if we're in a secondary process, request group fd from the primary 395 * process via mp channel. 396 */ 397 p->req = SOCKET_REQ_GROUP; 398 p->group_num = iommu_group_num; 399 strcpy(mp_req.name, EAL_VFIO_MP); 400 mp_req.len_param = sizeof(*p); 401 mp_req.num_fds = 0; 402 403 vfio_group_fd = -1; 404 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 405 mp_reply.nb_received == 1) { 406 mp_rep = &mp_reply.msgs[0]; 407 p = (struct vfio_mp_param *)mp_rep->param; 408 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 409 vfio_group_fd = mp_rep->fds[0]; 410 } else if (p->result == SOCKET_NO_FD) { 411 EAL_LOG(ERR, "Bad VFIO group fd"); 412 vfio_group_fd = -ENOENT; 413 } 414 } 415 416 free(mp_reply.msgs); 417 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 418 EAL_LOG(ERR, "Cannot request VFIO group fd"); 419 return vfio_group_fd; 420 } 421 422 static struct vfio_config * 423 get_vfio_cfg_by_group_num(int iommu_group_num) 424 { 425 struct vfio_config *vfio_cfg; 426 int i, j; 427 428 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 429 vfio_cfg = &vfio_cfgs[i]; 430 for (j = 0; j < VFIO_MAX_GROUPS; j++) { 431 if (vfio_cfg->vfio_groups[j].group_num == 432 iommu_group_num) 433 return vfio_cfg; 434 } 435 } 436 437 return NULL; 438 } 439 440 static int 441 vfio_get_group_fd(struct vfio_config *vfio_cfg, 442 int iommu_group_num) 443 { 444 int i; 445 int vfio_group_fd; 446 struct vfio_group *cur_grp; 447 448 /* check if we already have the group descriptor open */ 449 for (i = 0; i < VFIO_MAX_GROUPS; i++) 450 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) 451 return vfio_cfg->vfio_groups[i].fd; 452 453 /* Lets see first if there is room for a new group */ 454 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { 455 EAL_LOG(ERR, "Maximum number of VFIO groups reached!"); 456 return -1; 457 } 458 459 /* Now lets get an index for the new group */ 460 for (i = 0; i < VFIO_MAX_GROUPS; i++) 461 if (vfio_cfg->vfio_groups[i].group_num == -1) { 462 cur_grp = &vfio_cfg->vfio_groups[i]; 463 break; 464 } 465 466 /* This should not happen */ 467 if (i == VFIO_MAX_GROUPS) { 468 EAL_LOG(ERR, "No VFIO group free slot found"); 469 return -1; 470 } 471 472 vfio_group_fd = vfio_open_group_fd(iommu_group_num); 473 if (vfio_group_fd < 0) { 474 EAL_LOG(ERR, "Failed to open VFIO group %d", 475 iommu_group_num); 476 return vfio_group_fd; 477 } 478 479 cur_grp->group_num = iommu_group_num; 480 cur_grp->fd = vfio_group_fd; 481 vfio_cfg->vfio_active_groups++; 482 483 return vfio_group_fd; 484 } 485 486 static struct vfio_config * 487 get_vfio_cfg_by_group_fd(int vfio_group_fd) 488 { 489 struct vfio_config *vfio_cfg; 490 int i, j; 491 492 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 493 vfio_cfg = &vfio_cfgs[i]; 494 for (j = 0; j < VFIO_MAX_GROUPS; j++) 495 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 496 return vfio_cfg; 497 } 498 499 return NULL; 500 } 501 502 static struct vfio_config * 503 get_vfio_cfg_by_container_fd(int container_fd) 504 { 505 int i; 506 507 if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) 508 return default_vfio_cfg; 509 510 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 511 if (vfio_cfgs[i].vfio_container_fd == container_fd) 512 return &vfio_cfgs[i]; 513 } 514 515 return NULL; 516 } 517 518 int 519 rte_vfio_get_group_fd(int iommu_group_num) 520 { 521 struct vfio_config *vfio_cfg; 522 523 /* get the vfio_config it belongs to */ 524 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 525 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 526 527 return vfio_get_group_fd(vfio_cfg, iommu_group_num); 528 } 529 530 static int 531 get_vfio_group_idx(int vfio_group_fd) 532 { 533 struct vfio_config *vfio_cfg; 534 int i, j; 535 536 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 537 vfio_cfg = &vfio_cfgs[i]; 538 for (j = 0; j < VFIO_MAX_GROUPS; j++) 539 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) 540 return j; 541 } 542 543 return -1; 544 } 545 546 static void 547 vfio_group_device_get(int vfio_group_fd) 548 { 549 struct vfio_config *vfio_cfg; 550 int i; 551 552 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 553 if (vfio_cfg == NULL) { 554 EAL_LOG(ERR, "Invalid VFIO group fd!"); 555 return; 556 } 557 558 i = get_vfio_group_idx(vfio_group_fd); 559 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 560 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 561 else 562 vfio_cfg->vfio_groups[i].devices++; 563 } 564 565 static void 566 vfio_group_device_put(int vfio_group_fd) 567 { 568 struct vfio_config *vfio_cfg; 569 int i; 570 571 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 572 if (vfio_cfg == NULL) { 573 EAL_LOG(ERR, "Invalid VFIO group fd!"); 574 return; 575 } 576 577 i = get_vfio_group_idx(vfio_group_fd); 578 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) 579 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 580 else 581 vfio_cfg->vfio_groups[i].devices--; 582 } 583 584 static int 585 vfio_group_device_count(int vfio_group_fd) 586 { 587 struct vfio_config *vfio_cfg; 588 int i; 589 590 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 591 if (vfio_cfg == NULL) { 592 EAL_LOG(ERR, "Invalid VFIO group fd!"); 593 return -1; 594 } 595 596 i = get_vfio_group_idx(vfio_group_fd); 597 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { 598 EAL_LOG(ERR, "Wrong VFIO group index (%d)", i); 599 return -1; 600 } 601 602 return vfio_cfg->vfio_groups[i].devices; 603 } 604 605 static void 606 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, 607 void *arg __rte_unused) 608 { 609 struct rte_memseg_list *msl; 610 struct rte_memseg *ms; 611 size_t cur_len = 0; 612 613 msl = rte_mem_virt2memseg_list(addr); 614 615 /* for IOVA as VA mode, no need to care for IOVA addresses */ 616 if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { 617 uint64_t vfio_va = (uint64_t)(uintptr_t)addr; 618 uint64_t page_sz = msl->page_sz; 619 620 /* Maintain granularity of DMA map/unmap to memseg size */ 621 for (; cur_len < len; cur_len += page_sz) { 622 if (type == RTE_MEM_EVENT_ALLOC) 623 vfio_dma_mem_map(default_vfio_cfg, vfio_va, 624 vfio_va, page_sz, 1); 625 else 626 vfio_dma_mem_map(default_vfio_cfg, vfio_va, 627 vfio_va, page_sz, 0); 628 vfio_va += page_sz; 629 } 630 631 return; 632 } 633 634 /* memsegs are contiguous in memory */ 635 ms = rte_mem_virt2memseg(addr, msl); 636 while (cur_len < len) { 637 /* some memory segments may have invalid IOVA */ 638 if (ms->iova == RTE_BAD_IOVA) { 639 EAL_LOG(DEBUG, 640 "Memory segment at %p has bad IOVA, skipping", 641 ms->addr); 642 goto next; 643 } 644 if (type == RTE_MEM_EVENT_ALLOC) 645 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 646 ms->iova, ms->len, 1); 647 else 648 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, 649 ms->iova, ms->len, 0); 650 next: 651 cur_len += ms->len; 652 ++ms; 653 } 654 } 655 656 static int 657 vfio_sync_default_container(void) 658 { 659 struct rte_mp_msg mp_req, *mp_rep; 660 struct rte_mp_reply mp_reply = {0}; 661 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 662 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 663 int iommu_type_id; 664 unsigned int i; 665 666 /* cannot be called from primary */ 667 if (rte_eal_process_type() != RTE_PROC_SECONDARY) 668 return -1; 669 670 /* default container fd should have been opened in rte_vfio_enable() */ 671 if (!default_vfio_cfg->vfio_enabled || 672 default_vfio_cfg->vfio_container_fd < 0) { 673 EAL_LOG(ERR, "VFIO support is not initialized"); 674 return -1; 675 } 676 677 /* find default container's IOMMU type */ 678 p->req = SOCKET_REQ_IOMMU_TYPE; 679 strcpy(mp_req.name, EAL_VFIO_MP); 680 mp_req.len_param = sizeof(*p); 681 mp_req.num_fds = 0; 682 683 iommu_type_id = -1; 684 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 685 mp_reply.nb_received == 1) { 686 mp_rep = &mp_reply.msgs[0]; 687 p = (struct vfio_mp_param *)mp_rep->param; 688 if (p->result == SOCKET_OK) 689 iommu_type_id = p->iommu_type_id; 690 } 691 free(mp_reply.msgs); 692 if (iommu_type_id < 0) { 693 EAL_LOG(ERR, 694 "Could not get IOMMU type for default container"); 695 return -1; 696 } 697 698 /* we now have an fd for default container, as well as its IOMMU type. 699 * now, set up default VFIO container config to match. 700 */ 701 for (i = 0; i < RTE_DIM(iommu_types); i++) { 702 const struct vfio_iommu_type *t = &iommu_types[i]; 703 if (t->type_id != iommu_type_id) 704 continue; 705 706 /* we found our IOMMU type */ 707 default_vfio_cfg->vfio_iommu_type = t; 708 709 return 0; 710 } 711 EAL_LOG(ERR, "Could not find IOMMU type id (%i)", 712 iommu_type_id); 713 return -1; 714 } 715 716 int 717 rte_vfio_clear_group(int vfio_group_fd) 718 { 719 int i; 720 struct vfio_config *vfio_cfg; 721 722 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); 723 if (vfio_cfg == NULL) { 724 EAL_LOG(ERR, "Invalid VFIO group fd!"); 725 return -1; 726 } 727 728 i = get_vfio_group_idx(vfio_group_fd); 729 if (i < 0) 730 return -1; 731 vfio_cfg->vfio_groups[i].group_num = -1; 732 vfio_cfg->vfio_groups[i].fd = -1; 733 vfio_cfg->vfio_groups[i].devices = 0; 734 vfio_cfg->vfio_active_groups--; 735 736 return 0; 737 } 738 739 int 740 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, 741 int *vfio_dev_fd, struct vfio_device_info *device_info) 742 { 743 struct vfio_group_status group_status = { 744 .argsz = sizeof(group_status) 745 }; 746 struct vfio_config *vfio_cfg; 747 struct user_mem_maps *user_mem_maps; 748 int vfio_container_fd; 749 int vfio_group_fd; 750 int iommu_group_num; 751 rte_uuid_t vf_token; 752 int i, ret; 753 const struct internal_config *internal_conf = 754 eal_get_internal_configuration(); 755 756 /* get group number */ 757 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 758 if (ret == 0) { 759 EAL_LOG(NOTICE, 760 "%s not managed by VFIO driver, skipping", 761 dev_addr); 762 return 1; 763 } 764 765 /* if negative, something failed */ 766 if (ret < 0) 767 return -1; 768 769 /* get the actual group fd */ 770 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 771 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT) 772 return -1; 773 774 /* 775 * if vfio_group_fd == -ENOENT, that means the device 776 * isn't managed by VFIO 777 */ 778 if (vfio_group_fd == -ENOENT) { 779 EAL_LOG(NOTICE, 780 "%s not managed by VFIO driver, skipping", 781 dev_addr); 782 return 1; 783 } 784 785 /* 786 * at this point, we know that this group is viable (meaning, all devices 787 * are either bound to VFIO or not bound to anything) 788 */ 789 790 /* check if the group is viable */ 791 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); 792 if (ret) { 793 EAL_LOG(ERR, "%s cannot get VFIO group status, " 794 "error %i (%s)", dev_addr, errno, strerror(errno)); 795 close(vfio_group_fd); 796 rte_vfio_clear_group(vfio_group_fd); 797 return -1; 798 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 799 EAL_LOG(ERR, "%s VFIO group is not viable! " 800 "Not all devices in IOMMU group bound to VFIO or unbound", 801 dev_addr); 802 close(vfio_group_fd); 803 rte_vfio_clear_group(vfio_group_fd); 804 return -1; 805 } 806 807 /* get the vfio_config it belongs to */ 808 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 809 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 810 vfio_container_fd = vfio_cfg->vfio_container_fd; 811 user_mem_maps = &vfio_cfg->mem_maps; 812 813 /* check if group does not have a container yet */ 814 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { 815 816 /* add group to a container */ 817 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, 818 &vfio_container_fd); 819 if (ret) { 820 EAL_LOG(ERR, 821 "%s cannot add VFIO group to container, error " 822 "%i (%s)", dev_addr, errno, strerror(errno)); 823 close(vfio_group_fd); 824 rte_vfio_clear_group(vfio_group_fd); 825 return -1; 826 } 827 828 /* 829 * pick an IOMMU type and set up DMA mappings for container 830 * 831 * needs to be done only once, only when first group is 832 * assigned to a container and only in primary process. 833 * Note this can happen several times with the hotplug 834 * functionality. 835 */ 836 if (internal_conf->process_type == RTE_PROC_PRIMARY && 837 vfio_cfg->vfio_active_groups == 1 && 838 vfio_group_device_count(vfio_group_fd) == 0) { 839 const struct vfio_iommu_type *t; 840 841 /* select an IOMMU type which we will be using */ 842 t = vfio_set_iommu_type(vfio_container_fd); 843 if (!t) { 844 EAL_LOG(ERR, 845 "%s failed to select IOMMU type", 846 dev_addr); 847 close(vfio_group_fd); 848 rte_vfio_clear_group(vfio_group_fd); 849 return -1; 850 } 851 /* lock memory hotplug before mapping and release it 852 * after registering callback, to prevent races 853 */ 854 rte_mcfg_mem_read_lock(); 855 if (vfio_cfg == default_vfio_cfg) 856 ret = t->dma_map_func(vfio_container_fd); 857 else 858 ret = 0; 859 if (ret) { 860 EAL_LOG(ERR, 861 "%s DMA remapping failed, error " 862 "%i (%s)", 863 dev_addr, errno, strerror(errno)); 864 close(vfio_group_fd); 865 rte_vfio_clear_group(vfio_group_fd); 866 rte_mcfg_mem_read_unlock(); 867 return -1; 868 } 869 870 vfio_cfg->vfio_iommu_type = t; 871 872 /* re-map all user-mapped segments */ 873 rte_spinlock_recursive_lock(&user_mem_maps->lock); 874 875 /* this IOMMU type may not support DMA mapping, but 876 * if we have mappings in the list - that means we have 877 * previously mapped something successfully, so we can 878 * be sure that DMA mapping is supported. 879 */ 880 for (i = 0; i < user_mem_maps->n_maps; i++) { 881 struct user_mem_map *map; 882 map = &user_mem_maps->maps[i]; 883 884 ret = t->dma_user_map_func( 885 vfio_container_fd, 886 map->addr, map->iova, map->len, 887 1); 888 if (ret) { 889 EAL_LOG(ERR, "Couldn't map user memory for DMA: " 890 "va: 0x%" PRIx64 " " 891 "iova: 0x%" PRIx64 " " 892 "len: 0x%" PRIu64, 893 map->addr, map->iova, 894 map->len); 895 rte_spinlock_recursive_unlock( 896 &user_mem_maps->lock); 897 rte_mcfg_mem_read_unlock(); 898 return -1; 899 } 900 } 901 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 902 903 /* register callback for mem events */ 904 if (vfio_cfg == default_vfio_cfg) 905 ret = rte_mem_event_callback_register( 906 VFIO_MEM_EVENT_CLB_NAME, 907 vfio_mem_event_callback, NULL); 908 else 909 ret = 0; 910 /* unlock memory hotplug */ 911 rte_mcfg_mem_read_unlock(); 912 913 if (ret && rte_errno != ENOTSUP) { 914 EAL_LOG(ERR, "Could not install memory event callback for VFIO"); 915 return -1; 916 } 917 if (ret) 918 EAL_LOG(DEBUG, "Memory event callbacks not supported"); 919 else 920 EAL_LOG(DEBUG, "Installed memory event callback for VFIO"); 921 } 922 } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && 923 vfio_cfg == default_vfio_cfg && 924 vfio_cfg->vfio_iommu_type == NULL) { 925 /* if we're not a primary process, we do not set up the VFIO 926 * container because it's already been set up by the primary 927 * process. instead, we simply ask the primary about VFIO type 928 * we are using, and set the VFIO config up appropriately. 929 */ 930 ret = vfio_sync_default_container(); 931 if (ret < 0) { 932 EAL_LOG(ERR, "Could not sync default VFIO container"); 933 close(vfio_group_fd); 934 rte_vfio_clear_group(vfio_group_fd); 935 return -1; 936 } 937 /* we have successfully initialized VFIO, notify user */ 938 const struct vfio_iommu_type *t = 939 default_vfio_cfg->vfio_iommu_type; 940 EAL_LOG(INFO, "Using IOMMU type %d (%s)", 941 t->type_id, t->name); 942 } 943 944 rte_eal_vfio_get_vf_token(vf_token); 945 946 /* get a file descriptor for the device with VF token firstly */ 947 if (!rte_uuid_is_null(vf_token)) { 948 char vf_token_str[RTE_UUID_STRLEN]; 949 char dev[PATH_MAX]; 950 951 rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str)); 952 snprintf(dev, sizeof(dev), 953 "%s vf_token=%s", dev_addr, vf_token_str); 954 955 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, 956 dev); 957 if (*vfio_dev_fd >= 0) 958 goto dev_get_info; 959 } 960 961 /* get a file descriptor for the device */ 962 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); 963 if (*vfio_dev_fd < 0) { 964 /* if we cannot get a device fd, this implies a problem with 965 * the VFIO group or the container not having IOMMU configured. 966 */ 967 968 EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed", 969 dev_addr); 970 close(vfio_group_fd); 971 rte_vfio_clear_group(vfio_group_fd); 972 return -1; 973 } 974 975 /* test and setup the device */ 976 dev_get_info: 977 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); 978 if (ret) { 979 EAL_LOG(ERR, "%s cannot get device info, " 980 "error %i (%s)", dev_addr, errno, 981 strerror(errno)); 982 close(*vfio_dev_fd); 983 close(vfio_group_fd); 984 rte_vfio_clear_group(vfio_group_fd); 985 return -1; 986 } 987 vfio_group_device_get(vfio_group_fd); 988 989 return 0; 990 } 991 992 int 993 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, 994 int vfio_dev_fd) 995 { 996 struct vfio_config *vfio_cfg; 997 int vfio_group_fd; 998 int iommu_group_num; 999 int ret; 1000 1001 /* we don't want any DMA mapping messages to come while we're detaching 1002 * VFIO device, because this might be the last device and we might need 1003 * to unregister the callback. 1004 */ 1005 rte_mcfg_mem_read_lock(); 1006 1007 /* get group number */ 1008 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); 1009 if (ret <= 0) { 1010 EAL_LOG(WARNING, "%s not managed by VFIO driver", 1011 dev_addr); 1012 /* This is an error at this point. */ 1013 ret = -1; 1014 goto out; 1015 } 1016 1017 /* get the actual group fd */ 1018 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); 1019 if (vfio_group_fd < 0) { 1020 EAL_LOG(INFO, "rte_vfio_get_group_fd failed for %s", 1021 dev_addr); 1022 ret = vfio_group_fd; 1023 goto out; 1024 } 1025 1026 /* get the vfio_config it belongs to */ 1027 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); 1028 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; 1029 1030 /* At this point we got an active group. Closing it will make the 1031 * container detachment. If this is the last active group, VFIO kernel 1032 * code will unset the container and the IOMMU mappings. 1033 */ 1034 1035 /* Closing a device */ 1036 if (close(vfio_dev_fd) < 0) { 1037 EAL_LOG(INFO, "Error when closing vfio_dev_fd for %s", 1038 dev_addr); 1039 ret = -1; 1040 goto out; 1041 } 1042 1043 /* An VFIO group can have several devices attached. Just when there is 1044 * no devices remaining should the group be closed. 1045 */ 1046 vfio_group_device_put(vfio_group_fd); 1047 if (!vfio_group_device_count(vfio_group_fd)) { 1048 1049 if (close(vfio_group_fd) < 0) { 1050 EAL_LOG(INFO, "Error when closing vfio_group_fd for %s", 1051 dev_addr); 1052 ret = -1; 1053 goto out; 1054 } 1055 1056 if (rte_vfio_clear_group(vfio_group_fd) < 0) { 1057 EAL_LOG(INFO, "Error when clearing group for %s", 1058 dev_addr); 1059 ret = -1; 1060 goto out; 1061 } 1062 } 1063 1064 /* if there are no active device groups, unregister the callback to 1065 * avoid spurious attempts to map/unmap memory from VFIO. 1066 */ 1067 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && 1068 rte_eal_process_type() != RTE_PROC_SECONDARY) 1069 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, 1070 NULL); 1071 1072 /* success */ 1073 ret = 0; 1074 1075 out: 1076 rte_mcfg_mem_read_unlock(); 1077 return ret; 1078 } 1079 1080 int 1081 rte_vfio_enable(const char *modname) 1082 { 1083 /* initialize group list */ 1084 int i, j; 1085 int vfio_available; 1086 const struct internal_config *internal_conf = 1087 eal_get_internal_configuration(); 1088 1089 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; 1090 1091 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { 1092 vfio_cfgs[i].vfio_container_fd = -1; 1093 vfio_cfgs[i].vfio_active_groups = 0; 1094 vfio_cfgs[i].vfio_iommu_type = NULL; 1095 vfio_cfgs[i].mem_maps.lock = lock; 1096 1097 for (j = 0; j < VFIO_MAX_GROUPS; j++) { 1098 vfio_cfgs[i].vfio_groups[j].fd = -1; 1099 vfio_cfgs[i].vfio_groups[j].group_num = -1; 1100 vfio_cfgs[i].vfio_groups[j].devices = 0; 1101 } 1102 } 1103 1104 EAL_LOG(DEBUG, "Probing VFIO support..."); 1105 1106 /* check if vfio module is loaded */ 1107 vfio_available = rte_eal_check_module(modname); 1108 1109 /* return error directly */ 1110 if (vfio_available == -1) { 1111 EAL_LOG(INFO, "Could not get loaded module details!"); 1112 return -1; 1113 } 1114 1115 /* return 0 if VFIO modules not loaded */ 1116 if (vfio_available == 0) { 1117 EAL_LOG(DEBUG, 1118 "VFIO modules not loaded, skipping VFIO support..."); 1119 return 0; 1120 } 1121 1122 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1123 if (vfio_mp_sync_setup() == -1) { 1124 default_vfio_cfg->vfio_container_fd = -1; 1125 } else { 1126 /* open a new container */ 1127 default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); 1128 } 1129 } else { 1130 /* get the default container from the primary process */ 1131 default_vfio_cfg->vfio_container_fd = 1132 vfio_get_default_container_fd(); 1133 } 1134 1135 /* check if we have VFIO driver enabled */ 1136 if (default_vfio_cfg->vfio_container_fd != -1) { 1137 EAL_LOG(INFO, "VFIO support initialized"); 1138 default_vfio_cfg->vfio_enabled = 1; 1139 } else { 1140 EAL_LOG(NOTICE, "VFIO support could not be initialized"); 1141 } 1142 1143 return 0; 1144 } 1145 1146 int 1147 rte_vfio_is_enabled(const char *modname) 1148 { 1149 const int mod_available = rte_eal_check_module(modname) > 0; 1150 return default_vfio_cfg->vfio_enabled && mod_available; 1151 } 1152 1153 int 1154 vfio_get_default_container_fd(void) 1155 { 1156 struct rte_mp_msg mp_req, *mp_rep; 1157 struct rte_mp_reply mp_reply = {0}; 1158 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 1159 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 1160 int container_fd; 1161 const struct internal_config *internal_conf = 1162 eal_get_internal_configuration(); 1163 1164 if (default_vfio_cfg->vfio_enabled) 1165 return default_vfio_cfg->vfio_container_fd; 1166 1167 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1168 /* if we were secondary process we would try requesting 1169 * container fd from the primary, but we're the primary 1170 * process so just exit here 1171 */ 1172 return -1; 1173 } 1174 1175 p->req = SOCKET_REQ_DEFAULT_CONTAINER; 1176 strcpy(mp_req.name, EAL_VFIO_MP); 1177 mp_req.len_param = sizeof(*p); 1178 mp_req.num_fds = 0; 1179 1180 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 1181 mp_reply.nb_received == 1) { 1182 mp_rep = &mp_reply.msgs[0]; 1183 p = (struct vfio_mp_param *)mp_rep->param; 1184 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 1185 container_fd = mp_rep->fds[0]; 1186 free(mp_reply.msgs); 1187 return container_fd; 1188 } 1189 } 1190 1191 free(mp_reply.msgs); 1192 EAL_LOG(ERR, "Cannot request default VFIO container fd"); 1193 return -1; 1194 } 1195 1196 int 1197 vfio_get_iommu_type(void) 1198 { 1199 if (default_vfio_cfg->vfio_iommu_type == NULL) 1200 return -1; 1201 1202 return default_vfio_cfg->vfio_iommu_type->type_id; 1203 } 1204 1205 const struct vfio_iommu_type * 1206 vfio_set_iommu_type(int vfio_container_fd) 1207 { 1208 unsigned idx; 1209 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 1210 const struct vfio_iommu_type *t = &iommu_types[idx]; 1211 1212 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, 1213 t->type_id); 1214 if (!ret) { 1215 EAL_LOG(INFO, "Using IOMMU type %d (%s)", 1216 t->type_id, t->name); 1217 return t; 1218 } 1219 /* not an error, there may be more supported IOMMU types */ 1220 EAL_LOG(DEBUG, "Set IOMMU type %d (%s) failed, error " 1221 "%i (%s)", t->type_id, t->name, errno, 1222 strerror(errno)); 1223 } 1224 /* if we didn't find a suitable IOMMU type, fail */ 1225 return NULL; 1226 } 1227 1228 int 1229 rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr, 1230 int *vfio_dev_fd, struct vfio_device_info *device_info) 1231 { 1232 int ret; 1233 1234 if (device_info == NULL || *vfio_dev_fd < 0) 1235 return -1; 1236 1237 if (*vfio_dev_fd == 0) { 1238 ret = rte_vfio_setup_device(sysfs_base, dev_addr, 1239 vfio_dev_fd, device_info); 1240 if (ret) 1241 return -1; 1242 } else { 1243 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); 1244 if (ret) { 1245 EAL_LOG(ERR, "%s cannot get device info, error %i (%s)", 1246 dev_addr, errno, strerror(errno)); 1247 return -1; 1248 } 1249 } 1250 1251 return 0; 1252 } 1253 1254 int 1255 vfio_has_supported_extensions(int vfio_container_fd) 1256 { 1257 int ret; 1258 unsigned idx, n_extensions = 0; 1259 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { 1260 const struct vfio_iommu_type *t = &iommu_types[idx]; 1261 1262 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, 1263 t->type_id); 1264 if (ret < 0) { 1265 EAL_LOG(ERR, "Could not get IOMMU type, error " 1266 "%i (%s)", errno, strerror(errno)); 1267 close(vfio_container_fd); 1268 return -1; 1269 } else if (ret == 1) { 1270 /* we found a supported extension */ 1271 n_extensions++; 1272 } 1273 EAL_LOG(DEBUG, "IOMMU type %d (%s) is %s", 1274 t->type_id, t->name, 1275 ret ? "supported" : "not supported"); 1276 } 1277 1278 /* if we didn't find any supported IOMMU types, fail */ 1279 if (!n_extensions) { 1280 close(vfio_container_fd); 1281 return -1; 1282 } 1283 1284 return 0; 1285 } 1286 1287 int 1288 rte_vfio_get_container_fd(void) 1289 { 1290 int ret, vfio_container_fd; 1291 struct rte_mp_msg mp_req, *mp_rep; 1292 struct rte_mp_reply mp_reply = {0}; 1293 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; 1294 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; 1295 const struct internal_config *internal_conf = 1296 eal_get_internal_configuration(); 1297 1298 1299 /* if we're in a primary process, try to open the container */ 1300 if (internal_conf->process_type == RTE_PROC_PRIMARY) { 1301 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); 1302 if (vfio_container_fd < 0) { 1303 EAL_LOG(ERR, 1304 "Cannot open VFIO container %s, error " 1305 "%i (%s)", VFIO_CONTAINER_PATH, 1306 errno, strerror(errno)); 1307 return -1; 1308 } 1309 1310 /* check VFIO API version */ 1311 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); 1312 if (ret != VFIO_API_VERSION) { 1313 if (ret < 0) 1314 EAL_LOG(ERR, 1315 "Could not get VFIO API version, error " 1316 "%i (%s)", errno, strerror(errno)); 1317 else 1318 EAL_LOG(ERR, "Unsupported VFIO API version!"); 1319 close(vfio_container_fd); 1320 return -1; 1321 } 1322 1323 ret = vfio_has_supported_extensions(vfio_container_fd); 1324 if (ret) { 1325 EAL_LOG(ERR, 1326 "No supported IOMMU extensions found!"); 1327 return -1; 1328 } 1329 1330 return vfio_container_fd; 1331 } 1332 /* 1333 * if we're in a secondary process, request container fd from the 1334 * primary process via mp channel 1335 */ 1336 p->req = SOCKET_REQ_CONTAINER; 1337 strcpy(mp_req.name, EAL_VFIO_MP); 1338 mp_req.len_param = sizeof(*p); 1339 mp_req.num_fds = 0; 1340 1341 vfio_container_fd = -1; 1342 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && 1343 mp_reply.nb_received == 1) { 1344 mp_rep = &mp_reply.msgs[0]; 1345 p = (struct vfio_mp_param *)mp_rep->param; 1346 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { 1347 vfio_container_fd = mp_rep->fds[0]; 1348 free(mp_reply.msgs); 1349 return vfio_container_fd; 1350 } 1351 } 1352 1353 free(mp_reply.msgs); 1354 EAL_LOG(ERR, "Cannot request VFIO container fd"); 1355 return -1; 1356 } 1357 1358 int 1359 rte_vfio_get_group_num(const char *sysfs_base, 1360 const char *dev_addr, int *iommu_group_num) 1361 { 1362 char linkname[PATH_MAX]; 1363 char filename[PATH_MAX]; 1364 char *tok[16], *group_tok, *end; 1365 int ret; 1366 1367 memset(linkname, 0, sizeof(linkname)); 1368 memset(filename, 0, sizeof(filename)); 1369 1370 /* try to find out IOMMU group for this device */ 1371 snprintf(linkname, sizeof(linkname), 1372 "%s/%s/iommu_group", sysfs_base, dev_addr); 1373 1374 ret = readlink(linkname, filename, sizeof(filename)); 1375 1376 /* if the link doesn't exist, no VFIO for us */ 1377 if (ret < 0) 1378 return 0; 1379 1380 ret = rte_strsplit(filename, sizeof(filename), 1381 tok, RTE_DIM(tok), '/'); 1382 1383 if (ret <= 0) { 1384 EAL_LOG(ERR, "%s cannot get IOMMU group", dev_addr); 1385 return -1; 1386 } 1387 1388 /* IOMMU group is always the last token */ 1389 errno = 0; 1390 group_tok = tok[ret - 1]; 1391 end = group_tok; 1392 *iommu_group_num = strtol(group_tok, &end, 10); 1393 if ((end != group_tok && *end != '\0') || errno != 0) { 1394 EAL_LOG(ERR, "%s error parsing IOMMU number!", dev_addr); 1395 return -1; 1396 } 1397 1398 return 1; 1399 } 1400 1401 static int 1402 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, 1403 void *arg) 1404 { 1405 int *vfio_container_fd = arg; 1406 1407 /* skip external memory that isn't a heap */ 1408 if (msl->external && !msl->heap) 1409 return 0; 1410 1411 /* skip any segments with invalid IOVA addresses */ 1412 if (ms->iova == RTE_BAD_IOVA) 1413 return 0; 1414 1415 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, 1416 ms->len, 1); 1417 } 1418 1419 static int 1420 vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 1421 uint64_t len, int do_map) 1422 { 1423 struct vfio_iommu_type1_dma_map dma_map; 1424 struct vfio_iommu_type1_dma_unmap dma_unmap; 1425 int ret; 1426 1427 if (do_map != 0) { 1428 memset(&dma_map, 0, sizeof(dma_map)); 1429 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 1430 dma_map.vaddr = vaddr; 1431 dma_map.size = len; 1432 dma_map.iova = iova; 1433 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 1434 VFIO_DMA_MAP_FLAG_WRITE; 1435 1436 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 1437 if (ret) { 1438 /** 1439 * In case the mapping was already done EEXIST will be 1440 * returned from kernel. 1441 */ 1442 if (errno == EEXIST) { 1443 EAL_LOG(DEBUG, 1444 "Memory segment is already mapped, skipping"); 1445 } else { 1446 EAL_LOG(ERR, 1447 "Cannot set up DMA remapping, error " 1448 "%i (%s)", errno, strerror(errno)); 1449 return -1; 1450 } 1451 } 1452 } else { 1453 memset(&dma_unmap, 0, sizeof(dma_unmap)); 1454 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 1455 dma_unmap.size = len; 1456 dma_unmap.iova = iova; 1457 1458 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 1459 &dma_unmap); 1460 if (ret) { 1461 EAL_LOG(ERR, "Cannot clear DMA remapping, error " 1462 "%i (%s)", errno, strerror(errno)); 1463 return -1; 1464 } else if (dma_unmap.size != len) { 1465 EAL_LOG(ERR, "Unexpected size %"PRIu64 1466 " of DMA remapping cleared instead of %"PRIu64, 1467 (uint64_t)dma_unmap.size, len); 1468 rte_errno = EIO; 1469 return -1; 1470 } 1471 } 1472 1473 return 0; 1474 } 1475 1476 static int 1477 vfio_type1_dma_map(int vfio_container_fd) 1478 { 1479 return rte_memseg_walk(type1_map, &vfio_container_fd); 1480 } 1481 1482 /* Track the size of the statically allocated DMA window for SPAPR */ 1483 uint64_t spapr_dma_win_len; 1484 uint64_t spapr_dma_win_page_sz; 1485 1486 static int 1487 vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, 1488 uint64_t len, int do_map) 1489 { 1490 struct vfio_iommu_spapr_register_memory reg = { 1491 .argsz = sizeof(reg), 1492 .vaddr = (uintptr_t) vaddr, 1493 .size = len, 1494 .flags = 0 1495 }; 1496 int ret; 1497 1498 if (do_map != 0) { 1499 struct vfio_iommu_type1_dma_map dma_map; 1500 1501 if (iova + len > spapr_dma_win_len) { 1502 EAL_LOG(ERR, "DMA map attempt outside DMA window"); 1503 return -1; 1504 } 1505 1506 ret = ioctl(vfio_container_fd, 1507 VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 1508 if (ret) { 1509 EAL_LOG(ERR, 1510 "Cannot register vaddr for IOMMU, error " 1511 "%i (%s)", errno, strerror(errno)); 1512 return -1; 1513 } 1514 1515 memset(&dma_map, 0, sizeof(dma_map)); 1516 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); 1517 dma_map.vaddr = vaddr; 1518 dma_map.size = len; 1519 dma_map.iova = iova; 1520 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 1521 VFIO_DMA_MAP_FLAG_WRITE; 1522 1523 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); 1524 if (ret) { 1525 EAL_LOG(ERR, "Cannot map vaddr for IOMMU, error " 1526 "%i (%s)", errno, strerror(errno)); 1527 return -1; 1528 } 1529 1530 } else { 1531 struct vfio_iommu_type1_dma_map dma_unmap; 1532 1533 memset(&dma_unmap, 0, sizeof(dma_unmap)); 1534 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); 1535 dma_unmap.size = len; 1536 dma_unmap.iova = iova; 1537 1538 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, 1539 &dma_unmap); 1540 if (ret) { 1541 EAL_LOG(ERR, "Cannot unmap vaddr for IOMMU, error " 1542 "%i (%s)", errno, strerror(errno)); 1543 return -1; 1544 } 1545 1546 ret = ioctl(vfio_container_fd, 1547 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 1548 if (ret) { 1549 EAL_LOG(ERR, 1550 "Cannot unregister vaddr for IOMMU, error " 1551 "%i (%s)", errno, strerror(errno)); 1552 return -1; 1553 } 1554 } 1555 1556 return ret; 1557 } 1558 1559 static int 1560 vfio_spapr_map_walk(const struct rte_memseg_list *msl, 1561 const struct rte_memseg *ms, void *arg) 1562 { 1563 int *vfio_container_fd = arg; 1564 1565 /* skip external memory that isn't a heap */ 1566 if (msl->external && !msl->heap) 1567 return 0; 1568 1569 /* skip any segments with invalid IOVA addresses */ 1570 if (ms->iova == RTE_BAD_IOVA) 1571 return 0; 1572 1573 return vfio_spapr_dma_do_map(*vfio_container_fd, 1574 ms->addr_64, ms->iova, ms->len, 1); 1575 } 1576 1577 struct spapr_size_walk_param { 1578 uint64_t max_va; 1579 uint64_t page_sz; 1580 bool is_user_managed; 1581 }; 1582 1583 /* 1584 * In order to set the DMA window size required for the SPAPR IOMMU 1585 * we need to walk the existing virtual memory allocations as well as 1586 * find the hugepage size used. 1587 */ 1588 static int 1589 vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg) 1590 { 1591 struct spapr_size_walk_param *param = arg; 1592 uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len; 1593 1594 if (msl->external && !msl->heap) { 1595 /* ignore user managed external memory */ 1596 param->is_user_managed = true; 1597 return 0; 1598 } 1599 1600 if (max > param->max_va) { 1601 param->page_sz = msl->page_sz; 1602 param->max_va = max; 1603 } 1604 1605 return 0; 1606 } 1607 1608 /* 1609 * Find the highest memory address used in physical or virtual address 1610 * space and use that as the top of the DMA window. 1611 */ 1612 static int 1613 find_highest_mem_addr(struct spapr_size_walk_param *param) 1614 { 1615 /* find the maximum IOVA address for setting the DMA window size */ 1616 if (rte_eal_iova_mode() == RTE_IOVA_PA) { 1617 static const char proc_iomem[] = "/proc/iomem"; 1618 static const char str_sysram[] = "System RAM"; 1619 uint64_t start, end, max = 0; 1620 char *line = NULL; 1621 char *dash, *space; 1622 size_t line_len; 1623 1624 /* 1625 * Example "System RAM" in /proc/iomem: 1626 * 00000000-1fffffffff : System RAM 1627 * 200000000000-201fffffffff : System RAM 1628 */ 1629 FILE *fd = fopen(proc_iomem, "r"); 1630 if (fd == NULL) { 1631 EAL_LOG(ERR, "Cannot open %s", proc_iomem); 1632 return -1; 1633 } 1634 /* Scan /proc/iomem for the highest PA in the system */ 1635 while (getline(&line, &line_len, fd) != -1) { 1636 if (strstr(line, str_sysram) == NULL) 1637 continue; 1638 1639 space = strstr(line, " "); 1640 dash = strstr(line, "-"); 1641 1642 /* Validate the format of the memory string */ 1643 if (space == NULL || dash == NULL || space < dash) { 1644 EAL_LOG(ERR, "Can't parse line \"%s\" in file %s", 1645 line, proc_iomem); 1646 continue; 1647 } 1648 1649 start = strtoull(line, NULL, 16); 1650 end = strtoull(dash + 1, NULL, 16); 1651 EAL_LOG(DEBUG, "Found system RAM from 0x%" PRIx64 1652 " to 0x%" PRIx64, start, end); 1653 if (end > max) 1654 max = end; 1655 } 1656 free(line); 1657 fclose(fd); 1658 1659 if (max == 0) { 1660 EAL_LOG(ERR, "Failed to find valid \"System RAM\" " 1661 "entry in file %s", proc_iomem); 1662 return -1; 1663 } 1664 1665 spapr_dma_win_len = rte_align64pow2(max + 1); 1666 return 0; 1667 } else if (rte_eal_iova_mode() == RTE_IOVA_VA) { 1668 EAL_LOG(DEBUG, "Highest VA address in memseg list is 0x%" 1669 PRIx64, param->max_va); 1670 spapr_dma_win_len = rte_align64pow2(param->max_va); 1671 return 0; 1672 } 1673 1674 spapr_dma_win_len = 0; 1675 EAL_LOG(ERR, "Unsupported IOVA mode"); 1676 return -1; 1677 } 1678 1679 1680 /* 1681 * The SPAPRv2 IOMMU supports 2 DMA windows with starting 1682 * address at 0 or 1<<59. By default, a DMA window is set 1683 * at address 0, 2GB long, with a 4KB page. For DPDK we 1684 * must remove the default window and setup a new DMA window 1685 * based on the hugepage size and memory requirements of 1686 * the application before we can map memory for DMA. 1687 */ 1688 static int 1689 spapr_dma_win_size(void) 1690 { 1691 struct spapr_size_walk_param param; 1692 1693 /* only create DMA window once */ 1694 if (spapr_dma_win_len > 0) 1695 return 0; 1696 1697 /* walk the memseg list to find the page size/max VA address */ 1698 memset(¶m, 0, sizeof(param)); 1699 if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) { 1700 EAL_LOG(ERR, "Failed to walk memseg list for DMA window size"); 1701 return -1; 1702 } 1703 1704 /* we can't be sure if DMA window covers external memory */ 1705 if (param.is_user_managed) 1706 EAL_LOG(WARNING, "Detected user managed external memory which may not be managed by the IOMMU"); 1707 1708 /* check physical/virtual memory size */ 1709 if (find_highest_mem_addr(¶m) < 0) 1710 return -1; 1711 EAL_LOG(DEBUG, "Setting DMA window size to 0x%" PRIx64, 1712 spapr_dma_win_len); 1713 spapr_dma_win_page_sz = param.page_sz; 1714 rte_mem_set_dma_mask(rte_ctz64(spapr_dma_win_len)); 1715 return 0; 1716 } 1717 1718 static int 1719 vfio_spapr_create_dma_window(int vfio_container_fd) 1720 { 1721 struct vfio_iommu_spapr_tce_create create = { 1722 .argsz = sizeof(create), }; 1723 struct vfio_iommu_spapr_tce_remove remove = { 1724 .argsz = sizeof(remove), }; 1725 struct vfio_iommu_spapr_tce_info info = { 1726 .argsz = sizeof(info), }; 1727 int ret; 1728 1729 ret = spapr_dma_win_size(); 1730 if (ret < 0) 1731 return ret; 1732 1733 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 1734 if (ret) { 1735 EAL_LOG(ERR, "Cannot get IOMMU info, error %i (%s)", 1736 errno, strerror(errno)); 1737 return -1; 1738 } 1739 1740 /* 1741 * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window 1742 * can't be changed for v1 but it can be changed for v2. Since DPDK only 1743 * supports v2, remove the default DMA window so it can be resized. 1744 */ 1745 remove.start_addr = info.dma32_window_start; 1746 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 1747 if (ret) 1748 return -1; 1749 1750 /* create a new DMA window (start address is not selectable) */ 1751 create.window_size = spapr_dma_win_len; 1752 create.page_shift = rte_ctz64(spapr_dma_win_page_sz); 1753 create.levels = 1; 1754 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 1755 #ifdef VFIO_IOMMU_SPAPR_INFO_DDW 1756 /* 1757 * The vfio_iommu_spapr_tce_info structure was modified in 1758 * Linux kernel 4.2.0 to add support for the 1759 * vfio_iommu_spapr_tce_ddw_info structure needed to try 1760 * multiple table levels. Skip the attempt if running with 1761 * an older kernel. 1762 */ 1763 if (ret) { 1764 /* if at first we don't succeed, try more levels */ 1765 uint32_t levels; 1766 1767 for (levels = create.levels + 1; 1768 ret && levels <= info.ddw.levels; levels++) { 1769 create.levels = levels; 1770 ret = ioctl(vfio_container_fd, 1771 VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 1772 } 1773 } 1774 #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */ 1775 if (ret) { 1776 EAL_LOG(ERR, "Cannot create new DMA window, error " 1777 "%i (%s)", errno, strerror(errno)); 1778 EAL_LOG(ERR, 1779 "Consider using a larger hugepage size if supported by the system"); 1780 return -1; 1781 } 1782 1783 /* verify the start address */ 1784 if (create.start_addr != 0) { 1785 EAL_LOG(ERR, "Received unsupported start address 0x%" 1786 PRIx64, (uint64_t)create.start_addr); 1787 return -1; 1788 } 1789 return ret; 1790 } 1791 1792 static int 1793 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, 1794 uint64_t iova, uint64_t len, int do_map) 1795 { 1796 int ret = 0; 1797 1798 if (do_map) { 1799 if (vfio_spapr_dma_do_map(vfio_container_fd, 1800 vaddr, iova, len, 1)) { 1801 EAL_LOG(ERR, "Failed to map DMA"); 1802 ret = -1; 1803 } 1804 } else { 1805 if (vfio_spapr_dma_do_map(vfio_container_fd, 1806 vaddr, iova, len, 0)) { 1807 EAL_LOG(ERR, "Failed to unmap DMA"); 1808 ret = -1; 1809 } 1810 } 1811 1812 return ret; 1813 } 1814 1815 static int 1816 vfio_spapr_dma_map(int vfio_container_fd) 1817 { 1818 if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) { 1819 EAL_LOG(ERR, "Could not create new DMA window!"); 1820 return -1; 1821 } 1822 1823 /* map all existing DPDK segments for DMA */ 1824 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) 1825 return -1; 1826 1827 return 0; 1828 } 1829 1830 static int 1831 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) 1832 { 1833 /* No-IOMMU mode does not need DMA mapping */ 1834 return 0; 1835 } 1836 1837 static int 1838 vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, 1839 uint64_t __rte_unused vaddr, 1840 uint64_t __rte_unused iova, uint64_t __rte_unused len, 1841 int __rte_unused do_map) 1842 { 1843 /* No-IOMMU mode does not need DMA mapping */ 1844 return 0; 1845 } 1846 1847 static int 1848 vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1849 uint64_t len, int do_map) 1850 { 1851 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; 1852 1853 if (!t) { 1854 EAL_LOG(ERR, "VFIO support not initialized"); 1855 rte_errno = ENODEV; 1856 return -1; 1857 } 1858 1859 if (!t->dma_user_map_func) { 1860 EAL_LOG(ERR, 1861 "VFIO custom DMA region mapping not supported by IOMMU %s", 1862 t->name); 1863 rte_errno = ENOTSUP; 1864 return -1; 1865 } 1866 1867 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, 1868 len, do_map); 1869 } 1870 1871 static int 1872 container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1873 uint64_t len) 1874 { 1875 struct user_mem_map *new_map; 1876 struct user_mem_maps *user_mem_maps; 1877 bool has_partial_unmap; 1878 int ret = 0; 1879 1880 user_mem_maps = &vfio_cfg->mem_maps; 1881 rte_spinlock_recursive_lock(&user_mem_maps->lock); 1882 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { 1883 EAL_LOG(ERR, "No more space for user mem maps"); 1884 rte_errno = ENOMEM; 1885 ret = -1; 1886 goto out; 1887 } 1888 /* map the entry */ 1889 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { 1890 /* technically, this will fail if there are currently no devices 1891 * plugged in, even if a device were added later, this mapping 1892 * might have succeeded. however, since we cannot verify if this 1893 * is a valid mapping without having a device attached, consider 1894 * this to be unsupported, because we can't just store any old 1895 * mapping and pollute list of active mappings willy-nilly. 1896 */ 1897 EAL_LOG(ERR, "Couldn't map new region for DMA"); 1898 ret = -1; 1899 goto out; 1900 } 1901 /* do we have partial unmap support? */ 1902 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 1903 1904 /* create new user mem map entry */ 1905 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; 1906 new_map->addr = vaddr; 1907 new_map->iova = iova; 1908 new_map->len = len; 1909 /* for IOMMU types supporting partial unmap, we don't need chunking */ 1910 new_map->chunk = has_partial_unmap ? 0 : len; 1911 1912 compact_user_maps(user_mem_maps); 1913 out: 1914 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 1915 return ret; 1916 } 1917 1918 static int 1919 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, 1920 uint64_t len) 1921 { 1922 struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS]; 1923 struct user_mem_map new_maps[2]; /* can be at most 2 */ 1924 struct user_mem_maps *user_mem_maps; 1925 int n_orig, n_new, newlen, ret = 0; 1926 bool has_partial_unmap; 1927 1928 user_mem_maps = &vfio_cfg->mem_maps; 1929 rte_spinlock_recursive_lock(&user_mem_maps->lock); 1930 1931 /* 1932 * Previously, we had adjacent mappings entirely contained within one 1933 * mapping entry. Since we now store original mapping length in some 1934 * cases, this is no longer the case, so unmapping can potentially go 1935 * over multiple segments and split them in any number of ways. 1936 * 1937 * To complicate things further, some IOMMU types support arbitrary 1938 * partial unmapping, while others will only support unmapping along the 1939 * chunk size, so there are a lot of cases we need to handle. To make 1940 * things easier code wise, instead of trying to adjust existing 1941 * mappings, let's just rebuild them using information we have. 1942 */ 1943 1944 /* 1945 * first thing to do is check if there exists a mapping that includes 1946 * the start and the end of our requested unmap. We need to collect all 1947 * maps that include our unmapped region. 1948 */ 1949 n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len, 1950 orig_maps, RTE_DIM(orig_maps)); 1951 /* did we find anything? */ 1952 if (n_orig < 0) { 1953 EAL_LOG(ERR, "Couldn't find previously mapped region"); 1954 rte_errno = EINVAL; 1955 ret = -1; 1956 goto out; 1957 } 1958 1959 /* do we have partial unmap capability? */ 1960 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap; 1961 1962 /* 1963 * if we don't support partial unmap, we must check if start and end of 1964 * current unmap region are chunk-aligned. 1965 */ 1966 if (!has_partial_unmap) { 1967 bool start_aligned, end_aligned; 1968 1969 start_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 1970 vaddr, iova); 1971 end_aligned = addr_is_chunk_aligned(orig_maps, n_orig, 1972 vaddr + len, iova + len); 1973 1974 if (!start_aligned || !end_aligned) { 1975 EAL_LOG(DEBUG, "DMA partial unmap unsupported"); 1976 rte_errno = ENOTSUP; 1977 ret = -1; 1978 goto out; 1979 } 1980 } 1981 1982 /* 1983 * now we know we can potentially unmap the region, but we still have to 1984 * figure out if there is enough space in our list to store remaining 1985 * maps. for this, we will figure out how many segments we are going to 1986 * remove, and how many new segments we are going to create. 1987 */ 1988 n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len); 1989 1990 /* can we store the new maps in our list? */ 1991 newlen = (user_mem_maps->n_maps - n_orig) + n_new; 1992 if (newlen >= VFIO_MAX_USER_MEM_MAPS) { 1993 EAL_LOG(ERR, "Not enough space to store partial mapping"); 1994 rte_errno = ENOMEM; 1995 ret = -1; 1996 goto out; 1997 } 1998 1999 /* unmap the entry */ 2000 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { 2001 /* there may not be any devices plugged in, so unmapping will 2002 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't 2003 * stop us from removing the mapping, as the assumption is we 2004 * won't be needing this memory any more and thus will want to 2005 * prevent it from being remapped again on hotplug. so, only 2006 * fail if we indeed failed to unmap (e.g. if the mapping was 2007 * within our mapped range but had invalid alignment). 2008 */ 2009 if (rte_errno != ENODEV && rte_errno != ENOTSUP) { 2010 EAL_LOG(ERR, "Couldn't unmap region for DMA"); 2011 ret = -1; 2012 goto out; 2013 } else { 2014 EAL_LOG(DEBUG, "DMA unmapping failed, but removing mappings anyway"); 2015 } 2016 } 2017 2018 /* we have unmapped the region, so now update the maps */ 2019 delete_maps(user_mem_maps, orig_maps, n_orig); 2020 copy_maps(user_mem_maps, new_maps, n_new); 2021 compact_user_maps(user_mem_maps); 2022 out: 2023 rte_spinlock_recursive_unlock(&user_mem_maps->lock); 2024 return ret; 2025 } 2026 2027 int 2028 rte_vfio_noiommu_is_enabled(void) 2029 { 2030 int fd; 2031 ssize_t cnt; 2032 char c; 2033 2034 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); 2035 if (fd < 0) { 2036 if (errno != ENOENT) { 2037 EAL_LOG(ERR, "Cannot open VFIO noiommu file " 2038 "%i (%s)", errno, strerror(errno)); 2039 return -1; 2040 } 2041 /* 2042 * else the file does not exists 2043 * i.e. noiommu is not enabled 2044 */ 2045 return 0; 2046 } 2047 2048 cnt = read(fd, &c, 1); 2049 close(fd); 2050 if (cnt != 1) { 2051 EAL_LOG(ERR, "Unable to read from VFIO noiommu file " 2052 "%i (%s)", errno, strerror(errno)); 2053 return -1; 2054 } 2055 2056 return c == 'Y'; 2057 } 2058 2059 int 2060 rte_vfio_container_create(void) 2061 { 2062 int i; 2063 2064 /* Find an empty slot to store new vfio config */ 2065 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { 2066 if (vfio_cfgs[i].vfio_container_fd == -1) 2067 break; 2068 } 2069 2070 if (i == VFIO_MAX_CONTAINERS) { 2071 EAL_LOG(ERR, "Exceed max VFIO container limit"); 2072 return -1; 2073 } 2074 2075 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); 2076 if (vfio_cfgs[i].vfio_container_fd < 0) { 2077 EAL_LOG(NOTICE, "Fail to create a new VFIO container"); 2078 return -1; 2079 } 2080 2081 return vfio_cfgs[i].vfio_container_fd; 2082 } 2083 2084 int 2085 rte_vfio_container_destroy(int container_fd) 2086 { 2087 struct vfio_config *vfio_cfg; 2088 int i; 2089 2090 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2091 if (vfio_cfg == NULL) { 2092 EAL_LOG(ERR, "Invalid VFIO container fd"); 2093 return -1; 2094 } 2095 2096 for (i = 0; i < VFIO_MAX_GROUPS; i++) 2097 if (vfio_cfg->vfio_groups[i].group_num != -1) 2098 rte_vfio_container_group_unbind(container_fd, 2099 vfio_cfg->vfio_groups[i].group_num); 2100 2101 close(container_fd); 2102 vfio_cfg->vfio_container_fd = -1; 2103 vfio_cfg->vfio_active_groups = 0; 2104 vfio_cfg->vfio_iommu_type = NULL; 2105 2106 return 0; 2107 } 2108 2109 int 2110 rte_vfio_container_group_bind(int container_fd, int iommu_group_num) 2111 { 2112 struct vfio_config *vfio_cfg; 2113 2114 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2115 if (vfio_cfg == NULL) { 2116 EAL_LOG(ERR, "Invalid VFIO container fd"); 2117 return -1; 2118 } 2119 2120 return vfio_get_group_fd(vfio_cfg, iommu_group_num); 2121 } 2122 2123 int 2124 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) 2125 { 2126 struct vfio_config *vfio_cfg; 2127 struct vfio_group *cur_grp = NULL; 2128 int i; 2129 2130 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2131 if (vfio_cfg == NULL) { 2132 EAL_LOG(ERR, "Invalid VFIO container fd"); 2133 return -1; 2134 } 2135 2136 for (i = 0; i < VFIO_MAX_GROUPS; i++) { 2137 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { 2138 cur_grp = &vfio_cfg->vfio_groups[i]; 2139 break; 2140 } 2141 } 2142 2143 /* This should not happen */ 2144 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { 2145 EAL_LOG(ERR, "Specified VFIO group number not found"); 2146 return -1; 2147 } 2148 2149 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { 2150 EAL_LOG(ERR, 2151 "Error when closing vfio_group_fd for iommu_group_num " 2152 "%d", iommu_group_num); 2153 return -1; 2154 } 2155 cur_grp->group_num = -1; 2156 cur_grp->fd = -1; 2157 cur_grp->devices = 0; 2158 vfio_cfg->vfio_active_groups--; 2159 2160 return 0; 2161 } 2162 2163 int 2164 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, 2165 uint64_t len) 2166 { 2167 struct vfio_config *vfio_cfg; 2168 2169 if (len == 0) { 2170 rte_errno = EINVAL; 2171 return -1; 2172 } 2173 2174 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2175 if (vfio_cfg == NULL) { 2176 EAL_LOG(ERR, "Invalid VFIO container fd"); 2177 return -1; 2178 } 2179 2180 return container_dma_map(vfio_cfg, vaddr, iova, len); 2181 } 2182 2183 int 2184 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, 2185 uint64_t len) 2186 { 2187 struct vfio_config *vfio_cfg; 2188 2189 if (len == 0) { 2190 rte_errno = EINVAL; 2191 return -1; 2192 } 2193 2194 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); 2195 if (vfio_cfg == NULL) { 2196 EAL_LOG(ERR, "Invalid VFIO container fd"); 2197 return -1; 2198 } 2199 2200 return container_dma_unmap(vfio_cfg, vaddr, iova, len); 2201 } 2202