1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2018 Intel Corporation 3 */ 4 5 #include <unistd.h> 6 #include <pthread.h> 7 #include <fcntl.h> 8 #include <string.h> 9 #include <sys/ioctl.h> 10 #include <sys/epoll.h> 11 #include <linux/virtio_net.h> 12 #include <stdbool.h> 13 14 #include <rte_eal_paging.h> 15 #include <rte_malloc.h> 16 #include <rte_memory.h> 17 #include <bus_pci_driver.h> 18 #include <rte_vhost.h> 19 #include <rte_vdpa.h> 20 #include <vdpa_driver.h> 21 #include <rte_vfio.h> 22 #include <rte_spinlock.h> 23 #include <rte_log.h> 24 #include <rte_kvargs.h> 25 #include <rte_devargs.h> 26 27 #include "base/ifcvf.h" 28 29 /* 30 * RTE_MIN() cannot be used since braced-group within expression allowed 31 * only inside a function. 32 */ 33 #define MIN(v1, v2) ((v1) < (v2) ? (v1) : (v2)) 34 35 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE); 36 #define RTE_LOGTYPE_IFCVF_VDPA ifcvf_vdpa_logtype 37 #define DRV_LOG(level, ...) \ 38 RTE_LOG_LINE_PREFIX(level, IFCVF_VDPA, "%s(): ", __func__, __VA_ARGS__) 39 40 #define IFCVF_USED_RING_LEN(size) \ 41 ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3) 42 43 #define IFCVF_VDPA_MODE "vdpa" 44 #define IFCVF_SW_FALLBACK_LM "sw-live-migration" 45 46 #define THREAD_NAME_LEN 16 47 48 static const char * const ifcvf_valid_arguments[] = { 49 IFCVF_VDPA_MODE, 50 IFCVF_SW_FALLBACK_LM, 51 NULL 52 }; 53 54 struct ifcvf_internal { 55 struct rte_pci_device *pdev; 56 struct ifcvf_hw hw; 57 int configured; 58 int vfio_container_fd; 59 int vfio_group_fd; 60 int vfio_dev_fd; 61 rte_thread_t tid; /* thread for notify relay */ 62 rte_thread_t intr_tid; /* thread for config space change interrupt relay */ 63 int epfd; 64 int csc_epfd; 65 int vid; 66 struct rte_vdpa_device *vdev; 67 uint16_t max_queues; 68 uint64_t features; 69 rte_atomic32_t started; 70 rte_atomic32_t dev_attached; 71 rte_atomic32_t running; 72 rte_spinlock_t lock; 73 bool sw_lm; 74 bool sw_fallback_running; 75 /* mediated vring for sw fallback */ 76 struct vring m_vring[IFCVF_MAX_QUEUES * 2]; 77 /* eventfd for used ring interrupt */ 78 int intr_fd[IFCVF_MAX_QUEUES * 2]; 79 }; 80 81 struct internal_list { 82 TAILQ_ENTRY(internal_list) next; 83 struct ifcvf_internal *internal; 84 }; 85 86 /* vdpa device info includes device features and devcic operation. */ 87 struct rte_vdpa_dev_info { 88 uint64_t features; 89 struct rte_vdpa_dev_ops *ops; 90 }; 91 92 TAILQ_HEAD(internal_list_head, internal_list); 93 static struct internal_list_head internal_list = 94 TAILQ_HEAD_INITIALIZER(internal_list); 95 96 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER; 97 98 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid); 99 100 static struct internal_list * 101 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev) 102 { 103 int found = 0; 104 struct internal_list *list; 105 106 pthread_mutex_lock(&internal_list_lock); 107 108 TAILQ_FOREACH(list, &internal_list, next) { 109 if (vdev == list->internal->vdev) { 110 found = 1; 111 break; 112 } 113 } 114 115 pthread_mutex_unlock(&internal_list_lock); 116 117 if (!found) 118 return NULL; 119 120 return list; 121 } 122 123 static struct internal_list * 124 find_internal_resource_by_pci_dev(struct rte_pci_device *pdev) 125 { 126 int found = 0; 127 struct internal_list *list; 128 129 pthread_mutex_lock(&internal_list_lock); 130 131 TAILQ_FOREACH(list, &internal_list, next) { 132 if (!rte_pci_addr_cmp(&pdev->addr, 133 &list->internal->pdev->addr)) { 134 found = 1; 135 break; 136 } 137 } 138 139 pthread_mutex_unlock(&internal_list_lock); 140 141 if (!found) 142 return NULL; 143 144 return list; 145 } 146 147 static struct internal_list * 148 find_internal_resource_by_rte_dev(struct rte_device *rte_dev) 149 { 150 int found = 0; 151 struct internal_list *list; 152 153 pthread_mutex_lock(&internal_list_lock); 154 155 TAILQ_FOREACH(list, &internal_list, next) { 156 if (rte_dev == &list->internal->pdev->device) { 157 found = 1; 158 break; 159 } 160 } 161 162 pthread_mutex_unlock(&internal_list_lock); 163 164 if (!found) 165 return NULL; 166 167 return list; 168 } 169 170 static int 171 ifcvf_vfio_setup(struct ifcvf_internal *internal) 172 { 173 struct rte_pci_device *dev = internal->pdev; 174 char devname[RTE_DEV_NAME_MAX_LEN] = {0}; 175 int iommu_group_num; 176 int i, ret; 177 178 internal->vfio_dev_fd = -1; 179 internal->vfio_group_fd = -1; 180 internal->vfio_container_fd = -1; 181 182 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN); 183 ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname, 184 &iommu_group_num); 185 if (ret <= 0) { 186 DRV_LOG(ERR, "%s failed to get IOMMU group", devname); 187 return -1; 188 } 189 190 internal->vfio_container_fd = rte_vfio_container_create(); 191 if (internal->vfio_container_fd < 0) 192 return -1; 193 194 internal->vfio_group_fd = rte_vfio_container_group_bind( 195 internal->vfio_container_fd, iommu_group_num); 196 if (internal->vfio_group_fd < 0) 197 goto err; 198 199 if (rte_pci_map_device(dev)) 200 goto err; 201 202 internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle); 203 204 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE); 205 i++) { 206 internal->hw.mem_resource[i].addr = 207 internal->pdev->mem_resource[i].addr; 208 internal->hw.mem_resource[i].phys_addr = 209 internal->pdev->mem_resource[i].phys_addr; 210 internal->hw.mem_resource[i].len = 211 internal->pdev->mem_resource[i].len; 212 } 213 214 return 0; 215 216 err: 217 rte_vfio_container_destroy(internal->vfio_container_fd); 218 return -1; 219 } 220 221 static int 222 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map) 223 { 224 uint32_t i; 225 int ret; 226 struct rte_vhost_memory *mem = NULL; 227 int vfio_container_fd; 228 229 ret = rte_vhost_get_mem_table(internal->vid, &mem); 230 if (ret < 0) { 231 DRV_LOG(ERR, "failed to get VM memory layout."); 232 goto exit; 233 } 234 235 vfio_container_fd = internal->vfio_container_fd; 236 237 for (i = 0; i < mem->nregions; i++) { 238 struct rte_vhost_mem_region *reg; 239 240 reg = &mem->regions[i]; 241 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", " 242 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".", 243 do_map ? "DMA map" : "DMA unmap", i, 244 reg->host_user_addr, reg->guest_phys_addr, reg->size); 245 246 if (do_map) { 247 ret = rte_vfio_container_dma_map(vfio_container_fd, 248 reg->host_user_addr, reg->guest_phys_addr, 249 reg->size); 250 if (ret < 0) { 251 DRV_LOG(ERR, "DMA map failed."); 252 goto exit; 253 } 254 } else { 255 ret = rte_vfio_container_dma_unmap(vfio_container_fd, 256 reg->host_user_addr, reg->guest_phys_addr, 257 reg->size); 258 if (ret < 0) { 259 DRV_LOG(ERR, "DMA unmap failed."); 260 goto exit; 261 } 262 } 263 } 264 265 exit: 266 free(mem); 267 return ret; 268 } 269 270 static uint64_t 271 hva_to_gpa(int vid, uint64_t hva) 272 { 273 struct rte_vhost_memory *mem = NULL; 274 struct rte_vhost_mem_region *reg; 275 uint32_t i; 276 uint64_t gpa = 0; 277 278 if (rte_vhost_get_mem_table(vid, &mem) < 0) 279 goto exit; 280 281 for (i = 0; i < mem->nregions; i++) { 282 reg = &mem->regions[i]; 283 284 if (hva >= reg->host_user_addr && 285 hva < reg->host_user_addr + reg->size) { 286 gpa = hva - reg->host_user_addr + reg->guest_phys_addr; 287 break; 288 } 289 } 290 291 exit: 292 free(mem); 293 return gpa; 294 } 295 296 static int 297 vdpa_ifcvf_start(struct ifcvf_internal *internal) 298 { 299 struct ifcvf_hw *hw = &internal->hw; 300 int i, nr_vring; 301 int vid; 302 struct rte_vhost_vring vq; 303 uint64_t gpa; 304 305 vid = internal->vid; 306 nr_vring = rte_vhost_get_vring_num(vid); 307 rte_vhost_get_negotiated_features(vid, &hw->req_features); 308 309 for (i = 0; i < nr_vring; i++) { 310 if (!hw->vring[i].enable) 311 continue; 312 rte_vhost_get_vhost_vring(vid, i, &vq); 313 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); 314 if (gpa == 0) { 315 DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); 316 return -1; 317 } 318 hw->vring[i].desc = gpa; 319 320 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail); 321 if (gpa == 0) { 322 DRV_LOG(ERR, "Fail to get GPA for available ring."); 323 return -1; 324 } 325 hw->vring[i].avail = gpa; 326 327 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); 328 if (gpa == 0) { 329 DRV_LOG(ERR, "Fail to get GPA for used ring."); 330 return -1; 331 } 332 hw->vring[i].used = gpa; 333 334 hw->vring[i].size = vq.size; 335 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, 336 &hw->vring[i].last_used_idx); 337 } 338 hw->nr_vring = i; 339 340 return ifcvf_start_hw(&internal->hw); 341 } 342 343 static void 344 vdpa_ifcvf_stop(struct ifcvf_internal *internal) 345 { 346 struct ifcvf_hw *hw = &internal->hw; 347 uint32_t i; 348 int vid; 349 uint64_t features = 0; 350 uint64_t log_base = 0, log_size = 0; 351 uint64_t len; 352 u32 ring_state = 0; 353 354 vid = internal->vid; 355 356 /* to make sure no packet is lost for blk device 357 * do not stop until last_avail_idx == last_used_idx 358 */ 359 if (internal->hw.device_type == IFCVF_BLK) { 360 for (i = 0; i < hw->nr_vring; i++) { 361 do { 362 if (hw->lm_cfg != NULL) 363 ring_state = *(u32 *)(hw->lm_cfg + 364 IFCVF_LM_RING_STATE_OFFSET + 365 i * IFCVF_LM_CFG_SIZE); 366 hw->vring[i].last_avail_idx = 367 (u16)(ring_state & IFCVF_16_BIT_MASK); 368 hw->vring[i].last_used_idx = 369 (u16)(ring_state >> 16); 370 usleep(10); 371 } while (hw->vring[i].last_avail_idx != 372 hw->vring[i].last_used_idx); 373 } 374 } 375 376 ifcvf_stop_hw(hw); 377 378 for (i = 0; i < hw->nr_vring; i++) 379 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, 380 hw->vring[i].last_used_idx); 381 382 if (internal->sw_lm) 383 return; 384 385 rte_vhost_get_negotiated_features(vid, &features); 386 if (RTE_VHOST_NEED_LOG(features)) { 387 ifcvf_disable_logging(hw); 388 rte_vhost_get_log_base(internal->vid, &log_base, &log_size); 389 rte_vfio_container_dma_unmap(internal->vfio_container_fd, 390 log_base, IFCVF_LOG_BASE, log_size); 391 /* 392 * IFCVF marks dirty memory pages for only packet buffer, 393 * SW helps to mark the used ring as dirty after device stops. 394 */ 395 for (i = 0; i < hw->nr_vring; i++) { 396 len = IFCVF_USED_RING_LEN(hw->vring[i].size); 397 rte_vhost_log_used_vring(vid, i, 0, len); 398 } 399 } 400 } 401 402 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ 403 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1)) 404 static int 405 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx) 406 { 407 int ret; 408 uint32_t i, nr_vring; 409 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; 410 struct vfio_irq_set *irq_set; 411 int *fd_ptr; 412 struct rte_vhost_vring vring; 413 int fd; 414 415 vring.callfd = -1; 416 417 nr_vring = rte_vhost_get_vring_num(internal->vid); 418 if (nr_vring > IFCVF_MAX_QUEUES * 2) 419 return -1; 420 421 irq_set = (struct vfio_irq_set *)irq_set_buf; 422 irq_set->argsz = sizeof(irq_set_buf); 423 irq_set->count = nr_vring + 1; 424 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 425 VFIO_IRQ_SET_ACTION_TRIGGER; 426 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 427 irq_set->start = 0; 428 fd_ptr = (int *)&irq_set->data; 429 /* The first interrupt is for the configure space change notification */ 430 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = 431 rte_intr_fd_get(internal->pdev->intr_handle); 432 433 for (i = 0; i < nr_vring; i++) 434 internal->intr_fd[i] = -1; 435 436 for (i = 0; i < nr_vring; i++) { 437 rte_vhost_get_vhost_vring(internal->vid, i, &vring); 438 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd; 439 if (m_rx == true && 440 ((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) { 441 /* For the net we only need to relay rx queue, 442 * which will change the mem of VM. 443 * For the blk we need to relay all the read cmd 444 * of each queue 445 */ 446 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 447 if (fd < 0) { 448 DRV_LOG(ERR, "can't setup eventfd: %s", 449 strerror(errno)); 450 return -1; 451 } 452 internal->intr_fd[i] = fd; 453 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd; 454 } 455 } 456 457 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); 458 if (ret) { 459 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s", 460 strerror(errno)); 461 return -1; 462 } 463 464 return 0; 465 } 466 467 static int 468 vdpa_disable_vfio_intr(struct ifcvf_internal *internal) 469 { 470 int ret; 471 uint32_t i, nr_vring; 472 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; 473 struct vfio_irq_set *irq_set; 474 475 irq_set = (struct vfio_irq_set *)irq_set_buf; 476 irq_set->argsz = sizeof(irq_set_buf); 477 irq_set->count = 0; 478 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 479 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 480 irq_set->start = 0; 481 482 nr_vring = rte_vhost_get_vring_num(internal->vid); 483 for (i = 0; i < nr_vring; i++) { 484 if (internal->intr_fd[i] >= 0) 485 close(internal->intr_fd[i]); 486 internal->intr_fd[i] = -1; 487 } 488 489 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); 490 if (ret) { 491 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s", 492 strerror(errno)); 493 return -1; 494 } 495 496 return 0; 497 } 498 499 static uint32_t 500 notify_relay(void *arg) 501 { 502 int i, kickfd, epfd, nfds = 0; 503 uint32_t qid, q_num; 504 struct epoll_event events[IFCVF_MAX_QUEUES * 2]; 505 struct epoll_event ev; 506 uint64_t buf; 507 int nbytes; 508 struct rte_vhost_vring vring; 509 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; 510 struct ifcvf_hw *hw = &internal->hw; 511 512 q_num = rte_vhost_get_vring_num(internal->vid); 513 514 epfd = epoll_create(IFCVF_MAX_QUEUES * 2); 515 if (epfd < 0) { 516 DRV_LOG(ERR, "failed to create epoll instance."); 517 return 1; 518 } 519 internal->epfd = epfd; 520 521 vring.kickfd = -1; 522 for (qid = 0; qid < q_num; qid++) { 523 if (!hw->vring[qid].enable) 524 continue; 525 ev.events = EPOLLIN | EPOLLPRI; 526 rte_vhost_get_vhost_vring(internal->vid, qid, &vring); 527 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32; 528 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { 529 DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); 530 return 1; 531 } 532 } 533 534 for (;;) { 535 nfds = epoll_wait(epfd, events, q_num, -1); 536 if (nfds < 0) { 537 if (errno == EINTR) 538 continue; 539 DRV_LOG(ERR, "epoll_wait return fail"); 540 return 1; 541 } 542 543 for (i = 0; i < nfds; i++) { 544 qid = events[i].data.u32; 545 kickfd = (uint32_t)(events[i].data.u64 >> 32); 546 do { 547 nbytes = read(kickfd, &buf, 8); 548 if (nbytes < 0) { 549 if (errno == EINTR || 550 errno == EWOULDBLOCK || 551 errno == EAGAIN) 552 continue; 553 DRV_LOG(INFO, "Error reading " 554 "kickfd: %s", 555 strerror(errno)); 556 } 557 break; 558 } while (1); 559 560 ifcvf_notify_queue(hw, qid); 561 } 562 } 563 564 return 0; 565 } 566 567 static int 568 setup_notify_relay(struct ifcvf_internal *internal) 569 { 570 char name[RTE_THREAD_INTERNAL_NAME_SIZE]; 571 int ret; 572 573 snprintf(name, sizeof(name), "ifc-noti%d", internal->vid); 574 ret = rte_thread_create_internal_control(&internal->tid, name, 575 notify_relay, internal); 576 if (ret != 0) { 577 DRV_LOG(ERR, "failed to create notify relay pthread."); 578 return -1; 579 } 580 581 return 0; 582 } 583 584 static int 585 unset_notify_relay(struct ifcvf_internal *internal) 586 { 587 if (internal->tid.opaque_id != 0) { 588 pthread_cancel((pthread_t)internal->tid.opaque_id); 589 rte_thread_join(internal->tid, NULL); 590 } 591 internal->tid.opaque_id = 0; 592 593 if (internal->epfd >= 0) 594 close(internal->epfd); 595 internal->epfd = -1; 596 597 return 0; 598 } 599 600 static void 601 virtio_interrupt_handler(struct ifcvf_internal *internal) 602 { 603 int vid = internal->vid; 604 int ret; 605 606 ret = rte_vhost_backend_config_change(vid, 1); 607 if (ret) 608 DRV_LOG(ERR, "failed to notify the guest about configuration space change."); 609 } 610 611 static uint32_t 612 intr_relay(void *arg) 613 { 614 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; 615 struct epoll_event csc_event; 616 struct epoll_event ev; 617 uint64_t buf; 618 int nbytes; 619 int csc_epfd, csc_val = 0; 620 621 csc_epfd = epoll_create(1); 622 if (csc_epfd < 0) { 623 DRV_LOG(ERR, "failed to create epoll for config space change."); 624 return 1; 625 } 626 627 ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; 628 ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle); 629 if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD, 630 rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) { 631 DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); 632 goto out; 633 } 634 635 internal->csc_epfd = csc_epfd; 636 637 for (;;) { 638 csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1); 639 if (csc_val < 0) { 640 if (errno == EINTR) 641 continue; 642 DRV_LOG(ERR, "epoll_wait return fail."); 643 goto out; 644 } else if (csc_val == 0) { 645 continue; 646 } else { 647 /* csc_val > 0 */ 648 nbytes = read(csc_event.data.fd, &buf, 8); 649 if (nbytes < 0) { 650 if (errno == EINTR || 651 errno == EWOULDBLOCK || 652 errno == EAGAIN) 653 continue; 654 DRV_LOG(ERR, "Error reading from file descriptor %d: %s", 655 csc_event.data.fd, 656 strerror(errno)); 657 goto out; 658 } else if (nbytes == 0) { 659 DRV_LOG(ERR, "Read nothing from file descriptor %d", 660 csc_event.data.fd); 661 continue; 662 } else { 663 virtio_interrupt_handler(internal); 664 } 665 } 666 } 667 668 out: 669 if (csc_epfd >= 0) 670 close(csc_epfd); 671 internal->csc_epfd = -1; 672 673 return 0; 674 } 675 676 static int 677 setup_intr_relay(struct ifcvf_internal *internal) 678 { 679 char name[RTE_THREAD_INTERNAL_NAME_SIZE]; 680 int ret; 681 682 snprintf(name, sizeof(name), "ifc-int%d", internal->vid); 683 ret = rte_thread_create_internal_control(&internal->intr_tid, name, 684 intr_relay, (void *)internal); 685 if (ret) { 686 DRV_LOG(ERR, "failed to create notify relay pthread."); 687 return -1; 688 } 689 return 0; 690 } 691 692 static void 693 unset_intr_relay(struct ifcvf_internal *internal) 694 { 695 if (internal->intr_tid.opaque_id != 0) { 696 pthread_cancel((pthread_t)internal->intr_tid.opaque_id); 697 rte_thread_join(internal->intr_tid, NULL); 698 } 699 internal->intr_tid.opaque_id = 0; 700 701 if (internal->csc_epfd >= 0) 702 close(internal->csc_epfd); 703 internal->csc_epfd = -1; 704 } 705 706 static int 707 update_datapath(struct ifcvf_internal *internal) 708 { 709 int ret; 710 711 rte_spinlock_lock(&internal->lock); 712 713 if (!rte_atomic32_read(&internal->running) && 714 (rte_atomic32_read(&internal->started) && 715 rte_atomic32_read(&internal->dev_attached))) { 716 ret = ifcvf_dma_map(internal, true); 717 if (ret) 718 goto err; 719 720 ret = vdpa_enable_vfio_intr(internal, false); 721 if (ret) 722 goto err; 723 724 ret = vdpa_ifcvf_start(internal); 725 if (ret) 726 goto err; 727 728 ret = setup_notify_relay(internal); 729 if (ret) 730 goto err; 731 732 ret = setup_intr_relay(internal); 733 if (ret) 734 goto err; 735 736 rte_atomic32_set(&internal->running, 1); 737 } else if (rte_atomic32_read(&internal->running) && 738 (!rte_atomic32_read(&internal->started) || 739 !rte_atomic32_read(&internal->dev_attached))) { 740 unset_intr_relay(internal); 741 742 ret = unset_notify_relay(internal); 743 if (ret) 744 goto err; 745 746 vdpa_ifcvf_stop(internal); 747 748 ret = vdpa_disable_vfio_intr(internal); 749 if (ret) 750 goto err; 751 752 ret = ifcvf_dma_map(internal, false); 753 if (ret) 754 goto err; 755 756 rte_atomic32_set(&internal->running, 0); 757 } 758 759 rte_spinlock_unlock(&internal->lock); 760 return 0; 761 err: 762 rte_spinlock_unlock(&internal->lock); 763 return ret; 764 } 765 766 static int 767 m_ifcvf_start(struct ifcvf_internal *internal) 768 { 769 struct ifcvf_hw *hw = &internal->hw; 770 uint32_t i, nr_vring; 771 int vid, ret; 772 struct rte_vhost_vring vq; 773 void *vring_buf; 774 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING; 775 uint64_t size; 776 uint64_t gpa; 777 778 memset(&vq, 0, sizeof(vq)); 779 vid = internal->vid; 780 nr_vring = rte_vhost_get_vring_num(vid); 781 rte_vhost_get_negotiated_features(vid, &hw->req_features); 782 783 for (i = 0; i < nr_vring; i++) { 784 rte_vhost_get_vhost_vring(vid, i, &vq); 785 786 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()), 787 rte_mem_page_size()); 788 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size()); 789 vring_init(&internal->m_vring[i], vq.size, vring_buf, 790 rte_mem_page_size()); 791 792 ret = rte_vfio_container_dma_map(internal->vfio_container_fd, 793 (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size); 794 if (ret < 0) { 795 DRV_LOG(ERR, "mediated vring DMA map failed."); 796 goto error; 797 } 798 799 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); 800 if (gpa == 0) { 801 DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); 802 return -1; 803 } 804 hw->vring[i].desc = gpa; 805 806 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail); 807 if (gpa == 0) { 808 DRV_LOG(ERR, "Fail to get GPA for available ring."); 809 return -1; 810 } 811 hw->vring[i].avail = gpa; 812 813 /* NET: Direct I/O for Tx queue, relay for Rx queue 814 * BLK: relay every queue 815 */ 816 if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) { 817 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); 818 if (gpa == 0) { 819 DRV_LOG(ERR, "Fail to get GPA for used ring."); 820 return -1; 821 } 822 hw->vring[i].used = gpa; 823 } else { 824 hw->vring[i].used = m_vring_iova + 825 (char *)internal->m_vring[i].used - 826 (char *)internal->m_vring[i].desc; 827 } 828 829 hw->vring[i].size = vq.size; 830 831 rte_vhost_get_vring_base(vid, i, 832 &internal->m_vring[i].avail->idx, 833 &internal->m_vring[i].used->idx); 834 835 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, 836 &hw->vring[i].last_used_idx); 837 838 m_vring_iova += size; 839 } 840 hw->nr_vring = nr_vring; 841 842 return ifcvf_start_hw(&internal->hw); 843 844 error: 845 for (i = 0; i < nr_vring; i++) 846 rte_free(internal->m_vring[i].desc); 847 848 return -1; 849 } 850 851 static int 852 m_ifcvf_stop(struct ifcvf_internal *internal) 853 { 854 int vid; 855 uint32_t i; 856 struct rte_vhost_vring vq; 857 struct ifcvf_hw *hw = &internal->hw; 858 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING; 859 uint64_t size, len; 860 u32 ring_state = 0; 861 862 vid = internal->vid; 863 864 /* to make sure no packet is lost for blk device 865 * do not stop until last_avail_idx == last_used_idx 866 */ 867 if (internal->hw.device_type == IFCVF_BLK) { 868 for (i = 0; i < hw->nr_vring; i++) { 869 do { 870 if (hw->lm_cfg != NULL) 871 ring_state = *(u32 *)(hw->lm_cfg + 872 IFCVF_LM_RING_STATE_OFFSET + 873 i * IFCVF_LM_CFG_SIZE); 874 hw->vring[i].last_avail_idx = 875 (u16)(ring_state & IFCVF_16_BIT_MASK); 876 hw->vring[i].last_used_idx = 877 (u16)(ring_state >> 16); 878 usleep(10); 879 } while (hw->vring[i].last_avail_idx != 880 hw->vring[i].last_used_idx); 881 } 882 } 883 884 ifcvf_stop_hw(hw); 885 886 for (i = 0; i < hw->nr_vring; i++) { 887 /* synchronize remaining new used entries if any */ 888 if (internal->hw.device_type == IFCVF_NET) { 889 if ((i & 1) == 0) 890 update_used_ring(internal, i); 891 } else if (internal->hw.device_type == IFCVF_BLK) { 892 update_used_ring(internal, i); 893 } 894 895 rte_vhost_get_vhost_vring(vid, i, &vq); 896 len = IFCVF_USED_RING_LEN(vq.size); 897 rte_vhost_log_used_vring(vid, i, 0, len); 898 899 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()), 900 rte_mem_page_size()); 901 rte_vfio_container_dma_unmap(internal->vfio_container_fd, 902 (uint64_t)(uintptr_t)internal->m_vring[i].desc, 903 m_vring_iova, size); 904 905 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, 906 hw->vring[i].last_used_idx); 907 rte_free(internal->m_vring[i].desc); 908 m_vring_iova += size; 909 } 910 911 return 0; 912 } 913 914 static void 915 update_used_ring(struct ifcvf_internal *internal, uint16_t qid) 916 { 917 rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]); 918 rte_vhost_vring_call(internal->vid, qid); 919 } 920 921 static uint32_t 922 vring_relay(void *arg) 923 { 924 int i, vid, epfd, fd, nfds; 925 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; 926 struct rte_vhost_vring vring; 927 uint16_t qid, q_num; 928 struct epoll_event events[IFCVF_MAX_QUEUES * 4]; 929 struct epoll_event ev; 930 int nbytes; 931 uint64_t buf; 932 933 vid = internal->vid; 934 q_num = rte_vhost_get_vring_num(vid); 935 936 /* add notify fd and interrupt fd to epoll */ 937 epfd = epoll_create(IFCVF_MAX_QUEUES * 2); 938 if (epfd < 0) { 939 DRV_LOG(ERR, "failed to create epoll instance."); 940 return 1; 941 } 942 internal->epfd = epfd; 943 944 vring.kickfd = -1; 945 for (qid = 0; qid < q_num; qid++) { 946 ev.events = EPOLLIN | EPOLLPRI; 947 rte_vhost_get_vhost_vring(vid, qid, &vring); 948 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32; 949 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { 950 DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); 951 return 1; 952 } 953 } 954 955 for (qid = 0; qid < q_num; qid += 1) { 956 if ((internal->hw.device_type == IFCVF_NET) && (qid & 1)) 957 continue; 958 ev.events = EPOLLIN | EPOLLPRI; 959 /* leave a flag to mark it's for interrupt */ 960 ev.data.u64 = 1 | qid << 1 | 961 (uint64_t)internal->intr_fd[qid] << 32; 962 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev) 963 < 0) { 964 DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); 965 return 1; 966 } 967 update_used_ring(internal, qid); 968 } 969 970 /* start relay with a first kick */ 971 for (qid = 0; qid < q_num; qid++) 972 ifcvf_notify_queue(&internal->hw, qid); 973 974 /* listen to the events and react accordingly */ 975 for (;;) { 976 nfds = epoll_wait(epfd, events, q_num * 2, -1); 977 if (nfds < 0) { 978 if (errno == EINTR) 979 continue; 980 DRV_LOG(ERR, "epoll_wait return fail."); 981 return 1; 982 } 983 984 for (i = 0; i < nfds; i++) { 985 fd = (uint32_t)(events[i].data.u64 >> 32); 986 do { 987 nbytes = read(fd, &buf, 8); 988 if (nbytes < 0) { 989 if (errno == EINTR || 990 errno == EWOULDBLOCK || 991 errno == EAGAIN) 992 continue; 993 DRV_LOG(INFO, "Error reading " 994 "kickfd: %s", 995 strerror(errno)); 996 } 997 break; 998 } while (1); 999 1000 qid = events[i].data.u32 >> 1; 1001 1002 if (events[i].data.u32 & 1) 1003 update_used_ring(internal, qid); 1004 else 1005 ifcvf_notify_queue(&internal->hw, qid); 1006 } 1007 } 1008 1009 return 0; 1010 } 1011 1012 static int 1013 setup_vring_relay(struct ifcvf_internal *internal) 1014 { 1015 char name[RTE_THREAD_INTERNAL_NAME_SIZE]; 1016 int ret; 1017 1018 snprintf(name, sizeof(name), "ifc-ring%d", internal->vid); 1019 ret = rte_thread_create_internal_control(&internal->tid, name, 1020 vring_relay, internal); 1021 if (ret != 0) { 1022 DRV_LOG(ERR, "failed to create ring relay pthread."); 1023 return -1; 1024 } 1025 1026 return 0; 1027 } 1028 1029 static int 1030 unset_vring_relay(struct ifcvf_internal *internal) 1031 { 1032 if (internal->tid.opaque_id != 0) { 1033 pthread_cancel((pthread_t)internal->tid.opaque_id); 1034 rte_thread_join(internal->tid, NULL); 1035 } 1036 internal->tid.opaque_id = 0; 1037 1038 if (internal->epfd >= 0) 1039 close(internal->epfd); 1040 internal->epfd = -1; 1041 1042 return 0; 1043 } 1044 1045 static int 1046 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal) 1047 { 1048 int ret; 1049 int vid = internal->vid; 1050 1051 /* stop the direct IO data path */ 1052 unset_notify_relay(internal); 1053 vdpa_ifcvf_stop(internal); 1054 1055 unset_intr_relay(internal); 1056 1057 vdpa_disable_vfio_intr(internal); 1058 1059 rte_atomic32_set(&internal->running, 0); 1060 1061 ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false); 1062 if (ret && ret != -ENOTSUP) 1063 goto error; 1064 1065 /* set up interrupt for interrupt relay */ 1066 ret = vdpa_enable_vfio_intr(internal, true); 1067 if (ret) 1068 goto unmap; 1069 1070 /* config the VF */ 1071 ret = m_ifcvf_start(internal); 1072 if (ret) 1073 goto unset_intr; 1074 1075 /* set up vring relay thread */ 1076 ret = setup_vring_relay(internal); 1077 if (ret) 1078 goto stop_vf; 1079 1080 rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true); 1081 1082 internal->sw_fallback_running = true; 1083 1084 return 0; 1085 1086 stop_vf: 1087 m_ifcvf_stop(internal); 1088 unset_intr: 1089 vdpa_disable_vfio_intr(internal); 1090 unmap: 1091 ifcvf_dma_map(internal, false); 1092 error: 1093 return -1; 1094 } 1095 1096 static int 1097 ifcvf_dev_config(int vid) 1098 { 1099 struct rte_vdpa_device *vdev; 1100 struct internal_list *list; 1101 struct ifcvf_internal *internal; 1102 struct ifcvf_hw *hw; 1103 uint16_t i; 1104 1105 vdev = rte_vhost_get_vdpa_device(vid); 1106 list = find_internal_resource_by_vdev(vdev); 1107 if (list == NULL) { 1108 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1109 return -1; 1110 } 1111 1112 internal = list->internal; 1113 internal->vid = vid; 1114 rte_atomic32_set(&internal->dev_attached, 1); 1115 if (update_datapath(internal) < 0) { 1116 DRV_LOG(ERR, "failed to update datapath for vDPA device %s", 1117 vdev->device->name); 1118 rte_atomic32_set(&internal->dev_attached, 0); 1119 return -1; 1120 } 1121 1122 hw = &internal->hw; 1123 for (i = 0; i < hw->nr_vring; i++) { 1124 if (!hw->vring[i].enable) 1125 continue; 1126 if (rte_vhost_host_notifier_ctrl(vid, i, true) != 0) 1127 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.", 1128 vdev->device->name); 1129 } 1130 1131 internal->configured = 1; 1132 DRV_LOG(INFO, "vDPA device %s is configured", vdev->device->name); 1133 return 0; 1134 } 1135 1136 static int 1137 ifcvf_dev_close(int vid) 1138 { 1139 struct rte_vdpa_device *vdev; 1140 struct internal_list *list; 1141 struct ifcvf_internal *internal; 1142 1143 vdev = rte_vhost_get_vdpa_device(vid); 1144 list = find_internal_resource_by_vdev(vdev); 1145 if (list == NULL) { 1146 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1147 return -1; 1148 } 1149 1150 internal = list->internal; 1151 1152 if (internal->sw_fallback_running) { 1153 /* unset ring relay */ 1154 unset_vring_relay(internal); 1155 1156 /* reset VF */ 1157 m_ifcvf_stop(internal); 1158 1159 /* remove interrupt setting */ 1160 vdpa_disable_vfio_intr(internal); 1161 1162 /* unset DMA map for guest memory */ 1163 ifcvf_dma_map(internal, false); 1164 1165 internal->sw_fallback_running = false; 1166 } else { 1167 rte_atomic32_set(&internal->dev_attached, 0); 1168 if (update_datapath(internal) < 0) { 1169 DRV_LOG(ERR, "failed to update datapath for vDPA device %s", 1170 vdev->device->name); 1171 internal->configured = 0; 1172 return -1; 1173 } 1174 } 1175 1176 internal->configured = 0; 1177 return 0; 1178 } 1179 1180 static int 1181 ifcvf_set_features(int vid) 1182 { 1183 uint64_t features = 0; 1184 struct rte_vdpa_device *vdev; 1185 struct internal_list *list; 1186 struct ifcvf_internal *internal; 1187 uint64_t log_base = 0, log_size = 0; 1188 1189 vdev = rte_vhost_get_vdpa_device(vid); 1190 list = find_internal_resource_by_vdev(vdev); 1191 if (list == NULL) { 1192 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1193 return -1; 1194 } 1195 1196 internal = list->internal; 1197 rte_vhost_get_negotiated_features(vid, &features); 1198 1199 if (!RTE_VHOST_NEED_LOG(features)) 1200 return 0; 1201 1202 if (internal->sw_lm) { 1203 ifcvf_sw_fallback_switchover(internal); 1204 } else { 1205 rte_vhost_get_log_base(vid, &log_base, &log_size); 1206 rte_vfio_container_dma_map(internal->vfio_container_fd, 1207 log_base, IFCVF_LOG_BASE, log_size); 1208 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size); 1209 } 1210 1211 return 0; 1212 } 1213 1214 static int 1215 ifcvf_get_vfio_group_fd(int vid) 1216 { 1217 struct rte_vdpa_device *vdev; 1218 struct internal_list *list; 1219 1220 vdev = rte_vhost_get_vdpa_device(vid); 1221 list = find_internal_resource_by_vdev(vdev); 1222 if (list == NULL) { 1223 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1224 return -1; 1225 } 1226 1227 return list->internal->vfio_group_fd; 1228 } 1229 1230 static int 1231 ifcvf_get_vfio_device_fd(int vid) 1232 { 1233 struct rte_vdpa_device *vdev; 1234 struct internal_list *list; 1235 1236 vdev = rte_vhost_get_vdpa_device(vid); 1237 list = find_internal_resource_by_vdev(vdev); 1238 if (list == NULL) { 1239 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1240 return -1; 1241 } 1242 1243 return list->internal->vfio_dev_fd; 1244 } 1245 1246 static int 1247 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size) 1248 { 1249 struct rte_vdpa_device *vdev; 1250 struct internal_list *list; 1251 struct ifcvf_internal *internal; 1252 struct vfio_region_info reg = { .argsz = sizeof(reg) }; 1253 int ret; 1254 1255 vdev = rte_vhost_get_vdpa_device(vid); 1256 list = find_internal_resource_by_vdev(vdev); 1257 if (list == NULL) { 1258 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1259 return -1; 1260 } 1261 1262 internal = list->internal; 1263 1264 reg.index = ifcvf_get_notify_region(&internal->hw); 1265 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); 1266 if (ret) { 1267 DRV_LOG(ERR, "Get not get device region info: %s", 1268 strerror(errno)); 1269 return -1; 1270 } 1271 1272 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset; 1273 *size = 0x1000; 1274 1275 return 0; 1276 } 1277 1278 static int 1279 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num) 1280 { 1281 struct internal_list *list; 1282 1283 list = find_internal_resource_by_vdev(vdev); 1284 if (list == NULL) { 1285 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1286 return -1; 1287 } 1288 1289 *queue_num = list->internal->max_queues; 1290 1291 return 0; 1292 } 1293 1294 static int 1295 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features) 1296 { 1297 struct internal_list *list; 1298 1299 list = find_internal_resource_by_vdev(vdev); 1300 if (list == NULL) { 1301 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1302 return -1; 1303 } 1304 1305 *features = list->internal->features; 1306 1307 return 0; 1308 } 1309 1310 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \ 1311 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \ 1312 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ | \ 1313 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD | \ 1314 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \ 1315 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \ 1316 1ULL << VHOST_USER_PROTOCOL_F_MQ | \ 1317 1ULL << VHOST_USER_PROTOCOL_F_STATUS) 1318 1319 #define VDPA_BLK_PROTOCOL_FEATURES \ 1320 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG) 1321 1322 static int 1323 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features) 1324 { 1325 RTE_SET_USED(vdev); 1326 1327 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES; 1328 return 0; 1329 } 1330 1331 static int 1332 ifcvf_config_vring(struct ifcvf_internal *internal, int vring) 1333 { 1334 struct ifcvf_hw *hw = &internal->hw; 1335 int vid = internal->vid; 1336 struct rte_vhost_vring vq; 1337 uint64_t gpa; 1338 1339 if (hw->vring[vring].enable) { 1340 rte_vhost_get_vhost_vring(vid, vring, &vq); 1341 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); 1342 if (gpa == 0) { 1343 DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); 1344 return -1; 1345 } 1346 hw->vring[vring].desc = gpa; 1347 1348 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail); 1349 if (gpa == 0) { 1350 DRV_LOG(ERR, "Fail to get GPA for available ring."); 1351 return -1; 1352 } 1353 hw->vring[vring].avail = gpa; 1354 1355 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); 1356 if (gpa == 0) { 1357 DRV_LOG(ERR, "Fail to get GPA for used ring."); 1358 return -1; 1359 } 1360 hw->vring[vring].used = gpa; 1361 1362 hw->vring[vring].size = vq.size; 1363 rte_vhost_get_vring_base(vid, vring, 1364 &hw->vring[vring].last_avail_idx, 1365 &hw->vring[vring].last_used_idx); 1366 ifcvf_enable_vring_hw(&internal->hw, vring); 1367 } else { 1368 ifcvf_disable_vring_hw(&internal->hw, vring); 1369 rte_vhost_set_vring_base(vid, vring, 1370 hw->vring[vring].last_avail_idx, 1371 hw->vring[vring].last_used_idx); 1372 } 1373 1374 return 0; 1375 } 1376 1377 static int 1378 ifcvf_set_vring_state(int vid, int vring, int state) 1379 { 1380 struct rte_vdpa_device *vdev; 1381 struct internal_list *list; 1382 struct ifcvf_internal *internal; 1383 struct ifcvf_hw *hw; 1384 bool enable = !!state; 1385 int ret = 0; 1386 1387 vdev = rte_vhost_get_vdpa_device(vid); 1388 list = find_internal_resource_by_vdev(vdev); 1389 if (list == NULL) { 1390 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1391 return -1; 1392 } 1393 1394 DRV_LOG(INFO, "%s queue %d of vDPA device %s", 1395 enable ? "enable" : "disable", vring, vdev->device->name); 1396 1397 internal = list->internal; 1398 if (vring < 0 || vring >= internal->max_queues * 2) { 1399 DRV_LOG(ERR, "Vring index %d not correct", vring); 1400 return -1; 1401 } 1402 1403 hw = &internal->hw; 1404 hw->vring[vring].enable = enable; 1405 1406 if (!internal->configured) 1407 return 0; 1408 1409 unset_notify_relay(internal); 1410 1411 ret = vdpa_enable_vfio_intr(internal, false); 1412 if (ret) { 1413 DRV_LOG(ERR, "failed to set vfio interrupt of vDPA device %s", 1414 vdev->device->name); 1415 return ret; 1416 } 1417 1418 ret = ifcvf_config_vring(internal, vring); 1419 if (ret) { 1420 DRV_LOG(ERR, "failed to configure queue %d of vDPA device %s", 1421 vring, vdev->device->name); 1422 return ret; 1423 } 1424 1425 ret = setup_notify_relay(internal); 1426 if (ret) { 1427 DRV_LOG(ERR, "failed to setup notify relay of vDPA device %s", 1428 vdev->device->name); 1429 return ret; 1430 } 1431 1432 ret = rte_vhost_host_notifier_ctrl(vid, vring, enable); 1433 if (ret) { 1434 DRV_LOG(ERR, "vDPA device %s queue %d host notifier ctrl fail", 1435 vdev->device->name, vring); 1436 return ret; 1437 } 1438 1439 return 0; 1440 } 1441 1442 static int 1443 ifcvf_get_device_type(struct rte_vdpa_device *vdev, 1444 uint32_t *type) 1445 { 1446 struct ifcvf_internal *internal; 1447 struct internal_list *list; 1448 struct rte_device *rte_dev = vdev->device; 1449 1450 list = find_internal_resource_by_rte_dev(rte_dev); 1451 if (list == NULL) { 1452 DRV_LOG(ERR, "Invalid rte device: %p", rte_dev); 1453 return -1; 1454 } 1455 1456 internal = list->internal; 1457 1458 if (internal->hw.device_type == IFCVF_BLK) 1459 *type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK; 1460 else 1461 *type = RTE_VHOST_VDPA_DEVICE_TYPE_NET; 1462 1463 return 0; 1464 } 1465 1466 static struct rte_vdpa_dev_ops ifcvf_net_ops = { 1467 .get_queue_num = ifcvf_get_queue_num, 1468 .get_features = ifcvf_get_vdpa_features, 1469 .get_protocol_features = ifcvf_get_protocol_features, 1470 .dev_conf = ifcvf_dev_config, 1471 .dev_close = ifcvf_dev_close, 1472 .set_vring_state = ifcvf_set_vring_state, 1473 .set_features = ifcvf_set_features, 1474 .migration_done = NULL, 1475 .get_vfio_group_fd = ifcvf_get_vfio_group_fd, 1476 .get_vfio_device_fd = ifcvf_get_vfio_device_fd, 1477 .get_notify_area = ifcvf_get_notify_area, 1478 .get_dev_type = ifcvf_get_device_type, 1479 }; 1480 1481 static inline int 1482 open_int(const char *key __rte_unused, const char *value, void *extra_args) 1483 { 1484 uint16_t *n = extra_args; 1485 1486 if (value == NULL || extra_args == NULL) 1487 return -EINVAL; 1488 1489 *n = (uint16_t)strtoul(value, NULL, 0); 1490 if (*n == USHRT_MAX && errno == ERANGE) 1491 return -1; 1492 1493 return 0; 1494 } 1495 1496 static int16_t 1497 ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev) 1498 { 1499 uint16_t pci_device_id = pci_dev->id.device_id; 1500 uint16_t device_id; 1501 1502 if (pci_device_id < 0x1000 || pci_device_id > 0x107f) { 1503 DRV_LOG(ERR, "Probe device is not a virtio device"); 1504 return -1; 1505 } 1506 1507 if (pci_device_id < 0x1040) { 1508 /* Transitional devices: use the PCI subsystem device id as 1509 * virtio device id, same as legacy driver always did. 1510 */ 1511 device_id = pci_dev->id.subsystem_device_id; 1512 } else { 1513 /* Modern devices: simply use PCI device id, 1514 * but start from 0x1040. 1515 */ 1516 device_id = pci_device_id - 0x1040; 1517 } 1518 1519 return device_id; 1520 } 1521 1522 static int 1523 ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size) 1524 { 1525 struct virtio_blk_config *dev_cfg; 1526 struct ifcvf_internal *internal; 1527 struct rte_vdpa_device *vdev; 1528 struct internal_list *list; 1529 uint32_t i; 1530 uint64_t capacity = 0; 1531 uint8_t *byte; 1532 1533 if (size < sizeof(struct virtio_blk_config)) { 1534 DRV_LOG(ERR, "Invalid len: %u, required: %u", 1535 size, (uint32_t)sizeof(struct virtio_blk_config)); 1536 return -1; 1537 } 1538 1539 vdev = rte_vhost_get_vdpa_device(vid); 1540 if (vdev == NULL) { 1541 DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid); 1542 return -1; 1543 } 1544 1545 list = find_internal_resource_by_vdev(vdev); 1546 if (list == NULL) { 1547 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev); 1548 return -1; 1549 } 1550 1551 internal = list->internal; 1552 1553 for (i = 0; i < sizeof(struct virtio_blk_config); i++) 1554 config[i] = *((u8 *)internal->hw.blk_cfg + i); 1555 1556 dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg; 1557 1558 /* cannot read 64-bit register in one attempt, so read byte by byte. */ 1559 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) { 1560 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i; 1561 capacity |= (uint64_t)*byte << (i * 8); 1562 } 1563 /* The capacity is number of sectors in 512-byte. 1564 * So right shift 1 bit we get in K, 1565 * another right shift 10 bits we get in M, 1566 * right shift 10 more bits, we get in G. 1567 * To show capacity in G, we right shift 21 bits in total. 1568 */ 1569 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21); 1570 1571 DRV_LOG(DEBUG, "size_max : 0x%08x", dev_cfg->size_max); 1572 DRV_LOG(DEBUG, "seg_max : 0x%08x", dev_cfg->seg_max); 1573 DRV_LOG(DEBUG, "blk_size : 0x%08x", dev_cfg->blk_size); 1574 DRV_LOG(DEBUG, "geometry"); 1575 DRV_LOG(DEBUG, " cylinders: %u", dev_cfg->geometry.cylinders); 1576 DRV_LOG(DEBUG, " heads : %u", dev_cfg->geometry.heads); 1577 DRV_LOG(DEBUG, " sectors : %u", dev_cfg->geometry.sectors); 1578 DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues); 1579 1580 DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]", 1581 config[0], config[1], config[2], config[3], config[4], 1582 config[5], config[6], config[7]); 1583 return 0; 1584 } 1585 1586 static int 1587 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev, 1588 uint64_t *features) 1589 { 1590 RTE_SET_USED(vdev); 1591 1592 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES; 1593 *features |= VDPA_BLK_PROTOCOL_FEATURES; 1594 return 0; 1595 } 1596 1597 static struct rte_vdpa_dev_ops ifcvf_blk_ops = { 1598 .get_queue_num = ifcvf_get_queue_num, 1599 .get_features = ifcvf_get_vdpa_features, 1600 .set_features = ifcvf_set_features, 1601 .get_protocol_features = ifcvf_blk_get_protocol_features, 1602 .dev_conf = ifcvf_dev_config, 1603 .dev_close = ifcvf_dev_close, 1604 .set_vring_state = ifcvf_set_vring_state, 1605 .migration_done = NULL, 1606 .get_vfio_group_fd = ifcvf_get_vfio_group_fd, 1607 .get_vfio_device_fd = ifcvf_get_vfio_device_fd, 1608 .get_notify_area = ifcvf_get_notify_area, 1609 .get_config = ifcvf_blk_get_config, 1610 .get_dev_type = ifcvf_get_device_type, 1611 }; 1612 1613 struct rte_vdpa_dev_info dev_info[] = { 1614 { 1615 .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | 1616 (1ULL << VIRTIO_NET_F_CTRL_VQ) | 1617 (1ULL << VIRTIO_NET_F_STATUS) | 1618 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | 1619 (1ULL << VHOST_F_LOG_ALL), 1620 .ops = &ifcvf_net_ops, 1621 }, 1622 { 1623 .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | 1624 (1ULL << VHOST_F_LOG_ALL), 1625 .ops = &ifcvf_blk_ops, 1626 }, 1627 }; 1628 1629 static int 1630 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1631 struct rte_pci_device *pci_dev) 1632 { 1633 uint64_t features; 1634 struct ifcvf_internal *internal = NULL; 1635 struct internal_list *list = NULL; 1636 int vdpa_mode = 0; 1637 int sw_fallback_lm = 0; 1638 struct rte_kvargs *kvlist = NULL; 1639 int ret = 0; 1640 int16_t device_id; 1641 uint64_t capacity = 0; 1642 uint8_t *byte; 1643 uint32_t i; 1644 uint16_t queue_pairs; 1645 1646 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1647 return 0; 1648 1649 if (!pci_dev->device.devargs) 1650 return 1; 1651 1652 kvlist = rte_kvargs_parse(pci_dev->device.devargs->args, 1653 ifcvf_valid_arguments); 1654 if (kvlist == NULL) 1655 return 1; 1656 1657 /* probe only when vdpa mode is specified */ 1658 if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) { 1659 rte_kvargs_free(kvlist); 1660 return 1; 1661 } 1662 1663 ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int, 1664 &vdpa_mode); 1665 if (ret < 0 || vdpa_mode == 0) { 1666 rte_kvargs_free(kvlist); 1667 return 1; 1668 } 1669 1670 list = rte_zmalloc("ifcvf", sizeof(*list), 0); 1671 if (list == NULL) 1672 goto error; 1673 1674 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0); 1675 if (internal == NULL) 1676 goto error; 1677 1678 internal->pdev = pci_dev; 1679 rte_spinlock_init(&internal->lock); 1680 1681 if (ifcvf_vfio_setup(internal) < 0) { 1682 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name); 1683 goto error; 1684 } 1685 1686 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) { 1687 DRV_LOG(ERR, "failed to init device %s", pci_dev->name); 1688 goto error; 1689 } 1690 1691 internal->configured = 0; 1692 features = ifcvf_get_features(&internal->hw); 1693 1694 device_id = ifcvf_pci_get_device_type(pci_dev); 1695 if (device_id < 0) { 1696 DRV_LOG(ERR, "failed to get device %s type", pci_dev->name); 1697 goto error; 1698 } 1699 1700 if (device_id == VIRTIO_ID_NET) { 1701 internal->hw.device_type = IFCVF_NET; 1702 /* 1703 * ifc device always has CTRL_VQ, 1704 * and supports VIRTIO_NET_F_CTRL_VQ feature. 1705 */ 1706 queue_pairs = (internal->hw.common_cfg->num_queues - 1) / 2; 1707 DRV_LOG(INFO, "%s support %u queue pairs", pci_dev->name, 1708 queue_pairs); 1709 internal->max_queues = MIN(IFCVF_MAX_QUEUES, queue_pairs); 1710 internal->features = features & 1711 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 1712 internal->features |= dev_info[IFCVF_NET].features; 1713 } else if (device_id == VIRTIO_ID_BLOCK) { 1714 internal->hw.device_type = IFCVF_BLK; 1715 internal->features = features & 1716 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 1717 internal->features |= dev_info[IFCVF_BLK].features; 1718 1719 /* cannot read 64-bit register in one attempt, 1720 * so read byte by byte. 1721 */ 1722 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) { 1723 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i; 1724 capacity |= (uint64_t)*byte << (i * 8); 1725 } 1726 /* The capacity is number of sectors in 512-byte. 1727 * So right shift 1 bit we get in K, 1728 * another right shift 10 bits we get in M, 1729 * right shift 10 more bits, we get in G. 1730 * To show capacity in G, we right shift 21 bits in total. 1731 */ 1732 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21); 1733 1734 DRV_LOG(DEBUG, "size_max : 0x%08x", 1735 internal->hw.blk_cfg->size_max); 1736 DRV_LOG(DEBUG, "seg_max : 0x%08x", 1737 internal->hw.blk_cfg->seg_max); 1738 DRV_LOG(DEBUG, "blk_size : 0x%08x", 1739 internal->hw.blk_cfg->blk_size); 1740 DRV_LOG(DEBUG, "geometry"); 1741 DRV_LOG(DEBUG, " cylinders: %u", 1742 internal->hw.blk_cfg->geometry.cylinders); 1743 DRV_LOG(DEBUG, " heads : %u", 1744 internal->hw.blk_cfg->geometry.heads); 1745 DRV_LOG(DEBUG, " sectors : %u", 1746 internal->hw.blk_cfg->geometry.sectors); 1747 DRV_LOG(DEBUG, "num_queues: 0x%08x", 1748 internal->hw.blk_cfg->num_queues); 1749 1750 internal->max_queues = MIN(IFCVF_MAX_QUEUES, 1751 internal->hw.blk_cfg->num_queues); 1752 } 1753 1754 list->internal = internal; 1755 1756 if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) { 1757 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM, 1758 &open_int, &sw_fallback_lm); 1759 if (ret < 0) 1760 goto error; 1761 } 1762 internal->sw_lm = sw_fallback_lm; 1763 if (!internal->sw_lm && !internal->hw.lm_cfg) { 1764 DRV_LOG(ERR, "Device %s does not support HW assist live migration, please enable sw-live-migration!", 1765 pci_dev->name); 1766 goto error; 1767 } 1768 1769 pthread_mutex_lock(&internal_list_lock); 1770 TAILQ_INSERT_TAIL(&internal_list, list, next); 1771 pthread_mutex_unlock(&internal_list_lock); 1772 1773 internal->vdev = rte_vdpa_register_device(&pci_dev->device, 1774 dev_info[internal->hw.device_type].ops); 1775 if (internal->vdev == NULL) { 1776 DRV_LOG(ERR, "failed to register device %s", pci_dev->name); 1777 pthread_mutex_lock(&internal_list_lock); 1778 TAILQ_REMOVE(&internal_list, list, next); 1779 pthread_mutex_unlock(&internal_list_lock); 1780 goto error; 1781 } 1782 1783 rte_atomic32_set(&internal->started, 1); 1784 if (update_datapath(internal) < 0) { 1785 DRV_LOG(ERR, "failed to update datapath %s", pci_dev->name); 1786 rte_atomic32_set(&internal->started, 0); 1787 rte_vdpa_unregister_device(internal->vdev); 1788 pthread_mutex_lock(&internal_list_lock); 1789 TAILQ_REMOVE(&internal_list, list, next); 1790 pthread_mutex_unlock(&internal_list_lock); 1791 goto error; 1792 } 1793 1794 rte_kvargs_free(kvlist); 1795 return 0; 1796 1797 error: 1798 rte_kvargs_free(kvlist); 1799 rte_free(list); 1800 rte_free(internal); 1801 return -1; 1802 } 1803 1804 static int 1805 ifcvf_pci_remove(struct rte_pci_device *pci_dev) 1806 { 1807 struct ifcvf_internal *internal; 1808 struct internal_list *list; 1809 1810 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1811 return 0; 1812 1813 list = find_internal_resource_by_pci_dev(pci_dev); 1814 if (list == NULL) { 1815 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name); 1816 return -1; 1817 } 1818 1819 internal = list->internal; 1820 rte_atomic32_set(&internal->started, 0); 1821 if (update_datapath(internal) < 0) 1822 DRV_LOG(ERR, "failed to update datapath %s", pci_dev->name); 1823 1824 rte_pci_unmap_device(internal->pdev); 1825 rte_vfio_container_destroy(internal->vfio_container_fd); 1826 rte_vdpa_unregister_device(internal->vdev); 1827 1828 pthread_mutex_lock(&internal_list_lock); 1829 TAILQ_REMOVE(&internal_list, list, next); 1830 pthread_mutex_unlock(&internal_list_lock); 1831 1832 rte_free(list); 1833 rte_free(internal); 1834 1835 return 0; 1836 } 1837 1838 /* 1839 * IFCVF has the same vendor ID and device ID as virtio net PCI 1840 * device, with its specific subsystem vendor ID and device ID. 1841 */ 1842 static const struct rte_pci_id pci_id_ifcvf_map[] = { 1843 { .class_id = RTE_CLASS_ANY_ID, 1844 .vendor_id = IFCVF_VENDOR_ID, 1845 .device_id = IFCVF_NET_MODERN_DEVICE_ID, 1846 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, 1847 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID, 1848 }, 1849 1850 { .class_id = RTE_CLASS_ANY_ID, 1851 .vendor_id = IFCVF_VENDOR_ID, 1852 .device_id = IFCVF_NET_TRANSITIONAL_DEVICE_ID, 1853 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, 1854 .subsystem_device_id = IFCVF_SUBSYS_NET_DEVICE_ID, 1855 }, 1856 1857 { .class_id = RTE_CLASS_ANY_ID, 1858 .vendor_id = IFCVF_VENDOR_ID, 1859 .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID, 1860 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, 1861 .subsystem_device_id = IFCVF_SUBSYS_BLK_DEVICE_ID, 1862 }, 1863 1864 { .class_id = RTE_CLASS_ANY_ID, 1865 .vendor_id = IFCVF_VENDOR_ID, 1866 .device_id = IFCVF_BLK_MODERN_DEVICE_ID, 1867 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, 1868 .subsystem_device_id = IFCVF_SUBSYS_BLK_DEVICE_ID, 1869 }, 1870 1871 { .class_id = RTE_CLASS_ANY_ID, 1872 .vendor_id = IFCVF_VENDOR_ID, 1873 .device_id = IFCVF_BLK_MODERN_DEVICE_ID, 1874 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, 1875 .subsystem_device_id = IFCVF_SUBSYS_DEFAULT_DEVICE_ID, 1876 }, /* virtio-blk devices with default subsystem IDs */ 1877 1878 { .vendor_id = 0, /* sentinel */ 1879 }, 1880 }; 1881 1882 static struct rte_pci_driver rte_ifcvf_vdpa = { 1883 .id_table = pci_id_ifcvf_map, 1884 .drv_flags = 0, 1885 .probe = ifcvf_pci_probe, 1886 .remove = ifcvf_pci_remove, 1887 }; 1888 1889 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa); 1890 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map); 1891 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci"); 1892