1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2023 Corigine, Inc. 3 * All rights reserved. 4 */ 5 6 #include <pthread.h> 7 #include <sys/epoll.h> 8 #include <sys/ioctl.h> 9 #include <unistd.h> 10 11 #include <nfp_common_pci.h> 12 #include <nfp_dev.h> 13 #include <rte_vfio.h> 14 #include <rte_eal_paging.h> 15 #include <rte_malloc.h> 16 #include <vdpa_driver.h> 17 18 #include "nfp_vdpa_core.h" 19 #include "nfp_vdpa_log.h" 20 21 #define NFP_VDPA_DRIVER_NAME nfp_vdpa 22 23 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ 24 sizeof(int) * (NFP_VDPA_MAX_QUEUES * 2 + 1)) 25 26 #define NFP_VDPA_USED_RING_LEN(size) \ 27 ((size) * sizeof(struct vring_used_elem) + sizeof(struct vring_used)) 28 29 #define EPOLL_DATA_INTR 1 30 31 struct nfp_vdpa_dev { 32 struct rte_pci_device *pci_dev; 33 struct rte_vdpa_device *vdev; 34 struct nfp_vdpa_hw hw; 35 36 int vfio_container_fd; 37 int vfio_group_fd; 38 int vfio_dev_fd; 39 int iommu_group; 40 41 rte_thread_t tid; /**< Thread for notify relay */ 42 int epoll_fd; 43 44 int vid; 45 uint16_t max_queues; 46 RTE_ATOMIC(uint32_t) started; 47 RTE_ATOMIC(uint32_t) dev_attached; 48 RTE_ATOMIC(uint32_t) running; 49 rte_spinlock_t lock; 50 51 /** Eventfd for used ring interrupt */ 52 int intr_fd[NFP_VDPA_MAX_QUEUES * 2]; 53 }; 54 55 struct nfp_vdpa_dev_node { 56 TAILQ_ENTRY(nfp_vdpa_dev_node) next; 57 struct nfp_vdpa_dev *device; 58 }; 59 60 TAILQ_HEAD(vdpa_dev_list_head, nfp_vdpa_dev_node); 61 62 static struct vdpa_dev_list_head vdpa_dev_list = 63 TAILQ_HEAD_INITIALIZER(vdpa_dev_list); 64 65 static pthread_mutex_t vdpa_list_lock = PTHREAD_MUTEX_INITIALIZER; 66 67 static struct nfp_vdpa_dev_node * 68 nfp_vdpa_find_node_by_vdev(struct rte_vdpa_device *vdev) 69 { 70 bool found = false; 71 struct nfp_vdpa_dev_node *node; 72 73 pthread_mutex_lock(&vdpa_list_lock); 74 75 TAILQ_FOREACH(node, &vdpa_dev_list, next) { 76 if (vdev == node->device->vdev) { 77 found = true; 78 break; 79 } 80 } 81 82 pthread_mutex_unlock(&vdpa_list_lock); 83 84 if (found) 85 return node; 86 87 return NULL; 88 } 89 90 static struct nfp_vdpa_dev_node * 91 nfp_vdpa_find_node_by_pdev(struct rte_pci_device *pdev) 92 { 93 bool found = false; 94 struct nfp_vdpa_dev_node *node; 95 96 pthread_mutex_lock(&vdpa_list_lock); 97 98 TAILQ_FOREACH(node, &vdpa_dev_list, next) { 99 if (pdev == node->device->pci_dev) { 100 found = true; 101 break; 102 } 103 } 104 105 pthread_mutex_unlock(&vdpa_list_lock); 106 107 if (found) 108 return node; 109 110 return NULL; 111 } 112 113 static int 114 nfp_vdpa_vfio_setup(struct nfp_vdpa_dev *device) 115 { 116 int ret; 117 char dev_name[RTE_DEV_NAME_MAX_LEN] = {0}; 118 struct rte_pci_device *pci_dev = device->pci_dev; 119 120 rte_pci_unmap_device(pci_dev); 121 122 rte_pci_device_name(&pci_dev->addr, dev_name, RTE_DEV_NAME_MAX_LEN); 123 ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), dev_name, 124 &device->iommu_group); 125 if (ret <= 0) 126 return -1; 127 128 device->vfio_container_fd = rte_vfio_container_create(); 129 if (device->vfio_container_fd < 0) 130 return -1; 131 132 device->vfio_group_fd = rte_vfio_container_group_bind( 133 device->vfio_container_fd, device->iommu_group); 134 if (device->vfio_group_fd < 0) 135 goto container_destroy; 136 137 DRV_VDPA_LOG(DEBUG, "The container_fd=%d, group_fd=%d.", 138 device->vfio_container_fd, device->vfio_group_fd); 139 140 ret = rte_pci_map_device(pci_dev); 141 if (ret != 0) 142 goto group_unbind; 143 144 device->vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle); 145 146 return 0; 147 148 group_unbind: 149 rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group); 150 container_destroy: 151 rte_vfio_container_destroy(device->vfio_container_fd); 152 153 return -1; 154 } 155 156 static void 157 nfp_vdpa_vfio_teardown(struct nfp_vdpa_dev *device) 158 { 159 rte_pci_unmap_device(device->pci_dev); 160 rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group); 161 rte_vfio_container_destroy(device->vfio_container_fd); 162 } 163 164 static int 165 nfp_vdpa_dma_do_unmap(struct rte_vhost_memory *mem, 166 uint32_t times, 167 int vfio_container_fd) 168 { 169 uint32_t i; 170 int ret = 0; 171 struct rte_vhost_mem_region *region; 172 173 for (i = 0; i < times; i++) { 174 region = &mem->regions[i]; 175 176 ret = rte_vfio_container_dma_unmap(vfio_container_fd, 177 region->host_user_addr, region->guest_phys_addr, 178 region->size); 179 if (ret < 0) { 180 /* Here should not return, even error happened. */ 181 DRV_VDPA_LOG(ERR, "DMA unmap failed. Times: %u.", i); 182 } 183 } 184 185 return ret; 186 } 187 188 static int 189 nfp_vdpa_dma_do_map(struct rte_vhost_memory *mem, 190 uint32_t times, 191 int vfio_container_fd) 192 { 193 int ret; 194 uint32_t i; 195 struct rte_vhost_mem_region *region; 196 197 for (i = 0; i < times; i++) { 198 region = &mem->regions[i]; 199 200 ret = rte_vfio_container_dma_map(vfio_container_fd, 201 region->host_user_addr, region->guest_phys_addr, 202 region->size); 203 if (ret < 0) { 204 DRV_VDPA_LOG(ERR, "DMA map failed."); 205 nfp_vdpa_dma_do_unmap(mem, i, vfio_container_fd); 206 return ret; 207 } 208 } 209 210 return 0; 211 } 212 213 static int 214 nfp_vdpa_dma_map(struct nfp_vdpa_dev *device, 215 bool do_map) 216 { 217 int ret; 218 int vfio_container_fd; 219 struct rte_vhost_memory *mem = NULL; 220 221 ret = rte_vhost_get_mem_table(device->vid, &mem); 222 if (ret < 0) { 223 DRV_VDPA_LOG(ERR, "Failed to get memory layout."); 224 return ret; 225 } 226 227 vfio_container_fd = device->vfio_container_fd; 228 DRV_VDPA_LOG(DEBUG, "The vfio_container_fd %d.", vfio_container_fd); 229 230 if (do_map) 231 ret = nfp_vdpa_dma_do_map(mem, mem->nregions, vfio_container_fd); 232 else 233 ret = nfp_vdpa_dma_do_unmap(mem, mem->nregions, vfio_container_fd); 234 235 free(mem); 236 237 return ret; 238 } 239 240 static uint64_t 241 nfp_vdpa_qva_to_gpa(int vid, 242 uint64_t qva) 243 { 244 int ret; 245 uint32_t i; 246 uint64_t gpa = 0; 247 struct rte_vhost_memory *mem = NULL; 248 struct rte_vhost_mem_region *region; 249 250 ret = rte_vhost_get_mem_table(vid, &mem); 251 if (ret < 0) { 252 DRV_VDPA_LOG(ERR, "Failed to get memory layout."); 253 return gpa; 254 } 255 256 for (i = 0; i < mem->nregions; i++) { 257 region = &mem->regions[i]; 258 259 if (qva >= region->host_user_addr && 260 qva < region->host_user_addr + region->size) { 261 gpa = qva - region->host_user_addr + region->guest_phys_addr; 262 break; 263 } 264 } 265 266 free(mem); 267 268 return gpa; 269 } 270 271 static void 272 nfp_vdpa_relay_vring_free(struct nfp_vdpa_dev *device, 273 uint16_t vring_index) 274 { 275 uint16_t i; 276 uint64_t size; 277 struct rte_vhost_vring vring; 278 uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING; 279 280 for (i = 0; i < vring_index; i++) { 281 rte_vhost_get_vhost_vring(device->vid, i, &vring); 282 283 size = RTE_ALIGN_CEIL(vring_size(vring.size, rte_mem_page_size()), 284 rte_mem_page_size()); 285 rte_vfio_container_dma_unmap(device->vfio_container_fd, 286 (uint64_t)(uintptr_t)device->hw.m_vring[i].desc, 287 m_vring_iova, size); 288 289 rte_free(device->hw.m_vring[i].desc); 290 m_vring_iova += size; 291 } 292 } 293 294 static int 295 nfp_vdpa_relay_vring_alloc(struct nfp_vdpa_dev *device) 296 { 297 int ret; 298 uint16_t i; 299 uint64_t size; 300 void *vring_buf; 301 uint64_t page_size; 302 struct rte_vhost_vring vring; 303 struct nfp_vdpa_hw *vdpa_hw = &device->hw; 304 uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING; 305 306 page_size = rte_mem_page_size(); 307 308 for (i = 0; i < vdpa_hw->nr_vring; i++) { 309 rte_vhost_get_vhost_vring(device->vid, i, &vring); 310 311 size = RTE_ALIGN_CEIL(vring_size(vring.size, page_size), page_size); 312 vring_buf = rte_zmalloc("nfp_vdpa_relay", size, page_size); 313 if (vring_buf == NULL) 314 goto vring_free_all; 315 316 vring_init(&vdpa_hw->m_vring[i], vring.size, vring_buf, page_size); 317 318 ret = rte_vfio_container_dma_map(device->vfio_container_fd, 319 (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size); 320 if (ret != 0) { 321 DRV_VDPA_LOG(ERR, "vDPA vring relay dma map failed."); 322 goto vring_free_one; 323 } 324 325 m_vring_iova += size; 326 } 327 328 return 0; 329 330 vring_free_one: 331 rte_free(device->hw.m_vring[i].desc); 332 vring_free_all: 333 nfp_vdpa_relay_vring_free(device, i); 334 335 return -ENOSPC; 336 } 337 338 static int 339 nfp_vdpa_start(struct nfp_vdpa_dev *device, 340 bool relay) 341 { 342 int ret; 343 int vid; 344 uint16_t i; 345 uint64_t gpa; 346 uint16_t size; 347 struct rte_vhost_vring vring; 348 struct nfp_vdpa_hw *vdpa_hw = &device->hw; 349 uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING; 350 351 vid = device->vid; 352 vdpa_hw->nr_vring = rte_vhost_get_vring_num(vid); 353 354 ret = rte_vhost_get_negotiated_features(vid, &vdpa_hw->req_features); 355 if (ret != 0) 356 return ret; 357 358 if (relay) { 359 ret = nfp_vdpa_relay_vring_alloc(device); 360 if (ret != 0) 361 return ret; 362 } 363 364 for (i = 0; i < vdpa_hw->nr_vring; i++) { 365 ret = rte_vhost_get_vhost_vring(vid, i, &vring); 366 if (ret != 0) 367 goto relay_vring_free; 368 369 gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.desc); 370 if (gpa == 0) { 371 DRV_VDPA_LOG(ERR, "Fail to get GPA for descriptor ring."); 372 goto relay_vring_free; 373 } 374 375 vdpa_hw->vring[i].desc = gpa; 376 377 gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.avail); 378 if (gpa == 0) { 379 DRV_VDPA_LOG(ERR, "Fail to get GPA for available ring."); 380 goto relay_vring_free; 381 } 382 383 vdpa_hw->vring[i].avail = gpa; 384 385 /* Direct I/O for Tx queue, relay for Rx queue */ 386 if (relay && ((i & 1) == 0)) { 387 vdpa_hw->vring[i].used = m_vring_iova + 388 (char *)vdpa_hw->m_vring[i].used - 389 (char *)vdpa_hw->m_vring[i].desc; 390 391 ret = rte_vhost_get_vring_base(vid, i, 392 &vdpa_hw->m_vring[i].avail->idx, 393 &vdpa_hw->m_vring[i].used->idx); 394 if (ret != 0) 395 goto relay_vring_free; 396 } else { 397 gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.used); 398 if (gpa == 0) { 399 DRV_VDPA_LOG(ERR, "Fail to get GPA for used ring."); 400 goto relay_vring_free; 401 } 402 403 vdpa_hw->vring[i].used = gpa; 404 } 405 406 vdpa_hw->vring[i].size = vring.size; 407 408 if (relay) { 409 size = RTE_ALIGN_CEIL(vring_size(vring.size, 410 rte_mem_page_size()), rte_mem_page_size()); 411 m_vring_iova += size; 412 } 413 414 ret = rte_vhost_get_vring_base(vid, i, 415 &vdpa_hw->vring[i].last_avail_idx, 416 &vdpa_hw->vring[i].last_used_idx); 417 if (ret != 0) 418 goto relay_vring_free; 419 } 420 421 if (relay) 422 return nfp_vdpa_relay_hw_start(&device->hw, vid); 423 else 424 return nfp_vdpa_hw_start(&device->hw, vid); 425 426 relay_vring_free: 427 if (relay) 428 nfp_vdpa_relay_vring_free(device, vdpa_hw->nr_vring); 429 430 return -EFAULT; 431 } 432 433 static void 434 nfp_vdpa_update_used_ring(struct nfp_vdpa_dev *dev, 435 uint16_t qid) 436 { 437 rte_vdpa_relay_vring_used(dev->vid, qid, &dev->hw.m_vring[qid]); 438 rte_vhost_vring_call(dev->vid, qid); 439 } 440 441 static void 442 nfp_vdpa_relay_stop(struct nfp_vdpa_dev *device) 443 { 444 int vid; 445 uint32_t i; 446 uint64_t len; 447 struct rte_vhost_vring vring; 448 struct nfp_vdpa_hw *vdpa_hw = &device->hw; 449 450 nfp_vdpa_hw_stop(vdpa_hw); 451 452 vid = device->vid; 453 for (i = 0; i < vdpa_hw->nr_vring; i++) { 454 /* Synchronize remaining new used entries if any */ 455 if ((i & 1) == 0) 456 nfp_vdpa_update_used_ring(device, i); 457 458 rte_vhost_get_vhost_vring(vid, i, &vring); 459 len = NFP_VDPA_USED_RING_LEN(vring.size); 460 vdpa_hw->vring[i].last_avail_idx = vring.avail->idx; 461 vdpa_hw->vring[i].last_used_idx = vring.used->idx; 462 463 rte_vhost_set_vring_base(vid, i, 464 vdpa_hw->vring[i].last_avail_idx, 465 vdpa_hw->vring[i].last_used_idx); 466 467 rte_vhost_log_used_vring(vid, i, 0, len); 468 469 if (vring.used->idx != vring.avail->idx) 470 rte_atomic_store_explicit( 471 (unsigned short __rte_atomic *)&vring.used->idx, 472 vring.avail->idx, rte_memory_order_release); 473 } 474 475 nfp_vdpa_relay_vring_free(device, vdpa_hw->nr_vring); 476 } 477 478 static void 479 nfp_vdpa_stop(struct nfp_vdpa_dev *device, 480 bool relay) 481 { 482 int vid; 483 uint32_t i; 484 struct nfp_vdpa_hw *vdpa_hw = &device->hw; 485 486 nfp_vdpa_hw_stop(vdpa_hw); 487 488 vid = device->vid; 489 if (relay) 490 nfp_vdpa_relay_stop(device); 491 else 492 for (i = 0; i < vdpa_hw->nr_vring; i++) 493 rte_vhost_set_vring_base(vid, i, 494 vdpa_hw->vring[i].last_avail_idx, 495 vdpa_hw->vring[i].last_used_idx); 496 497 } 498 499 static int 500 nfp_vdpa_enable_vfio_intr(struct nfp_vdpa_dev *device, 501 bool relay) 502 { 503 int fd; 504 int ret; 505 uint16_t i; 506 int *fd_ptr; 507 uint16_t nr_vring; 508 struct vfio_irq_set *irq_set; 509 struct rte_vhost_vring vring; 510 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; 511 512 nr_vring = rte_vhost_get_vring_num(device->vid); 513 514 irq_set = (struct vfio_irq_set *)irq_set_buf; 515 irq_set->argsz = sizeof(irq_set_buf); 516 irq_set->count = nr_vring + 1; 517 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 518 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 519 irq_set->start = 0; 520 521 fd_ptr = (int *)&irq_set->data; 522 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(device->pci_dev->intr_handle); 523 524 for (i = 0; i < nr_vring; i++) 525 device->intr_fd[i] = -1; 526 527 for (i = 0; i < nr_vring; i++) { 528 rte_vhost_get_vhost_vring(device->vid, i, &vring); 529 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd; 530 } 531 532 if (relay) { 533 for (i = 0; i < nr_vring; i += 2) { 534 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 535 if (fd < 0) { 536 DRV_VDPA_LOG(ERR, "Can't setup eventfd."); 537 return -EINVAL; 538 } 539 540 device->intr_fd[i] = fd; 541 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd; 542 } 543 } 544 545 ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); 546 if (ret != 0) { 547 DRV_VDPA_LOG(ERR, "Error enabling MSI-X interrupts."); 548 return -EIO; 549 } 550 551 return 0; 552 } 553 554 static int 555 nfp_vdpa_disable_vfio_intr(struct nfp_vdpa_dev *device) 556 { 557 int ret; 558 struct vfio_irq_set *irq_set; 559 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; 560 561 irq_set = (struct vfio_irq_set *)irq_set_buf; 562 irq_set->argsz = sizeof(irq_set_buf); 563 irq_set->count = 0; 564 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 565 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 566 irq_set->start = 0; 567 568 ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); 569 if (ret != 0) { 570 DRV_VDPA_LOG(ERR, "Error disabling MSI-X interrupts."); 571 return -EIO; 572 } 573 574 return 0; 575 } 576 577 static void 578 nfp_vdpa_read_kickfd(int kickfd) 579 { 580 int bytes; 581 uint64_t buf; 582 583 for (;;) { 584 bytes = read(kickfd, &buf, 8); 585 if (bytes >= 0) 586 break; 587 588 if (errno != EINTR && errno != EWOULDBLOCK && 589 errno != EAGAIN) { 590 DRV_VDPA_LOG(ERR, "Error reading kickfd."); 591 break; 592 } 593 } 594 } 595 596 static int 597 nfp_vdpa_notify_epoll_ctl(uint32_t queue_num, 598 struct nfp_vdpa_dev *device) 599 { 600 int ret; 601 uint32_t qid; 602 603 for (qid = 0; qid < queue_num; qid++) { 604 struct epoll_event ev; 605 struct rte_vhost_vring vring; 606 607 ev.events = EPOLLIN | EPOLLPRI; 608 rte_vhost_get_vhost_vring(device->vid, qid, &vring); 609 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32; 610 ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD, vring.kickfd, &ev); 611 if (ret < 0) { 612 DRV_VDPA_LOG(ERR, "Epoll add error for queue %d.", qid); 613 return ret; 614 } 615 } 616 617 return 0; 618 } 619 620 static int 621 nfp_vdpa_notify_epoll_wait(uint32_t queue_num, 622 struct nfp_vdpa_dev *device) 623 { 624 int i; 625 int fds; 626 int kickfd; 627 uint32_t qid; 628 struct epoll_event events[NFP_VDPA_MAX_QUEUES * 2]; 629 630 for (;;) { 631 fds = epoll_wait(device->epoll_fd, events, queue_num, -1); 632 if (fds < 0) { 633 if (errno == EINTR) 634 continue; 635 636 DRV_VDPA_LOG(ERR, "Epoll wait fail."); 637 return -EACCES; 638 } 639 640 for (i = 0; i < fds; i++) { 641 qid = events[i].data.u32; 642 kickfd = (uint32_t)(events[i].data.u64 >> 32); 643 644 nfp_vdpa_read_kickfd(kickfd); 645 nfp_vdpa_notify_queue(&device->hw, qid); 646 } 647 } 648 649 return 0; 650 } 651 652 static uint32_t 653 nfp_vdpa_notify_relay(void *arg) 654 { 655 int ret; 656 int epoll_fd; 657 uint32_t queue_num; 658 struct nfp_vdpa_dev *device = arg; 659 660 epoll_fd = epoll_create(NFP_VDPA_MAX_QUEUES * 2); 661 if (epoll_fd < 0) { 662 DRV_VDPA_LOG(ERR, "Failed to create epoll instance."); 663 return 1; 664 } 665 666 device->epoll_fd = epoll_fd; 667 668 queue_num = rte_vhost_get_vring_num(device->vid); 669 670 ret = nfp_vdpa_notify_epoll_ctl(queue_num, device); 671 if (ret != 0) 672 goto notify_exit; 673 674 ret = nfp_vdpa_notify_epoll_wait(queue_num, device); 675 if (ret != 0) 676 goto notify_exit; 677 678 return 0; 679 680 notify_exit: 681 close(device->epoll_fd); 682 device->epoll_fd = -1; 683 684 return 1; 685 } 686 687 static int 688 nfp_vdpa_setup_notify_relay(struct nfp_vdpa_dev *device) 689 { 690 int ret; 691 char name[RTE_THREAD_INTERNAL_NAME_SIZE]; 692 693 snprintf(name, sizeof(name), "nfp-noti%d", device->vid); 694 ret = rte_thread_create_internal_control(&device->tid, name, 695 nfp_vdpa_notify_relay, (void *)device); 696 if (ret != 0) { 697 DRV_VDPA_LOG(ERR, "Failed to create notify relay pthread."); 698 return -1; 699 } 700 701 return 0; 702 } 703 704 static void 705 nfp_vdpa_unset_notify_relay(struct nfp_vdpa_dev *device) 706 { 707 if (device->tid.opaque_id != 0) { 708 pthread_cancel((pthread_t)device->tid.opaque_id); 709 rte_thread_join(device->tid, NULL); 710 device->tid.opaque_id = 0; 711 } 712 713 if (device->epoll_fd >= 0) { 714 close(device->epoll_fd); 715 device->epoll_fd = -1; 716 } 717 } 718 719 static int 720 update_datapath(struct nfp_vdpa_dev *device) 721 { 722 int ret; 723 724 rte_spinlock_lock(&device->lock); 725 726 if ((rte_atomic_load_explicit(&device->running, rte_memory_order_relaxed) == 0) && 727 (rte_atomic_load_explicit(&device->started, 728 rte_memory_order_relaxed) != 0) && 729 (rte_atomic_load_explicit(&device->dev_attached, 730 rte_memory_order_relaxed) != 0)) { 731 ret = nfp_vdpa_dma_map(device, true); 732 if (ret != 0) 733 goto unlock_exit; 734 735 ret = nfp_vdpa_enable_vfio_intr(device, false); 736 if (ret != 0) 737 goto dma_map_rollback; 738 739 ret = nfp_vdpa_start(device, false); 740 if (ret != 0) 741 goto disable_vfio_intr; 742 743 ret = nfp_vdpa_setup_notify_relay(device); 744 if (ret != 0) 745 goto vdpa_stop; 746 747 rte_atomic_store_explicit(&device->running, 1, rte_memory_order_relaxed); 748 } else if ((rte_atomic_load_explicit(&device->running, rte_memory_order_relaxed) != 0) && 749 ((rte_atomic_load_explicit(&device->started, 750 rte_memory_order_relaxed) != 0) || 751 (rte_atomic_load_explicit(&device->dev_attached, 752 rte_memory_order_relaxed) != 0))) { 753 nfp_vdpa_unset_notify_relay(device); 754 755 nfp_vdpa_stop(device, false); 756 757 ret = nfp_vdpa_disable_vfio_intr(device); 758 if (ret != 0) 759 goto unlock_exit; 760 761 ret = nfp_vdpa_dma_map(device, false); 762 if (ret != 0) 763 goto unlock_exit; 764 765 rte_atomic_store_explicit(&device->running, 0, rte_memory_order_relaxed); 766 } 767 768 rte_spinlock_unlock(&device->lock); 769 return 0; 770 771 vdpa_stop: 772 nfp_vdpa_stop(device, false); 773 disable_vfio_intr: 774 nfp_vdpa_disable_vfio_intr(device); 775 dma_map_rollback: 776 nfp_vdpa_dma_map(device, false); 777 unlock_exit: 778 rte_spinlock_unlock(&device->lock); 779 return ret; 780 } 781 782 static int 783 nfp_vdpa_vring_epoll_ctl(uint32_t queue_num, 784 struct nfp_vdpa_dev *device) 785 { 786 int ret; 787 uint32_t qid; 788 struct epoll_event ev; 789 struct rte_vhost_vring vring; 790 791 for (qid = 0; qid < queue_num; qid++) { 792 ev.events = EPOLLIN | EPOLLPRI; 793 rte_vhost_get_vhost_vring(device->vid, qid, &vring); 794 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32; 795 ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD, vring.kickfd, &ev); 796 if (ret < 0) { 797 DRV_VDPA_LOG(ERR, "Epoll add error for queue %u.", qid); 798 return ret; 799 } 800 } 801 802 /* vDPA driver interrupt */ 803 for (qid = 0; qid < queue_num; qid += 2) { 804 ev.events = EPOLLIN | EPOLLPRI; 805 /* Leave a flag to mark it's for interrupt */ 806 ev.data.u64 = EPOLL_DATA_INTR | qid << 1 | 807 (uint64_t)device->intr_fd[qid] << 32; 808 ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD, 809 device->intr_fd[qid], &ev); 810 if (ret < 0) { 811 DRV_VDPA_LOG(ERR, "Epoll add error for queue %u.", qid); 812 return ret; 813 } 814 815 nfp_vdpa_update_used_ring(device, qid); 816 } 817 818 return 0; 819 } 820 821 static int 822 nfp_vdpa_vring_epoll_wait(uint32_t queue_num, 823 struct nfp_vdpa_dev *device) 824 { 825 int i; 826 int fds; 827 int kickfd; 828 uint32_t qid; 829 struct epoll_event events[NFP_VDPA_MAX_QUEUES * 2]; 830 831 for (;;) { 832 fds = epoll_wait(device->epoll_fd, events, queue_num * 2, -1); 833 if (fds < 0) { 834 if (errno == EINTR) 835 continue; 836 837 DRV_VDPA_LOG(ERR, "Epoll wait fail."); 838 return -EACCES; 839 } 840 841 for (i = 0; i < fds; i++) { 842 qid = events[i].data.u32 >> 1; 843 kickfd = (uint32_t)(events[i].data.u64 >> 32); 844 845 nfp_vdpa_read_kickfd(kickfd); 846 if ((events[i].data.u32 & EPOLL_DATA_INTR) != 0) { 847 nfp_vdpa_update_used_ring(device, qid); 848 nfp_vdpa_irq_unmask(&device->hw); 849 } else { 850 nfp_vdpa_notify_queue(&device->hw, qid); 851 } 852 } 853 } 854 855 return 0; 856 } 857 858 static uint32_t 859 nfp_vdpa_vring_relay(void *arg) 860 { 861 int ret; 862 int epoll_fd; 863 uint16_t queue_id; 864 uint32_t queue_num; 865 struct nfp_vdpa_dev *device = arg; 866 867 epoll_fd = epoll_create(NFP_VDPA_MAX_QUEUES * 2); 868 if (epoll_fd < 0) { 869 DRV_VDPA_LOG(ERR, "failed to create epoll instance."); 870 return 1; 871 } 872 873 device->epoll_fd = epoll_fd; 874 875 queue_num = rte_vhost_get_vring_num(device->vid); 876 877 ret = nfp_vdpa_vring_epoll_ctl(queue_num, device); 878 if (ret != 0) 879 goto notify_exit; 880 881 /* Start relay with a first kick */ 882 for (queue_id = 0; queue_id < queue_num; queue_id++) 883 nfp_vdpa_notify_queue(&device->hw, queue_id); 884 885 ret = nfp_vdpa_vring_epoll_wait(queue_num, device); 886 if (ret != 0) 887 goto notify_exit; 888 889 return 0; 890 891 notify_exit: 892 close(device->epoll_fd); 893 device->epoll_fd = -1; 894 895 return 1; 896 } 897 898 static int 899 nfp_vdpa_setup_vring_relay(struct nfp_vdpa_dev *device) 900 { 901 int ret; 902 char name[RTE_THREAD_INTERNAL_NAME_SIZE]; 903 904 snprintf(name, sizeof(name), "nfp_vring%d", device->vid); 905 ret = rte_thread_create_internal_control(&device->tid, name, 906 nfp_vdpa_vring_relay, (void *)device); 907 if (ret != 0) { 908 DRV_VDPA_LOG(ERR, "Failed to create vring relay pthread."); 909 return -EPERM; 910 } 911 912 return 0; 913 } 914 915 static int 916 nfp_vdpa_sw_fallback(struct nfp_vdpa_dev *device) 917 { 918 int ret; 919 int vid = device->vid; 920 921 /* Stop the direct IO data path */ 922 nfp_vdpa_unset_notify_relay(device); 923 nfp_vdpa_disable_vfio_intr(device); 924 925 ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false); 926 if ((ret != 0) && (ret != -ENOTSUP)) { 927 DRV_VDPA_LOG(ERR, "Unset the host notifier failed."); 928 goto error; 929 } 930 931 /* Setup interrupt for vring relay */ 932 ret = nfp_vdpa_enable_vfio_intr(device, true); 933 if (ret != 0) 934 goto error; 935 936 /* Config the VF */ 937 ret = nfp_vdpa_start(device, true); 938 if (ret != 0) 939 goto unset_intr; 940 941 /* Setup vring relay thread */ 942 ret = nfp_vdpa_setup_vring_relay(device); 943 if (ret != 0) 944 goto stop_vf; 945 946 device->hw.sw_fallback_running = true; 947 948 return 0; 949 950 stop_vf: 951 nfp_vdpa_stop(device, true); 952 unset_intr: 953 nfp_vdpa_disable_vfio_intr(device); 954 error: 955 return ret; 956 } 957 958 static int 959 nfp_vdpa_dev_config(int vid) 960 { 961 int ret; 962 struct nfp_vdpa_dev *device; 963 struct rte_vdpa_device *vdev; 964 struct nfp_vdpa_dev_node *node; 965 966 vdev = rte_vhost_get_vdpa_device(vid); 967 node = nfp_vdpa_find_node_by_vdev(vdev); 968 if (node == NULL) { 969 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 970 return -ENODEV; 971 } 972 973 device = node->device; 974 device->vid = vid; 975 rte_atomic_store_explicit(&device->dev_attached, 1, rte_memory_order_relaxed); 976 update_datapath(device); 977 978 ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true); 979 if (ret != 0) 980 DRV_VDPA_LOG(INFO, "vDPA (%s): software relay is used.", 981 vdev->device->name); 982 983 return 0; 984 } 985 986 static int 987 nfp_vdpa_dev_close(int vid) 988 { 989 struct nfp_vdpa_dev *device; 990 struct rte_vdpa_device *vdev; 991 struct nfp_vdpa_dev_node *node; 992 993 vdev = rte_vhost_get_vdpa_device(vid); 994 node = nfp_vdpa_find_node_by_vdev(vdev); 995 if (node == NULL) { 996 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 997 return -ENODEV; 998 } 999 1000 device = node->device; 1001 if (device->hw.sw_fallback_running) { 1002 /* Reset VF */ 1003 nfp_vdpa_stop(device, true); 1004 1005 /* Remove interrupt setting */ 1006 nfp_vdpa_disable_vfio_intr(device); 1007 1008 /* Unset DMA map for guest memory */ 1009 nfp_vdpa_dma_map(device, false); 1010 1011 device->hw.sw_fallback_running = false; 1012 1013 rte_atomic_store_explicit(&device->dev_attached, 0, 1014 rte_memory_order_relaxed); 1015 rte_atomic_store_explicit(&device->running, 0, 1016 rte_memory_order_relaxed); 1017 } else { 1018 rte_atomic_store_explicit(&device->dev_attached, 0, 1019 rte_memory_order_relaxed); 1020 update_datapath(device); 1021 } 1022 1023 return 0; 1024 } 1025 1026 static int 1027 nfp_vdpa_get_vfio_group_fd(int vid) 1028 { 1029 struct rte_vdpa_device *vdev; 1030 struct nfp_vdpa_dev_node *node; 1031 1032 vdev = rte_vhost_get_vdpa_device(vid); 1033 node = nfp_vdpa_find_node_by_vdev(vdev); 1034 if (node == NULL) { 1035 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 1036 return -ENODEV; 1037 } 1038 1039 return node->device->vfio_group_fd; 1040 } 1041 1042 static int 1043 nfp_vdpa_get_vfio_device_fd(int vid) 1044 { 1045 struct rte_vdpa_device *vdev; 1046 struct nfp_vdpa_dev_node *node; 1047 1048 vdev = rte_vhost_get_vdpa_device(vid); 1049 node = nfp_vdpa_find_node_by_vdev(vdev); 1050 if (node == NULL) { 1051 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 1052 return -ENODEV; 1053 } 1054 1055 return node->device->vfio_dev_fd; 1056 } 1057 1058 static int 1059 nfp_vdpa_get_notify_area(int vid, 1060 int qid, 1061 uint64_t *offset, 1062 uint64_t *size) 1063 { 1064 int ret; 1065 struct nfp_vdpa_dev *device; 1066 struct rte_vdpa_device *vdev; 1067 struct nfp_vdpa_dev_node *node; 1068 struct vfio_region_info region = { 1069 .argsz = sizeof(region) 1070 }; 1071 1072 vdev = rte_vhost_get_vdpa_device(vid); 1073 node = nfp_vdpa_find_node_by_vdev(vdev); 1074 if (node == NULL) { 1075 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p", vdev); 1076 return -ENODEV; 1077 } 1078 1079 device = node->device; 1080 region.index = device->hw.notify_region; 1081 1082 ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®ion); 1083 if (ret != 0) { 1084 DRV_VDPA_LOG(ERR, "Get not get device region info."); 1085 return -EIO; 1086 } 1087 1088 *offset = nfp_vdpa_get_queue_notify_offset(&device->hw, qid) + region.offset; 1089 *size = NFP_VDPA_NOTIFY_ADDR_INTERVAL; 1090 1091 return 0; 1092 } 1093 1094 static int 1095 nfp_vdpa_get_queue_num(struct rte_vdpa_device *vdev, 1096 uint32_t *queue_num) 1097 { 1098 struct nfp_vdpa_dev_node *node; 1099 1100 node = nfp_vdpa_find_node_by_vdev(vdev); 1101 if (node == NULL) { 1102 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 1103 return -ENODEV; 1104 } 1105 1106 *queue_num = node->device->max_queues; 1107 1108 return 0; 1109 } 1110 1111 static int 1112 nfp_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev, 1113 uint64_t *features) 1114 { 1115 struct nfp_vdpa_dev_node *node; 1116 1117 node = nfp_vdpa_find_node_by_vdev(vdev); 1118 if (node == NULL) { 1119 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p", vdev); 1120 return -ENODEV; 1121 } 1122 1123 *features = node->device->hw.features; 1124 1125 return 0; 1126 } 1127 1128 static int 1129 nfp_vdpa_get_protocol_features(struct rte_vdpa_device *vdev __rte_unused, 1130 uint64_t *features) 1131 { 1132 *features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1133 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1134 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ | 1135 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD | 1136 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER; 1137 1138 return 0; 1139 } 1140 1141 static int 1142 nfp_vdpa_set_features(int32_t vid) 1143 { 1144 int ret; 1145 uint64_t features = 0; 1146 struct nfp_vdpa_dev *device; 1147 struct rte_vdpa_device *vdev; 1148 struct nfp_vdpa_dev_node *node; 1149 1150 DRV_VDPA_LOG(DEBUG, "Start vid=%d.", vid); 1151 1152 vdev = rte_vhost_get_vdpa_device(vid); 1153 node = nfp_vdpa_find_node_by_vdev(vdev); 1154 if (node == NULL) { 1155 DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev); 1156 return -ENODEV; 1157 } 1158 1159 rte_vhost_get_negotiated_features(vid, &features); 1160 1161 if (RTE_VHOST_NEED_LOG(features) == 0) 1162 return 0; 1163 1164 device = node->device; 1165 if (device->hw.sw_lm) { 1166 ret = nfp_vdpa_sw_fallback(device); 1167 if (ret != 0) { 1168 DRV_VDPA_LOG(ERR, "Software fallback start failed."); 1169 return -1; 1170 } 1171 } 1172 1173 return 0; 1174 } 1175 1176 static int 1177 nfp_vdpa_set_vring_state(int vid, 1178 int vring, 1179 int state) 1180 { 1181 DRV_VDPA_LOG(DEBUG, "Start vid=%d, vring=%d, state=%d.", vid, vring, state); 1182 return 0; 1183 } 1184 1185 struct rte_vdpa_dev_ops nfp_vdpa_ops = { 1186 .get_queue_num = nfp_vdpa_get_queue_num, 1187 .get_features = nfp_vdpa_get_vdpa_features, 1188 .get_protocol_features = nfp_vdpa_get_protocol_features, 1189 .dev_conf = nfp_vdpa_dev_config, 1190 .dev_close = nfp_vdpa_dev_close, 1191 .set_vring_state = nfp_vdpa_set_vring_state, 1192 .set_features = nfp_vdpa_set_features, 1193 .get_vfio_group_fd = nfp_vdpa_get_vfio_group_fd, 1194 .get_vfio_device_fd = nfp_vdpa_get_vfio_device_fd, 1195 .get_notify_area = nfp_vdpa_get_notify_area, 1196 }; 1197 1198 static int 1199 nfp_vdpa_pci_probe(struct rte_pci_device *pci_dev) 1200 { 1201 int ret; 1202 struct nfp_vdpa_dev *device; 1203 struct nfp_vdpa_dev_node *node; 1204 1205 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1206 return 0; 1207 1208 node = calloc(1, sizeof(*node)); 1209 if (node == NULL) 1210 return -ENOMEM; 1211 1212 device = calloc(1, sizeof(*device)); 1213 if (device == NULL) 1214 goto free_node; 1215 1216 device->pci_dev = pci_dev; 1217 1218 ret = nfp_vdpa_vfio_setup(device); 1219 if (ret != 0) 1220 goto free_device; 1221 1222 ret = nfp_vdpa_hw_init(&device->hw, pci_dev); 1223 if (ret != 0) 1224 goto vfio_teardown; 1225 1226 device->max_queues = NFP_VDPA_MAX_QUEUES; 1227 1228 device->vdev = rte_vdpa_register_device(&pci_dev->device, &nfp_vdpa_ops); 1229 if (device->vdev == NULL) { 1230 DRV_VDPA_LOG(ERR, "Failed to register device %s.", pci_dev->name); 1231 goto vfio_teardown; 1232 } 1233 1234 node->device = device; 1235 pthread_mutex_lock(&vdpa_list_lock); 1236 TAILQ_INSERT_TAIL(&vdpa_dev_list, node, next); 1237 pthread_mutex_unlock(&vdpa_list_lock); 1238 1239 rte_spinlock_init(&device->lock); 1240 rte_atomic_store_explicit(&device->started, 1, rte_memory_order_relaxed); 1241 update_datapath(device); 1242 1243 return 0; 1244 1245 vfio_teardown: 1246 nfp_vdpa_vfio_teardown(device); 1247 free_device: 1248 free(device); 1249 free_node: 1250 free(node); 1251 1252 return -1; 1253 } 1254 1255 static int 1256 nfp_vdpa_pci_remove(struct rte_pci_device *pci_dev) 1257 { 1258 struct nfp_vdpa_dev *device; 1259 struct nfp_vdpa_dev_node *node; 1260 1261 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1262 return 0; 1263 1264 node = nfp_vdpa_find_node_by_pdev(pci_dev); 1265 if (node == NULL) { 1266 DRV_VDPA_LOG(ERR, "Invalid device: %s.", pci_dev->name); 1267 return -ENODEV; 1268 } 1269 1270 device = node->device; 1271 1272 rte_atomic_store_explicit(&device->started, 0, rte_memory_order_relaxed); 1273 update_datapath(device); 1274 1275 pthread_mutex_lock(&vdpa_list_lock); 1276 TAILQ_REMOVE(&vdpa_dev_list, node, next); 1277 pthread_mutex_unlock(&vdpa_list_lock); 1278 1279 rte_vdpa_unregister_device(device->vdev); 1280 nfp_vdpa_vfio_teardown(device); 1281 1282 free(device); 1283 free(node); 1284 1285 return 0; 1286 } 1287 1288 static const struct rte_pci_id pci_id_nfp_vdpa_map[] = { 1289 { 1290 RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME, 1291 PCI_DEVICE_ID_NFP6000_VF_NIC) 1292 }, 1293 { 1294 .vendor_id = 0, 1295 }, 1296 }; 1297 1298 static struct nfp_class_driver nfp_vdpa = { 1299 .drv_class = NFP_CLASS_VDPA, 1300 .name = RTE_STR(NFP_VDPA_DRIVER_NAME), 1301 .id_table = pci_id_nfp_vdpa_map, 1302 .drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC, 1303 .probe = nfp_vdpa_pci_probe, 1304 .remove = nfp_vdpa_pci_remove, 1305 }; 1306 1307 RTE_INIT(nfp_vdpa_init) 1308 { 1309 nfp_class_driver_register(&nfp_vdpa); 1310 } 1311 1312 RTE_PMD_REGISTER_PCI_TABLE(NFP_VDPA_DRIVER_NAME, pci_id_nfp_vdpa_map); 1313 RTE_PMD_REGISTER_KMOD_DEP(NFP_VDPA_DRIVER_NAME, "* vfio-pci"); 1314