1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 /* Security model 6 * -------------- 7 * The vhost-user protocol connection is an external interface, so it must be 8 * robust against invalid inputs. 9 * 10 * This is important because the vhost-user frontend is only one step removed 11 * from the guest. Malicious guests that have escaped will then launch further 12 * attacks from the vhost-user frontend. 13 * 14 * Even in deployments where guests are trusted, a bug in the vhost-user frontend 15 * can still cause invalid messages to be sent. Such messages must not 16 * compromise the stability of the DPDK application by causing crashes, memory 17 * corruption, or other problematic behavior. 18 * 19 * Do not assume received VhostUserMsg fields contain sensible values! 20 */ 21 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #include <unistd.h> 27 #include <fcntl.h> 28 #include <sys/ioctl.h> 29 #include <sys/mman.h> 30 #include <sys/stat.h> 31 #include <sys/syscall.h> 32 #ifdef RTE_LIBRTE_VHOST_NUMA 33 #include <numaif.h> 34 #endif 35 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 36 #include <linux/userfaultfd.h> 37 #endif 38 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 39 #include <linux/memfd.h> 40 #define MEMFD_SUPPORTED 41 #endif 42 43 #include <rte_common.h> 44 #include <rte_malloc.h> 45 #include <rte_log.h> 46 #include <rte_vfio.h> 47 #include <rte_errno.h> 48 49 #include "iotlb.h" 50 #include "vhost.h" 51 #include "vhost_user.h" 52 53 #define VIRTIO_MIN_MTU 68 54 #define VIRTIO_MAX_MTU 65535 55 56 #define INFLIGHT_ALIGNMENT 64 57 #define INFLIGHT_VERSION 0x1 58 59 typedef struct vhost_message_handler { 60 const char *description; 61 int (*callback)(struct virtio_net **pdev, struct vhu_msg_context *ctx, 62 int main_fd); 63 bool accepts_fd; 64 } vhost_message_handler_t; 65 static vhost_message_handler_t vhost_message_handlers[]; 66 67 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 68 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 69 70 static void 71 close_msg_fds(struct vhu_msg_context *ctx) 72 { 73 int i; 74 75 for (i = 0; i < ctx->fd_num; i++) { 76 int fd = ctx->fds[i]; 77 78 if (fd == -1) 79 continue; 80 81 ctx->fds[i] = -1; 82 close(fd); 83 } 84 } 85 86 /* 87 * Ensure the expected number of FDs is received, 88 * close all FDs and return an error if this is not the case. 89 */ 90 static int 91 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds) 92 { 93 if (ctx->fd_num == expected_fds) 94 return 0; 95 96 VHOST_CONFIG_LOG(dev->ifname, ERR, 97 "expect %d FDs for request %s, received %d", 98 expected_fds, vhost_message_handlers[ctx->msg.request.frontend].description, 99 ctx->fd_num); 100 101 close_msg_fds(ctx); 102 103 return -1; 104 } 105 106 static uint64_t 107 get_blk_size(int fd) 108 { 109 struct stat stat; 110 int ret; 111 112 ret = fstat(fd, &stat); 113 return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; 114 } 115 116 static void 117 async_dma_map(struct virtio_net *dev, bool do_map) 118 { 119 int ret = 0; 120 uint32_t i; 121 struct guest_page *page; 122 123 if (do_map) { 124 for (i = 0; i < dev->nr_guest_pages; i++) { 125 page = &dev->guest_pages[i]; 126 ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD, 127 page->host_user_addr, 128 page->host_iova, 129 page->size); 130 if (ret) { 131 /* 132 * DMA device may bind with kernel driver, in this case, 133 * we don't need to program IOMMU manually. However, if no 134 * device is bound with vfio/uio in DPDK, and vfio kernel 135 * module is loaded, the API will still be called and return 136 * with ENODEV. 137 * 138 * DPDK vfio only returns ENODEV in very similar situations 139 * (vfio either unsupported, or supported but no devices found). 140 * Either way, no mappings could be performed. We treat it as 141 * normal case in async path. This is a workaround. 142 */ 143 if (rte_errno == ENODEV) 144 return; 145 146 /* DMA mapping errors won't stop VHOST_USER_SET_MEM_TABLE. */ 147 VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine map failed"); 148 } 149 } 150 151 } else { 152 for (i = 0; i < dev->nr_guest_pages; i++) { 153 page = &dev->guest_pages[i]; 154 ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD, 155 page->host_user_addr, 156 page->host_iova, 157 page->size); 158 if (ret) { 159 /* like DMA map, ignore the kernel driver case when unmap. */ 160 if (rte_errno == EINVAL) 161 return; 162 163 VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine unmap failed"); 164 } 165 } 166 } 167 } 168 169 static void 170 free_mem_region(struct virtio_net *dev) 171 { 172 uint32_t i; 173 struct rte_vhost_mem_region *reg; 174 175 if (!dev || !dev->mem) 176 return; 177 178 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 179 async_dma_map(dev, false); 180 181 for (i = 0; i < dev->mem->nregions; i++) { 182 reg = &dev->mem->regions[i]; 183 if (reg->host_user_addr) { 184 munmap(reg->mmap_addr, reg->mmap_size); 185 close(reg->fd); 186 } 187 } 188 } 189 190 void 191 vhost_backend_cleanup(struct virtio_net *dev) 192 { 193 struct rte_vdpa_device *vdpa_dev; 194 195 vdpa_dev = dev->vdpa_dev; 196 if (vdpa_dev && vdpa_dev->ops->dev_cleanup != NULL) 197 vdpa_dev->ops->dev_cleanup(dev->vid); 198 199 if (dev->mem) { 200 free_mem_region(dev); 201 rte_free(dev->mem); 202 dev->mem = NULL; 203 } 204 205 rte_free(dev->guest_pages); 206 dev->guest_pages = NULL; 207 208 if (dev->log_addr) { 209 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 210 dev->log_addr = 0; 211 } 212 213 if (dev->inflight_info) { 214 if (dev->inflight_info->addr) { 215 munmap(dev->inflight_info->addr, 216 dev->inflight_info->size); 217 dev->inflight_info->addr = NULL; 218 } 219 220 if (dev->inflight_info->fd >= 0) { 221 close(dev->inflight_info->fd); 222 dev->inflight_info->fd = -1; 223 } 224 225 rte_free(dev->inflight_info); 226 dev->inflight_info = NULL; 227 } 228 229 if (dev->backend_req_fd >= 0) { 230 close(dev->backend_req_fd); 231 dev->backend_req_fd = -1; 232 } 233 234 if (dev->postcopy_ufd >= 0) { 235 close(dev->postcopy_ufd); 236 dev->postcopy_ufd = -1; 237 } 238 239 dev->postcopy_listening = 0; 240 241 vhost_user_iotlb_destroy(dev); 242 } 243 244 static void 245 vhost_user_notify_queue_state(struct virtio_net *dev, struct vhost_virtqueue *vq, 246 int enable) 247 { 248 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 249 250 /* Configure guest notifications on enable */ 251 if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF) 252 vhost_enable_guest_notification(dev, vq, vq->notif_enable); 253 254 if (vdpa_dev && vdpa_dev->ops->set_vring_state) 255 vdpa_dev->ops->set_vring_state(dev->vid, vq->index, enable); 256 257 if (dev->notify_ops->vring_state_changed) 258 dev->notify_ops->vring_state_changed(dev->vid, vq->index, enable); 259 } 260 261 /* 262 * This function just returns success at the moment unless 263 * the device hasn't been initialised. 264 */ 265 static int 266 vhost_user_set_owner(struct virtio_net **pdev __rte_unused, 267 struct vhu_msg_context *ctx __rte_unused, 268 int main_fd __rte_unused) 269 { 270 return RTE_VHOST_MSG_RESULT_OK; 271 } 272 273 static int 274 vhost_user_reset_owner(struct virtio_net **pdev, 275 struct vhu_msg_context *ctx __rte_unused, 276 int main_fd __rte_unused) 277 { 278 struct virtio_net *dev = *pdev; 279 280 vhost_destroy_device_notify(dev); 281 282 cleanup_device(dev, 0); 283 reset_device(dev); 284 return RTE_VHOST_MSG_RESULT_OK; 285 } 286 287 /* 288 * The features that we support are requested. 289 */ 290 static int 291 vhost_user_get_features(struct virtio_net **pdev, 292 struct vhu_msg_context *ctx, 293 int main_fd __rte_unused) 294 { 295 struct virtio_net *dev = *pdev; 296 uint64_t features = 0; 297 298 rte_vhost_driver_get_features(dev->ifname, &features); 299 300 ctx->msg.payload.u64 = features; 301 ctx->msg.size = sizeof(ctx->msg.payload.u64); 302 ctx->fd_num = 0; 303 304 return RTE_VHOST_MSG_RESULT_REPLY; 305 } 306 307 /* 308 * The queue number that we support are requested. 309 */ 310 static int 311 vhost_user_get_queue_num(struct virtio_net **pdev, 312 struct vhu_msg_context *ctx, 313 int main_fd __rte_unused) 314 { 315 struct virtio_net *dev = *pdev; 316 uint32_t queue_num = 0; 317 318 rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); 319 320 ctx->msg.payload.u64 = (uint64_t)queue_num; 321 ctx->msg.size = sizeof(ctx->msg.payload.u64); 322 ctx->fd_num = 0; 323 324 return RTE_VHOST_MSG_RESULT_REPLY; 325 } 326 327 /* 328 * We receive the negotiated features supported by us and the virtio device. 329 */ 330 static int 331 vhost_user_set_features(struct virtio_net **pdev, 332 struct vhu_msg_context *ctx, 333 int main_fd __rte_unused) 334 { 335 struct virtio_net *dev = *pdev; 336 uint64_t features = ctx->msg.payload.u64; 337 uint64_t vhost_features = 0; 338 struct rte_vdpa_device *vdpa_dev; 339 340 rte_vhost_driver_get_features(dev->ifname, &vhost_features); 341 if (features & ~vhost_features) { 342 VHOST_CONFIG_LOG(dev->ifname, ERR, "received invalid negotiated features."); 343 dev->flags |= VIRTIO_DEV_FEATURES_FAILED; 344 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 345 346 return RTE_VHOST_MSG_RESULT_ERR; 347 } 348 349 if (dev->flags & VIRTIO_DEV_RUNNING) { 350 if (dev->features == features) 351 return RTE_VHOST_MSG_RESULT_OK; 352 353 /* 354 * Error out if frontend tries to change features while device is 355 * in running state. The exception being VHOST_F_LOG_ALL, which 356 * is enabled when the live-migration starts. 357 */ 358 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { 359 VHOST_CONFIG_LOG(dev->ifname, ERR, 360 "features changed while device is running."); 361 return RTE_VHOST_MSG_RESULT_ERR; 362 } 363 364 if (dev->notify_ops->features_changed) 365 dev->notify_ops->features_changed(dev->vid, features); 366 } 367 368 dev->features = features; 369 if (dev->features & 370 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 371 (1ULL << VIRTIO_F_VERSION_1) | 372 (1ULL << VIRTIO_F_RING_PACKED))) { 373 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); 374 } else { 375 dev->vhost_hlen = sizeof(struct virtio_net_hdr); 376 } 377 VHOST_CONFIG_LOG(dev->ifname, INFO, 378 "negotiated Virtio features: 0x%" PRIx64, 379 dev->features); 380 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 381 "mergeable RX buffers %s, virtio 1 %s", 382 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", 383 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); 384 385 if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && 386 !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { 387 /* 388 * Remove all but first queue pair if MQ hasn't been 389 * negotiated. This is safe because the device is not 390 * running at this stage. 391 */ 392 while (dev->nr_vring > 2) { 393 struct vhost_virtqueue *vq; 394 395 vq = dev->virtqueue[--dev->nr_vring]; 396 if (!vq) 397 continue; 398 399 dev->virtqueue[dev->nr_vring] = NULL; 400 cleanup_vq(vq, 1); 401 cleanup_vq_inflight(dev, vq); 402 /* vhost_user_lock_all_queue_pairs locked all qps */ 403 vq_assert_lock(dev, vq); 404 rte_rwlock_write_unlock(&vq->access_lock); 405 free_vq(dev, vq); 406 } 407 } 408 409 vdpa_dev = dev->vdpa_dev; 410 if (vdpa_dev) 411 vdpa_dev->ops->set_features(dev->vid); 412 413 dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED; 414 return RTE_VHOST_MSG_RESULT_OK; 415 } 416 417 /* 418 * The virtio device sends us the size of the descriptor ring. 419 */ 420 static int 421 vhost_user_set_vring_num(struct virtio_net **pdev, 422 struct vhu_msg_context *ctx, 423 int main_fd __rte_unused) 424 { 425 struct virtio_net *dev = *pdev; 426 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 427 428 if (ctx->msg.payload.state.num > 32768) { 429 VHOST_CONFIG_LOG(dev->ifname, ERR, 430 "invalid virtqueue size %u", 431 ctx->msg.payload.state.num); 432 return RTE_VHOST_MSG_RESULT_ERR; 433 } 434 435 vq->size = ctx->msg.payload.state.num; 436 437 /* VIRTIO 1.0, 2.4 Virtqueues says: 438 * 439 * Queue Size value is always a power of 2. The maximum Queue Size 440 * value is 32768. 441 * 442 * VIRTIO 1.1 2.7 Virtqueues says: 443 * 444 * Packed virtqueues support up to 2^15 entries each. 445 */ 446 if (!vq_is_packed(dev)) { 447 if (vq->size & (vq->size - 1)) { 448 VHOST_CONFIG_LOG(dev->ifname, ERR, 449 "invalid virtqueue size %u", 450 vq->size); 451 return RTE_VHOST_MSG_RESULT_ERR; 452 } 453 } 454 455 if (vq_is_packed(dev)) { 456 rte_free(vq->shadow_used_packed); 457 vq->shadow_used_packed = rte_malloc_socket(NULL, 458 vq->size * 459 sizeof(struct vring_used_elem_packed), 460 RTE_CACHE_LINE_SIZE, vq->numa_node); 461 if (!vq->shadow_used_packed) { 462 VHOST_CONFIG_LOG(dev->ifname, ERR, 463 "failed to allocate memory for shadow used ring."); 464 return RTE_VHOST_MSG_RESULT_ERR; 465 } 466 467 } else { 468 rte_free(vq->shadow_used_split); 469 470 vq->shadow_used_split = rte_malloc_socket(NULL, 471 vq->size * sizeof(struct vring_used_elem), 472 RTE_CACHE_LINE_SIZE, vq->numa_node); 473 474 if (!vq->shadow_used_split) { 475 VHOST_CONFIG_LOG(dev->ifname, ERR, 476 "failed to allocate memory for vq internal data."); 477 return RTE_VHOST_MSG_RESULT_ERR; 478 } 479 } 480 481 rte_free(vq->batch_copy_elems); 482 vq->batch_copy_elems = rte_malloc_socket(NULL, 483 vq->size * sizeof(struct batch_copy_elem), 484 RTE_CACHE_LINE_SIZE, vq->numa_node); 485 if (!vq->batch_copy_elems) { 486 VHOST_CONFIG_LOG(dev->ifname, ERR, 487 "failed to allocate memory for batching copy."); 488 return RTE_VHOST_MSG_RESULT_ERR; 489 } 490 491 return RTE_VHOST_MSG_RESULT_OK; 492 } 493 494 /* 495 * Reallocate virtio_dev, vhost_virtqueue and related data structures to 496 * make them on the same numa node as the memory of vring descriptor. 497 */ 498 #ifdef RTE_LIBRTE_VHOST_NUMA 499 static void 500 numa_realloc(struct virtio_net **pdev, struct vhost_virtqueue **pvq) 501 { 502 int node, dev_node; 503 struct virtio_net *dev; 504 struct vhost_virtqueue *vq; 505 struct batch_copy_elem *bce; 506 struct guest_page *gp; 507 struct rte_vhost_memory *mem; 508 size_t mem_size; 509 int ret; 510 511 dev = *pdev; 512 vq = *pvq; 513 514 /* 515 * If VQ is ready, it is too late to reallocate, it certainly already 516 * happened anyway on VHOST_USER_SET_VRING_ADRR. 517 */ 518 if (vq->ready) 519 return; 520 521 ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR); 522 if (ret) { 523 VHOST_CONFIG_LOG(dev->ifname, ERR, 524 "unable to get virtqueue %d numa information.", 525 vq->index); 526 return; 527 } 528 529 if (node == vq->numa_node) 530 goto out_dev_realloc; 531 532 vq = rte_realloc_socket(*pvq, sizeof(**pvq), 0, node); 533 if (!vq) { 534 VHOST_CONFIG_LOG(dev->ifname, ERR, 535 "failed to realloc virtqueue %d on node %d", 536 (*pvq)->index, node); 537 return; 538 } 539 *pvq = vq; 540 541 if (vq != dev->virtqueue[vq->index]) { 542 VHOST_CONFIG_LOG(dev->ifname, INFO, "reallocated virtqueue on node %d", node); 543 dev->virtqueue[vq->index] = vq; 544 } 545 546 if (vq_is_packed(dev)) { 547 struct vring_used_elem_packed *sup; 548 549 sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup), 550 RTE_CACHE_LINE_SIZE, node); 551 if (!sup) { 552 VHOST_CONFIG_LOG(dev->ifname, ERR, 553 "failed to realloc shadow packed on node %d", 554 node); 555 return; 556 } 557 vq->shadow_used_packed = sup; 558 } else { 559 struct vring_used_elem *sus; 560 561 sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus), 562 RTE_CACHE_LINE_SIZE, node); 563 if (!sus) { 564 VHOST_CONFIG_LOG(dev->ifname, ERR, 565 "failed to realloc shadow split on node %d", 566 node); 567 return; 568 } 569 vq->shadow_used_split = sus; 570 } 571 572 bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce), 573 RTE_CACHE_LINE_SIZE, node); 574 if (!bce) { 575 VHOST_CONFIG_LOG(dev->ifname, ERR, 576 "failed to realloc batch copy elem on node %d", 577 node); 578 return; 579 } 580 vq->batch_copy_elems = bce; 581 582 if (vq->log_cache) { 583 struct log_cache_entry *lc; 584 585 lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node); 586 if (!lc) { 587 VHOST_CONFIG_LOG(dev->ifname, ERR, 588 "failed to realloc log cache on node %d", 589 node); 590 return; 591 } 592 vq->log_cache = lc; 593 } 594 595 if (vq->resubmit_inflight) { 596 struct rte_vhost_resubmit_info *ri; 597 598 ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node); 599 if (!ri) { 600 VHOST_CONFIG_LOG(dev->ifname, ERR, 601 "failed to realloc resubmit inflight on node %d", 602 node); 603 return; 604 } 605 vq->resubmit_inflight = ri; 606 607 if (ri->resubmit_list) { 608 struct rte_vhost_resubmit_desc *rd; 609 610 rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num, 611 0, node); 612 if (!rd) { 613 VHOST_CONFIG_LOG(dev->ifname, ERR, 614 "failed to realloc resubmit list on node %d", 615 node); 616 return; 617 } 618 ri->resubmit_list = rd; 619 } 620 } 621 622 vq->numa_node = node; 623 624 out_dev_realloc: 625 626 if (dev->flags & VIRTIO_DEV_RUNNING) 627 return; 628 629 ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR); 630 if (ret) { 631 VHOST_CONFIG_LOG(dev->ifname, ERR, "unable to get numa information."); 632 return; 633 } 634 635 if (dev_node == node) 636 return; 637 638 dev = rte_realloc_socket(*pdev, sizeof(**pdev), 0, node); 639 if (!dev) { 640 VHOST_CONFIG_LOG((*pdev)->ifname, ERR, "failed to realloc dev on node %d", node); 641 return; 642 } 643 *pdev = dev; 644 645 VHOST_CONFIG_LOG(dev->ifname, INFO, "reallocated device on node %d", node); 646 vhost_devices[dev->vid] = dev; 647 648 mem_size = sizeof(struct rte_vhost_memory) + 649 sizeof(struct rte_vhost_mem_region) * dev->mem->nregions; 650 mem = rte_realloc_socket(dev->mem, mem_size, 0, node); 651 if (!mem) { 652 VHOST_CONFIG_LOG(dev->ifname, ERR, 653 "failed to realloc mem table on node %d", 654 node); 655 return; 656 } 657 dev->mem = mem; 658 659 gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp), 660 RTE_CACHE_LINE_SIZE, node); 661 if (!gp) { 662 VHOST_CONFIG_LOG(dev->ifname, ERR, 663 "failed to realloc guest pages on node %d", 664 node); 665 return; 666 } 667 dev->guest_pages = gp; 668 669 vhost_user_iotlb_init(dev); 670 } 671 #else 672 static void 673 numa_realloc(struct virtio_net **pdev, struct vhost_virtqueue **pvq) 674 { 675 RTE_SET_USED(pdev); 676 RTE_SET_USED(pvq); 677 } 678 #endif 679 680 /* Converts QEMU virtual address to Vhost virtual address. */ 681 static uint64_t 682 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) 683 { 684 struct rte_vhost_mem_region *r; 685 uint32_t i; 686 687 if (unlikely(!dev || !dev->mem)) 688 goto out_error; 689 690 /* Find the region where the address lives. */ 691 for (i = 0; i < dev->mem->nregions; i++) { 692 r = &dev->mem->regions[i]; 693 694 if (qva >= r->guest_user_addr && 695 qva < r->guest_user_addr + r->size) { 696 697 if (unlikely(*len > r->guest_user_addr + r->size - qva)) 698 *len = r->guest_user_addr + r->size - qva; 699 700 return qva - r->guest_user_addr + 701 r->host_user_addr; 702 } 703 } 704 out_error: 705 *len = 0; 706 707 return 0; 708 } 709 710 711 /* 712 * Converts ring address to Vhost virtual address. 713 * If IOMMU is enabled, the ring address is a guest IO virtual address, 714 * else it is a QEMU virtual address. 715 */ 716 static uint64_t 717 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 718 uint64_t ra, uint64_t *size) 719 { 720 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { 721 uint64_t vva; 722 723 vhost_user_iotlb_rd_lock(vq); 724 vva = vhost_iova_to_vva(dev, vq, ra, 725 size, VHOST_ACCESS_RW); 726 vhost_user_iotlb_rd_unlock(vq); 727 728 return vva; 729 } 730 731 return qva_to_vva(dev, ra, size); 732 } 733 734 static uint64_t 735 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) 736 { 737 uint64_t log_gpa; 738 739 vhost_user_iotlb_rd_lock(vq); 740 log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr); 741 vhost_user_iotlb_rd_unlock(vq); 742 743 return log_gpa; 744 } 745 746 static uint64_t 747 hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) 748 { 749 struct rte_vhost_mem_region *r; 750 uint32_t i; 751 uintptr_t hua = (uintptr_t)ptr; 752 753 for (i = 0; i < mem->nregions; i++) { 754 r = &mem->regions[i]; 755 if (hua >= r->host_user_addr && 756 hua < r->host_user_addr + r->size) { 757 return get_blk_size(r->fd); 758 } 759 } 760 761 /* If region isn't found, don't align at all */ 762 return 1; 763 } 764 765 void 766 mem_set_dump(struct virtio_net *dev, void *ptr, size_t size, bool enable, uint64_t pagesz) 767 { 768 #ifdef MADV_DONTDUMP 769 void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); 770 uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); 771 size_t len = end - (uintptr_t)start; 772 773 if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { 774 VHOST_CONFIG_LOG(dev->ifname, INFO, 775 "could not set coredump preference (%s).", strerror(errno)); 776 } 777 #endif 778 } 779 780 static void 781 translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) 782 { 783 struct vhost_virtqueue *vq; 784 struct virtio_net *dev; 785 uint64_t len, expected_len; 786 787 dev = *pdev; 788 vq = *pvq; 789 790 vq_assert_lock(dev, vq); 791 792 if (vq->ring_addrs.flags & (1 << VHOST_VRING_F_LOG)) { 793 vq->log_guest_addr = 794 log_addr_to_gpa(dev, vq); 795 if (vq->log_guest_addr == 0) { 796 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map log_guest_addr."); 797 return; 798 } 799 } 800 801 if (vq_is_packed(dev)) { 802 len = sizeof(struct vring_packed_desc) * vq->size; 803 vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) 804 ring_addr_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, &len); 805 if (vq->desc_packed == NULL || 806 len != sizeof(struct vring_packed_desc) * 807 vq->size) { 808 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map desc_packed ring."); 809 return; 810 } 811 812 mem_set_dump(dev, vq->desc_packed, len, true, 813 hua_to_alignment(dev->mem, vq->desc_packed)); 814 numa_realloc(&dev, &vq); 815 *pdev = dev; 816 *pvq = vq; 817 818 len = sizeof(struct vring_packed_desc_event); 819 vq->driver_event = (struct vring_packed_desc_event *) 820 (uintptr_t)ring_addr_to_vva(dev, 821 vq, vq->ring_addrs.avail_user_addr, &len); 822 if (vq->driver_event == NULL || 823 len != sizeof(struct vring_packed_desc_event)) { 824 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 825 "failed to find driver area address."); 826 return; 827 } 828 829 mem_set_dump(dev, vq->driver_event, len, true, 830 hua_to_alignment(dev->mem, vq->driver_event)); 831 len = sizeof(struct vring_packed_desc_event); 832 vq->device_event = (struct vring_packed_desc_event *) 833 (uintptr_t)ring_addr_to_vva(dev, 834 vq, vq->ring_addrs.used_user_addr, &len); 835 if (vq->device_event == NULL || 836 len != sizeof(struct vring_packed_desc_event)) { 837 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 838 "failed to find device area address."); 839 return; 840 } 841 842 mem_set_dump(dev, vq->device_event, len, true, 843 hua_to_alignment(dev->mem, vq->device_event)); 844 vq->access_ok = true; 845 return; 846 } 847 848 /* The addresses are converted from QEMU virtual to Vhost virtual. */ 849 if (vq->desc && vq->avail && vq->used) 850 return; 851 852 len = sizeof(struct vring_desc) * vq->size; 853 vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, 854 vq, vq->ring_addrs.desc_user_addr, &len); 855 if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { 856 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map desc ring."); 857 return; 858 } 859 860 mem_set_dump(dev, vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); 861 numa_realloc(&dev, &vq); 862 *pdev = dev; 863 *pvq = vq; 864 865 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 866 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 867 len += sizeof(uint16_t); 868 expected_len = len; 869 vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, 870 vq, vq->ring_addrs.avail_user_addr, &len); 871 if (vq->avail == 0 || len != expected_len) { 872 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map avail ring."); 873 return; 874 } 875 876 mem_set_dump(dev, vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); 877 len = sizeof(struct vring_used) + 878 sizeof(struct vring_used_elem) * vq->size; 879 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 880 len += sizeof(uint16_t); 881 expected_len = len; 882 vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, 883 vq, vq->ring_addrs.used_user_addr, &len); 884 if (vq->used == 0 || len != expected_len) { 885 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map used ring."); 886 return; 887 } 888 889 mem_set_dump(dev, vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); 890 891 if (vq->last_used_idx != vq->used->idx) { 892 VHOST_CONFIG_LOG(dev->ifname, WARNING, 893 "last_used_idx (%u) and vq->used->idx (%u) mismatches;", 894 vq->last_used_idx, vq->used->idx); 895 vq->last_used_idx = vq->used->idx; 896 vq->last_avail_idx = vq->used->idx; 897 VHOST_CONFIG_LOG(dev->ifname, WARNING, 898 "some packets maybe resent for Tx and dropped for Rx"); 899 } 900 901 vq->access_ok = true; 902 903 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address desc: %p", vq->desc); 904 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address avail: %p", vq->avail); 905 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address used: %p", vq->used); 906 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "log_guest_addr: %" PRIx64, vq->log_guest_addr); 907 } 908 909 /* 910 * The virtio device sends us the desc, used and avail ring addresses. 911 * This function then converts these to our address space. 912 */ 913 static int 914 vhost_user_set_vring_addr(struct virtio_net **pdev, 915 struct vhu_msg_context *ctx, 916 int main_fd __rte_unused) 917 { 918 struct virtio_net *dev = *pdev; 919 struct vhost_virtqueue *vq; 920 struct vhost_vring_addr *addr = &ctx->msg.payload.addr; 921 bool access_ok; 922 923 if (dev->mem == NULL) 924 return RTE_VHOST_MSG_RESULT_ERR; 925 926 /* addr->index refers to the queue index. The txq 1, rxq is 0. */ 927 vq = dev->virtqueue[ctx->msg.payload.addr.index]; 928 929 /* vhost_user_lock_all_queue_pairs locked all qps */ 930 vq_assert_lock(dev, vq); 931 932 access_ok = vq->access_ok; 933 934 /* 935 * Rings addresses should not be interpreted as long as the ring is not 936 * started and enabled 937 */ 938 memcpy(&vq->ring_addrs, addr, sizeof(*addr)); 939 940 vring_invalidate(dev, vq); 941 942 if ((vq->enabled && (dev->features & 943 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) || 944 access_ok) { 945 translate_ring_addresses(&dev, &vq); 946 *pdev = dev; 947 } 948 949 return RTE_VHOST_MSG_RESULT_OK; 950 } 951 952 /* 953 * The virtio device sends us the available ring last used index. 954 */ 955 static int 956 vhost_user_set_vring_base(struct virtio_net **pdev, 957 struct vhu_msg_context *ctx, 958 int main_fd __rte_unused) 959 { 960 struct virtio_net *dev = *pdev; 961 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 962 uint64_t val = ctx->msg.payload.state.num; 963 964 if (vq_is_packed(dev)) { 965 /* 966 * Bit[0:14]: avail index 967 * Bit[15]: avail wrap counter 968 */ 969 vq->last_avail_idx = val & 0x7fff; 970 vq->avail_wrap_counter = !!(val & (0x1 << 15)); 971 /* 972 * Set used index to same value as available one, as 973 * their values should be the same since ring processing 974 * was stopped at get time. 975 */ 976 vq->last_used_idx = vq->last_avail_idx; 977 vq->used_wrap_counter = vq->avail_wrap_counter; 978 } else { 979 vq->last_used_idx = ctx->msg.payload.state.num; 980 vq->last_avail_idx = ctx->msg.payload.state.num; 981 } 982 983 VHOST_CONFIG_LOG(dev->ifname, INFO, 984 "vring base idx:%u last_used_idx:%u last_avail_idx:%u.", 985 ctx->msg.payload.state.index, vq->last_used_idx, vq->last_avail_idx); 986 987 return RTE_VHOST_MSG_RESULT_OK; 988 } 989 990 static int 991 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, 992 uint64_t host_iova, uint64_t host_user_addr, uint64_t size) 993 { 994 struct guest_page *page, *last_page; 995 struct guest_page *old_pages; 996 997 if (dev->nr_guest_pages == dev->max_guest_pages) { 998 dev->max_guest_pages *= 2; 999 old_pages = dev->guest_pages; 1000 dev->guest_pages = rte_realloc(dev->guest_pages, 1001 dev->max_guest_pages * sizeof(*page), 1002 RTE_CACHE_LINE_SIZE); 1003 if (dev->guest_pages == NULL) { 1004 VHOST_CONFIG_LOG(dev->ifname, ERR, "cannot realloc guest_pages"); 1005 rte_free(old_pages); 1006 return -1; 1007 } 1008 } 1009 1010 if (dev->nr_guest_pages > 0) { 1011 last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; 1012 /* merge if the two pages are continuous */ 1013 if (host_iova == last_page->host_iova + last_page->size && 1014 guest_phys_addr == last_page->guest_phys_addr + last_page->size && 1015 host_user_addr == last_page->host_user_addr + last_page->size) { 1016 last_page->size += size; 1017 return 0; 1018 } 1019 } 1020 1021 page = &dev->guest_pages[dev->nr_guest_pages++]; 1022 page->guest_phys_addr = guest_phys_addr; 1023 page->host_iova = host_iova; 1024 page->host_user_addr = host_user_addr; 1025 page->size = size; 1026 1027 return 0; 1028 } 1029 1030 static int 1031 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, 1032 uint64_t page_size) 1033 { 1034 uint64_t reg_size = reg->size; 1035 uint64_t host_user_addr = reg->host_user_addr; 1036 uint64_t guest_phys_addr = reg->guest_phys_addr; 1037 uint64_t host_iova; 1038 uint64_t size; 1039 1040 host_iova = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); 1041 size = page_size - (guest_phys_addr & (page_size - 1)); 1042 size = RTE_MIN(size, reg_size); 1043 1044 if (add_one_guest_page(dev, guest_phys_addr, host_iova, 1045 host_user_addr, size) < 0) 1046 return -1; 1047 1048 host_user_addr += size; 1049 guest_phys_addr += size; 1050 reg_size -= size; 1051 1052 while (reg_size > 0) { 1053 size = RTE_MIN(reg_size, page_size); 1054 host_iova = rte_mem_virt2iova((void *)(uintptr_t) 1055 host_user_addr); 1056 if (add_one_guest_page(dev, guest_phys_addr, host_iova, 1057 host_user_addr, size) < 0) 1058 return -1; 1059 1060 host_user_addr += size; 1061 guest_phys_addr += size; 1062 reg_size -= size; 1063 } 1064 1065 /* sort guest page array if over binary search threshold */ 1066 if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) { 1067 qsort((void *)dev->guest_pages, dev->nr_guest_pages, 1068 sizeof(struct guest_page), guest_page_addrcmp); 1069 } 1070 1071 return 0; 1072 } 1073 1074 #ifdef RTE_LIBRTE_VHOST_DEBUG 1075 /* TODO: enable it only in debug mode? */ 1076 static void 1077 dump_guest_pages(struct virtio_net *dev) 1078 { 1079 uint32_t i; 1080 struct guest_page *page; 1081 1082 for (i = 0; i < dev->nr_guest_pages; i++) { 1083 page = &dev->guest_pages[i]; 1084 1085 VHOST_CONFIG_LOG(dev->ifname, INFO, "guest physical page region %u", i); 1086 VHOST_CONFIG_LOG(dev->ifname, INFO, "\tguest_phys_addr: %" PRIx64, 1087 page->guest_phys_addr); 1088 VHOST_CONFIG_LOG(dev->ifname, INFO, "\thost_iova : %" PRIx64, 1089 page->host_iova); 1090 VHOST_CONFIG_LOG(dev->ifname, INFO, "\tsize : %" PRIx64, 1091 page->size); 1092 } 1093 } 1094 #else 1095 #define dump_guest_pages(dev) 1096 #endif 1097 1098 static bool 1099 vhost_memory_changed(struct VhostUserMemory *new, 1100 struct rte_vhost_memory *old) 1101 { 1102 uint32_t i; 1103 1104 if (new->nregions != old->nregions) 1105 return true; 1106 1107 for (i = 0; i < new->nregions; ++i) { 1108 VhostUserMemoryRegion *new_r = &new->regions[i]; 1109 struct rte_vhost_mem_region *old_r = &old->regions[i]; 1110 1111 if (new_r->guest_phys_addr != old_r->guest_phys_addr) 1112 return true; 1113 if (new_r->memory_size != old_r->size) 1114 return true; 1115 if (new_r->userspace_addr != old_r->guest_user_addr) 1116 return true; 1117 } 1118 1119 return false; 1120 } 1121 1122 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 1123 static int 1124 vhost_user_postcopy_region_register(struct virtio_net *dev, 1125 struct rte_vhost_mem_region *reg) 1126 { 1127 struct uffdio_register reg_struct; 1128 1129 /* 1130 * Let's register all the mmapped area to ensure 1131 * alignment on page boundary. 1132 */ 1133 reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr; 1134 reg_struct.range.len = reg->mmap_size; 1135 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 1136 1137 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, 1138 ®_struct)) { 1139 VHOST_CONFIG_LOG(dev->ifname, ERR, 1140 "failed to register ufd for region " 1141 "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s", 1142 (uint64_t)reg_struct.range.start, 1143 (uint64_t)reg_struct.range.start + 1144 (uint64_t)reg_struct.range.len - 1, 1145 dev->postcopy_ufd, 1146 strerror(errno)); 1147 return -1; 1148 } 1149 1150 VHOST_CONFIG_LOG(dev->ifname, INFO, 1151 "\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64, 1152 (uint64_t)reg_struct.range.start, 1153 (uint64_t)reg_struct.range.start + 1154 (uint64_t)reg_struct.range.len - 1); 1155 1156 return 0; 1157 } 1158 #else 1159 static int 1160 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused, 1161 struct rte_vhost_mem_region *reg __rte_unused) 1162 { 1163 return -1; 1164 } 1165 #endif 1166 1167 static int 1168 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd, 1169 struct vhu_msg_context *ctx) 1170 { 1171 struct VhostUserMemory *memory; 1172 struct rte_vhost_mem_region *reg; 1173 struct vhu_msg_context ack_ctx; 1174 uint32_t i; 1175 1176 if (!dev->postcopy_listening) 1177 return 0; 1178 1179 /* 1180 * We haven't a better way right now than sharing 1181 * DPDK's virtual address with Qemu, so that Qemu can 1182 * retrieve the region offset when handling userfaults. 1183 */ 1184 memory = &ctx->msg.payload.memory; 1185 for (i = 0; i < memory->nregions; i++) { 1186 reg = &dev->mem->regions[i]; 1187 memory->regions[i].userspace_addr = reg->host_user_addr; 1188 } 1189 1190 /* Send the addresses back to qemu */ 1191 ctx->fd_num = 0; 1192 send_vhost_reply(dev, main_fd, ctx); 1193 1194 /* Wait for qemu to acknowledge it got the addresses 1195 * we've got to wait before we're allowed to generate faults. 1196 */ 1197 if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) { 1198 VHOST_CONFIG_LOG(dev->ifname, ERR, 1199 "failed to read qemu ack on postcopy set-mem-table"); 1200 return -1; 1201 } 1202 1203 if (validate_msg_fds(dev, &ack_ctx, 0) != 0) 1204 return -1; 1205 1206 if (ack_ctx.msg.request.frontend != VHOST_USER_SET_MEM_TABLE) { 1207 VHOST_CONFIG_LOG(dev->ifname, ERR, 1208 "bad qemu ack on postcopy set-mem-table (%d)", 1209 ack_ctx.msg.request.frontend); 1210 return -1; 1211 } 1212 1213 /* Now userfault register and we can use the memory */ 1214 for (i = 0; i < memory->nregions; i++) { 1215 reg = &dev->mem->regions[i]; 1216 if (vhost_user_postcopy_region_register(dev, reg) < 0) 1217 return -1; 1218 } 1219 1220 return 0; 1221 } 1222 1223 static int 1224 vhost_user_mmap_region(struct virtio_net *dev, 1225 struct rte_vhost_mem_region *region, 1226 uint64_t mmap_offset) 1227 { 1228 void *mmap_addr; 1229 uint64_t mmap_size; 1230 uint64_t alignment; 1231 int populate; 1232 1233 /* Check for memory_size + mmap_offset overflow */ 1234 if (mmap_offset >= -region->size) { 1235 VHOST_CONFIG_LOG(dev->ifname, ERR, 1236 "mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow", 1237 mmap_offset, region->size); 1238 return -1; 1239 } 1240 1241 mmap_size = region->size + mmap_offset; 1242 1243 /* mmap() without flag of MAP_ANONYMOUS, should be called with length 1244 * argument aligned with hugepagesz at older longterm version Linux, 1245 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL. 1246 * 1247 * To avoid failure, make sure in caller to keep length aligned. 1248 */ 1249 alignment = get_blk_size(region->fd); 1250 if (alignment == (uint64_t)-1) { 1251 VHOST_CONFIG_LOG(dev->ifname, ERR, "couldn't get hugepage size through fstat"); 1252 return -1; 1253 } 1254 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); 1255 if (mmap_size == 0) { 1256 /* 1257 * It could happen if initial mmap_size + alignment overflows 1258 * the sizeof uint64, which could happen if either mmap_size or 1259 * alignment value is wrong. 1260 * 1261 * mmap() kernel implementation would return an error, but 1262 * better catch it before and provide useful info in the logs. 1263 */ 1264 VHOST_CONFIG_LOG(dev->ifname, ERR, 1265 "mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid", 1266 region->size + mmap_offset, alignment); 1267 return -1; 1268 } 1269 1270 populate = dev->async_copy ? MAP_POPULATE : 0; 1271 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1272 MAP_SHARED | populate, region->fd, 0); 1273 1274 if (mmap_addr == MAP_FAILED) { 1275 VHOST_CONFIG_LOG(dev->ifname, ERR, "mmap failed (%s).", strerror(errno)); 1276 return -1; 1277 } 1278 1279 region->mmap_addr = mmap_addr; 1280 region->mmap_size = mmap_size; 1281 region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; 1282 mem_set_dump(dev, mmap_addr, mmap_size, false, alignment); 1283 1284 if (dev->async_copy) { 1285 if (add_guest_pages(dev, region, alignment) < 0) { 1286 VHOST_CONFIG_LOG(dev->ifname, ERR, 1287 "adding guest pages to region failed."); 1288 return -1; 1289 } 1290 } 1291 1292 VHOST_CONFIG_LOG(dev->ifname, INFO, 1293 "guest memory region size: 0x%" PRIx64, 1294 region->size); 1295 VHOST_CONFIG_LOG(dev->ifname, INFO, 1296 "\t guest physical addr: 0x%" PRIx64, 1297 region->guest_phys_addr); 1298 VHOST_CONFIG_LOG(dev->ifname, INFO, 1299 "\t guest virtual addr: 0x%" PRIx64, 1300 region->guest_user_addr); 1301 VHOST_CONFIG_LOG(dev->ifname, INFO, 1302 "\t host virtual addr: 0x%" PRIx64, 1303 region->host_user_addr); 1304 VHOST_CONFIG_LOG(dev->ifname, INFO, 1305 "\t mmap addr : 0x%" PRIx64, 1306 (uint64_t)(uintptr_t)mmap_addr); 1307 VHOST_CONFIG_LOG(dev->ifname, INFO, 1308 "\t mmap size : 0x%" PRIx64, 1309 mmap_size); 1310 VHOST_CONFIG_LOG(dev->ifname, INFO, 1311 "\t mmap align: 0x%" PRIx64, 1312 alignment); 1313 VHOST_CONFIG_LOG(dev->ifname, INFO, 1314 "\t mmap off : 0x%" PRIx64, 1315 mmap_offset); 1316 1317 return 0; 1318 } 1319 1320 static int 1321 vhost_user_set_mem_table(struct virtio_net **pdev, 1322 struct vhu_msg_context *ctx, 1323 int main_fd) 1324 { 1325 struct virtio_net *dev = *pdev; 1326 struct VhostUserMemory *memory = &ctx->msg.payload.memory; 1327 struct rte_vhost_mem_region *reg; 1328 int numa_node = SOCKET_ID_ANY; 1329 uint64_t mmap_offset; 1330 uint32_t i; 1331 bool async_notify = false; 1332 1333 if (validate_msg_fds(dev, ctx, memory->nregions) != 0) 1334 return RTE_VHOST_MSG_RESULT_ERR; 1335 1336 if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { 1337 VHOST_CONFIG_LOG(dev->ifname, ERR, 1338 "too many memory regions (%u)", 1339 memory->nregions); 1340 goto close_msg_fds; 1341 } 1342 1343 if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { 1344 VHOST_CONFIG_LOG(dev->ifname, INFO, "memory regions not changed"); 1345 1346 close_msg_fds(ctx); 1347 1348 return RTE_VHOST_MSG_RESULT_OK; 1349 } 1350 1351 if (dev->mem) { 1352 if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) { 1353 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 1354 1355 if (vdpa_dev && vdpa_dev->ops->dev_close) 1356 vdpa_dev->ops->dev_close(dev->vid); 1357 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 1358 } 1359 1360 /* notify the vhost application to stop DMA transfers */ 1361 if (dev->async_copy && dev->notify_ops->vring_state_changed) { 1362 for (i = 0; i < dev->nr_vring; i++) { 1363 dev->notify_ops->vring_state_changed(dev->vid, 1364 i, 0); 1365 } 1366 async_notify = true; 1367 } 1368 1369 /* Flush IOTLB cache as previous HVAs are now invalid */ 1370 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 1371 vhost_user_iotlb_flush_all(dev); 1372 1373 free_mem_region(dev); 1374 rte_free(dev->mem); 1375 dev->mem = NULL; 1376 } 1377 1378 /* 1379 * If VQ 0 has already been allocated, try to allocate on the same 1380 * NUMA node. It can be reallocated later in numa_realloc(). 1381 */ 1382 if (dev->nr_vring > 0) 1383 numa_node = dev->virtqueue[0]->numa_node; 1384 1385 dev->nr_guest_pages = 0; 1386 if (dev->guest_pages == NULL) { 1387 dev->max_guest_pages = 8; 1388 dev->guest_pages = rte_zmalloc_socket(NULL, 1389 dev->max_guest_pages * 1390 sizeof(struct guest_page), 1391 RTE_CACHE_LINE_SIZE, 1392 numa_node); 1393 if (dev->guest_pages == NULL) { 1394 VHOST_CONFIG_LOG(dev->ifname, ERR, 1395 "failed to allocate memory for dev->guest_pages"); 1396 goto close_msg_fds; 1397 } 1398 } 1399 1400 dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) + 1401 sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node); 1402 if (dev->mem == NULL) { 1403 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to allocate memory for dev->mem"); 1404 goto free_guest_pages; 1405 } 1406 1407 for (i = 0; i < memory->nregions; i++) { 1408 reg = &dev->mem->regions[i]; 1409 1410 reg->guest_phys_addr = memory->regions[i].guest_phys_addr; 1411 reg->guest_user_addr = memory->regions[i].userspace_addr; 1412 reg->size = memory->regions[i].memory_size; 1413 reg->fd = ctx->fds[i]; 1414 1415 /* 1416 * Assign invalid file descriptor value to avoid double 1417 * closing on error path. 1418 */ 1419 ctx->fds[i] = -1; 1420 1421 mmap_offset = memory->regions[i].mmap_offset; 1422 1423 if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) { 1424 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap region %u", i); 1425 goto free_mem_table; 1426 } 1427 1428 dev->mem->nregions++; 1429 } 1430 1431 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 1432 async_dma_map(dev, true); 1433 1434 if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0) 1435 goto free_mem_table; 1436 1437 for (i = 0; i < dev->nr_vring; i++) { 1438 struct vhost_virtqueue *vq = dev->virtqueue[i]; 1439 1440 if (!vq) 1441 continue; 1442 1443 if (vq->desc || vq->avail || vq->used) { 1444 /* vhost_user_lock_all_queue_pairs locked all qps */ 1445 vq_assert_lock(dev, vq); 1446 1447 /* 1448 * If the memory table got updated, the ring addresses 1449 * need to be translated again as virtual addresses have 1450 * changed. 1451 */ 1452 vring_invalidate(dev, vq); 1453 1454 translate_ring_addresses(&dev, &vq); 1455 *pdev = dev; 1456 } 1457 } 1458 1459 dump_guest_pages(dev); 1460 1461 if (async_notify) { 1462 for (i = 0; i < dev->nr_vring; i++) 1463 dev->notify_ops->vring_state_changed(dev->vid, i, 1); 1464 } 1465 1466 return RTE_VHOST_MSG_RESULT_OK; 1467 1468 free_mem_table: 1469 free_mem_region(dev); 1470 rte_free(dev->mem); 1471 dev->mem = NULL; 1472 1473 free_guest_pages: 1474 rte_free(dev->guest_pages); 1475 dev->guest_pages = NULL; 1476 close_msg_fds: 1477 close_msg_fds(ctx); 1478 return RTE_VHOST_MSG_RESULT_ERR; 1479 } 1480 1481 static bool 1482 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) 1483 { 1484 bool rings_ok; 1485 1486 if (!vq) 1487 return false; 1488 1489 if (vq_is_packed(dev)) 1490 rings_ok = vq->desc_packed && vq->driver_event && 1491 vq->device_event; 1492 else 1493 rings_ok = vq->desc && vq->avail && vq->used; 1494 1495 return rings_ok && 1496 vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && 1497 vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && 1498 vq->enabled; 1499 } 1500 1501 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u 1502 #define VIRTIO_BLK_NUM_VQS_TO_BE_READY 1u 1503 1504 static int 1505 virtio_is_ready(struct virtio_net *dev) 1506 { 1507 struct rte_vdpa_device *vdpa_dev; 1508 struct vhost_virtqueue *vq; 1509 uint32_t vdpa_type; 1510 uint32_t i, nr_vring = dev->nr_vring; 1511 1512 if (dev->flags & VIRTIO_DEV_READY) 1513 return 1; 1514 1515 if (!dev->nr_vring) 1516 return 0; 1517 1518 vdpa_dev = dev->vdpa_dev; 1519 if (vdpa_dev) 1520 vdpa_type = vdpa_dev->type; 1521 else 1522 vdpa_type = -1; 1523 1524 if (vdpa_type == RTE_VHOST_VDPA_DEVICE_TYPE_BLK) { 1525 nr_vring = VIRTIO_BLK_NUM_VQS_TO_BE_READY; 1526 } else { 1527 if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) 1528 nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY; 1529 } 1530 1531 if (dev->nr_vring < nr_vring) 1532 return 0; 1533 1534 for (i = 0; i < nr_vring; i++) { 1535 vq = dev->virtqueue[i]; 1536 1537 if (!vq_is_ready(dev, vq)) 1538 return 0; 1539 } 1540 1541 /* If supported, ensure the frontend is really done with config */ 1542 if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS)) 1543 if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)) 1544 return 0; 1545 1546 dev->flags |= VIRTIO_DEV_READY; 1547 1548 if (!(dev->flags & VIRTIO_DEV_RUNNING)) 1549 VHOST_CONFIG_LOG(dev->ifname, INFO, "virtio is now ready for processing."); 1550 return 1; 1551 } 1552 1553 static void * 1554 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd) 1555 { 1556 void *ptr; 1557 int mfd = -1; 1558 uint64_t alignment; 1559 char fname[20] = "/tmp/memfd-XXXXXX"; 1560 1561 *fd = -1; 1562 #ifdef MEMFD_SUPPORTED 1563 mfd = memfd_create(name, MFD_CLOEXEC); 1564 #else 1565 RTE_SET_USED(name); 1566 #endif 1567 if (mfd == -1) { 1568 mfd = mkstemp(fname); 1569 if (mfd == -1) { 1570 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to get inflight buffer fd"); 1571 return NULL; 1572 } 1573 1574 unlink(fname); 1575 } 1576 1577 if (ftruncate(mfd, size) == -1) { 1578 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc inflight buffer"); 1579 close(mfd); 1580 return NULL; 1581 } 1582 1583 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); 1584 if (ptr == MAP_FAILED) { 1585 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap inflight buffer"); 1586 close(mfd); 1587 return NULL; 1588 } 1589 1590 alignment = get_blk_size(mfd); 1591 mem_set_dump(dev, ptr, size, false, alignment); 1592 *fd = mfd; 1593 return ptr; 1594 } 1595 1596 static uint32_t 1597 get_pervq_shm_size_split(uint16_t queue_size) 1598 { 1599 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) * 1600 queue_size + sizeof(uint64_t) + 1601 sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT); 1602 } 1603 1604 static uint32_t 1605 get_pervq_shm_size_packed(uint16_t queue_size) 1606 { 1607 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed) 1608 * queue_size + sizeof(uint64_t) + 1609 sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9, 1610 INFLIGHT_ALIGNMENT); 1611 } 1612 1613 static int 1614 vhost_user_get_inflight_fd(struct virtio_net **pdev, 1615 struct vhu_msg_context *ctx, 1616 int main_fd __rte_unused) 1617 { 1618 struct rte_vhost_inflight_info_packed *inflight_packed; 1619 uint64_t pervq_inflight_size, mmap_size; 1620 uint16_t num_queues, queue_size; 1621 struct virtio_net *dev = *pdev; 1622 int fd, i, j; 1623 int numa_node = SOCKET_ID_ANY; 1624 void *addr; 1625 1626 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) { 1627 VHOST_CONFIG_LOG(dev->ifname, ERR, 1628 "invalid get_inflight_fd message size is %d", 1629 ctx->msg.size); 1630 return RTE_VHOST_MSG_RESULT_ERR; 1631 } 1632 1633 /* 1634 * If VQ 0 has already been allocated, try to allocate on the same 1635 * NUMA node. It can be reallocated later in numa_realloc(). 1636 */ 1637 if (dev->nr_vring > 0) 1638 numa_node = dev->virtqueue[0]->numa_node; 1639 1640 if (dev->inflight_info == NULL) { 1641 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1642 sizeof(struct inflight_mem_info), 0, numa_node); 1643 if (!dev->inflight_info) { 1644 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc dev inflight area"); 1645 return RTE_VHOST_MSG_RESULT_ERR; 1646 } 1647 dev->inflight_info->fd = -1; 1648 } 1649 1650 num_queues = ctx->msg.payload.inflight.num_queues; 1651 queue_size = ctx->msg.payload.inflight.queue_size; 1652 1653 VHOST_CONFIG_LOG(dev->ifname, INFO, 1654 "get_inflight_fd num_queues: %u", 1655 ctx->msg.payload.inflight.num_queues); 1656 VHOST_CONFIG_LOG(dev->ifname, INFO, 1657 "get_inflight_fd queue_size: %u", 1658 ctx->msg.payload.inflight.queue_size); 1659 1660 if (vq_is_packed(dev)) 1661 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1662 else 1663 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1664 1665 mmap_size = num_queues * pervq_inflight_size; 1666 addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd); 1667 if (!addr) { 1668 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc vhost inflight area"); 1669 ctx->msg.payload.inflight.mmap_size = 0; 1670 return RTE_VHOST_MSG_RESULT_ERR; 1671 } 1672 memset(addr, 0, mmap_size); 1673 1674 if (dev->inflight_info->addr) { 1675 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1676 dev->inflight_info->addr = NULL; 1677 } 1678 1679 if (dev->inflight_info->fd >= 0) { 1680 close(dev->inflight_info->fd); 1681 dev->inflight_info->fd = -1; 1682 } 1683 1684 dev->inflight_info->addr = addr; 1685 dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; 1686 dev->inflight_info->fd = ctx->fds[0] = fd; 1687 ctx->msg.payload.inflight.mmap_offset = 0; 1688 ctx->fd_num = 1; 1689 1690 if (vq_is_packed(dev)) { 1691 for (i = 0; i < num_queues; i++) { 1692 inflight_packed = 1693 (struct rte_vhost_inflight_info_packed *)addr; 1694 inflight_packed->used_wrap_counter = 1; 1695 inflight_packed->old_used_wrap_counter = 1; 1696 for (j = 0; j < queue_size; j++) 1697 inflight_packed->desc[j].next = j + 1; 1698 addr = (void *)((char *)addr + pervq_inflight_size); 1699 } 1700 } 1701 1702 VHOST_CONFIG_LOG(dev->ifname, INFO, 1703 "send inflight mmap_size: %"PRIu64, 1704 ctx->msg.payload.inflight.mmap_size); 1705 VHOST_CONFIG_LOG(dev->ifname, INFO, 1706 "send inflight mmap_offset: %"PRIu64, 1707 ctx->msg.payload.inflight.mmap_offset); 1708 VHOST_CONFIG_LOG(dev->ifname, INFO, 1709 "send inflight fd: %d", ctx->fds[0]); 1710 1711 return RTE_VHOST_MSG_RESULT_REPLY; 1712 } 1713 1714 static int 1715 vhost_user_set_inflight_fd(struct virtio_net **pdev, 1716 struct vhu_msg_context *ctx, 1717 int main_fd __rte_unused) 1718 { 1719 uint64_t mmap_size, mmap_offset; 1720 uint16_t num_queues, queue_size; 1721 struct virtio_net *dev = *pdev; 1722 uint32_t pervq_inflight_size; 1723 struct vhost_virtqueue *vq; 1724 void *addr; 1725 int fd, i; 1726 int numa_node = SOCKET_ID_ANY; 1727 1728 if (validate_msg_fds(dev, ctx, 1) != 0) 1729 return RTE_VHOST_MSG_RESULT_ERR; 1730 1731 fd = ctx->fds[0]; 1732 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) { 1733 VHOST_CONFIG_LOG(dev->ifname, ERR, 1734 "invalid set_inflight_fd message size is %d,fd is %d", 1735 ctx->msg.size, fd); 1736 return RTE_VHOST_MSG_RESULT_ERR; 1737 } 1738 1739 mmap_size = ctx->msg.payload.inflight.mmap_size; 1740 mmap_offset = ctx->msg.payload.inflight.mmap_offset; 1741 num_queues = ctx->msg.payload.inflight.num_queues; 1742 queue_size = ctx->msg.payload.inflight.queue_size; 1743 1744 if (vq_is_packed(dev)) 1745 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1746 else 1747 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1748 1749 VHOST_CONFIG_LOG(dev->ifname, INFO, "set_inflight_fd mmap_size: %"PRIu64, mmap_size); 1750 VHOST_CONFIG_LOG(dev->ifname, INFO, 1751 "set_inflight_fd mmap_offset: %"PRIu64, 1752 mmap_offset); 1753 VHOST_CONFIG_LOG(dev->ifname, INFO, 1754 "set_inflight_fd num_queues: %u", 1755 num_queues); 1756 VHOST_CONFIG_LOG(dev->ifname, INFO, 1757 "set_inflight_fd queue_size: %u", 1758 queue_size); 1759 VHOST_CONFIG_LOG(dev->ifname, INFO, 1760 "set_inflight_fd fd: %d", 1761 fd); 1762 VHOST_CONFIG_LOG(dev->ifname, INFO, 1763 "set_inflight_fd pervq_inflight_size: %d", 1764 pervq_inflight_size); 1765 1766 /* 1767 * If VQ 0 has already been allocated, try to allocate on the same 1768 * NUMA node. It can be reallocated later in numa_realloc(). 1769 */ 1770 if (dev->nr_vring > 0) 1771 numa_node = dev->virtqueue[0]->numa_node; 1772 1773 if (!dev->inflight_info) { 1774 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1775 sizeof(struct inflight_mem_info), 0, numa_node); 1776 if (dev->inflight_info == NULL) { 1777 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc dev inflight area"); 1778 return RTE_VHOST_MSG_RESULT_ERR; 1779 } 1780 dev->inflight_info->fd = -1; 1781 } 1782 1783 if (dev->inflight_info->addr) { 1784 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1785 dev->inflight_info->addr = NULL; 1786 } 1787 1788 addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1789 fd, mmap_offset); 1790 if (addr == MAP_FAILED) { 1791 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap share memory."); 1792 return RTE_VHOST_MSG_RESULT_ERR; 1793 } 1794 1795 if (dev->inflight_info->fd >= 0) { 1796 close(dev->inflight_info->fd); 1797 dev->inflight_info->fd = -1; 1798 } 1799 1800 mem_set_dump(dev, addr, mmap_size, false, get_blk_size(fd)); 1801 dev->inflight_info->fd = fd; 1802 dev->inflight_info->addr = addr; 1803 dev->inflight_info->size = mmap_size; 1804 1805 for (i = 0; i < num_queues; i++) { 1806 vq = dev->virtqueue[i]; 1807 if (!vq) 1808 continue; 1809 1810 if (vq_is_packed(dev)) { 1811 vq->inflight_packed = addr; 1812 vq->inflight_packed->desc_num = queue_size; 1813 } else { 1814 vq->inflight_split = addr; 1815 vq->inflight_split->desc_num = queue_size; 1816 } 1817 addr = (void *)((char *)addr + pervq_inflight_size); 1818 } 1819 1820 return RTE_VHOST_MSG_RESULT_OK; 1821 } 1822 1823 static int 1824 vhost_user_set_vring_call(struct virtio_net **pdev, 1825 struct vhu_msg_context *ctx, 1826 int main_fd __rte_unused) 1827 { 1828 struct virtio_net *dev = *pdev; 1829 struct vhost_vring_file file; 1830 struct vhost_virtqueue *vq; 1831 int expected_fds; 1832 1833 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1834 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1835 return RTE_VHOST_MSG_RESULT_ERR; 1836 1837 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 1838 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 1839 file.fd = VIRTIO_INVALID_EVENTFD; 1840 else 1841 file.fd = ctx->fds[0]; 1842 VHOST_CONFIG_LOG(dev->ifname, INFO, 1843 "vring call idx:%d file:%d", 1844 file.index, file.fd); 1845 1846 vq = dev->virtqueue[file.index]; 1847 1848 if (vq->ready) { 1849 vq->ready = false; 1850 vhost_user_notify_queue_state(dev, vq, 0); 1851 } 1852 1853 if (vq->callfd >= 0) 1854 close(vq->callfd); 1855 1856 vq->callfd = file.fd; 1857 1858 return RTE_VHOST_MSG_RESULT_OK; 1859 } 1860 1861 static int vhost_user_set_vring_err(struct virtio_net **pdev, 1862 struct vhu_msg_context *ctx, 1863 int main_fd __rte_unused) 1864 { 1865 struct virtio_net *dev = *pdev; 1866 int expected_fds; 1867 1868 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1869 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1870 return RTE_VHOST_MSG_RESULT_ERR; 1871 1872 if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) 1873 close(ctx->fds[0]); 1874 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "not implemented"); 1875 1876 return RTE_VHOST_MSG_RESULT_OK; 1877 } 1878 1879 static int 1880 resubmit_desc_compare(const void *a, const void *b) 1881 { 1882 const struct rte_vhost_resubmit_desc *desc0 = a; 1883 const struct rte_vhost_resubmit_desc *desc1 = b; 1884 1885 if (desc1->counter > desc0->counter) 1886 return 1; 1887 1888 return -1; 1889 } 1890 1891 static int 1892 vhost_check_queue_inflights_split(struct virtio_net *dev, 1893 struct vhost_virtqueue *vq) 1894 { 1895 uint16_t i; 1896 uint16_t resubmit_num = 0, last_io, num; 1897 struct vring_used *used = vq->used; 1898 struct rte_vhost_resubmit_info *resubmit; 1899 struct rte_vhost_inflight_info_split *inflight_split; 1900 1901 if (!(dev->protocol_features & 1902 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1903 return RTE_VHOST_MSG_RESULT_OK; 1904 1905 /* The frontend may still not support the inflight feature 1906 * although we negotiate the protocol feature. 1907 */ 1908 if ((!vq->inflight_split)) 1909 return RTE_VHOST_MSG_RESULT_OK; 1910 1911 if (!vq->inflight_split->version) { 1912 vq->inflight_split->version = INFLIGHT_VERSION; 1913 return RTE_VHOST_MSG_RESULT_OK; 1914 } 1915 1916 if (vq->resubmit_inflight) 1917 return RTE_VHOST_MSG_RESULT_OK; 1918 1919 inflight_split = vq->inflight_split; 1920 vq->global_counter = 0; 1921 last_io = inflight_split->last_inflight_io; 1922 1923 if (inflight_split->used_idx != used->idx) { 1924 inflight_split->desc[last_io].inflight = 0; 1925 rte_atomic_thread_fence(rte_memory_order_seq_cst); 1926 inflight_split->used_idx = used->idx; 1927 } 1928 1929 for (i = 0; i < inflight_split->desc_num; i++) { 1930 if (inflight_split->desc[i].inflight == 1) 1931 resubmit_num++; 1932 } 1933 1934 vq->last_avail_idx += resubmit_num; 1935 1936 if (resubmit_num) { 1937 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 1938 0, vq->numa_node); 1939 if (!resubmit) { 1940 VHOST_CONFIG_LOG(dev->ifname, ERR, 1941 "failed to allocate memory for resubmit info."); 1942 return RTE_VHOST_MSG_RESULT_ERR; 1943 } 1944 1945 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 1946 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 1947 0, vq->numa_node); 1948 if (!resubmit->resubmit_list) { 1949 VHOST_CONFIG_LOG(dev->ifname, ERR, 1950 "failed to allocate memory for inflight desc."); 1951 rte_free(resubmit); 1952 return RTE_VHOST_MSG_RESULT_ERR; 1953 } 1954 1955 num = 0; 1956 for (i = 0; i < vq->inflight_split->desc_num; i++) { 1957 if (vq->inflight_split->desc[i].inflight == 1) { 1958 resubmit->resubmit_list[num].index = i; 1959 resubmit->resubmit_list[num].counter = 1960 inflight_split->desc[i].counter; 1961 num++; 1962 } 1963 } 1964 resubmit->resubmit_num = num; 1965 1966 if (resubmit->resubmit_num > 1) 1967 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 1968 sizeof(struct rte_vhost_resubmit_desc), 1969 resubmit_desc_compare); 1970 1971 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 1972 vq->resubmit_inflight = resubmit; 1973 } 1974 1975 return RTE_VHOST_MSG_RESULT_OK; 1976 } 1977 1978 static int 1979 vhost_check_queue_inflights_packed(struct virtio_net *dev, 1980 struct vhost_virtqueue *vq) 1981 { 1982 uint16_t i; 1983 uint16_t resubmit_num = 0, old_used_idx, num; 1984 struct rte_vhost_resubmit_info *resubmit; 1985 struct rte_vhost_inflight_info_packed *inflight_packed; 1986 1987 if (!(dev->protocol_features & 1988 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1989 return RTE_VHOST_MSG_RESULT_OK; 1990 1991 /* The frontend may still not support the inflight feature 1992 * although we negotiate the protocol feature. 1993 */ 1994 if ((!vq->inflight_packed)) 1995 return RTE_VHOST_MSG_RESULT_OK; 1996 1997 if (!vq->inflight_packed->version) { 1998 vq->inflight_packed->version = INFLIGHT_VERSION; 1999 return RTE_VHOST_MSG_RESULT_OK; 2000 } 2001 2002 if (vq->resubmit_inflight) 2003 return RTE_VHOST_MSG_RESULT_OK; 2004 2005 inflight_packed = vq->inflight_packed; 2006 vq->global_counter = 0; 2007 old_used_idx = inflight_packed->old_used_idx; 2008 2009 if (inflight_packed->used_idx != old_used_idx) { 2010 if (inflight_packed->desc[old_used_idx].inflight == 0) { 2011 inflight_packed->old_used_idx = 2012 inflight_packed->used_idx; 2013 inflight_packed->old_used_wrap_counter = 2014 inflight_packed->used_wrap_counter; 2015 inflight_packed->old_free_head = 2016 inflight_packed->free_head; 2017 } else { 2018 inflight_packed->used_idx = 2019 inflight_packed->old_used_idx; 2020 inflight_packed->used_wrap_counter = 2021 inflight_packed->old_used_wrap_counter; 2022 inflight_packed->free_head = 2023 inflight_packed->old_free_head; 2024 } 2025 } 2026 2027 for (i = 0; i < inflight_packed->desc_num; i++) { 2028 if (inflight_packed->desc[i].inflight == 1) 2029 resubmit_num++; 2030 } 2031 2032 if (resubmit_num) { 2033 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 2034 0, vq->numa_node); 2035 if (resubmit == NULL) { 2036 VHOST_CONFIG_LOG(dev->ifname, ERR, 2037 "failed to allocate memory for resubmit info."); 2038 return RTE_VHOST_MSG_RESULT_ERR; 2039 } 2040 2041 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 2042 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 2043 0, vq->numa_node); 2044 if (resubmit->resubmit_list == NULL) { 2045 VHOST_CONFIG_LOG(dev->ifname, ERR, 2046 "failed to allocate memory for resubmit desc."); 2047 rte_free(resubmit); 2048 return RTE_VHOST_MSG_RESULT_ERR; 2049 } 2050 2051 num = 0; 2052 for (i = 0; i < inflight_packed->desc_num; i++) { 2053 if (vq->inflight_packed->desc[i].inflight == 1) { 2054 resubmit->resubmit_list[num].index = i; 2055 resubmit->resubmit_list[num].counter = 2056 inflight_packed->desc[i].counter; 2057 num++; 2058 } 2059 } 2060 resubmit->resubmit_num = num; 2061 2062 if (resubmit->resubmit_num > 1) 2063 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 2064 sizeof(struct rte_vhost_resubmit_desc), 2065 resubmit_desc_compare); 2066 2067 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 2068 vq->resubmit_inflight = resubmit; 2069 } 2070 2071 return RTE_VHOST_MSG_RESULT_OK; 2072 } 2073 2074 static int 2075 vhost_user_set_vring_kick(struct virtio_net **pdev, 2076 struct vhu_msg_context *ctx, 2077 int main_fd __rte_unused) 2078 { 2079 struct virtio_net *dev = *pdev; 2080 struct vhost_vring_file file; 2081 struct vhost_virtqueue *vq; 2082 int expected_fds; 2083 2084 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 2085 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 2086 return RTE_VHOST_MSG_RESULT_ERR; 2087 2088 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2089 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 2090 file.fd = VIRTIO_INVALID_EVENTFD; 2091 else 2092 file.fd = ctx->fds[0]; 2093 VHOST_CONFIG_LOG(dev->ifname, INFO, 2094 "vring kick idx:%d file:%d", 2095 file.index, file.fd); 2096 2097 /* Interpret ring addresses only when ring is started. */ 2098 vq = dev->virtqueue[file.index]; 2099 translate_ring_addresses(&dev, &vq); 2100 *pdev = dev; 2101 2102 /* 2103 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, 2104 * the ring starts already enabled. Otherwise, it is enabled via 2105 * the SET_VRING_ENABLE message. 2106 */ 2107 if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { 2108 vq->enabled = true; 2109 } 2110 2111 if (vq->ready) { 2112 vq->ready = false; 2113 vhost_user_notify_queue_state(dev, vq, 0); 2114 } 2115 2116 if (vq->kickfd >= 0) 2117 close(vq->kickfd); 2118 vq->kickfd = file.fd; 2119 2120 if (vq_is_packed(dev)) { 2121 if (vhost_check_queue_inflights_packed(dev, vq)) { 2122 VHOST_CONFIG_LOG(dev->ifname, ERR, 2123 "failed to inflights for vq: %d", 2124 file.index); 2125 return RTE_VHOST_MSG_RESULT_ERR; 2126 } 2127 } else { 2128 if (vhost_check_queue_inflights_split(dev, vq)) { 2129 VHOST_CONFIG_LOG(dev->ifname, ERR, 2130 "failed to inflights for vq: %d", 2131 file.index); 2132 return RTE_VHOST_MSG_RESULT_ERR; 2133 } 2134 } 2135 2136 return RTE_VHOST_MSG_RESULT_OK; 2137 } 2138 2139 /* 2140 * when virtio is stopped, qemu will send us the GET_VRING_BASE message. 2141 */ 2142 static int 2143 vhost_user_get_vring_base(struct virtio_net **pdev, 2144 struct vhu_msg_context *ctx, 2145 int main_fd __rte_unused) 2146 { 2147 struct virtio_net *dev = *pdev; 2148 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 2149 uint64_t val; 2150 2151 /* We have to stop the queue (virtio) if it is running. */ 2152 vhost_destroy_device_notify(dev); 2153 2154 dev->flags &= ~VIRTIO_DEV_READY; 2155 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 2156 2157 /* Here we are safe to get the indexes */ 2158 if (vq_is_packed(dev)) { 2159 /* 2160 * Bit[0:14]: avail index 2161 * Bit[15]: avail wrap counter 2162 */ 2163 val = vq->last_avail_idx & 0x7fff; 2164 val |= vq->avail_wrap_counter << 15; 2165 ctx->msg.payload.state.num = val; 2166 } else { 2167 ctx->msg.payload.state.num = vq->last_avail_idx; 2168 } 2169 2170 VHOST_CONFIG_LOG(dev->ifname, INFO, 2171 "vring base idx:%d file:%d", 2172 ctx->msg.payload.state.index, ctx->msg.payload.state.num); 2173 /* 2174 * Based on current qemu vhost-user implementation, this message is 2175 * sent and only sent in vhost_vring_stop. 2176 * TODO: cleanup the vring, it isn't usable since here. 2177 */ 2178 if (vq->kickfd >= 0) 2179 close(vq->kickfd); 2180 2181 vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; 2182 2183 if (vq->callfd >= 0) 2184 close(vq->callfd); 2185 2186 vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; 2187 2188 vq->signalled_used_valid = false; 2189 2190 if (vq_is_packed(dev)) { 2191 rte_free(vq->shadow_used_packed); 2192 vq->shadow_used_packed = NULL; 2193 } else { 2194 rte_free(vq->shadow_used_split); 2195 vq->shadow_used_split = NULL; 2196 } 2197 2198 rte_free(vq->batch_copy_elems); 2199 vq->batch_copy_elems = NULL; 2200 2201 rte_free(vq->log_cache); 2202 vq->log_cache = NULL; 2203 2204 ctx->msg.size = sizeof(ctx->msg.payload.state); 2205 ctx->fd_num = 0; 2206 2207 vhost_user_iotlb_flush_all(dev); 2208 2209 rte_rwlock_write_lock(&vq->access_lock); 2210 vring_invalidate(dev, vq); 2211 rte_rwlock_write_unlock(&vq->access_lock); 2212 2213 return RTE_VHOST_MSG_RESULT_REPLY; 2214 } 2215 2216 /* 2217 * when virtio queues are ready to work, qemu will send us to 2218 * enable the virtio queue pair. 2219 */ 2220 static int 2221 vhost_user_set_vring_enable(struct virtio_net **pdev, 2222 struct vhu_msg_context *ctx, 2223 int main_fd __rte_unused) 2224 { 2225 struct virtio_net *dev = *pdev; 2226 struct vhost_virtqueue *vq; 2227 bool enable = !!ctx->msg.payload.state.num; 2228 int index = (int)ctx->msg.payload.state.index; 2229 2230 VHOST_CONFIG_LOG(dev->ifname, INFO, 2231 "set queue enable: %d to qp idx: %d", 2232 enable, index); 2233 2234 vq = dev->virtqueue[index]; 2235 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 2236 /* vhost_user_lock_all_queue_pairs locked all qps */ 2237 vq_assert_lock(dev, vq); 2238 if (enable && vq->async && vq->async->pkts_inflight_n) { 2239 VHOST_CONFIG_LOG(dev->ifname, ERR, 2240 "failed to enable vring. Inflight packets must be completed first"); 2241 return RTE_VHOST_MSG_RESULT_ERR; 2242 } 2243 } 2244 2245 vq->enabled = enable; 2246 2247 return RTE_VHOST_MSG_RESULT_OK; 2248 } 2249 2250 static int 2251 vhost_user_get_protocol_features(struct virtio_net **pdev, 2252 struct vhu_msg_context *ctx, 2253 int main_fd __rte_unused) 2254 { 2255 struct virtio_net *dev = *pdev; 2256 uint64_t features, protocol_features; 2257 2258 rte_vhost_driver_get_features(dev->ifname, &features); 2259 rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); 2260 2261 ctx->msg.payload.u64 = protocol_features; 2262 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2263 ctx->fd_num = 0; 2264 2265 return RTE_VHOST_MSG_RESULT_REPLY; 2266 } 2267 2268 static int 2269 vhost_user_set_protocol_features(struct virtio_net **pdev, 2270 struct vhu_msg_context *ctx, 2271 int main_fd __rte_unused) 2272 { 2273 struct virtio_net *dev = *pdev; 2274 uint64_t protocol_features = ctx->msg.payload.u64; 2275 uint64_t backend_protocol_features = 0; 2276 2277 rte_vhost_driver_get_protocol_features(dev->ifname, 2278 &backend_protocol_features); 2279 if (protocol_features & ~backend_protocol_features) { 2280 VHOST_CONFIG_LOG(dev->ifname, ERR, "received invalid protocol features."); 2281 return RTE_VHOST_MSG_RESULT_ERR; 2282 } 2283 2284 dev->protocol_features = protocol_features; 2285 VHOST_CONFIG_LOG(dev->ifname, INFO, 2286 "negotiated Vhost-user protocol features: 0x%" PRIx64, 2287 dev->protocol_features); 2288 2289 return RTE_VHOST_MSG_RESULT_OK; 2290 } 2291 2292 static int 2293 vhost_user_set_log_base(struct virtio_net **pdev, 2294 struct vhu_msg_context *ctx, 2295 int main_fd __rte_unused) 2296 { 2297 struct virtio_net *dev = *pdev; 2298 int fd = ctx->fds[0]; 2299 uint64_t size, off; 2300 uint64_t alignment; 2301 void *addr; 2302 uint32_t i; 2303 2304 if (validate_msg_fds(dev, ctx, 1) != 0) 2305 return RTE_VHOST_MSG_RESULT_ERR; 2306 2307 if (fd < 0) { 2308 VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid log fd: %d", fd); 2309 return RTE_VHOST_MSG_RESULT_ERR; 2310 } 2311 2312 if (ctx->msg.size != sizeof(VhostUserLog)) { 2313 VHOST_CONFIG_LOG(dev->ifname, ERR, 2314 "invalid log base msg size: %"PRId32" != %d", 2315 ctx->msg.size, (int)sizeof(VhostUserLog)); 2316 goto close_msg_fds; 2317 } 2318 2319 size = ctx->msg.payload.log.mmap_size; 2320 off = ctx->msg.payload.log.mmap_offset; 2321 2322 /* Check for mmap size and offset overflow. */ 2323 if (off >= -size) { 2324 VHOST_CONFIG_LOG(dev->ifname, ERR, 2325 "log offset %#"PRIx64" and log size %#"PRIx64" overflow", 2326 off, size); 2327 goto close_msg_fds; 2328 } 2329 2330 VHOST_CONFIG_LOG(dev->ifname, INFO, 2331 "log mmap size: %"PRId64", offset: %"PRId64, 2332 size, off); 2333 2334 /* 2335 * mmap from 0 to workaround a hugepage mmap bug: mmap will 2336 * fail when offset is not page size aligned. 2337 */ 2338 addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 2339 alignment = get_blk_size(fd); 2340 close(fd); 2341 if (addr == MAP_FAILED) { 2342 VHOST_CONFIG_LOG(dev->ifname, ERR, "mmap log base failed!"); 2343 return RTE_VHOST_MSG_RESULT_ERR; 2344 } 2345 2346 /* 2347 * Free previously mapped log memory on occasionally 2348 * multiple VHOST_USER_SET_LOG_BASE. 2349 */ 2350 if (dev->log_addr) { 2351 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 2352 } 2353 dev->log_addr = (uint64_t)(uintptr_t)addr; 2354 dev->log_base = dev->log_addr + off; 2355 dev->log_size = size; 2356 mem_set_dump(dev, addr, size + off, false, alignment); 2357 2358 for (i = 0; i < dev->nr_vring; i++) { 2359 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2360 2361 rte_free(vq->log_cache); 2362 vq->log_cache = NULL; 2363 vq->log_cache_nb_elem = 0; 2364 vq->log_cache = rte_malloc_socket("vq log cache", 2365 sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR, 2366 0, vq->numa_node); 2367 /* 2368 * If log cache alloc fail, don't fail migration, but no 2369 * caching will be done, which will impact performance 2370 */ 2371 if (!vq->log_cache) 2372 VHOST_CONFIG_LOG(dev->ifname, ERR, 2373 "failed to allocate VQ logging cache"); 2374 } 2375 2376 /* 2377 * The spec is not clear about it (yet), but QEMU doesn't expect 2378 * any payload in the reply. 2379 */ 2380 ctx->msg.size = 0; 2381 ctx->fd_num = 0; 2382 2383 return RTE_VHOST_MSG_RESULT_REPLY; 2384 2385 close_msg_fds: 2386 close_msg_fds(ctx); 2387 return RTE_VHOST_MSG_RESULT_ERR; 2388 } 2389 2390 static int vhost_user_set_log_fd(struct virtio_net **pdev, 2391 struct vhu_msg_context *ctx, 2392 int main_fd __rte_unused) 2393 { 2394 struct virtio_net *dev = *pdev; 2395 2396 if (validate_msg_fds(dev, ctx, 1) != 0) 2397 return RTE_VHOST_MSG_RESULT_ERR; 2398 2399 close(ctx->fds[0]); 2400 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "not implemented."); 2401 2402 return RTE_VHOST_MSG_RESULT_OK; 2403 } 2404 2405 /* 2406 * An rarp packet is constructed and broadcasted to notify switches about 2407 * the new location of the migrated VM, so that packets from outside will 2408 * not be lost after migration. 2409 * 2410 * However, we don't actually "send" a rarp packet here, instead, we set 2411 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. 2412 */ 2413 static int 2414 vhost_user_send_rarp(struct virtio_net **pdev, 2415 struct vhu_msg_context *ctx, 2416 int main_fd __rte_unused) 2417 { 2418 struct virtio_net *dev = *pdev; 2419 uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64; 2420 struct rte_vdpa_device *vdpa_dev; 2421 2422 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 2423 "MAC: " RTE_ETHER_ADDR_PRT_FMT, 2424 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); 2425 memcpy(dev->mac.addr_bytes, mac, 6); 2426 2427 /* 2428 * Set the flag to inject a RARP broadcast packet at 2429 * rte_vhost_dequeue_burst(). 2430 * 2431 * rte_memory_order_release ordering is for making sure the mac is 2432 * copied before the flag is set. 2433 */ 2434 rte_atomic_store_explicit(&dev->broadcast_rarp, 1, rte_memory_order_release); 2435 vdpa_dev = dev->vdpa_dev; 2436 if (vdpa_dev && vdpa_dev->ops->migration_done) 2437 vdpa_dev->ops->migration_done(dev->vid); 2438 2439 return RTE_VHOST_MSG_RESULT_OK; 2440 } 2441 2442 static int 2443 vhost_user_net_set_mtu(struct virtio_net **pdev, 2444 struct vhu_msg_context *ctx, 2445 int main_fd __rte_unused) 2446 { 2447 struct virtio_net *dev = *pdev; 2448 2449 if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU || 2450 ctx->msg.payload.u64 > VIRTIO_MAX_MTU) { 2451 VHOST_CONFIG_LOG(dev->ifname, ERR, 2452 "invalid MTU size (%"PRIu64")", 2453 ctx->msg.payload.u64); 2454 2455 return RTE_VHOST_MSG_RESULT_ERR; 2456 } 2457 2458 dev->mtu = ctx->msg.payload.u64; 2459 2460 return RTE_VHOST_MSG_RESULT_OK; 2461 } 2462 2463 static int 2464 vhost_user_set_req_fd(struct virtio_net **pdev, 2465 struct vhu_msg_context *ctx, 2466 int main_fd __rte_unused) 2467 { 2468 struct virtio_net *dev = *pdev; 2469 int fd = ctx->fds[0]; 2470 2471 if (validate_msg_fds(dev, ctx, 1) != 0) 2472 return RTE_VHOST_MSG_RESULT_ERR; 2473 2474 if (fd < 0) { 2475 VHOST_CONFIG_LOG(dev->ifname, ERR, 2476 "invalid file descriptor for backend channel (%d)", fd); 2477 return RTE_VHOST_MSG_RESULT_ERR; 2478 } 2479 2480 if (dev->backend_req_fd >= 0) 2481 close(dev->backend_req_fd); 2482 2483 dev->backend_req_fd = fd; 2484 2485 return RTE_VHOST_MSG_RESULT_OK; 2486 } 2487 2488 static int 2489 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2490 { 2491 struct vhost_vring_addr *ra; 2492 uint64_t start, end, len; 2493 2494 start = imsg->iova; 2495 end = start + imsg->size; 2496 2497 ra = &vq->ring_addrs; 2498 len = sizeof(struct vring_desc) * vq->size; 2499 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2500 return 1; 2501 2502 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 2503 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2504 return 1; 2505 2506 len = sizeof(struct vring_used) + 2507 sizeof(struct vring_used_elem) * vq->size; 2508 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2509 return 1; 2510 2511 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2512 len = sizeof(uint64_t); 2513 if (ra->log_guest_addr < end && 2514 (ra->log_guest_addr + len) > start) 2515 return 1; 2516 } 2517 2518 return 0; 2519 } 2520 2521 static int 2522 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2523 { 2524 struct vhost_vring_addr *ra; 2525 uint64_t start, end, len; 2526 2527 start = imsg->iova; 2528 end = start + imsg->size; 2529 2530 ra = &vq->ring_addrs; 2531 len = sizeof(struct vring_packed_desc) * vq->size; 2532 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2533 return 1; 2534 2535 len = sizeof(struct vring_packed_desc_event); 2536 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2537 return 1; 2538 2539 len = sizeof(struct vring_packed_desc_event); 2540 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2541 return 1; 2542 2543 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2544 len = sizeof(uint64_t); 2545 if (ra->log_guest_addr < end && 2546 (ra->log_guest_addr + len) > start) 2547 return 1; 2548 } 2549 2550 return 0; 2551 } 2552 2553 static int is_vring_iotlb(struct virtio_net *dev, 2554 struct vhost_virtqueue *vq, 2555 struct vhost_iotlb_msg *imsg) 2556 { 2557 if (vq_is_packed(dev)) 2558 return is_vring_iotlb_packed(vq, imsg); 2559 else 2560 return is_vring_iotlb_split(vq, imsg); 2561 } 2562 2563 static int 2564 vhost_user_get_config(struct virtio_net **pdev, 2565 struct vhu_msg_context *ctx, 2566 int main_fd __rte_unused) 2567 { 2568 struct virtio_net *dev = *pdev; 2569 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 2570 int ret = 0; 2571 2572 if (validate_msg_fds(dev, ctx, 0) != 0) 2573 return RTE_VHOST_MSG_RESULT_ERR; 2574 2575 if (!vdpa_dev) { 2576 VHOST_CONFIG_LOG(dev->ifname, ERR, "is not vDPA device!"); 2577 return RTE_VHOST_MSG_RESULT_ERR; 2578 } 2579 2580 if (vdpa_dev->ops->get_config) { 2581 ret = vdpa_dev->ops->get_config(dev->vid, 2582 ctx->msg.payload.cfg.region, 2583 ctx->msg.payload.cfg.size); 2584 if (ret != 0) { 2585 ctx->msg.size = 0; 2586 VHOST_CONFIG_LOG(dev->ifname, ERR, "get_config() return error!"); 2587 } 2588 } else { 2589 VHOST_CONFIG_LOG(dev->ifname, ERR, "get_config() not supported!"); 2590 } 2591 2592 return RTE_VHOST_MSG_RESULT_REPLY; 2593 } 2594 2595 static int 2596 vhost_user_set_config(struct virtio_net **pdev, 2597 struct vhu_msg_context *ctx, 2598 int main_fd __rte_unused) 2599 { 2600 struct virtio_net *dev = *pdev; 2601 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 2602 int ret = 0; 2603 2604 if (validate_msg_fds(dev, ctx, 0) != 0) 2605 return RTE_VHOST_MSG_RESULT_ERR; 2606 2607 if (ctx->msg.payload.cfg.size > VHOST_USER_MAX_CONFIG_SIZE) { 2608 VHOST_CONFIG_LOG(dev->ifname, ERR, 2609 "vhost_user_config size: %"PRIu32", should not be larger than %d", 2610 ctx->msg.payload.cfg.size, VHOST_USER_MAX_CONFIG_SIZE); 2611 goto out; 2612 } 2613 2614 if (!vdpa_dev) { 2615 VHOST_CONFIG_LOG(dev->ifname, ERR, "is not vDPA device!"); 2616 goto out; 2617 } 2618 2619 if (vdpa_dev->ops->set_config) { 2620 ret = vdpa_dev->ops->set_config(dev->vid, 2621 ctx->msg.payload.cfg.region, 2622 ctx->msg.payload.cfg.offset, 2623 ctx->msg.payload.cfg.size, 2624 ctx->msg.payload.cfg.flags); 2625 if (ret) 2626 VHOST_CONFIG_LOG(dev->ifname, ERR, "set_config() return error!"); 2627 } else { 2628 VHOST_CONFIG_LOG(dev->ifname, ERR, "set_config() not supported!"); 2629 } 2630 2631 return RTE_VHOST_MSG_RESULT_OK; 2632 2633 out: 2634 return RTE_VHOST_MSG_RESULT_ERR; 2635 } 2636 2637 static int 2638 vhost_user_iotlb_msg(struct virtio_net **pdev, 2639 struct vhu_msg_context *ctx, 2640 int main_fd __rte_unused) 2641 { 2642 struct virtio_net *dev = *pdev; 2643 struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb; 2644 uint16_t i; 2645 uint64_t vva, len, pg_sz; 2646 2647 switch (imsg->type) { 2648 case VHOST_IOTLB_UPDATE: 2649 len = imsg->size; 2650 vva = qva_to_vva(dev, imsg->uaddr, &len); 2651 if (!vva) 2652 return RTE_VHOST_MSG_RESULT_ERR; 2653 2654 pg_sz = hua_to_alignment(dev->mem, (void *)(uintptr_t)vva); 2655 2656 vhost_user_iotlb_cache_insert(dev, imsg->iova, vva, 0, len, pg_sz, imsg->perm); 2657 2658 for (i = 0; i < dev->nr_vring; i++) { 2659 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2660 2661 if (!vq) 2662 continue; 2663 2664 if (is_vring_iotlb(dev, vq, imsg)) { 2665 rte_rwlock_write_lock(&vq->access_lock); 2666 translate_ring_addresses(&dev, &vq); 2667 *pdev = dev; 2668 rte_rwlock_write_unlock(&vq->access_lock); 2669 } 2670 } 2671 break; 2672 case VHOST_IOTLB_INVALIDATE: 2673 vhost_user_iotlb_cache_remove(dev, imsg->iova, imsg->size); 2674 2675 for (i = 0; i < dev->nr_vring; i++) { 2676 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2677 2678 if (!vq) 2679 continue; 2680 2681 if (is_vring_iotlb(dev, vq, imsg)) { 2682 rte_rwlock_write_lock(&vq->access_lock); 2683 vring_invalidate(dev, vq); 2684 rte_rwlock_write_unlock(&vq->access_lock); 2685 } 2686 } 2687 break; 2688 default: 2689 VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid IOTLB message type (%d)", 2690 imsg->type); 2691 return RTE_VHOST_MSG_RESULT_ERR; 2692 } 2693 2694 return RTE_VHOST_MSG_RESULT_OK; 2695 } 2696 2697 static int 2698 vhost_user_set_postcopy_advise(struct virtio_net **pdev, 2699 struct vhu_msg_context *ctx, 2700 int main_fd __rte_unused) 2701 { 2702 struct virtio_net *dev = *pdev; 2703 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 2704 struct uffdio_api api_struct; 2705 2706 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 2707 2708 if (dev->postcopy_ufd == -1) { 2709 VHOST_CONFIG_LOG(dev->ifname, ERR, 2710 "userfaultfd not available: %s", 2711 strerror(errno)); 2712 return RTE_VHOST_MSG_RESULT_ERR; 2713 } 2714 api_struct.api = UFFD_API; 2715 api_struct.features = 0; 2716 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 2717 VHOST_CONFIG_LOG(dev->ifname, ERR, 2718 "UFFDIO_API ioctl failure: %s", 2719 strerror(errno)); 2720 close(dev->postcopy_ufd); 2721 dev->postcopy_ufd = -1; 2722 return RTE_VHOST_MSG_RESULT_ERR; 2723 } 2724 ctx->fds[0] = dev->postcopy_ufd; 2725 ctx->fd_num = 1; 2726 2727 return RTE_VHOST_MSG_RESULT_REPLY; 2728 #else 2729 dev->postcopy_ufd = -1; 2730 ctx->fd_num = 0; 2731 2732 return RTE_VHOST_MSG_RESULT_ERR; 2733 #endif 2734 } 2735 2736 static int 2737 vhost_user_set_postcopy_listen(struct virtio_net **pdev, 2738 struct vhu_msg_context *ctx __rte_unused, 2739 int main_fd __rte_unused) 2740 { 2741 struct virtio_net *dev = *pdev; 2742 2743 if (dev->mem && dev->mem->nregions) { 2744 VHOST_CONFIG_LOG(dev->ifname, ERR, 2745 "regions already registered at postcopy-listen"); 2746 return RTE_VHOST_MSG_RESULT_ERR; 2747 } 2748 dev->postcopy_listening = 1; 2749 2750 return RTE_VHOST_MSG_RESULT_OK; 2751 } 2752 2753 static int 2754 vhost_user_postcopy_end(struct virtio_net **pdev, 2755 struct vhu_msg_context *ctx, 2756 int main_fd __rte_unused) 2757 { 2758 struct virtio_net *dev = *pdev; 2759 2760 dev->postcopy_listening = 0; 2761 if (dev->postcopy_ufd >= 0) { 2762 close(dev->postcopy_ufd); 2763 dev->postcopy_ufd = -1; 2764 } 2765 2766 ctx->msg.payload.u64 = 0; 2767 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2768 ctx->fd_num = 0; 2769 2770 return RTE_VHOST_MSG_RESULT_REPLY; 2771 } 2772 2773 static int 2774 vhost_user_get_status(struct virtio_net **pdev, 2775 struct vhu_msg_context *ctx, 2776 int main_fd __rte_unused) 2777 { 2778 struct virtio_net *dev = *pdev; 2779 2780 ctx->msg.payload.u64 = dev->status; 2781 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2782 ctx->fd_num = 0; 2783 2784 return RTE_VHOST_MSG_RESULT_REPLY; 2785 } 2786 2787 static int 2788 vhost_user_set_status(struct virtio_net **pdev, 2789 struct vhu_msg_context *ctx, 2790 int main_fd __rte_unused) 2791 { 2792 struct virtio_net *dev = *pdev; 2793 2794 /* As per Virtio specification, the device status is 8bits long */ 2795 if (ctx->msg.payload.u64 > UINT8_MAX) { 2796 VHOST_CONFIG_LOG(dev->ifname, ERR, 2797 "invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64, 2798 ctx->msg.payload.u64); 2799 return RTE_VHOST_MSG_RESULT_ERR; 2800 } 2801 2802 dev->status = ctx->msg.payload.u64; 2803 2804 if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) && 2805 (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) { 2806 VHOST_CONFIG_LOG(dev->ifname, ERR, 2807 "FEATURES_OK bit is set but feature negotiation failed"); 2808 /* 2809 * Clear the bit to let the driver know about the feature 2810 * negotiation failure 2811 */ 2812 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 2813 } 2814 2815 VHOST_CONFIG_LOG(dev->ifname, INFO, "new device status(0x%08x):", dev->status); 2816 VHOST_CONFIG_LOG(dev->ifname, INFO, 2817 "\t-RESET: %u", 2818 (dev->status == VIRTIO_DEVICE_STATUS_RESET)); 2819 VHOST_CONFIG_LOG(dev->ifname, INFO, 2820 "\t-ACKNOWLEDGE: %u", 2821 !!(dev->status & VIRTIO_DEVICE_STATUS_ACK)); 2822 VHOST_CONFIG_LOG(dev->ifname, INFO, 2823 "\t-DRIVER: %u", 2824 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER)); 2825 VHOST_CONFIG_LOG(dev->ifname, INFO, 2826 "\t-FEATURES_OK: %u", 2827 !!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK)); 2828 VHOST_CONFIG_LOG(dev->ifname, INFO, 2829 "\t-DRIVER_OK: %u", 2830 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)); 2831 VHOST_CONFIG_LOG(dev->ifname, INFO, 2832 "\t-DEVICE_NEED_RESET: %u", 2833 !!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET)); 2834 VHOST_CONFIG_LOG(dev->ifname, INFO, 2835 "\t-FAILED: %u", 2836 !!(dev->status & VIRTIO_DEVICE_STATUS_FAILED)); 2837 2838 return RTE_VHOST_MSG_RESULT_OK; 2839 } 2840 2841 #define VHOST_MESSAGE_HANDLERS \ 2842 VHOST_MESSAGE_HANDLER(VHOST_USER_NONE, NULL, false) \ 2843 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_FEATURES, vhost_user_get_features, false) \ 2844 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_FEATURES, vhost_user_set_features, false) \ 2845 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_OWNER, vhost_user_set_owner, false) \ 2846 VHOST_MESSAGE_HANDLER(VHOST_USER_RESET_OWNER, vhost_user_reset_owner, false) \ 2847 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_MEM_TABLE, vhost_user_set_mem_table, true) \ 2848 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_BASE, vhost_user_set_log_base, true) \ 2849 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_FD, vhost_user_set_log_fd, true) \ 2850 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_NUM, vhost_user_set_vring_num, false) \ 2851 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ADDR, vhost_user_set_vring_addr, false) \ 2852 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_BASE, vhost_user_set_vring_base, false) \ 2853 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_VRING_BASE, vhost_user_get_vring_base, false) \ 2854 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_KICK, vhost_user_set_vring_kick, true) \ 2855 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_CALL, vhost_user_set_vring_call, true) \ 2856 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ERR, vhost_user_set_vring_err, true) \ 2857 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_PROTOCOL_FEATURES, vhost_user_get_protocol_features, false) \ 2858 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_PROTOCOL_FEATURES, vhost_user_set_protocol_features, false) \ 2859 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_QUEUE_NUM, vhost_user_get_queue_num, false) \ 2860 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ENABLE, vhost_user_set_vring_enable, false) \ 2861 VHOST_MESSAGE_HANDLER(VHOST_USER_SEND_RARP, vhost_user_send_rarp, false) \ 2862 VHOST_MESSAGE_HANDLER(VHOST_USER_NET_SET_MTU, vhost_user_net_set_mtu, false) \ 2863 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_BACKEND_REQ_FD, vhost_user_set_req_fd, true) \ 2864 VHOST_MESSAGE_HANDLER(VHOST_USER_IOTLB_MSG, vhost_user_iotlb_msg, false) \ 2865 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_CONFIG, vhost_user_get_config, false) \ 2866 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_CONFIG, vhost_user_set_config, false) \ 2867 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_ADVISE, vhost_user_set_postcopy_advise, false) \ 2868 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_LISTEN, vhost_user_set_postcopy_listen, false) \ 2869 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_END, vhost_user_postcopy_end, false) \ 2870 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_INFLIGHT_FD, vhost_user_get_inflight_fd, false) \ 2871 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_INFLIGHT_FD, vhost_user_set_inflight_fd, true) \ 2872 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_STATUS, vhost_user_set_status, false) \ 2873 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_STATUS, vhost_user_get_status, false) 2874 2875 #define VHOST_MESSAGE_HANDLER(id, handler, accepts_fd) \ 2876 [id] = { #id, handler, accepts_fd }, 2877 static vhost_message_handler_t vhost_message_handlers[] = { 2878 VHOST_MESSAGE_HANDLERS 2879 }; 2880 #undef VHOST_MESSAGE_HANDLER 2881 2882 /* return bytes# of read on success or negative val on failure. */ 2883 static int 2884 read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2885 { 2886 int ret; 2887 2888 ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE, 2889 ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num); 2890 if (ret <= 0) 2891 goto out; 2892 2893 if (ret != VHOST_USER_HDR_SIZE) { 2894 VHOST_CONFIG_LOG(dev->ifname, ERR, "Unexpected header size read"); 2895 ret = -1; 2896 goto out; 2897 } 2898 2899 if (ctx->msg.size) { 2900 if (ctx->msg.size > sizeof(ctx->msg.payload)) { 2901 VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid msg size: %d", 2902 ctx->msg.size); 2903 ret = -1; 2904 goto out; 2905 } 2906 ret = read(sockfd, &ctx->msg.payload, ctx->msg.size); 2907 if (ret <= 0) 2908 goto out; 2909 if (ret != (int)ctx->msg.size) { 2910 VHOST_CONFIG_LOG(dev->ifname, ERR, "read control message failed"); 2911 ret = -1; 2912 goto out; 2913 } 2914 } 2915 2916 out: 2917 if (ret <= 0) 2918 close_msg_fds(ctx); 2919 2920 return ret; 2921 } 2922 2923 static int 2924 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2925 { 2926 if (!ctx) 2927 return 0; 2928 2929 return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, 2930 VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num); 2931 } 2932 2933 static int 2934 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2935 { 2936 if (!ctx) 2937 return 0; 2938 2939 ctx->msg.flags &= ~VHOST_USER_VERSION_MASK; 2940 ctx->msg.flags &= ~VHOST_USER_NEED_REPLY; 2941 ctx->msg.flags |= VHOST_USER_VERSION; 2942 ctx->msg.flags |= VHOST_USER_REPLY_MASK; 2943 2944 return send_vhost_message(dev, sockfd, ctx); 2945 } 2946 2947 static int 2948 send_vhost_backend_message(struct virtio_net *dev, struct vhu_msg_context *ctx) 2949 { 2950 return send_vhost_message(dev, dev->backend_req_fd, ctx); 2951 } 2952 2953 static int 2954 send_vhost_backend_message_process_reply(struct virtio_net *dev, struct vhu_msg_context *ctx) 2955 { 2956 struct vhu_msg_context msg_reply; 2957 int ret; 2958 2959 rte_spinlock_lock(&dev->backend_req_lock); 2960 ret = send_vhost_backend_message(dev, ctx); 2961 if (ret < 0) { 2962 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to send config change (%d)", ret); 2963 goto out; 2964 } 2965 2966 ret = read_vhost_message(dev, dev->backend_req_fd, &msg_reply); 2967 if (ret <= 0) { 2968 if (ret < 0) 2969 VHOST_CONFIG_LOG(dev->ifname, ERR, 2970 "vhost read backend message reply failed"); 2971 else 2972 VHOST_CONFIG_LOG(dev->ifname, INFO, "vhost peer closed"); 2973 ret = -1; 2974 goto out; 2975 } 2976 2977 if (msg_reply.msg.request.backend != ctx->msg.request.backend) { 2978 VHOST_CONFIG_LOG(dev->ifname, ERR, 2979 "received unexpected msg type (%u), expected %u", 2980 msg_reply.msg.request.backend, ctx->msg.request.backend); 2981 ret = -1; 2982 goto out; 2983 } 2984 2985 ret = msg_reply.msg.payload.u64 ? -1 : 0; 2986 out: 2987 rte_spinlock_unlock(&dev->backend_req_lock); 2988 return ret; 2989 } 2990 2991 /* 2992 * Allocate a queue pair if it hasn't been allocated yet 2993 */ 2994 static int 2995 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, 2996 struct vhu_msg_context *ctx) 2997 { 2998 uint32_t vring_idx; 2999 3000 switch (ctx->msg.request.frontend) { 3001 case VHOST_USER_SET_VRING_KICK: 3002 case VHOST_USER_SET_VRING_CALL: 3003 case VHOST_USER_SET_VRING_ERR: 3004 vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 3005 break; 3006 case VHOST_USER_SET_VRING_NUM: 3007 case VHOST_USER_SET_VRING_BASE: 3008 case VHOST_USER_GET_VRING_BASE: 3009 case VHOST_USER_SET_VRING_ENABLE: 3010 vring_idx = ctx->msg.payload.state.index; 3011 break; 3012 case VHOST_USER_SET_VRING_ADDR: 3013 vring_idx = ctx->msg.payload.addr.index; 3014 break; 3015 case VHOST_USER_SET_INFLIGHT_FD: 3016 vring_idx = ctx->msg.payload.inflight.num_queues - 1; 3017 break; 3018 default: 3019 return 0; 3020 } 3021 3022 if (vring_idx >= VHOST_MAX_VRING) { 3023 VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid vring index: %u", vring_idx); 3024 return -1; 3025 } 3026 3027 if (dev->virtqueue[vring_idx]) 3028 return 0; 3029 3030 return alloc_vring_queue(dev, vring_idx); 3031 } 3032 3033 static void 3034 vhost_user_lock_all_queue_pairs(struct virtio_net *dev) 3035 __rte_no_thread_safety_analysis 3036 { 3037 unsigned int i = 0; 3038 unsigned int vq_num = 0; 3039 3040 while (vq_num < dev->nr_vring) { 3041 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3042 3043 if (vq) { 3044 rte_rwlock_write_lock(&vq->access_lock); 3045 vq_num++; 3046 } 3047 i++; 3048 } 3049 } 3050 3051 static void 3052 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) 3053 __rte_no_thread_safety_analysis 3054 { 3055 unsigned int i = 0; 3056 unsigned int vq_num = 0; 3057 3058 while (vq_num < dev->nr_vring) { 3059 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3060 3061 if (vq) { 3062 rte_rwlock_write_unlock(&vq->access_lock); 3063 vq_num++; 3064 } 3065 i++; 3066 } 3067 } 3068 3069 int 3070 vhost_user_msg_handler(int vid, int fd) 3071 { 3072 struct virtio_net *dev; 3073 struct vhu_msg_context ctx; 3074 vhost_message_handler_t *msg_handler; 3075 struct rte_vdpa_device *vdpa_dev; 3076 int msg_result = RTE_VHOST_MSG_RESULT_OK; 3077 int ret; 3078 int unlock_required = 0; 3079 bool handled; 3080 uint32_t request; 3081 uint32_t i; 3082 uint16_t blk_call_fd; 3083 3084 dev = get_device(vid); 3085 if (dev == NULL) 3086 return -1; 3087 3088 if (!dev->notify_ops) { 3089 dev->notify_ops = vhost_driver_callback_get(dev->ifname); 3090 if (!dev->notify_ops) { 3091 VHOST_CONFIG_LOG(dev->ifname, ERR, 3092 "failed to get callback ops for driver"); 3093 return -1; 3094 } 3095 } 3096 3097 ctx.msg.request.frontend = VHOST_USER_NONE; 3098 ret = read_vhost_message(dev, fd, &ctx); 3099 if (ret == 0) { 3100 VHOST_CONFIG_LOG(dev->ifname, INFO, "vhost peer closed"); 3101 return -1; 3102 } 3103 3104 request = ctx.msg.request.frontend; 3105 if (request > VHOST_USER_NONE && request < RTE_DIM(vhost_message_handlers)) 3106 msg_handler = &vhost_message_handlers[request]; 3107 else 3108 msg_handler = NULL; 3109 3110 if (ret < 0) { 3111 VHOST_CONFIG_LOG(dev->ifname, ERR, "vhost read message %s%s%sfailed", 3112 msg_handler != NULL ? "for " : "", 3113 msg_handler != NULL ? msg_handler->description : "", 3114 msg_handler != NULL ? " " : ""); 3115 return -1; 3116 } 3117 3118 if (msg_handler != NULL && msg_handler->description != NULL) { 3119 if (request != VHOST_USER_IOTLB_MSG) 3120 VHOST_CONFIG_LOG(dev->ifname, INFO, 3121 "read message %s", 3122 msg_handler->description); 3123 else 3124 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 3125 "read message %s", 3126 msg_handler->description); 3127 } else { 3128 VHOST_CONFIG_LOG(dev->ifname, DEBUG, "external request %d", request); 3129 } 3130 3131 ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx); 3132 if (ret < 0) { 3133 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc queue"); 3134 return -1; 3135 } 3136 3137 /* 3138 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE 3139 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops 3140 * and device is destroyed. destroy_device waits for queues to be 3141 * inactive, so it is safe. Otherwise taking the access_lock 3142 * would cause a dead lock. 3143 */ 3144 switch (request) { 3145 case VHOST_USER_SET_FEATURES: 3146 case VHOST_USER_SET_PROTOCOL_FEATURES: 3147 case VHOST_USER_SET_OWNER: 3148 case VHOST_USER_SET_MEM_TABLE: 3149 case VHOST_USER_SET_LOG_BASE: 3150 case VHOST_USER_SET_LOG_FD: 3151 case VHOST_USER_SET_VRING_NUM: 3152 case VHOST_USER_SET_VRING_ADDR: 3153 case VHOST_USER_SET_VRING_BASE: 3154 case VHOST_USER_SET_VRING_KICK: 3155 case VHOST_USER_SET_VRING_CALL: 3156 case VHOST_USER_SET_VRING_ERR: 3157 case VHOST_USER_SET_VRING_ENABLE: 3158 case VHOST_USER_SEND_RARP: 3159 case VHOST_USER_NET_SET_MTU: 3160 case VHOST_USER_SET_BACKEND_REQ_FD: 3161 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3162 vhost_user_lock_all_queue_pairs(dev); 3163 unlock_required = 1; 3164 } 3165 break; 3166 default: 3167 break; 3168 3169 } 3170 3171 handled = false; 3172 if (dev->extern_ops.pre_msg_handle) { 3173 RTE_BUILD_BUG_ON(offsetof(struct vhu_msg_context, msg) != 0); 3174 msg_result = (*dev->extern_ops.pre_msg_handle)(dev->vid, &ctx); 3175 switch (msg_result) { 3176 case RTE_VHOST_MSG_RESULT_REPLY: 3177 send_vhost_reply(dev, fd, &ctx); 3178 /* Fall-through */ 3179 case RTE_VHOST_MSG_RESULT_ERR: 3180 case RTE_VHOST_MSG_RESULT_OK: 3181 handled = true; 3182 goto skip_to_post_handle; 3183 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3184 default: 3185 break; 3186 } 3187 } 3188 3189 if (msg_handler == NULL || msg_handler->callback == NULL) 3190 goto skip_to_post_handle; 3191 3192 if (!msg_handler->accepts_fd && validate_msg_fds(dev, &ctx, 0) != 0) { 3193 msg_result = RTE_VHOST_MSG_RESULT_ERR; 3194 } else { 3195 msg_result = msg_handler->callback(&dev, &ctx, fd); 3196 } 3197 3198 switch (msg_result) { 3199 case RTE_VHOST_MSG_RESULT_ERR: 3200 VHOST_CONFIG_LOG(dev->ifname, ERR, 3201 "processing %s failed.", 3202 msg_handler->description); 3203 handled = true; 3204 break; 3205 case RTE_VHOST_MSG_RESULT_OK: 3206 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 3207 "processing %s succeeded.", 3208 msg_handler->description); 3209 handled = true; 3210 break; 3211 case RTE_VHOST_MSG_RESULT_REPLY: 3212 VHOST_CONFIG_LOG(dev->ifname, DEBUG, 3213 "processing %s succeeded and needs reply.", 3214 msg_handler->description); 3215 send_vhost_reply(dev, fd, &ctx); 3216 handled = true; 3217 break; 3218 default: 3219 break; 3220 } 3221 3222 skip_to_post_handle: 3223 if (msg_result != RTE_VHOST_MSG_RESULT_ERR && 3224 dev->extern_ops.post_msg_handle) { 3225 RTE_BUILD_BUG_ON(offsetof(struct vhu_msg_context, msg) != 0); 3226 msg_result = (*dev->extern_ops.post_msg_handle)(dev->vid, &ctx); 3227 switch (msg_result) { 3228 case RTE_VHOST_MSG_RESULT_REPLY: 3229 send_vhost_reply(dev, fd, &ctx); 3230 /* Fall-through */ 3231 case RTE_VHOST_MSG_RESULT_ERR: 3232 case RTE_VHOST_MSG_RESULT_OK: 3233 handled = true; 3234 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3235 default: 3236 break; 3237 } 3238 } 3239 3240 /* If message was not handled at this stage, treat it as an error */ 3241 if (!handled) { 3242 VHOST_CONFIG_LOG(dev->ifname, ERR, 3243 "vhost message (req: %d) was not handled.", 3244 request); 3245 close_msg_fds(&ctx); 3246 msg_result = RTE_VHOST_MSG_RESULT_ERR; 3247 } 3248 3249 /* 3250 * If the request required a reply that was already sent, 3251 * this optional reply-ack won't be sent as the 3252 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). 3253 */ 3254 if (ctx.msg.flags & VHOST_USER_NEED_REPLY) { 3255 ctx.msg.payload.u64 = msg_result == RTE_VHOST_MSG_RESULT_ERR; 3256 ctx.msg.size = sizeof(ctx.msg.payload.u64); 3257 ctx.fd_num = 0; 3258 send_vhost_reply(dev, fd, &ctx); 3259 } else if (msg_result == RTE_VHOST_MSG_RESULT_ERR) { 3260 VHOST_CONFIG_LOG(dev->ifname, ERR, "vhost message handling failed."); 3261 ret = -1; 3262 goto unlock; 3263 } 3264 3265 for (i = 0; i < dev->nr_vring; i++) { 3266 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3267 bool cur_ready = vq_is_ready(dev, vq); 3268 3269 if (cur_ready != (vq && vq->ready)) { 3270 vq->ready = cur_ready; 3271 vhost_user_notify_queue_state(dev, vq, cur_ready); 3272 } 3273 } 3274 3275 unlock: 3276 if (unlock_required) 3277 vhost_user_unlock_all_queue_pairs(dev); 3278 3279 if (ret != 0 || !virtio_is_ready(dev)) 3280 goto out; 3281 3282 /* 3283 * Virtio is now ready. If not done already, it is time 3284 * to notify the application it can process the rings and 3285 * configure the vDPA device if present. 3286 */ 3287 3288 if (!(dev->flags & VIRTIO_DEV_RUNNING)) { 3289 if (dev->notify_ops->new_device(dev->vid) == 0) 3290 dev->flags |= VIRTIO_DEV_RUNNING; 3291 } 3292 3293 vdpa_dev = dev->vdpa_dev; 3294 if (!vdpa_dev) 3295 goto out; 3296 3297 if (vdpa_dev->type == RTE_VHOST_VDPA_DEVICE_TYPE_BLK) { 3298 if (request == VHOST_USER_SET_VRING_CALL) { 3299 blk_call_fd = ctx.msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 3300 if (blk_call_fd != dev->nr_vring - 1) 3301 goto out; 3302 } else { 3303 goto out; 3304 } 3305 } 3306 3307 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3308 if (vdpa_dev->ops->dev_conf(dev->vid)) 3309 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to configure vDPA device"); 3310 else 3311 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; 3312 } 3313 3314 out: 3315 return ret; 3316 } 3317 3318 static int 3319 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) 3320 { 3321 int ret; 3322 struct vhu_msg_context ctx = { 3323 .msg = { 3324 .request.backend = VHOST_USER_BACKEND_IOTLB_MSG, 3325 .flags = VHOST_USER_VERSION, 3326 .size = sizeof(ctx.msg.payload.iotlb), 3327 .payload.iotlb = { 3328 .iova = iova, 3329 .perm = perm, 3330 .type = VHOST_IOTLB_MISS, 3331 }, 3332 }, 3333 }; 3334 3335 ret = send_vhost_message(dev, dev->backend_req_fd, &ctx); 3336 if (ret < 0) { 3337 VHOST_CONFIG_LOG(dev->ifname, ERR, 3338 "failed to send IOTLB miss message (%d)", 3339 ret); 3340 return ret; 3341 } 3342 3343 return 0; 3344 } 3345 3346 int 3347 rte_vhost_backend_config_change(int vid, bool need_reply) 3348 { 3349 struct vhu_msg_context ctx = { 3350 .msg = { 3351 .request.backend = VHOST_USER_BACKEND_CONFIG_CHANGE_MSG, 3352 .flags = VHOST_USER_VERSION, 3353 .size = 0, 3354 } 3355 }; 3356 struct virtio_net *dev; 3357 int ret; 3358 3359 dev = get_device(vid); 3360 if (!dev) 3361 return -ENODEV; 3362 3363 if (!need_reply) { 3364 ret = send_vhost_backend_message(dev, &ctx); 3365 } else { 3366 ctx.msg.flags |= VHOST_USER_NEED_REPLY; 3367 ret = send_vhost_backend_message_process_reply(dev, &ctx); 3368 } 3369 3370 if (ret < 0) 3371 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to send config change (%d)", ret); 3372 return ret; 3373 } 3374 3375 static int vhost_user_backend_set_vring_host_notifier(struct virtio_net *dev, 3376 int index, int fd, 3377 uint64_t offset, 3378 uint64_t size) 3379 { 3380 int ret; 3381 struct vhu_msg_context ctx = { 3382 .msg = { 3383 .request.backend = VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG, 3384 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, 3385 .size = sizeof(ctx.msg.payload.area), 3386 .payload.area = { 3387 .u64 = index & VHOST_USER_VRING_IDX_MASK, 3388 .size = size, 3389 .offset = offset, 3390 }, 3391 }, 3392 }; 3393 3394 if (fd < 0) 3395 ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 3396 else { 3397 ctx.fds[0] = fd; 3398 ctx.fd_num = 1; 3399 } 3400 3401 ret = send_vhost_backend_message_process_reply(dev, &ctx); 3402 if (ret < 0) 3403 VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to set host notifier (%d)", ret); 3404 3405 return ret; 3406 } 3407 3408 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable) 3409 { 3410 struct virtio_net *dev; 3411 struct rte_vdpa_device *vdpa_dev; 3412 int vfio_device_fd, ret = 0; 3413 uint64_t offset, size; 3414 unsigned int i, q_start, q_last; 3415 3416 dev = get_device(vid); 3417 if (!dev) 3418 return -ENODEV; 3419 3420 vdpa_dev = dev->vdpa_dev; 3421 if (vdpa_dev == NULL) 3422 return -ENODEV; 3423 3424 if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || 3425 !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || 3426 !(dev->protocol_features & 3427 (1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ)) || 3428 !(dev->protocol_features & 3429 (1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD)) || 3430 !(dev->protocol_features & 3431 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) 3432 return -ENOTSUP; 3433 3434 if (qid == RTE_VHOST_QUEUE_ALL) { 3435 q_start = 0; 3436 q_last = dev->nr_vring - 1; 3437 } else { 3438 if (qid >= dev->nr_vring) 3439 return -EINVAL; 3440 q_start = qid; 3441 q_last = qid; 3442 } 3443 3444 if (vdpa_dev->ops->get_vfio_device_fd == NULL) 3445 return -ENOTSUP; 3446 if (vdpa_dev->ops->get_notify_area == NULL) 3447 return -ENOTSUP; 3448 3449 vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); 3450 if (vfio_device_fd < 0) 3451 return -ENOTSUP; 3452 3453 if (enable) { 3454 for (i = q_start; i <= q_last; i++) { 3455 if (vdpa_dev->ops->get_notify_area(vid, i, &offset, 3456 &size) < 0) { 3457 ret = -ENOTSUP; 3458 goto disable; 3459 } 3460 3461 if (vhost_user_backend_set_vring_host_notifier(dev, i, 3462 vfio_device_fd, offset, size) < 0) { 3463 ret = -EFAULT; 3464 goto disable; 3465 } 3466 } 3467 } else { 3468 disable: 3469 for (i = q_start; i <= q_last; i++) { 3470 vhost_user_backend_set_vring_host_notifier(dev, i, -1, 3471 0, 0); 3472 } 3473 } 3474 3475 return ret; 3476 } 3477 3478 static int 3479 vhost_user_inject_irq(struct virtio_net *dev __rte_unused, struct vhost_virtqueue *vq) 3480 { 3481 if (vq->callfd < 0) 3482 return -1; 3483 3484 return eventfd_write(vq->callfd, (eventfd_t)1); 3485 } 3486 3487 static struct vhost_backend_ops vhost_user_backend_ops = { 3488 .iotlb_miss = vhost_user_iotlb_miss, 3489 .inject_irq = vhost_user_inject_irq, 3490 }; 3491 3492 int 3493 vhost_user_new_device(void) 3494 { 3495 return vhost_new_device(&vhost_user_backend_ops); 3496 } 3497