1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 /* Security model 6 * -------------- 7 * The vhost-user protocol connection is an external interface, so it must be 8 * robust against invalid inputs. 9 * 10 * This is important because the vhost-user master is only one step removed 11 * from the guest. Malicious guests that have escaped will then launch further 12 * attacks from the vhost-user master. 13 * 14 * Even in deployments where guests are trusted, a bug in the vhost-user master 15 * can still cause invalid messages to be sent. Such messages must not 16 * compromise the stability of the DPDK application by causing crashes, memory 17 * corruption, or other problematic behavior. 18 * 19 * Do not assume received VhostUserMsg fields contain sensible values! 20 */ 21 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #include <unistd.h> 27 #include <fcntl.h> 28 #include <sys/ioctl.h> 29 #include <sys/mman.h> 30 #include <sys/stat.h> 31 #include <sys/syscall.h> 32 #ifdef RTE_LIBRTE_VHOST_NUMA 33 #include <numaif.h> 34 #endif 35 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 36 #include <linux/userfaultfd.h> 37 #endif 38 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 39 #include <linux/memfd.h> 40 #define MEMFD_SUPPORTED 41 #endif 42 43 #include <rte_common.h> 44 #include <rte_malloc.h> 45 #include <rte_log.h> 46 #include <rte_vfio.h> 47 #include <rte_errno.h> 48 49 #include "iotlb.h" 50 #include "vhost.h" 51 #include "vhost_user.h" 52 53 #define VIRTIO_MIN_MTU 68 54 #define VIRTIO_MAX_MTU 65535 55 56 #define INFLIGHT_ALIGNMENT 64 57 #define INFLIGHT_VERSION 0x1 58 59 static const char *vhost_message_str[VHOST_USER_MAX] = { 60 [VHOST_USER_NONE] = "VHOST_USER_NONE", 61 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", 62 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", 63 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", 64 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", 65 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", 66 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", 67 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", 68 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", 69 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", 70 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", 71 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", 72 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", 73 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", 74 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", 75 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", 76 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", 77 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", 78 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", 79 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", 80 [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", 81 [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", 82 [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", 83 [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", 84 [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", 85 [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", 86 [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", 87 [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", 88 [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD", 89 [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD", 90 [VHOST_USER_SET_STATUS] = "VHOST_USER_SET_STATUS", 91 [VHOST_USER_GET_STATUS] = "VHOST_USER_GET_STATUS", 92 }; 93 94 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 95 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 96 97 static void 98 close_msg_fds(struct vhu_msg_context *ctx) 99 { 100 int i; 101 102 for (i = 0; i < ctx->fd_num; i++) { 103 int fd = ctx->fds[i]; 104 105 if (fd == -1) 106 continue; 107 108 ctx->fds[i] = -1; 109 close(fd); 110 } 111 } 112 113 /* 114 * Ensure the expected number of FDs is received, 115 * close all FDs and return an error if this is not the case. 116 */ 117 static int 118 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds) 119 { 120 if (ctx->fd_num == expected_fds) 121 return 0; 122 123 VHOST_LOG_CONFIG(ERR, "(%s) expect %d FDs for request %s, received %d\n", 124 dev->ifname, expected_fds, 125 vhost_message_str[ctx->msg.request.master], 126 ctx->fd_num); 127 128 close_msg_fds(ctx); 129 130 return -1; 131 } 132 133 static uint64_t 134 get_blk_size(int fd) 135 { 136 struct stat stat; 137 int ret; 138 139 ret = fstat(fd, &stat); 140 return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; 141 } 142 143 static void 144 async_dma_map(struct virtio_net *dev, bool do_map) 145 { 146 int ret = 0; 147 uint32_t i; 148 struct guest_page *page; 149 150 if (do_map) { 151 for (i = 0; i < dev->nr_guest_pages; i++) { 152 page = &dev->guest_pages[i]; 153 ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD, 154 page->host_user_addr, 155 page->host_iova, 156 page->size); 157 if (ret) { 158 /* 159 * DMA device may bind with kernel driver, in this case, 160 * we don't need to program IOMMU manually. However, if no 161 * device is bound with vfio/uio in DPDK, and vfio kernel 162 * module is loaded, the API will still be called and return 163 * with ENODEV. 164 * 165 * DPDK vfio only returns ENODEV in very similar situations 166 * (vfio either unsupported, or supported but no devices found). 167 * Either way, no mappings could be performed. We treat it as 168 * normal case in async path. This is a workaround. 169 */ 170 if (rte_errno == ENODEV) 171 return; 172 173 /* DMA mapping errors won't stop VHOST_USER_SET_MEM_TABLE. */ 174 VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n"); 175 } 176 } 177 178 } else { 179 for (i = 0; i < dev->nr_guest_pages; i++) { 180 page = &dev->guest_pages[i]; 181 ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD, 182 page->host_user_addr, 183 page->host_iova, 184 page->size); 185 if (ret) { 186 /* like DMA map, ignore the kernel driver case when unmap. */ 187 if (rte_errno == EINVAL) 188 return; 189 190 VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n"); 191 } 192 } 193 } 194 } 195 196 static void 197 free_mem_region(struct virtio_net *dev) 198 { 199 uint32_t i; 200 struct rte_vhost_mem_region *reg; 201 202 if (!dev || !dev->mem) 203 return; 204 205 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 206 async_dma_map(dev, false); 207 208 for (i = 0; i < dev->mem->nregions; i++) { 209 reg = &dev->mem->regions[i]; 210 if (reg->host_user_addr) { 211 munmap(reg->mmap_addr, reg->mmap_size); 212 close(reg->fd); 213 } 214 } 215 } 216 217 void 218 vhost_backend_cleanup(struct virtio_net *dev) 219 { 220 struct rte_vdpa_device *vdpa_dev; 221 222 vdpa_dev = dev->vdpa_dev; 223 if (vdpa_dev && vdpa_dev->ops->dev_cleanup != NULL) 224 vdpa_dev->ops->dev_cleanup(dev->vid); 225 226 if (dev->mem) { 227 free_mem_region(dev); 228 rte_free(dev->mem); 229 dev->mem = NULL; 230 } 231 232 rte_free(dev->guest_pages); 233 dev->guest_pages = NULL; 234 235 if (dev->log_addr) { 236 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 237 dev->log_addr = 0; 238 } 239 240 if (dev->inflight_info) { 241 if (dev->inflight_info->addr) { 242 munmap(dev->inflight_info->addr, 243 dev->inflight_info->size); 244 dev->inflight_info->addr = NULL; 245 } 246 247 if (dev->inflight_info->fd >= 0) { 248 close(dev->inflight_info->fd); 249 dev->inflight_info->fd = -1; 250 } 251 252 rte_free(dev->inflight_info); 253 dev->inflight_info = NULL; 254 } 255 256 if (dev->slave_req_fd >= 0) { 257 close(dev->slave_req_fd); 258 dev->slave_req_fd = -1; 259 } 260 261 if (dev->postcopy_ufd >= 0) { 262 close(dev->postcopy_ufd); 263 dev->postcopy_ufd = -1; 264 } 265 266 dev->postcopy_listening = 0; 267 } 268 269 static void 270 vhost_user_notify_queue_state(struct virtio_net *dev, uint16_t index, 271 int enable) 272 { 273 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 274 struct vhost_virtqueue *vq = dev->virtqueue[index]; 275 276 /* Configure guest notifications on enable */ 277 if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF) 278 vhost_enable_guest_notification(dev, vq, vq->notif_enable); 279 280 if (vdpa_dev && vdpa_dev->ops->set_vring_state) 281 vdpa_dev->ops->set_vring_state(dev->vid, index, enable); 282 283 if (dev->notify_ops->vring_state_changed) 284 dev->notify_ops->vring_state_changed(dev->vid, 285 index, enable); 286 } 287 288 /* 289 * This function just returns success at the moment unless 290 * the device hasn't been initialised. 291 */ 292 static int 293 vhost_user_set_owner(struct virtio_net **pdev, 294 struct vhu_msg_context *ctx, 295 int main_fd __rte_unused) 296 { 297 struct virtio_net *dev = *pdev; 298 299 if (validate_msg_fds(dev, ctx, 0) != 0) 300 return RTE_VHOST_MSG_RESULT_ERR; 301 302 return RTE_VHOST_MSG_RESULT_OK; 303 } 304 305 static int 306 vhost_user_reset_owner(struct virtio_net **pdev, 307 struct vhu_msg_context *ctx, 308 int main_fd __rte_unused) 309 { 310 struct virtio_net *dev = *pdev; 311 312 if (validate_msg_fds(dev, ctx, 0) != 0) 313 return RTE_VHOST_MSG_RESULT_ERR; 314 315 vhost_destroy_device_notify(dev); 316 317 cleanup_device(dev, 0); 318 reset_device(dev); 319 return RTE_VHOST_MSG_RESULT_OK; 320 } 321 322 /* 323 * The features that we support are requested. 324 */ 325 static int 326 vhost_user_get_features(struct virtio_net **pdev, 327 struct vhu_msg_context *ctx, 328 int main_fd __rte_unused) 329 { 330 struct virtio_net *dev = *pdev; 331 uint64_t features = 0; 332 333 if (validate_msg_fds(dev, ctx, 0) != 0) 334 return RTE_VHOST_MSG_RESULT_ERR; 335 336 rte_vhost_driver_get_features(dev->ifname, &features); 337 338 ctx->msg.payload.u64 = features; 339 ctx->msg.size = sizeof(ctx->msg.payload.u64); 340 ctx->fd_num = 0; 341 342 return RTE_VHOST_MSG_RESULT_REPLY; 343 } 344 345 /* 346 * The queue number that we support are requested. 347 */ 348 static int 349 vhost_user_get_queue_num(struct virtio_net **pdev, 350 struct vhu_msg_context *ctx, 351 int main_fd __rte_unused) 352 { 353 struct virtio_net *dev = *pdev; 354 uint32_t queue_num = 0; 355 356 if (validate_msg_fds(dev, ctx, 0) != 0) 357 return RTE_VHOST_MSG_RESULT_ERR; 358 359 rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); 360 361 ctx->msg.payload.u64 = (uint64_t)queue_num; 362 ctx->msg.size = sizeof(ctx->msg.payload.u64); 363 ctx->fd_num = 0; 364 365 return RTE_VHOST_MSG_RESULT_REPLY; 366 } 367 368 /* 369 * We receive the negotiated features supported by us and the virtio device. 370 */ 371 static int 372 vhost_user_set_features(struct virtio_net **pdev, 373 struct vhu_msg_context *ctx, 374 int main_fd __rte_unused) 375 { 376 struct virtio_net *dev = *pdev; 377 uint64_t features = ctx->msg.payload.u64; 378 uint64_t vhost_features = 0; 379 struct rte_vdpa_device *vdpa_dev; 380 381 if (validate_msg_fds(dev, ctx, 0) != 0) 382 return RTE_VHOST_MSG_RESULT_ERR; 383 384 rte_vhost_driver_get_features(dev->ifname, &vhost_features); 385 if (features & ~vhost_features) { 386 VHOST_LOG_CONFIG(ERR, "(%s) received invalid negotiated features.\n", 387 dev->ifname); 388 dev->flags |= VIRTIO_DEV_FEATURES_FAILED; 389 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 390 391 return RTE_VHOST_MSG_RESULT_ERR; 392 } 393 394 if (dev->flags & VIRTIO_DEV_RUNNING) { 395 if (dev->features == features) 396 return RTE_VHOST_MSG_RESULT_OK; 397 398 /* 399 * Error out if master tries to change features while device is 400 * in running state. The exception being VHOST_F_LOG_ALL, which 401 * is enabled when the live-migration starts. 402 */ 403 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { 404 VHOST_LOG_CONFIG(ERR, "(%s) features changed while device is running.\n", 405 dev->ifname); 406 return RTE_VHOST_MSG_RESULT_ERR; 407 } 408 409 if (dev->notify_ops->features_changed) 410 dev->notify_ops->features_changed(dev->vid, features); 411 } 412 413 dev->features = features; 414 if (dev->features & 415 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 416 (1ULL << VIRTIO_F_VERSION_1) | 417 (1ULL << VIRTIO_F_RING_PACKED))) { 418 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); 419 } else { 420 dev->vhost_hlen = sizeof(struct virtio_net_hdr); 421 } 422 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Virtio features: 0x%" PRIx64 "\n", 423 dev->ifname, dev->features); 424 VHOST_LOG_CONFIG(DEBUG, "(%s) mergeable RX buffers %s, virtio 1 %s\n", 425 dev->ifname, 426 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", 427 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); 428 429 if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && 430 !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { 431 /* 432 * Remove all but first queue pair if MQ hasn't been 433 * negotiated. This is safe because the device is not 434 * running at this stage. 435 */ 436 while (dev->nr_vring > 2) { 437 struct vhost_virtqueue *vq; 438 439 vq = dev->virtqueue[--dev->nr_vring]; 440 if (!vq) 441 continue; 442 443 dev->virtqueue[dev->nr_vring] = NULL; 444 cleanup_vq(vq, 1); 445 cleanup_vq_inflight(dev, vq); 446 free_vq(dev, vq); 447 } 448 } 449 450 vdpa_dev = dev->vdpa_dev; 451 if (vdpa_dev) 452 vdpa_dev->ops->set_features(dev->vid); 453 454 dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED; 455 return RTE_VHOST_MSG_RESULT_OK; 456 } 457 458 /* 459 * The virtio device sends us the size of the descriptor ring. 460 */ 461 static int 462 vhost_user_set_vring_num(struct virtio_net **pdev, 463 struct vhu_msg_context *ctx, 464 int main_fd __rte_unused) 465 { 466 struct virtio_net *dev = *pdev; 467 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 468 469 if (validate_msg_fds(dev, ctx, 0) != 0) 470 return RTE_VHOST_MSG_RESULT_ERR; 471 472 if (ctx->msg.payload.state.num > 32768) { 473 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 474 dev->ifname, ctx->msg.payload.state.num); 475 return RTE_VHOST_MSG_RESULT_ERR; 476 } 477 478 vq->size = ctx->msg.payload.state.num; 479 480 /* VIRTIO 1.0, 2.4 Virtqueues says: 481 * 482 * Queue Size value is always a power of 2. The maximum Queue Size 483 * value is 32768. 484 * 485 * VIRTIO 1.1 2.7 Virtqueues says: 486 * 487 * Packed virtqueues support up to 2^15 entries each. 488 */ 489 if (!vq_is_packed(dev)) { 490 if (vq->size & (vq->size - 1)) { 491 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 492 dev->ifname, vq->size); 493 return RTE_VHOST_MSG_RESULT_ERR; 494 } 495 } 496 497 if (vq_is_packed(dev)) { 498 rte_free(vq->shadow_used_packed); 499 vq->shadow_used_packed = rte_malloc_socket(NULL, 500 vq->size * 501 sizeof(struct vring_used_elem_packed), 502 RTE_CACHE_LINE_SIZE, vq->numa_node); 503 if (!vq->shadow_used_packed) { 504 VHOST_LOG_CONFIG(ERR, 505 "(%s) failed to allocate memory for shadow used ring.\n", 506 dev->ifname); 507 return RTE_VHOST_MSG_RESULT_ERR; 508 } 509 510 } else { 511 rte_free(vq->shadow_used_split); 512 513 vq->shadow_used_split = rte_malloc_socket(NULL, 514 vq->size * sizeof(struct vring_used_elem), 515 RTE_CACHE_LINE_SIZE, vq->numa_node); 516 517 if (!vq->shadow_used_split) { 518 VHOST_LOG_CONFIG(ERR, 519 "(%s) failed to allocate memory for vq internal data.\n", 520 dev->ifname); 521 return RTE_VHOST_MSG_RESULT_ERR; 522 } 523 } 524 525 rte_free(vq->batch_copy_elems); 526 vq->batch_copy_elems = rte_malloc_socket(NULL, 527 vq->size * sizeof(struct batch_copy_elem), 528 RTE_CACHE_LINE_SIZE, vq->numa_node); 529 if (!vq->batch_copy_elems) { 530 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate memory for batching copy.\n", 531 dev->ifname); 532 return RTE_VHOST_MSG_RESULT_ERR; 533 } 534 535 return RTE_VHOST_MSG_RESULT_OK; 536 } 537 538 /* 539 * Reallocate virtio_dev, vhost_virtqueue and related data structures to 540 * make them on the same numa node as the memory of vring descriptor. 541 */ 542 #ifdef RTE_LIBRTE_VHOST_NUMA 543 static struct virtio_net* 544 numa_realloc(struct virtio_net *dev, int index) 545 { 546 int node, dev_node; 547 struct virtio_net *old_dev; 548 struct vhost_virtqueue *vq; 549 struct batch_copy_elem *bce; 550 struct guest_page *gp; 551 struct rte_vhost_memory *mem; 552 size_t mem_size; 553 int ret; 554 555 old_dev = dev; 556 vq = dev->virtqueue[index]; 557 558 /* 559 * If VQ is ready, it is too late to reallocate, it certainly already 560 * happened anyway on VHOST_USER_SET_VRING_ADRR. 561 */ 562 if (vq->ready) 563 return dev; 564 565 ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR); 566 if (ret) { 567 VHOST_LOG_CONFIG(ERR, "(%s) unable to get virtqueue %d numa information.\n", 568 dev->ifname, index); 569 return dev; 570 } 571 572 if (node == vq->numa_node) 573 goto out_dev_realloc; 574 575 vq = rte_realloc_socket(vq, sizeof(*vq), 0, node); 576 if (!vq) { 577 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc virtqueue %d on node %d\n", 578 dev->ifname, index, node); 579 return dev; 580 } 581 582 if (vq != dev->virtqueue[index]) { 583 VHOST_LOG_CONFIG(INFO, "(%s) reallocated virtqueue on node %d\n", 584 dev->ifname, node); 585 dev->virtqueue[index] = vq; 586 vhost_user_iotlb_init(dev, index); 587 } 588 589 if (vq_is_packed(dev)) { 590 struct vring_used_elem_packed *sup; 591 592 sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup), 593 RTE_CACHE_LINE_SIZE, node); 594 if (!sup) { 595 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow packed on node %d\n", 596 dev->ifname, node); 597 return dev; 598 } 599 vq->shadow_used_packed = sup; 600 } else { 601 struct vring_used_elem *sus; 602 603 sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus), 604 RTE_CACHE_LINE_SIZE, node); 605 if (!sus) { 606 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow split on node %d\n", 607 dev->ifname, node); 608 return dev; 609 } 610 vq->shadow_used_split = sus; 611 } 612 613 bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce), 614 RTE_CACHE_LINE_SIZE, node); 615 if (!bce) { 616 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc batch copy elem on node %d\n", 617 dev->ifname, node); 618 return dev; 619 } 620 vq->batch_copy_elems = bce; 621 622 if (vq->log_cache) { 623 struct log_cache_entry *lc; 624 625 lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node); 626 if (!lc) { 627 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc log cache on node %d\n", 628 dev->ifname, node); 629 return dev; 630 } 631 vq->log_cache = lc; 632 } 633 634 if (vq->resubmit_inflight) { 635 struct rte_vhost_resubmit_info *ri; 636 637 ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node); 638 if (!ri) { 639 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit inflight on node %d\n", 640 dev->ifname, node); 641 return dev; 642 } 643 vq->resubmit_inflight = ri; 644 645 if (ri->resubmit_list) { 646 struct rte_vhost_resubmit_desc *rd; 647 648 rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num, 649 0, node); 650 if (!rd) { 651 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit list on node %d\n", 652 dev->ifname, node); 653 return dev; 654 } 655 ri->resubmit_list = rd; 656 } 657 } 658 659 vq->numa_node = node; 660 661 out_dev_realloc: 662 663 if (dev->flags & VIRTIO_DEV_RUNNING) 664 return dev; 665 666 ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR); 667 if (ret) { 668 VHOST_LOG_CONFIG(ERR, "(%s) unable to get numa information.\n", dev->ifname); 669 return dev; 670 } 671 672 if (dev_node == node) 673 return dev; 674 675 dev = rte_realloc_socket(old_dev, sizeof(*dev), 0, node); 676 if (!dev) { 677 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc dev on node %d\n", 678 old_dev->ifname, node); 679 return old_dev; 680 } 681 682 VHOST_LOG_CONFIG(INFO, "(%s) reallocated device on node %d\n", dev->ifname, node); 683 vhost_devices[dev->vid] = dev; 684 685 mem_size = sizeof(struct rte_vhost_memory) + 686 sizeof(struct rte_vhost_mem_region) * dev->mem->nregions; 687 mem = rte_realloc_socket(dev->mem, mem_size, 0, node); 688 if (!mem) { 689 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc mem table on node %d\n", 690 dev->ifname, node); 691 return dev; 692 } 693 dev->mem = mem; 694 695 gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp), 696 RTE_CACHE_LINE_SIZE, node); 697 if (!gp) { 698 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc guest pages on node %d\n", 699 dev->ifname, node); 700 return dev; 701 } 702 dev->guest_pages = gp; 703 704 return dev; 705 } 706 #else 707 static struct virtio_net* 708 numa_realloc(struct virtio_net *dev, int index __rte_unused) 709 { 710 return dev; 711 } 712 #endif 713 714 /* Converts QEMU virtual address to Vhost virtual address. */ 715 static uint64_t 716 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) 717 { 718 struct rte_vhost_mem_region *r; 719 uint32_t i; 720 721 if (unlikely(!dev || !dev->mem)) 722 goto out_error; 723 724 /* Find the region where the address lives. */ 725 for (i = 0; i < dev->mem->nregions; i++) { 726 r = &dev->mem->regions[i]; 727 728 if (qva >= r->guest_user_addr && 729 qva < r->guest_user_addr + r->size) { 730 731 if (unlikely(*len > r->guest_user_addr + r->size - qva)) 732 *len = r->guest_user_addr + r->size - qva; 733 734 return qva - r->guest_user_addr + 735 r->host_user_addr; 736 } 737 } 738 out_error: 739 *len = 0; 740 741 return 0; 742 } 743 744 745 /* 746 * Converts ring address to Vhost virtual address. 747 * If IOMMU is enabled, the ring address is a guest IO virtual address, 748 * else it is a QEMU virtual address. 749 */ 750 static uint64_t 751 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 752 uint64_t ra, uint64_t *size) 753 { 754 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { 755 uint64_t vva; 756 757 vhost_user_iotlb_rd_lock(vq); 758 vva = vhost_iova_to_vva(dev, vq, ra, 759 size, VHOST_ACCESS_RW); 760 vhost_user_iotlb_rd_unlock(vq); 761 762 return vva; 763 } 764 765 return qva_to_vva(dev, ra, size); 766 } 767 768 static uint64_t 769 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) 770 { 771 uint64_t log_gpa; 772 773 vhost_user_iotlb_rd_lock(vq); 774 log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr); 775 vhost_user_iotlb_rd_unlock(vq); 776 777 return log_gpa; 778 } 779 780 static struct virtio_net * 781 translate_ring_addresses(struct virtio_net *dev, int vq_index) 782 { 783 struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; 784 struct vhost_vring_addr *addr = &vq->ring_addrs; 785 uint64_t len, expected_len; 786 787 if (addr->flags & (1 << VHOST_VRING_F_LOG)) { 788 vq->log_guest_addr = 789 log_addr_to_gpa(dev, vq); 790 if (vq->log_guest_addr == 0) { 791 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map log_guest_addr.\n", 792 dev->ifname); 793 return dev; 794 } 795 } 796 797 if (vq_is_packed(dev)) { 798 len = sizeof(struct vring_packed_desc) * vq->size; 799 vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) 800 ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); 801 if (vq->desc_packed == NULL || 802 len != sizeof(struct vring_packed_desc) * 803 vq->size) { 804 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc_packed ring.\n", 805 dev->ifname); 806 return dev; 807 } 808 809 dev = numa_realloc(dev, vq_index); 810 vq = dev->virtqueue[vq_index]; 811 addr = &vq->ring_addrs; 812 813 len = sizeof(struct vring_packed_desc_event); 814 vq->driver_event = (struct vring_packed_desc_event *) 815 (uintptr_t)ring_addr_to_vva(dev, 816 vq, addr->avail_user_addr, &len); 817 if (vq->driver_event == NULL || 818 len != sizeof(struct vring_packed_desc_event)) { 819 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find driver area address.\n", 820 dev->ifname); 821 return dev; 822 } 823 824 len = sizeof(struct vring_packed_desc_event); 825 vq->device_event = (struct vring_packed_desc_event *) 826 (uintptr_t)ring_addr_to_vva(dev, 827 vq, addr->used_user_addr, &len); 828 if (vq->device_event == NULL || 829 len != sizeof(struct vring_packed_desc_event)) { 830 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find device area address.\n", 831 dev->ifname); 832 return dev; 833 } 834 835 vq->access_ok = true; 836 return dev; 837 } 838 839 /* The addresses are converted from QEMU virtual to Vhost virtual. */ 840 if (vq->desc && vq->avail && vq->used) 841 return dev; 842 843 len = sizeof(struct vring_desc) * vq->size; 844 vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, 845 vq, addr->desc_user_addr, &len); 846 if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { 847 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc ring.\n", dev->ifname); 848 return dev; 849 } 850 851 dev = numa_realloc(dev, vq_index); 852 vq = dev->virtqueue[vq_index]; 853 addr = &vq->ring_addrs; 854 855 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 856 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 857 len += sizeof(uint16_t); 858 expected_len = len; 859 vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, 860 vq, addr->avail_user_addr, &len); 861 if (vq->avail == 0 || len != expected_len) { 862 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map avail ring.\n", dev->ifname); 863 return dev; 864 } 865 866 len = sizeof(struct vring_used) + 867 sizeof(struct vring_used_elem) * vq->size; 868 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 869 len += sizeof(uint16_t); 870 expected_len = len; 871 vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, 872 vq, addr->used_user_addr, &len); 873 if (vq->used == 0 || len != expected_len) { 874 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map used ring.\n", dev->ifname); 875 return dev; 876 } 877 878 if (vq->last_used_idx != vq->used->idx) { 879 VHOST_LOG_CONFIG(WARNING, "(%s) last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", 880 dev->ifname, 881 vq->last_used_idx, vq->used->idx); 882 vq->last_used_idx = vq->used->idx; 883 vq->last_avail_idx = vq->used->idx; 884 VHOST_LOG_CONFIG(WARNING, "(%s) some packets maybe resent for Tx and dropped for Rx\n", 885 dev->ifname); 886 } 887 888 vq->access_ok = true; 889 890 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address desc: %p\n", dev->ifname, vq->desc); 891 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address avail: %p\n", dev->ifname, vq->avail); 892 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address used: %p\n", dev->ifname, vq->used); 893 VHOST_LOG_CONFIG(DEBUG, "(%s) log_guest_addr: %" PRIx64 "\n", 894 dev->ifname, vq->log_guest_addr); 895 896 return dev; 897 } 898 899 /* 900 * The virtio device sends us the desc, used and avail ring addresses. 901 * This function then converts these to our address space. 902 */ 903 static int 904 vhost_user_set_vring_addr(struct virtio_net **pdev, 905 struct vhu_msg_context *ctx, 906 int main_fd __rte_unused) 907 { 908 struct virtio_net *dev = *pdev; 909 struct vhost_virtqueue *vq; 910 struct vhost_vring_addr *addr = &ctx->msg.payload.addr; 911 bool access_ok; 912 913 if (validate_msg_fds(dev, ctx, 0) != 0) 914 return RTE_VHOST_MSG_RESULT_ERR; 915 916 if (dev->mem == NULL) 917 return RTE_VHOST_MSG_RESULT_ERR; 918 919 /* addr->index refers to the queue index. The txq 1, rxq is 0. */ 920 vq = dev->virtqueue[ctx->msg.payload.addr.index]; 921 922 access_ok = vq->access_ok; 923 924 /* 925 * Rings addresses should not be interpreted as long as the ring is not 926 * started and enabled 927 */ 928 memcpy(&vq->ring_addrs, addr, sizeof(*addr)); 929 930 vring_invalidate(dev, vq); 931 932 if ((vq->enabled && (dev->features & 933 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) || 934 access_ok) { 935 dev = translate_ring_addresses(dev, ctx->msg.payload.addr.index); 936 if (!dev) 937 return RTE_VHOST_MSG_RESULT_ERR; 938 939 *pdev = dev; 940 } 941 942 return RTE_VHOST_MSG_RESULT_OK; 943 } 944 945 /* 946 * The virtio device sends us the available ring last used index. 947 */ 948 static int 949 vhost_user_set_vring_base(struct virtio_net **pdev, 950 struct vhu_msg_context *ctx, 951 int main_fd __rte_unused) 952 { 953 struct virtio_net *dev = *pdev; 954 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 955 uint64_t val = ctx->msg.payload.state.num; 956 957 if (validate_msg_fds(dev, ctx, 0) != 0) 958 return RTE_VHOST_MSG_RESULT_ERR; 959 960 if (vq_is_packed(dev)) { 961 /* 962 * Bit[0:14]: avail index 963 * Bit[15]: avail wrap counter 964 */ 965 vq->last_avail_idx = val & 0x7fff; 966 vq->avail_wrap_counter = !!(val & (0x1 << 15)); 967 /* 968 * Set used index to same value as available one, as 969 * their values should be the same since ring processing 970 * was stopped at get time. 971 */ 972 vq->last_used_idx = vq->last_avail_idx; 973 vq->used_wrap_counter = vq->avail_wrap_counter; 974 } else { 975 vq->last_used_idx = ctx->msg.payload.state.num; 976 vq->last_avail_idx = ctx->msg.payload.state.num; 977 } 978 979 VHOST_LOG_CONFIG(INFO, 980 "(%s) vring base idx:%u last_used_idx:%u last_avail_idx:%u.\n", 981 dev->ifname, ctx->msg.payload.state.index, vq->last_used_idx, 982 vq->last_avail_idx); 983 984 return RTE_VHOST_MSG_RESULT_OK; 985 } 986 987 static int 988 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, 989 uint64_t host_iova, uint64_t host_user_addr, uint64_t size) 990 { 991 struct guest_page *page, *last_page; 992 struct guest_page *old_pages; 993 994 if (dev->nr_guest_pages == dev->max_guest_pages) { 995 dev->max_guest_pages *= 2; 996 old_pages = dev->guest_pages; 997 dev->guest_pages = rte_realloc(dev->guest_pages, 998 dev->max_guest_pages * sizeof(*page), 999 RTE_CACHE_LINE_SIZE); 1000 if (dev->guest_pages == NULL) { 1001 VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n"); 1002 rte_free(old_pages); 1003 return -1; 1004 } 1005 } 1006 1007 if (dev->nr_guest_pages > 0) { 1008 last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; 1009 /* merge if the two pages are continuous */ 1010 if (host_iova == last_page->host_iova + last_page->size && 1011 guest_phys_addr == last_page->guest_phys_addr + last_page->size && 1012 host_user_addr == last_page->host_user_addr + last_page->size) { 1013 last_page->size += size; 1014 return 0; 1015 } 1016 } 1017 1018 page = &dev->guest_pages[dev->nr_guest_pages++]; 1019 page->guest_phys_addr = guest_phys_addr; 1020 page->host_iova = host_iova; 1021 page->host_user_addr = host_user_addr; 1022 page->size = size; 1023 1024 return 0; 1025 } 1026 1027 static int 1028 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, 1029 uint64_t page_size) 1030 { 1031 uint64_t reg_size = reg->size; 1032 uint64_t host_user_addr = reg->host_user_addr; 1033 uint64_t guest_phys_addr = reg->guest_phys_addr; 1034 uint64_t host_iova; 1035 uint64_t size; 1036 1037 host_iova = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); 1038 size = page_size - (guest_phys_addr & (page_size - 1)); 1039 size = RTE_MIN(size, reg_size); 1040 1041 if (add_one_guest_page(dev, guest_phys_addr, host_iova, 1042 host_user_addr, size) < 0) 1043 return -1; 1044 1045 host_user_addr += size; 1046 guest_phys_addr += size; 1047 reg_size -= size; 1048 1049 while (reg_size > 0) { 1050 size = RTE_MIN(reg_size, page_size); 1051 host_iova = rte_mem_virt2iova((void *)(uintptr_t) 1052 host_user_addr); 1053 if (add_one_guest_page(dev, guest_phys_addr, host_iova, 1054 host_user_addr, size) < 0) 1055 return -1; 1056 1057 host_user_addr += size; 1058 guest_phys_addr += size; 1059 reg_size -= size; 1060 } 1061 1062 /* sort guest page array if over binary search threshold */ 1063 if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) { 1064 qsort((void *)dev->guest_pages, dev->nr_guest_pages, 1065 sizeof(struct guest_page), guest_page_addrcmp); 1066 } 1067 1068 return 0; 1069 } 1070 1071 #ifdef RTE_LIBRTE_VHOST_DEBUG 1072 /* TODO: enable it only in debug mode? */ 1073 static void 1074 dump_guest_pages(struct virtio_net *dev) 1075 { 1076 uint32_t i; 1077 struct guest_page *page; 1078 1079 for (i = 0; i < dev->nr_guest_pages; i++) { 1080 page = &dev->guest_pages[i]; 1081 1082 VHOST_LOG_CONFIG(INFO, "(%s) guest physical page region %u\n", 1083 dev->ifname, i); 1084 VHOST_LOG_CONFIG(INFO, "(%s)\tguest_phys_addr: %" PRIx64 "\n", 1085 dev->ifname, page->guest_phys_addr); 1086 VHOST_LOG_CONFIG(INFO, "(%s)\thost_iova : %" PRIx64 "\n", 1087 dev->ifname, page->host_iova); 1088 VHOST_LOG_CONFIG(INFO, "(%s)\tsize : %" PRIx64 "\n", 1089 dev->ifname, page->size); 1090 } 1091 } 1092 #else 1093 #define dump_guest_pages(dev) 1094 #endif 1095 1096 static bool 1097 vhost_memory_changed(struct VhostUserMemory *new, 1098 struct rte_vhost_memory *old) 1099 { 1100 uint32_t i; 1101 1102 if (new->nregions != old->nregions) 1103 return true; 1104 1105 for (i = 0; i < new->nregions; ++i) { 1106 VhostUserMemoryRegion *new_r = &new->regions[i]; 1107 struct rte_vhost_mem_region *old_r = &old->regions[i]; 1108 1109 if (new_r->guest_phys_addr != old_r->guest_phys_addr) 1110 return true; 1111 if (new_r->memory_size != old_r->size) 1112 return true; 1113 if (new_r->userspace_addr != old_r->guest_user_addr) 1114 return true; 1115 } 1116 1117 return false; 1118 } 1119 1120 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 1121 static int 1122 vhost_user_postcopy_region_register(struct virtio_net *dev, 1123 struct rte_vhost_mem_region *reg) 1124 { 1125 struct uffdio_register reg_struct; 1126 1127 /* 1128 * Let's register all the mmapped area to ensure 1129 * alignment on page boundary. 1130 */ 1131 reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr; 1132 reg_struct.range.len = reg->mmap_size; 1133 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 1134 1135 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, 1136 ®_struct)) { 1137 VHOST_LOG_CONFIG(ERR, "(%s) failed to register ufd for region " 1138 "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n", 1139 dev->ifname, 1140 (uint64_t)reg_struct.range.start, 1141 (uint64_t)reg_struct.range.start + 1142 (uint64_t)reg_struct.range.len - 1, 1143 dev->postcopy_ufd, 1144 strerror(errno)); 1145 return -1; 1146 } 1147 1148 VHOST_LOG_CONFIG(INFO, 1149 "(%s)\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n", 1150 dev->ifname, 1151 (uint64_t)reg_struct.range.start, 1152 (uint64_t)reg_struct.range.start + 1153 (uint64_t)reg_struct.range.len - 1); 1154 1155 return 0; 1156 } 1157 #else 1158 static int 1159 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused, 1160 struct rte_vhost_mem_region *reg __rte_unused) 1161 { 1162 return -1; 1163 } 1164 #endif 1165 1166 static int 1167 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd, 1168 struct vhu_msg_context *ctx) 1169 { 1170 struct VhostUserMemory *memory; 1171 struct rte_vhost_mem_region *reg; 1172 struct vhu_msg_context ack_ctx; 1173 uint32_t i; 1174 1175 if (!dev->postcopy_listening) 1176 return 0; 1177 1178 /* 1179 * We haven't a better way right now than sharing 1180 * DPDK's virtual address with Qemu, so that Qemu can 1181 * retrieve the region offset when handling userfaults. 1182 */ 1183 memory = &ctx->msg.payload.memory; 1184 for (i = 0; i < memory->nregions; i++) { 1185 reg = &dev->mem->regions[i]; 1186 memory->regions[i].userspace_addr = reg->host_user_addr; 1187 } 1188 1189 /* Send the addresses back to qemu */ 1190 ctx->fd_num = 0; 1191 send_vhost_reply(dev, main_fd, ctx); 1192 1193 /* Wait for qemu to acknowledge it got the addresses 1194 * we've got to wait before we're allowed to generate faults. 1195 */ 1196 if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) { 1197 VHOST_LOG_CONFIG(ERR, "(%s) failed to read qemu ack on postcopy set-mem-table\n", 1198 dev->ifname); 1199 return -1; 1200 } 1201 1202 if (validate_msg_fds(dev, &ack_ctx, 0) != 0) 1203 return -1; 1204 1205 if (ack_ctx.msg.request.master != VHOST_USER_SET_MEM_TABLE) { 1206 VHOST_LOG_CONFIG(ERR, "(%s) bad qemu ack on postcopy set-mem-table (%d)\n", 1207 dev->ifname, ack_ctx.msg.request.master); 1208 return -1; 1209 } 1210 1211 /* Now userfault register and we can use the memory */ 1212 for (i = 0; i < memory->nregions; i++) { 1213 reg = &dev->mem->regions[i]; 1214 if (vhost_user_postcopy_region_register(dev, reg) < 0) 1215 return -1; 1216 } 1217 1218 return 0; 1219 } 1220 1221 static int 1222 vhost_user_mmap_region(struct virtio_net *dev, 1223 struct rte_vhost_mem_region *region, 1224 uint64_t mmap_offset) 1225 { 1226 void *mmap_addr; 1227 uint64_t mmap_size; 1228 uint64_t alignment; 1229 int populate; 1230 1231 /* Check for memory_size + mmap_offset overflow */ 1232 if (mmap_offset >= -region->size) { 1233 VHOST_LOG_CONFIG(ERR, "(%s) mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow\n", 1234 dev->ifname, mmap_offset, region->size); 1235 return -1; 1236 } 1237 1238 mmap_size = region->size + mmap_offset; 1239 1240 /* mmap() without flag of MAP_ANONYMOUS, should be called with length 1241 * argument aligned with hugepagesz at older longterm version Linux, 1242 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL. 1243 * 1244 * To avoid failure, make sure in caller to keep length aligned. 1245 */ 1246 alignment = get_blk_size(region->fd); 1247 if (alignment == (uint64_t)-1) { 1248 VHOST_LOG_CONFIG(ERR, "(%s) couldn't get hugepage size through fstat\n", 1249 dev->ifname); 1250 return -1; 1251 } 1252 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); 1253 if (mmap_size == 0) { 1254 /* 1255 * It could happen if initial mmap_size + alignment overflows 1256 * the sizeof uint64, which could happen if either mmap_size or 1257 * alignment value is wrong. 1258 * 1259 * mmap() kernel implementation would return an error, but 1260 * better catch it before and provide useful info in the logs. 1261 */ 1262 VHOST_LOG_CONFIG(ERR, "(%s) mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid\n", 1263 dev->ifname, region->size + mmap_offset, alignment); 1264 return -1; 1265 } 1266 1267 populate = dev->async_copy ? MAP_POPULATE : 0; 1268 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1269 MAP_SHARED | populate, region->fd, 0); 1270 1271 if (mmap_addr == MAP_FAILED) { 1272 VHOST_LOG_CONFIG(ERR, "(%s) mmap failed (%s).\n", dev->ifname, strerror(errno)); 1273 return -1; 1274 } 1275 1276 region->mmap_addr = mmap_addr; 1277 region->mmap_size = mmap_size; 1278 region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; 1279 1280 if (dev->async_copy) { 1281 if (add_guest_pages(dev, region, alignment) < 0) { 1282 VHOST_LOG_CONFIG(ERR, "(%s) adding guest pages to region failed.\n", 1283 dev->ifname); 1284 return -1; 1285 } 1286 } 1287 1288 VHOST_LOG_CONFIG(INFO, "(%s) guest memory region size: 0x%" PRIx64 "\n", 1289 dev->ifname, region->size); 1290 VHOST_LOG_CONFIG(INFO, "(%s)\t guest physical addr: 0x%" PRIx64 "\n", 1291 dev->ifname, region->guest_phys_addr); 1292 VHOST_LOG_CONFIG(INFO, "(%s)\t guest virtual addr: 0x%" PRIx64 "\n", 1293 dev->ifname, region->guest_user_addr); 1294 VHOST_LOG_CONFIG(INFO, "(%s)\t host virtual addr: 0x%" PRIx64 "\n", 1295 dev->ifname, region->host_user_addr); 1296 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap addr : 0x%" PRIx64 "\n", 1297 dev->ifname, (uint64_t)(uintptr_t)mmap_addr); 1298 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap size : 0x%" PRIx64 "\n", 1299 dev->ifname, mmap_size); 1300 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap align: 0x%" PRIx64 "\n", 1301 dev->ifname, alignment); 1302 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap off : 0x%" PRIx64 "\n", 1303 dev->ifname, mmap_offset); 1304 1305 return 0; 1306 } 1307 1308 static int 1309 vhost_user_set_mem_table(struct virtio_net **pdev, 1310 struct vhu_msg_context *ctx, 1311 int main_fd) 1312 { 1313 struct virtio_net *dev = *pdev; 1314 struct VhostUserMemory *memory = &ctx->msg.payload.memory; 1315 struct rte_vhost_mem_region *reg; 1316 int numa_node = SOCKET_ID_ANY; 1317 uint64_t mmap_offset; 1318 uint32_t i; 1319 bool async_notify = false; 1320 1321 if (validate_msg_fds(dev, ctx, memory->nregions) != 0) 1322 return RTE_VHOST_MSG_RESULT_ERR; 1323 1324 if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { 1325 VHOST_LOG_CONFIG(ERR, "(%s) too many memory regions (%u)\n", 1326 dev->ifname, memory->nregions); 1327 goto close_msg_fds; 1328 } 1329 1330 if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { 1331 VHOST_LOG_CONFIG(INFO, "(%s) memory regions not changed\n", dev->ifname); 1332 1333 close_msg_fds(ctx); 1334 1335 return RTE_VHOST_MSG_RESULT_OK; 1336 } 1337 1338 if (dev->mem) { 1339 if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) { 1340 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 1341 1342 if (vdpa_dev && vdpa_dev->ops->dev_close) 1343 vdpa_dev->ops->dev_close(dev->vid); 1344 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 1345 } 1346 1347 /* notify the vhost application to stop DMA transfers */ 1348 if (dev->async_copy && dev->notify_ops->vring_state_changed) { 1349 for (i = 0; i < dev->nr_vring; i++) { 1350 dev->notify_ops->vring_state_changed(dev->vid, 1351 i, 0); 1352 } 1353 async_notify = true; 1354 } 1355 1356 free_mem_region(dev); 1357 rte_free(dev->mem); 1358 dev->mem = NULL; 1359 } 1360 1361 /* Flush IOTLB cache as previous HVAs are now invalid */ 1362 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 1363 for (i = 0; i < dev->nr_vring; i++) 1364 vhost_user_iotlb_flush_all(dev->virtqueue[i]); 1365 1366 /* 1367 * If VQ 0 has already been allocated, try to allocate on the same 1368 * NUMA node. It can be reallocated later in numa_realloc(). 1369 */ 1370 if (dev->nr_vring > 0) 1371 numa_node = dev->virtqueue[0]->numa_node; 1372 1373 dev->nr_guest_pages = 0; 1374 if (dev->guest_pages == NULL) { 1375 dev->max_guest_pages = 8; 1376 dev->guest_pages = rte_zmalloc_socket(NULL, 1377 dev->max_guest_pages * 1378 sizeof(struct guest_page), 1379 RTE_CACHE_LINE_SIZE, 1380 numa_node); 1381 if (dev->guest_pages == NULL) { 1382 VHOST_LOG_CONFIG(ERR, 1383 "(%s) failed to allocate memory for dev->guest_pages\n", 1384 dev->ifname); 1385 goto close_msg_fds; 1386 } 1387 } 1388 1389 dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) + 1390 sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node); 1391 if (dev->mem == NULL) { 1392 VHOST_LOG_CONFIG(ERR, 1393 "(%s) failed to allocate memory for dev->mem\n", 1394 dev->ifname); 1395 goto free_guest_pages; 1396 } 1397 1398 for (i = 0; i < memory->nregions; i++) { 1399 reg = &dev->mem->regions[i]; 1400 1401 reg->guest_phys_addr = memory->regions[i].guest_phys_addr; 1402 reg->guest_user_addr = memory->regions[i].userspace_addr; 1403 reg->size = memory->regions[i].memory_size; 1404 reg->fd = ctx->fds[i]; 1405 1406 /* 1407 * Assign invalid file descriptor value to avoid double 1408 * closing on error path. 1409 */ 1410 ctx->fds[i] = -1; 1411 1412 mmap_offset = memory->regions[i].mmap_offset; 1413 1414 if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) { 1415 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap region %u\n", dev->ifname, i); 1416 goto free_mem_table; 1417 } 1418 1419 dev->mem->nregions++; 1420 } 1421 1422 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 1423 async_dma_map(dev, true); 1424 1425 if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0) 1426 goto free_mem_table; 1427 1428 for (i = 0; i < dev->nr_vring; i++) { 1429 struct vhost_virtqueue *vq = dev->virtqueue[i]; 1430 1431 if (!vq) 1432 continue; 1433 1434 if (vq->desc || vq->avail || vq->used) { 1435 /* 1436 * If the memory table got updated, the ring addresses 1437 * need to be translated again as virtual addresses have 1438 * changed. 1439 */ 1440 vring_invalidate(dev, vq); 1441 1442 dev = translate_ring_addresses(dev, i); 1443 if (!dev) { 1444 dev = *pdev; 1445 goto free_mem_table; 1446 } 1447 1448 *pdev = dev; 1449 } 1450 } 1451 1452 dump_guest_pages(dev); 1453 1454 if (async_notify) { 1455 for (i = 0; i < dev->nr_vring; i++) 1456 dev->notify_ops->vring_state_changed(dev->vid, i, 1); 1457 } 1458 1459 return RTE_VHOST_MSG_RESULT_OK; 1460 1461 free_mem_table: 1462 free_mem_region(dev); 1463 rte_free(dev->mem); 1464 dev->mem = NULL; 1465 1466 free_guest_pages: 1467 rte_free(dev->guest_pages); 1468 dev->guest_pages = NULL; 1469 close_msg_fds: 1470 close_msg_fds(ctx); 1471 return RTE_VHOST_MSG_RESULT_ERR; 1472 } 1473 1474 static bool 1475 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) 1476 { 1477 bool rings_ok; 1478 1479 if (!vq) 1480 return false; 1481 1482 if (vq_is_packed(dev)) 1483 rings_ok = vq->desc_packed && vq->driver_event && 1484 vq->device_event; 1485 else 1486 rings_ok = vq->desc && vq->avail && vq->used; 1487 1488 return rings_ok && 1489 vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && 1490 vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && 1491 vq->enabled; 1492 } 1493 1494 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u 1495 1496 static int 1497 virtio_is_ready(struct virtio_net *dev) 1498 { 1499 struct vhost_virtqueue *vq; 1500 uint32_t i, nr_vring = dev->nr_vring; 1501 1502 if (dev->flags & VIRTIO_DEV_READY) 1503 return 1; 1504 1505 if (!dev->nr_vring) 1506 return 0; 1507 1508 if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) { 1509 nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY; 1510 1511 if (dev->nr_vring < nr_vring) 1512 return 0; 1513 } 1514 1515 for (i = 0; i < nr_vring; i++) { 1516 vq = dev->virtqueue[i]; 1517 1518 if (!vq_is_ready(dev, vq)) 1519 return 0; 1520 } 1521 1522 /* If supported, ensure the frontend is really done with config */ 1523 if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS)) 1524 if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)) 1525 return 0; 1526 1527 dev->flags |= VIRTIO_DEV_READY; 1528 1529 if (!(dev->flags & VIRTIO_DEV_RUNNING)) 1530 VHOST_LOG_CONFIG(INFO, "(%s) virtio is now ready for processing.\n", dev->ifname); 1531 return 1; 1532 } 1533 1534 static void * 1535 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd) 1536 { 1537 void *ptr; 1538 int mfd = -1; 1539 char fname[20] = "/tmp/memfd-XXXXXX"; 1540 1541 *fd = -1; 1542 #ifdef MEMFD_SUPPORTED 1543 mfd = memfd_create(name, MFD_CLOEXEC); 1544 #else 1545 RTE_SET_USED(name); 1546 #endif 1547 if (mfd == -1) { 1548 mfd = mkstemp(fname); 1549 if (mfd == -1) { 1550 VHOST_LOG_CONFIG(ERR, "(%s) failed to get inflight buffer fd\n", 1551 dev->ifname); 1552 return NULL; 1553 } 1554 1555 unlink(fname); 1556 } 1557 1558 if (ftruncate(mfd, size) == -1) { 1559 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc inflight buffer\n", dev->ifname); 1560 close(mfd); 1561 return NULL; 1562 } 1563 1564 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); 1565 if (ptr == MAP_FAILED) { 1566 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap inflight buffer\n", dev->ifname); 1567 close(mfd); 1568 return NULL; 1569 } 1570 1571 *fd = mfd; 1572 return ptr; 1573 } 1574 1575 static uint32_t 1576 get_pervq_shm_size_split(uint16_t queue_size) 1577 { 1578 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) * 1579 queue_size + sizeof(uint64_t) + 1580 sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT); 1581 } 1582 1583 static uint32_t 1584 get_pervq_shm_size_packed(uint16_t queue_size) 1585 { 1586 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed) 1587 * queue_size + sizeof(uint64_t) + 1588 sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9, 1589 INFLIGHT_ALIGNMENT); 1590 } 1591 1592 static int 1593 vhost_user_get_inflight_fd(struct virtio_net **pdev, 1594 struct vhu_msg_context *ctx, 1595 int main_fd __rte_unused) 1596 { 1597 struct rte_vhost_inflight_info_packed *inflight_packed; 1598 uint64_t pervq_inflight_size, mmap_size; 1599 uint16_t num_queues, queue_size; 1600 struct virtio_net *dev = *pdev; 1601 int fd, i, j; 1602 int numa_node = SOCKET_ID_ANY; 1603 void *addr; 1604 1605 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) { 1606 VHOST_LOG_CONFIG(ERR, "(%s) invalid get_inflight_fd message size is %d\n", 1607 dev->ifname, ctx->msg.size); 1608 return RTE_VHOST_MSG_RESULT_ERR; 1609 } 1610 1611 /* 1612 * If VQ 0 has already been allocated, try to allocate on the same 1613 * NUMA node. It can be reallocated later in numa_realloc(). 1614 */ 1615 if (dev->nr_vring > 0) 1616 numa_node = dev->virtqueue[0]->numa_node; 1617 1618 if (dev->inflight_info == NULL) { 1619 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1620 sizeof(struct inflight_mem_info), 0, numa_node); 1621 if (!dev->inflight_info) { 1622 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1623 dev->ifname); 1624 return RTE_VHOST_MSG_RESULT_ERR; 1625 } 1626 dev->inflight_info->fd = -1; 1627 } 1628 1629 num_queues = ctx->msg.payload.inflight.num_queues; 1630 queue_size = ctx->msg.payload.inflight.queue_size; 1631 1632 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd num_queues: %u\n", 1633 dev->ifname, ctx->msg.payload.inflight.num_queues); 1634 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd queue_size: %u\n", 1635 dev->ifname, ctx->msg.payload.inflight.queue_size); 1636 1637 if (vq_is_packed(dev)) 1638 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1639 else 1640 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1641 1642 mmap_size = num_queues * pervq_inflight_size; 1643 addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd); 1644 if (!addr) { 1645 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc vhost inflight area\n", dev->ifname); 1646 ctx->msg.payload.inflight.mmap_size = 0; 1647 return RTE_VHOST_MSG_RESULT_ERR; 1648 } 1649 memset(addr, 0, mmap_size); 1650 1651 if (dev->inflight_info->addr) { 1652 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1653 dev->inflight_info->addr = NULL; 1654 } 1655 1656 if (dev->inflight_info->fd >= 0) { 1657 close(dev->inflight_info->fd); 1658 dev->inflight_info->fd = -1; 1659 } 1660 1661 dev->inflight_info->addr = addr; 1662 dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; 1663 dev->inflight_info->fd = ctx->fds[0] = fd; 1664 ctx->msg.payload.inflight.mmap_offset = 0; 1665 ctx->fd_num = 1; 1666 1667 if (vq_is_packed(dev)) { 1668 for (i = 0; i < num_queues; i++) { 1669 inflight_packed = 1670 (struct rte_vhost_inflight_info_packed *)addr; 1671 inflight_packed->used_wrap_counter = 1; 1672 inflight_packed->old_used_wrap_counter = 1; 1673 for (j = 0; j < queue_size; j++) 1674 inflight_packed->desc[j].next = j + 1; 1675 addr = (void *)((char *)addr + pervq_inflight_size); 1676 } 1677 } 1678 1679 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_size: %"PRIu64"\n", 1680 dev->ifname, ctx->msg.payload.inflight.mmap_size); 1681 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_offset: %"PRIu64"\n", 1682 dev->ifname, ctx->msg.payload.inflight.mmap_offset); 1683 VHOST_LOG_CONFIG(INFO, "(%s) send inflight fd: %d\n", dev->ifname, ctx->fds[0]); 1684 1685 return RTE_VHOST_MSG_RESULT_REPLY; 1686 } 1687 1688 static int 1689 vhost_user_set_inflight_fd(struct virtio_net **pdev, 1690 struct vhu_msg_context *ctx, 1691 int main_fd __rte_unused) 1692 { 1693 uint64_t mmap_size, mmap_offset; 1694 uint16_t num_queues, queue_size; 1695 struct virtio_net *dev = *pdev; 1696 uint32_t pervq_inflight_size; 1697 struct vhost_virtqueue *vq; 1698 void *addr; 1699 int fd, i; 1700 int numa_node = SOCKET_ID_ANY; 1701 1702 fd = ctx->fds[0]; 1703 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) { 1704 VHOST_LOG_CONFIG(ERR, "(%s) invalid set_inflight_fd message size is %d,fd is %d\n", 1705 dev->ifname, ctx->msg.size, fd); 1706 return RTE_VHOST_MSG_RESULT_ERR; 1707 } 1708 1709 mmap_size = ctx->msg.payload.inflight.mmap_size; 1710 mmap_offset = ctx->msg.payload.inflight.mmap_offset; 1711 num_queues = ctx->msg.payload.inflight.num_queues; 1712 queue_size = ctx->msg.payload.inflight.queue_size; 1713 1714 if (vq_is_packed(dev)) 1715 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1716 else 1717 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1718 1719 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_size: %"PRIu64"\n", 1720 dev->ifname, mmap_size); 1721 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_offset: %"PRIu64"\n", 1722 dev->ifname, mmap_offset); 1723 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd num_queues: %u\n", dev->ifname, num_queues); 1724 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd queue_size: %u\n", dev->ifname, queue_size); 1725 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd fd: %d\n", dev->ifname, fd); 1726 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd pervq_inflight_size: %d\n", 1727 dev->ifname, pervq_inflight_size); 1728 1729 /* 1730 * If VQ 0 has already been allocated, try to allocate on the same 1731 * NUMA node. It can be reallocated later in numa_realloc(). 1732 */ 1733 if (dev->nr_vring > 0) 1734 numa_node = dev->virtqueue[0]->numa_node; 1735 1736 if (!dev->inflight_info) { 1737 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1738 sizeof(struct inflight_mem_info), 0, numa_node); 1739 if (dev->inflight_info == NULL) { 1740 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1741 dev->ifname); 1742 return RTE_VHOST_MSG_RESULT_ERR; 1743 } 1744 dev->inflight_info->fd = -1; 1745 } 1746 1747 if (dev->inflight_info->addr) { 1748 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1749 dev->inflight_info->addr = NULL; 1750 } 1751 1752 addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1753 fd, mmap_offset); 1754 if (addr == MAP_FAILED) { 1755 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap share memory.\n", dev->ifname); 1756 return RTE_VHOST_MSG_RESULT_ERR; 1757 } 1758 1759 if (dev->inflight_info->fd >= 0) { 1760 close(dev->inflight_info->fd); 1761 dev->inflight_info->fd = -1; 1762 } 1763 1764 dev->inflight_info->fd = fd; 1765 dev->inflight_info->addr = addr; 1766 dev->inflight_info->size = mmap_size; 1767 1768 for (i = 0; i < num_queues; i++) { 1769 vq = dev->virtqueue[i]; 1770 if (!vq) 1771 continue; 1772 1773 if (vq_is_packed(dev)) { 1774 vq->inflight_packed = addr; 1775 vq->inflight_packed->desc_num = queue_size; 1776 } else { 1777 vq->inflight_split = addr; 1778 vq->inflight_split->desc_num = queue_size; 1779 } 1780 addr = (void *)((char *)addr + pervq_inflight_size); 1781 } 1782 1783 return RTE_VHOST_MSG_RESULT_OK; 1784 } 1785 1786 static int 1787 vhost_user_set_vring_call(struct virtio_net **pdev, 1788 struct vhu_msg_context *ctx, 1789 int main_fd __rte_unused) 1790 { 1791 struct virtio_net *dev = *pdev; 1792 struct vhost_vring_file file; 1793 struct vhost_virtqueue *vq; 1794 int expected_fds; 1795 1796 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1797 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1798 return RTE_VHOST_MSG_RESULT_ERR; 1799 1800 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 1801 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 1802 file.fd = VIRTIO_INVALID_EVENTFD; 1803 else 1804 file.fd = ctx->fds[0]; 1805 VHOST_LOG_CONFIG(INFO, "(%s) vring call idx:%d file:%d\n", 1806 dev->ifname, file.index, file.fd); 1807 1808 vq = dev->virtqueue[file.index]; 1809 1810 if (vq->ready) { 1811 vq->ready = false; 1812 vhost_user_notify_queue_state(dev, file.index, 0); 1813 } 1814 1815 if (vq->callfd >= 0) 1816 close(vq->callfd); 1817 1818 vq->callfd = file.fd; 1819 1820 return RTE_VHOST_MSG_RESULT_OK; 1821 } 1822 1823 static int vhost_user_set_vring_err(struct virtio_net **pdev, 1824 struct vhu_msg_context *ctx, 1825 int main_fd __rte_unused) 1826 { 1827 struct virtio_net *dev = *pdev; 1828 int expected_fds; 1829 1830 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1831 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1832 return RTE_VHOST_MSG_RESULT_ERR; 1833 1834 if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) 1835 close(ctx->fds[0]); 1836 VHOST_LOG_CONFIG(INFO, "(%s) not implemented\n", dev->ifname); 1837 1838 return RTE_VHOST_MSG_RESULT_OK; 1839 } 1840 1841 static int 1842 resubmit_desc_compare(const void *a, const void *b) 1843 { 1844 const struct rte_vhost_resubmit_desc *desc0 = a; 1845 const struct rte_vhost_resubmit_desc *desc1 = b; 1846 1847 if (desc1->counter > desc0->counter) 1848 return 1; 1849 1850 return -1; 1851 } 1852 1853 static int 1854 vhost_check_queue_inflights_split(struct virtio_net *dev, 1855 struct vhost_virtqueue *vq) 1856 { 1857 uint16_t i; 1858 uint16_t resubmit_num = 0, last_io, num; 1859 struct vring_used *used = vq->used; 1860 struct rte_vhost_resubmit_info *resubmit; 1861 struct rte_vhost_inflight_info_split *inflight_split; 1862 1863 if (!(dev->protocol_features & 1864 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1865 return RTE_VHOST_MSG_RESULT_OK; 1866 1867 /* The frontend may still not support the inflight feature 1868 * although we negotiate the protocol feature. 1869 */ 1870 if ((!vq->inflight_split)) 1871 return RTE_VHOST_MSG_RESULT_OK; 1872 1873 if (!vq->inflight_split->version) { 1874 vq->inflight_split->version = INFLIGHT_VERSION; 1875 return RTE_VHOST_MSG_RESULT_OK; 1876 } 1877 1878 if (vq->resubmit_inflight) 1879 return RTE_VHOST_MSG_RESULT_OK; 1880 1881 inflight_split = vq->inflight_split; 1882 vq->global_counter = 0; 1883 last_io = inflight_split->last_inflight_io; 1884 1885 if (inflight_split->used_idx != used->idx) { 1886 inflight_split->desc[last_io].inflight = 0; 1887 rte_atomic_thread_fence(__ATOMIC_SEQ_CST); 1888 inflight_split->used_idx = used->idx; 1889 } 1890 1891 for (i = 0; i < inflight_split->desc_num; i++) { 1892 if (inflight_split->desc[i].inflight == 1) 1893 resubmit_num++; 1894 } 1895 1896 vq->last_avail_idx += resubmit_num; 1897 1898 if (resubmit_num) { 1899 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 1900 0, vq->numa_node); 1901 if (!resubmit) { 1902 VHOST_LOG_CONFIG(ERR, 1903 "(%s) failed to allocate memory for resubmit info.\n", 1904 dev->ifname); 1905 return RTE_VHOST_MSG_RESULT_ERR; 1906 } 1907 1908 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 1909 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 1910 0, vq->numa_node); 1911 if (!resubmit->resubmit_list) { 1912 VHOST_LOG_CONFIG(ERR, 1913 "(%s) failed to allocate memory for inflight desc.\n", 1914 dev->ifname); 1915 rte_free(resubmit); 1916 return RTE_VHOST_MSG_RESULT_ERR; 1917 } 1918 1919 num = 0; 1920 for (i = 0; i < vq->inflight_split->desc_num; i++) { 1921 if (vq->inflight_split->desc[i].inflight == 1) { 1922 resubmit->resubmit_list[num].index = i; 1923 resubmit->resubmit_list[num].counter = 1924 inflight_split->desc[i].counter; 1925 num++; 1926 } 1927 } 1928 resubmit->resubmit_num = num; 1929 1930 if (resubmit->resubmit_num > 1) 1931 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 1932 sizeof(struct rte_vhost_resubmit_desc), 1933 resubmit_desc_compare); 1934 1935 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 1936 vq->resubmit_inflight = resubmit; 1937 } 1938 1939 return RTE_VHOST_MSG_RESULT_OK; 1940 } 1941 1942 static int 1943 vhost_check_queue_inflights_packed(struct virtio_net *dev, 1944 struct vhost_virtqueue *vq) 1945 { 1946 uint16_t i; 1947 uint16_t resubmit_num = 0, old_used_idx, num; 1948 struct rte_vhost_resubmit_info *resubmit; 1949 struct rte_vhost_inflight_info_packed *inflight_packed; 1950 1951 if (!(dev->protocol_features & 1952 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1953 return RTE_VHOST_MSG_RESULT_OK; 1954 1955 /* The frontend may still not support the inflight feature 1956 * although we negotiate the protocol feature. 1957 */ 1958 if ((!vq->inflight_packed)) 1959 return RTE_VHOST_MSG_RESULT_OK; 1960 1961 if (!vq->inflight_packed->version) { 1962 vq->inflight_packed->version = INFLIGHT_VERSION; 1963 return RTE_VHOST_MSG_RESULT_OK; 1964 } 1965 1966 if (vq->resubmit_inflight) 1967 return RTE_VHOST_MSG_RESULT_OK; 1968 1969 inflight_packed = vq->inflight_packed; 1970 vq->global_counter = 0; 1971 old_used_idx = inflight_packed->old_used_idx; 1972 1973 if (inflight_packed->used_idx != old_used_idx) { 1974 if (inflight_packed->desc[old_used_idx].inflight == 0) { 1975 inflight_packed->old_used_idx = 1976 inflight_packed->used_idx; 1977 inflight_packed->old_used_wrap_counter = 1978 inflight_packed->used_wrap_counter; 1979 inflight_packed->old_free_head = 1980 inflight_packed->free_head; 1981 } else { 1982 inflight_packed->used_idx = 1983 inflight_packed->old_used_idx; 1984 inflight_packed->used_wrap_counter = 1985 inflight_packed->old_used_wrap_counter; 1986 inflight_packed->free_head = 1987 inflight_packed->old_free_head; 1988 } 1989 } 1990 1991 for (i = 0; i < inflight_packed->desc_num; i++) { 1992 if (inflight_packed->desc[i].inflight == 1) 1993 resubmit_num++; 1994 } 1995 1996 if (resubmit_num) { 1997 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 1998 0, vq->numa_node); 1999 if (resubmit == NULL) { 2000 VHOST_LOG_CONFIG(ERR, 2001 "(%s) failed to allocate memory for resubmit info.\n", 2002 dev->ifname); 2003 return RTE_VHOST_MSG_RESULT_ERR; 2004 } 2005 2006 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 2007 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 2008 0, vq->numa_node); 2009 if (resubmit->resubmit_list == NULL) { 2010 VHOST_LOG_CONFIG(ERR, 2011 "(%s) failed to allocate memory for resubmit desc.\n", 2012 dev->ifname); 2013 rte_free(resubmit); 2014 return RTE_VHOST_MSG_RESULT_ERR; 2015 } 2016 2017 num = 0; 2018 for (i = 0; i < inflight_packed->desc_num; i++) { 2019 if (vq->inflight_packed->desc[i].inflight == 1) { 2020 resubmit->resubmit_list[num].index = i; 2021 resubmit->resubmit_list[num].counter = 2022 inflight_packed->desc[i].counter; 2023 num++; 2024 } 2025 } 2026 resubmit->resubmit_num = num; 2027 2028 if (resubmit->resubmit_num > 1) 2029 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 2030 sizeof(struct rte_vhost_resubmit_desc), 2031 resubmit_desc_compare); 2032 2033 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 2034 vq->resubmit_inflight = resubmit; 2035 } 2036 2037 return RTE_VHOST_MSG_RESULT_OK; 2038 } 2039 2040 static int 2041 vhost_user_set_vring_kick(struct virtio_net **pdev, 2042 struct vhu_msg_context *ctx, 2043 int main_fd __rte_unused) 2044 { 2045 struct virtio_net *dev = *pdev; 2046 struct vhost_vring_file file; 2047 struct vhost_virtqueue *vq; 2048 int expected_fds; 2049 2050 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 2051 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 2052 return RTE_VHOST_MSG_RESULT_ERR; 2053 2054 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2055 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 2056 file.fd = VIRTIO_INVALID_EVENTFD; 2057 else 2058 file.fd = ctx->fds[0]; 2059 VHOST_LOG_CONFIG(INFO, "(%s) vring kick idx:%d file:%d\n", 2060 dev->ifname, file.index, file.fd); 2061 2062 /* Interpret ring addresses only when ring is started. */ 2063 dev = translate_ring_addresses(dev, file.index); 2064 if (!dev) { 2065 if (file.fd != VIRTIO_INVALID_EVENTFD) 2066 close(file.fd); 2067 2068 return RTE_VHOST_MSG_RESULT_ERR; 2069 } 2070 2071 *pdev = dev; 2072 2073 vq = dev->virtqueue[file.index]; 2074 2075 /* 2076 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, 2077 * the ring starts already enabled. Otherwise, it is enabled via 2078 * the SET_VRING_ENABLE message. 2079 */ 2080 if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { 2081 vq->enabled = true; 2082 } 2083 2084 if (vq->ready) { 2085 vq->ready = false; 2086 vhost_user_notify_queue_state(dev, file.index, 0); 2087 } 2088 2089 if (vq->kickfd >= 0) 2090 close(vq->kickfd); 2091 vq->kickfd = file.fd; 2092 2093 if (vq_is_packed(dev)) { 2094 if (vhost_check_queue_inflights_packed(dev, vq)) { 2095 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2096 dev->ifname, file.index); 2097 return RTE_VHOST_MSG_RESULT_ERR; 2098 } 2099 } else { 2100 if (vhost_check_queue_inflights_split(dev, vq)) { 2101 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2102 dev->ifname, file.index); 2103 return RTE_VHOST_MSG_RESULT_ERR; 2104 } 2105 } 2106 2107 return RTE_VHOST_MSG_RESULT_OK; 2108 } 2109 2110 /* 2111 * when virtio is stopped, qemu will send us the GET_VRING_BASE message. 2112 */ 2113 static int 2114 vhost_user_get_vring_base(struct virtio_net **pdev, 2115 struct vhu_msg_context *ctx, 2116 int main_fd __rte_unused) 2117 { 2118 struct virtio_net *dev = *pdev; 2119 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 2120 uint64_t val; 2121 2122 if (validate_msg_fds(dev, ctx, 0) != 0) 2123 return RTE_VHOST_MSG_RESULT_ERR; 2124 2125 /* We have to stop the queue (virtio) if it is running. */ 2126 vhost_destroy_device_notify(dev); 2127 2128 dev->flags &= ~VIRTIO_DEV_READY; 2129 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 2130 2131 /* Here we are safe to get the indexes */ 2132 if (vq_is_packed(dev)) { 2133 /* 2134 * Bit[0:14]: avail index 2135 * Bit[15]: avail wrap counter 2136 */ 2137 val = vq->last_avail_idx & 0x7fff; 2138 val |= vq->avail_wrap_counter << 15; 2139 ctx->msg.payload.state.num = val; 2140 } else { 2141 ctx->msg.payload.state.num = vq->last_avail_idx; 2142 } 2143 2144 VHOST_LOG_CONFIG(INFO, "(%s) vring base idx:%d file:%d\n", 2145 dev->ifname, ctx->msg.payload.state.index, 2146 ctx->msg.payload.state.num); 2147 /* 2148 * Based on current qemu vhost-user implementation, this message is 2149 * sent and only sent in vhost_vring_stop. 2150 * TODO: cleanup the vring, it isn't usable since here. 2151 */ 2152 if (vq->kickfd >= 0) 2153 close(vq->kickfd); 2154 2155 vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; 2156 2157 if (vq->callfd >= 0) 2158 close(vq->callfd); 2159 2160 vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; 2161 2162 vq->signalled_used_valid = false; 2163 2164 if (vq_is_packed(dev)) { 2165 rte_free(vq->shadow_used_packed); 2166 vq->shadow_used_packed = NULL; 2167 } else { 2168 rte_free(vq->shadow_used_split); 2169 vq->shadow_used_split = NULL; 2170 } 2171 2172 rte_free(vq->batch_copy_elems); 2173 vq->batch_copy_elems = NULL; 2174 2175 rte_free(vq->log_cache); 2176 vq->log_cache = NULL; 2177 2178 ctx->msg.size = sizeof(ctx->msg.payload.state); 2179 ctx->fd_num = 0; 2180 2181 vhost_user_iotlb_flush_all(vq); 2182 2183 vring_invalidate(dev, vq); 2184 2185 return RTE_VHOST_MSG_RESULT_REPLY; 2186 } 2187 2188 /* 2189 * when virtio queues are ready to work, qemu will send us to 2190 * enable the virtio queue pair. 2191 */ 2192 static int 2193 vhost_user_set_vring_enable(struct virtio_net **pdev, 2194 struct vhu_msg_context *ctx, 2195 int main_fd __rte_unused) 2196 { 2197 struct virtio_net *dev = *pdev; 2198 bool enable = !!ctx->msg.payload.state.num; 2199 int index = (int)ctx->msg.payload.state.index; 2200 2201 if (validate_msg_fds(dev, ctx, 0) != 0) 2202 return RTE_VHOST_MSG_RESULT_ERR; 2203 2204 VHOST_LOG_CONFIG(INFO, "(%s) set queue enable: %d to qp idx: %d\n", 2205 dev->ifname, enable, index); 2206 2207 if (enable && dev->virtqueue[index]->async) { 2208 if (dev->virtqueue[index]->async->pkts_inflight_n) { 2209 VHOST_LOG_CONFIG(ERR, 2210 "(%s) failed to enable vring. Inflight packets must be completed first\n", 2211 dev->ifname); 2212 return RTE_VHOST_MSG_RESULT_ERR; 2213 } 2214 } 2215 2216 dev->virtqueue[index]->enabled = enable; 2217 2218 return RTE_VHOST_MSG_RESULT_OK; 2219 } 2220 2221 static int 2222 vhost_user_get_protocol_features(struct virtio_net **pdev, 2223 struct vhu_msg_context *ctx, 2224 int main_fd __rte_unused) 2225 { 2226 struct virtio_net *dev = *pdev; 2227 uint64_t features, protocol_features; 2228 2229 if (validate_msg_fds(dev, ctx, 0) != 0) 2230 return RTE_VHOST_MSG_RESULT_ERR; 2231 2232 rte_vhost_driver_get_features(dev->ifname, &features); 2233 rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); 2234 2235 ctx->msg.payload.u64 = protocol_features; 2236 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2237 ctx->fd_num = 0; 2238 2239 return RTE_VHOST_MSG_RESULT_REPLY; 2240 } 2241 2242 static int 2243 vhost_user_set_protocol_features(struct virtio_net **pdev, 2244 struct vhu_msg_context *ctx, 2245 int main_fd __rte_unused) 2246 { 2247 struct virtio_net *dev = *pdev; 2248 uint64_t protocol_features = ctx->msg.payload.u64; 2249 uint64_t slave_protocol_features = 0; 2250 2251 if (validate_msg_fds(dev, ctx, 0) != 0) 2252 return RTE_VHOST_MSG_RESULT_ERR; 2253 2254 rte_vhost_driver_get_protocol_features(dev->ifname, 2255 &slave_protocol_features); 2256 if (protocol_features & ~slave_protocol_features) { 2257 VHOST_LOG_CONFIG(ERR, "(%s) received invalid protocol features.\n", dev->ifname); 2258 return RTE_VHOST_MSG_RESULT_ERR; 2259 } 2260 2261 dev->protocol_features = protocol_features; 2262 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Vhost-user protocol features: 0x%" PRIx64 "\n", 2263 dev->ifname, dev->protocol_features); 2264 2265 return RTE_VHOST_MSG_RESULT_OK; 2266 } 2267 2268 static int 2269 vhost_user_set_log_base(struct virtio_net **pdev, 2270 struct vhu_msg_context *ctx, 2271 int main_fd __rte_unused) 2272 { 2273 struct virtio_net *dev = *pdev; 2274 int fd = ctx->fds[0]; 2275 uint64_t size, off; 2276 void *addr; 2277 uint32_t i; 2278 2279 if (validate_msg_fds(dev, ctx, 1) != 0) 2280 return RTE_VHOST_MSG_RESULT_ERR; 2281 2282 if (fd < 0) { 2283 VHOST_LOG_CONFIG(ERR, "(%s) invalid log fd: %d\n", dev->ifname, fd); 2284 return RTE_VHOST_MSG_RESULT_ERR; 2285 } 2286 2287 if (ctx->msg.size != sizeof(VhostUserLog)) { 2288 VHOST_LOG_CONFIG(ERR, "(%s) invalid log base msg size: %"PRId32" != %d\n", 2289 dev->ifname, ctx->msg.size, (int)sizeof(VhostUserLog)); 2290 goto close_msg_fds; 2291 } 2292 2293 size = ctx->msg.payload.log.mmap_size; 2294 off = ctx->msg.payload.log.mmap_offset; 2295 2296 /* Check for mmap size and offset overflow. */ 2297 if (off >= -size) { 2298 VHOST_LOG_CONFIG(ERR, 2299 "(%s) log offset %#"PRIx64" and log size %#"PRIx64" overflow\n", 2300 dev->ifname, off, size); 2301 goto close_msg_fds; 2302 } 2303 2304 VHOST_LOG_CONFIG(INFO, "(%s) log mmap size: %"PRId64", offset: %"PRId64"\n", 2305 dev->ifname, size, off); 2306 2307 /* 2308 * mmap from 0 to workaround a hugepage mmap bug: mmap will 2309 * fail when offset is not page size aligned. 2310 */ 2311 addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 2312 close(fd); 2313 if (addr == MAP_FAILED) { 2314 VHOST_LOG_CONFIG(ERR, "(%s) mmap log base failed!\n", dev->ifname); 2315 return RTE_VHOST_MSG_RESULT_ERR; 2316 } 2317 2318 /* 2319 * Free previously mapped log memory on occasionally 2320 * multiple VHOST_USER_SET_LOG_BASE. 2321 */ 2322 if (dev->log_addr) { 2323 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 2324 } 2325 dev->log_addr = (uint64_t)(uintptr_t)addr; 2326 dev->log_base = dev->log_addr + off; 2327 dev->log_size = size; 2328 2329 for (i = 0; i < dev->nr_vring; i++) { 2330 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2331 2332 rte_free(vq->log_cache); 2333 vq->log_cache = NULL; 2334 vq->log_cache_nb_elem = 0; 2335 vq->log_cache = rte_malloc_socket("vq log cache", 2336 sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR, 2337 0, vq->numa_node); 2338 /* 2339 * If log cache alloc fail, don't fail migration, but no 2340 * caching will be done, which will impact performance 2341 */ 2342 if (!vq->log_cache) 2343 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate VQ logging cache\n", 2344 dev->ifname); 2345 } 2346 2347 /* 2348 * The spec is not clear about it (yet), but QEMU doesn't expect 2349 * any payload in the reply. 2350 */ 2351 ctx->msg.size = 0; 2352 ctx->fd_num = 0; 2353 2354 return RTE_VHOST_MSG_RESULT_REPLY; 2355 2356 close_msg_fds: 2357 close_msg_fds(ctx); 2358 return RTE_VHOST_MSG_RESULT_ERR; 2359 } 2360 2361 static int vhost_user_set_log_fd(struct virtio_net **pdev, 2362 struct vhu_msg_context *ctx, 2363 int main_fd __rte_unused) 2364 { 2365 struct virtio_net *dev = *pdev; 2366 2367 if (validate_msg_fds(dev, ctx, 1) != 0) 2368 return RTE_VHOST_MSG_RESULT_ERR; 2369 2370 close(ctx->fds[0]); 2371 VHOST_LOG_CONFIG(INFO, "(%s) not implemented.\n", dev->ifname); 2372 2373 return RTE_VHOST_MSG_RESULT_OK; 2374 } 2375 2376 /* 2377 * An rarp packet is constructed and broadcasted to notify switches about 2378 * the new location of the migrated VM, so that packets from outside will 2379 * not be lost after migration. 2380 * 2381 * However, we don't actually "send" a rarp packet here, instead, we set 2382 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. 2383 */ 2384 static int 2385 vhost_user_send_rarp(struct virtio_net **pdev, 2386 struct vhu_msg_context *ctx, 2387 int main_fd __rte_unused) 2388 { 2389 struct virtio_net *dev = *pdev; 2390 uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64; 2391 struct rte_vdpa_device *vdpa_dev; 2392 2393 if (validate_msg_fds(dev, ctx, 0) != 0) 2394 return RTE_VHOST_MSG_RESULT_ERR; 2395 2396 VHOST_LOG_CONFIG(DEBUG, "(%s) MAC: " RTE_ETHER_ADDR_PRT_FMT "\n", 2397 dev->ifname, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); 2398 memcpy(dev->mac.addr_bytes, mac, 6); 2399 2400 /* 2401 * Set the flag to inject a RARP broadcast packet at 2402 * rte_vhost_dequeue_burst(). 2403 * 2404 * __ATOMIC_RELEASE ordering is for making sure the mac is 2405 * copied before the flag is set. 2406 */ 2407 __atomic_store_n(&dev->broadcast_rarp, 1, __ATOMIC_RELEASE); 2408 vdpa_dev = dev->vdpa_dev; 2409 if (vdpa_dev && vdpa_dev->ops->migration_done) 2410 vdpa_dev->ops->migration_done(dev->vid); 2411 2412 return RTE_VHOST_MSG_RESULT_OK; 2413 } 2414 2415 static int 2416 vhost_user_net_set_mtu(struct virtio_net **pdev, 2417 struct vhu_msg_context *ctx, 2418 int main_fd __rte_unused) 2419 { 2420 struct virtio_net *dev = *pdev; 2421 2422 if (validate_msg_fds(dev, ctx, 0) != 0) 2423 return RTE_VHOST_MSG_RESULT_ERR; 2424 2425 if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU || 2426 ctx->msg.payload.u64 > VIRTIO_MAX_MTU) { 2427 VHOST_LOG_CONFIG(ERR, "(%s) invalid MTU size (%"PRIu64")\n", 2428 dev->ifname, ctx->msg.payload.u64); 2429 2430 return RTE_VHOST_MSG_RESULT_ERR; 2431 } 2432 2433 dev->mtu = ctx->msg.payload.u64; 2434 2435 return RTE_VHOST_MSG_RESULT_OK; 2436 } 2437 2438 static int 2439 vhost_user_set_req_fd(struct virtio_net **pdev, 2440 struct vhu_msg_context *ctx, 2441 int main_fd __rte_unused) 2442 { 2443 struct virtio_net *dev = *pdev; 2444 int fd = ctx->fds[0]; 2445 2446 if (validate_msg_fds(dev, ctx, 1) != 0) 2447 return RTE_VHOST_MSG_RESULT_ERR; 2448 2449 if (fd < 0) { 2450 VHOST_LOG_CONFIG(ERR, "(%s) invalid file descriptor for slave channel (%d)\n", 2451 dev->ifname, fd); 2452 return RTE_VHOST_MSG_RESULT_ERR; 2453 } 2454 2455 if (dev->slave_req_fd >= 0) 2456 close(dev->slave_req_fd); 2457 2458 dev->slave_req_fd = fd; 2459 2460 return RTE_VHOST_MSG_RESULT_OK; 2461 } 2462 2463 static int 2464 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2465 { 2466 struct vhost_vring_addr *ra; 2467 uint64_t start, end, len; 2468 2469 start = imsg->iova; 2470 end = start + imsg->size; 2471 2472 ra = &vq->ring_addrs; 2473 len = sizeof(struct vring_desc) * vq->size; 2474 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2475 return 1; 2476 2477 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 2478 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2479 return 1; 2480 2481 len = sizeof(struct vring_used) + 2482 sizeof(struct vring_used_elem) * vq->size; 2483 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2484 return 1; 2485 2486 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2487 len = sizeof(uint64_t); 2488 if (ra->log_guest_addr < end && 2489 (ra->log_guest_addr + len) > start) 2490 return 1; 2491 } 2492 2493 return 0; 2494 } 2495 2496 static int 2497 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2498 { 2499 struct vhost_vring_addr *ra; 2500 uint64_t start, end, len; 2501 2502 start = imsg->iova; 2503 end = start + imsg->size; 2504 2505 ra = &vq->ring_addrs; 2506 len = sizeof(struct vring_packed_desc) * vq->size; 2507 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2508 return 1; 2509 2510 len = sizeof(struct vring_packed_desc_event); 2511 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2512 return 1; 2513 2514 len = sizeof(struct vring_packed_desc_event); 2515 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2516 return 1; 2517 2518 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2519 len = sizeof(uint64_t); 2520 if (ra->log_guest_addr < end && 2521 (ra->log_guest_addr + len) > start) 2522 return 1; 2523 } 2524 2525 return 0; 2526 } 2527 2528 static int is_vring_iotlb(struct virtio_net *dev, 2529 struct vhost_virtqueue *vq, 2530 struct vhost_iotlb_msg *imsg) 2531 { 2532 if (vq_is_packed(dev)) 2533 return is_vring_iotlb_packed(vq, imsg); 2534 else 2535 return is_vring_iotlb_split(vq, imsg); 2536 } 2537 2538 static int 2539 vhost_user_iotlb_msg(struct virtio_net **pdev, 2540 struct vhu_msg_context *ctx, 2541 int main_fd __rte_unused) 2542 { 2543 struct virtio_net *dev = *pdev; 2544 struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb; 2545 uint16_t i; 2546 uint64_t vva, len; 2547 2548 if (validate_msg_fds(dev, ctx, 0) != 0) 2549 return RTE_VHOST_MSG_RESULT_ERR; 2550 2551 switch (imsg->type) { 2552 case VHOST_IOTLB_UPDATE: 2553 len = imsg->size; 2554 vva = qva_to_vva(dev, imsg->uaddr, &len); 2555 if (!vva) 2556 return RTE_VHOST_MSG_RESULT_ERR; 2557 2558 for (i = 0; i < dev->nr_vring; i++) { 2559 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2560 2561 if (!vq) 2562 continue; 2563 2564 vhost_user_iotlb_cache_insert(dev, vq, imsg->iova, vva, 2565 len, imsg->perm); 2566 2567 if (is_vring_iotlb(dev, vq, imsg)) { 2568 rte_spinlock_lock(&vq->access_lock); 2569 *pdev = dev = translate_ring_addresses(dev, i); 2570 rte_spinlock_unlock(&vq->access_lock); 2571 } 2572 } 2573 break; 2574 case VHOST_IOTLB_INVALIDATE: 2575 for (i = 0; i < dev->nr_vring; i++) { 2576 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2577 2578 if (!vq) 2579 continue; 2580 2581 vhost_user_iotlb_cache_remove(vq, imsg->iova, 2582 imsg->size); 2583 2584 if (is_vring_iotlb(dev, vq, imsg)) { 2585 rte_spinlock_lock(&vq->access_lock); 2586 vring_invalidate(dev, vq); 2587 rte_spinlock_unlock(&vq->access_lock); 2588 } 2589 } 2590 break; 2591 default: 2592 VHOST_LOG_CONFIG(ERR, "(%s) invalid IOTLB message type (%d)\n", 2593 dev->ifname, imsg->type); 2594 return RTE_VHOST_MSG_RESULT_ERR; 2595 } 2596 2597 return RTE_VHOST_MSG_RESULT_OK; 2598 } 2599 2600 static int 2601 vhost_user_set_postcopy_advise(struct virtio_net **pdev, 2602 struct vhu_msg_context *ctx, 2603 int main_fd __rte_unused) 2604 { 2605 struct virtio_net *dev = *pdev; 2606 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 2607 struct uffdio_api api_struct; 2608 2609 if (validate_msg_fds(dev, ctx, 0) != 0) 2610 return RTE_VHOST_MSG_RESULT_ERR; 2611 2612 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 2613 2614 if (dev->postcopy_ufd == -1) { 2615 VHOST_LOG_CONFIG(ERR, "(%s) userfaultfd not available: %s\n", 2616 dev->ifname, strerror(errno)); 2617 return RTE_VHOST_MSG_RESULT_ERR; 2618 } 2619 api_struct.api = UFFD_API; 2620 api_struct.features = 0; 2621 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 2622 VHOST_LOG_CONFIG(ERR, "(%s) UFFDIO_API ioctl failure: %s\n", 2623 dev->ifname, strerror(errno)); 2624 close(dev->postcopy_ufd); 2625 dev->postcopy_ufd = -1; 2626 return RTE_VHOST_MSG_RESULT_ERR; 2627 } 2628 ctx->fds[0] = dev->postcopy_ufd; 2629 ctx->fd_num = 1; 2630 2631 return RTE_VHOST_MSG_RESULT_REPLY; 2632 #else 2633 dev->postcopy_ufd = -1; 2634 ctx->fd_num = 0; 2635 2636 return RTE_VHOST_MSG_RESULT_ERR; 2637 #endif 2638 } 2639 2640 static int 2641 vhost_user_set_postcopy_listen(struct virtio_net **pdev, 2642 struct vhu_msg_context *ctx __rte_unused, 2643 int main_fd __rte_unused) 2644 { 2645 struct virtio_net *dev = *pdev; 2646 2647 if (validate_msg_fds(dev, ctx, 0) != 0) 2648 return RTE_VHOST_MSG_RESULT_ERR; 2649 2650 if (dev->mem && dev->mem->nregions) { 2651 VHOST_LOG_CONFIG(ERR, "(%s) regions already registered at postcopy-listen\n", 2652 dev->ifname); 2653 return RTE_VHOST_MSG_RESULT_ERR; 2654 } 2655 dev->postcopy_listening = 1; 2656 2657 return RTE_VHOST_MSG_RESULT_OK; 2658 } 2659 2660 static int 2661 vhost_user_postcopy_end(struct virtio_net **pdev, 2662 struct vhu_msg_context *ctx, 2663 int main_fd __rte_unused) 2664 { 2665 struct virtio_net *dev = *pdev; 2666 2667 if (validate_msg_fds(dev, ctx, 0) != 0) 2668 return RTE_VHOST_MSG_RESULT_ERR; 2669 2670 dev->postcopy_listening = 0; 2671 if (dev->postcopy_ufd >= 0) { 2672 close(dev->postcopy_ufd); 2673 dev->postcopy_ufd = -1; 2674 } 2675 2676 ctx->msg.payload.u64 = 0; 2677 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2678 ctx->fd_num = 0; 2679 2680 return RTE_VHOST_MSG_RESULT_REPLY; 2681 } 2682 2683 static int 2684 vhost_user_get_status(struct virtio_net **pdev, 2685 struct vhu_msg_context *ctx, 2686 int main_fd __rte_unused) 2687 { 2688 struct virtio_net *dev = *pdev; 2689 2690 if (validate_msg_fds(dev, ctx, 0) != 0) 2691 return RTE_VHOST_MSG_RESULT_ERR; 2692 2693 ctx->msg.payload.u64 = dev->status; 2694 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2695 ctx->fd_num = 0; 2696 2697 return RTE_VHOST_MSG_RESULT_REPLY; 2698 } 2699 2700 static int 2701 vhost_user_set_status(struct virtio_net **pdev, 2702 struct vhu_msg_context *ctx, 2703 int main_fd __rte_unused) 2704 { 2705 struct virtio_net *dev = *pdev; 2706 2707 if (validate_msg_fds(dev, ctx, 0) != 0) 2708 return RTE_VHOST_MSG_RESULT_ERR; 2709 2710 /* As per Virtio specification, the device status is 8bits long */ 2711 if (ctx->msg.payload.u64 > UINT8_MAX) { 2712 VHOST_LOG_CONFIG(ERR, "(%s) invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64 "\n", 2713 dev->ifname, ctx->msg.payload.u64); 2714 return RTE_VHOST_MSG_RESULT_ERR; 2715 } 2716 2717 dev->status = ctx->msg.payload.u64; 2718 2719 if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) && 2720 (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) { 2721 VHOST_LOG_CONFIG(ERR, 2722 "(%s) FEATURES_OK bit is set but feature negotiation failed\n", 2723 dev->ifname); 2724 /* 2725 * Clear the bit to let the driver know about the feature 2726 * negotiation failure 2727 */ 2728 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 2729 } 2730 2731 VHOST_LOG_CONFIG(INFO, "(%s) new device status(0x%08x):\n", dev->ifname, 2732 dev->status); 2733 VHOST_LOG_CONFIG(INFO, "(%s)\t-RESET: %u\n", dev->ifname, 2734 (dev->status == VIRTIO_DEVICE_STATUS_RESET)); 2735 VHOST_LOG_CONFIG(INFO, "(%s)\t-ACKNOWLEDGE: %u\n", dev->ifname, 2736 !!(dev->status & VIRTIO_DEVICE_STATUS_ACK)); 2737 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER: %u\n", dev->ifname, 2738 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER)); 2739 VHOST_LOG_CONFIG(INFO, "(%s)\t-FEATURES_OK: %u\n", dev->ifname, 2740 !!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK)); 2741 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER_OK: %u\n", dev->ifname, 2742 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)); 2743 VHOST_LOG_CONFIG(INFO, "(%s)\t-DEVICE_NEED_RESET: %u\n", dev->ifname, 2744 !!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET)); 2745 VHOST_LOG_CONFIG(INFO, "(%s)\t-FAILED: %u\n", dev->ifname, 2746 !!(dev->status & VIRTIO_DEVICE_STATUS_FAILED)); 2747 2748 return RTE_VHOST_MSG_RESULT_OK; 2749 } 2750 2751 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev, 2752 struct vhu_msg_context *ctx, 2753 int main_fd); 2754 2755 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { 2756 [VHOST_USER_NONE] = NULL, 2757 [VHOST_USER_GET_FEATURES] = vhost_user_get_features, 2758 [VHOST_USER_SET_FEATURES] = vhost_user_set_features, 2759 [VHOST_USER_SET_OWNER] = vhost_user_set_owner, 2760 [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner, 2761 [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table, 2762 [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base, 2763 [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd, 2764 [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num, 2765 [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr, 2766 [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base, 2767 [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base, 2768 [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick, 2769 [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call, 2770 [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err, 2771 [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features, 2772 [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features, 2773 [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num, 2774 [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable, 2775 [VHOST_USER_SEND_RARP] = vhost_user_send_rarp, 2776 [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu, 2777 [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd, 2778 [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg, 2779 [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, 2780 [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, 2781 [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, 2782 [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd, 2783 [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd, 2784 [VHOST_USER_SET_STATUS] = vhost_user_set_status, 2785 [VHOST_USER_GET_STATUS] = vhost_user_get_status, 2786 }; 2787 2788 /* return bytes# of read on success or negative val on failure. */ 2789 static int 2790 read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2791 { 2792 int ret; 2793 2794 ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE, 2795 ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num); 2796 if (ret <= 0) { 2797 return ret; 2798 } else if (ret != VHOST_USER_HDR_SIZE) { 2799 VHOST_LOG_CONFIG(ERR, "(%s) Unexpected header size read\n", dev->ifname); 2800 close_msg_fds(ctx); 2801 return -1; 2802 } 2803 2804 if (ctx->msg.size) { 2805 if (ctx->msg.size > sizeof(ctx->msg.payload)) { 2806 VHOST_LOG_CONFIG(ERR, "(%s) invalid msg size: %d\n", 2807 dev->ifname, ctx->msg.size); 2808 return -1; 2809 } 2810 ret = read(sockfd, &ctx->msg.payload, ctx->msg.size); 2811 if (ret <= 0) 2812 return ret; 2813 if (ret != (int)ctx->msg.size) { 2814 VHOST_LOG_CONFIG(ERR, "(%s) read control message failed\n", dev->ifname); 2815 return -1; 2816 } 2817 } 2818 2819 return ret; 2820 } 2821 2822 static int 2823 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2824 { 2825 if (!ctx) 2826 return 0; 2827 2828 return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, 2829 VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num); 2830 } 2831 2832 static int 2833 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2834 { 2835 if (!ctx) 2836 return 0; 2837 2838 ctx->msg.flags &= ~VHOST_USER_VERSION_MASK; 2839 ctx->msg.flags &= ~VHOST_USER_NEED_REPLY; 2840 ctx->msg.flags |= VHOST_USER_VERSION; 2841 ctx->msg.flags |= VHOST_USER_REPLY_MASK; 2842 2843 return send_vhost_message(dev, sockfd, ctx); 2844 } 2845 2846 static int 2847 send_vhost_slave_message(struct virtio_net *dev, 2848 struct vhu_msg_context *ctx) 2849 { 2850 int ret; 2851 2852 if (ctx->msg.flags & VHOST_USER_NEED_REPLY) 2853 rte_spinlock_lock(&dev->slave_req_lock); 2854 2855 ret = send_vhost_message(dev, dev->slave_req_fd, ctx); 2856 if (ret < 0 && (ctx->msg.flags & VHOST_USER_NEED_REPLY)) 2857 rte_spinlock_unlock(&dev->slave_req_lock); 2858 2859 return ret; 2860 } 2861 2862 /* 2863 * Allocate a queue pair if it hasn't been allocated yet 2864 */ 2865 static int 2866 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, 2867 struct vhu_msg_context *ctx) 2868 { 2869 uint32_t vring_idx; 2870 2871 switch (ctx->msg.request.master) { 2872 case VHOST_USER_SET_VRING_KICK: 2873 case VHOST_USER_SET_VRING_CALL: 2874 case VHOST_USER_SET_VRING_ERR: 2875 vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2876 break; 2877 case VHOST_USER_SET_VRING_NUM: 2878 case VHOST_USER_SET_VRING_BASE: 2879 case VHOST_USER_GET_VRING_BASE: 2880 case VHOST_USER_SET_VRING_ENABLE: 2881 vring_idx = ctx->msg.payload.state.index; 2882 break; 2883 case VHOST_USER_SET_VRING_ADDR: 2884 vring_idx = ctx->msg.payload.addr.index; 2885 break; 2886 default: 2887 return 0; 2888 } 2889 2890 if (vring_idx >= VHOST_MAX_VRING) { 2891 VHOST_LOG_CONFIG(ERR, "(%s) invalid vring index: %u\n", dev->ifname, vring_idx); 2892 return -1; 2893 } 2894 2895 if (dev->virtqueue[vring_idx]) 2896 return 0; 2897 2898 return alloc_vring_queue(dev, vring_idx); 2899 } 2900 2901 static void 2902 vhost_user_lock_all_queue_pairs(struct virtio_net *dev) 2903 { 2904 unsigned int i = 0; 2905 unsigned int vq_num = 0; 2906 2907 while (vq_num < dev->nr_vring) { 2908 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2909 2910 if (vq) { 2911 rte_spinlock_lock(&vq->access_lock); 2912 vq_num++; 2913 } 2914 i++; 2915 } 2916 } 2917 2918 static void 2919 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) 2920 { 2921 unsigned int i = 0; 2922 unsigned int vq_num = 0; 2923 2924 while (vq_num < dev->nr_vring) { 2925 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2926 2927 if (vq) { 2928 rte_spinlock_unlock(&vq->access_lock); 2929 vq_num++; 2930 } 2931 i++; 2932 } 2933 } 2934 2935 int 2936 vhost_user_msg_handler(int vid, int fd) 2937 { 2938 struct virtio_net *dev; 2939 struct vhu_msg_context ctx; 2940 struct rte_vdpa_device *vdpa_dev; 2941 int ret; 2942 int unlock_required = 0; 2943 bool handled; 2944 int request; 2945 uint32_t i; 2946 2947 dev = get_device(vid); 2948 if (dev == NULL) 2949 return -1; 2950 2951 if (!dev->notify_ops) { 2952 dev->notify_ops = vhost_driver_callback_get(dev->ifname); 2953 if (!dev->notify_ops) { 2954 VHOST_LOG_CONFIG(ERR, "(%s) failed to get callback ops for driver\n", 2955 dev->ifname); 2956 return -1; 2957 } 2958 } 2959 2960 ret = read_vhost_message(dev, fd, &ctx); 2961 if (ret <= 0) { 2962 if (ret < 0) 2963 VHOST_LOG_CONFIG(ERR, "(%s) vhost read message failed\n", dev->ifname); 2964 else 2965 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 2966 2967 return -1; 2968 } 2969 2970 ret = 0; 2971 request = ctx.msg.request.master; 2972 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX && 2973 vhost_message_str[request]) { 2974 if (request != VHOST_USER_IOTLB_MSG) 2975 VHOST_LOG_CONFIG(INFO, "(%s) read message %s\n", 2976 dev->ifname, vhost_message_str[request]); 2977 else 2978 VHOST_LOG_CONFIG(DEBUG, "(%s) read message %s\n", 2979 dev->ifname, vhost_message_str[request]); 2980 } else { 2981 VHOST_LOG_CONFIG(DEBUG, "(%s) external request %d\n", dev->ifname, request); 2982 } 2983 2984 ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx); 2985 if (ret < 0) { 2986 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc queue\n", dev->ifname); 2987 return -1; 2988 } 2989 2990 /* 2991 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE 2992 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops 2993 * and device is destroyed. destroy_device waits for queues to be 2994 * inactive, so it is safe. Otherwise taking the access_lock 2995 * would cause a dead lock. 2996 */ 2997 switch (request) { 2998 case VHOST_USER_SET_FEATURES: 2999 case VHOST_USER_SET_PROTOCOL_FEATURES: 3000 case VHOST_USER_SET_OWNER: 3001 case VHOST_USER_SET_MEM_TABLE: 3002 case VHOST_USER_SET_LOG_BASE: 3003 case VHOST_USER_SET_LOG_FD: 3004 case VHOST_USER_SET_VRING_NUM: 3005 case VHOST_USER_SET_VRING_ADDR: 3006 case VHOST_USER_SET_VRING_BASE: 3007 case VHOST_USER_SET_VRING_KICK: 3008 case VHOST_USER_SET_VRING_CALL: 3009 case VHOST_USER_SET_VRING_ERR: 3010 case VHOST_USER_SET_VRING_ENABLE: 3011 case VHOST_USER_SEND_RARP: 3012 case VHOST_USER_NET_SET_MTU: 3013 case VHOST_USER_SET_SLAVE_REQ_FD: 3014 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3015 vhost_user_lock_all_queue_pairs(dev); 3016 unlock_required = 1; 3017 } 3018 break; 3019 default: 3020 break; 3021 3022 } 3023 3024 handled = false; 3025 if (dev->extern_ops.pre_msg_handle) { 3026 ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, 3027 (void *)&ctx.msg); 3028 switch (ret) { 3029 case RTE_VHOST_MSG_RESULT_REPLY: 3030 send_vhost_reply(dev, fd, &ctx); 3031 /* Fall-through */ 3032 case RTE_VHOST_MSG_RESULT_ERR: 3033 case RTE_VHOST_MSG_RESULT_OK: 3034 handled = true; 3035 goto skip_to_post_handle; 3036 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3037 default: 3038 break; 3039 } 3040 } 3041 3042 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) { 3043 if (!vhost_message_handlers[request]) 3044 goto skip_to_post_handle; 3045 ret = vhost_message_handlers[request](&dev, &ctx, fd); 3046 3047 switch (ret) { 3048 case RTE_VHOST_MSG_RESULT_ERR: 3049 VHOST_LOG_CONFIG(ERR, "(%s) processing %s failed.\n", 3050 dev->ifname, vhost_message_str[request]); 3051 handled = true; 3052 break; 3053 case RTE_VHOST_MSG_RESULT_OK: 3054 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded.\n", 3055 dev->ifname, vhost_message_str[request]); 3056 handled = true; 3057 break; 3058 case RTE_VHOST_MSG_RESULT_REPLY: 3059 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded and needs reply.\n", 3060 dev->ifname, vhost_message_str[request]); 3061 send_vhost_reply(dev, fd, &ctx); 3062 handled = true; 3063 break; 3064 default: 3065 break; 3066 } 3067 } 3068 3069 skip_to_post_handle: 3070 if (ret != RTE_VHOST_MSG_RESULT_ERR && 3071 dev->extern_ops.post_msg_handle) { 3072 ret = (*dev->extern_ops.post_msg_handle)(dev->vid, 3073 (void *)&ctx.msg); 3074 switch (ret) { 3075 case RTE_VHOST_MSG_RESULT_REPLY: 3076 send_vhost_reply(dev, fd, &ctx); 3077 /* Fall-through */ 3078 case RTE_VHOST_MSG_RESULT_ERR: 3079 case RTE_VHOST_MSG_RESULT_OK: 3080 handled = true; 3081 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3082 default: 3083 break; 3084 } 3085 } 3086 3087 /* If message was not handled at this stage, treat it as an error */ 3088 if (!handled) { 3089 VHOST_LOG_CONFIG(ERR, "(%s) vhost message (req: %d) was not handled.\n", 3090 dev->ifname, request); 3091 close_msg_fds(&ctx); 3092 ret = RTE_VHOST_MSG_RESULT_ERR; 3093 } 3094 3095 /* 3096 * If the request required a reply that was already sent, 3097 * this optional reply-ack won't be sent as the 3098 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). 3099 */ 3100 if (ctx.msg.flags & VHOST_USER_NEED_REPLY) { 3101 ctx.msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR; 3102 ctx.msg.size = sizeof(ctx.msg.payload.u64); 3103 ctx.fd_num = 0; 3104 send_vhost_reply(dev, fd, &ctx); 3105 } else if (ret == RTE_VHOST_MSG_RESULT_ERR) { 3106 VHOST_LOG_CONFIG(ERR, "(%s) vhost message handling failed.\n", dev->ifname); 3107 return -1; 3108 } 3109 3110 for (i = 0; i < dev->nr_vring; i++) { 3111 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3112 bool cur_ready = vq_is_ready(dev, vq); 3113 3114 if (cur_ready != (vq && vq->ready)) { 3115 vq->ready = cur_ready; 3116 vhost_user_notify_queue_state(dev, i, cur_ready); 3117 } 3118 } 3119 3120 if (unlock_required) 3121 vhost_user_unlock_all_queue_pairs(dev); 3122 3123 if (!virtio_is_ready(dev)) 3124 goto out; 3125 3126 /* 3127 * Virtio is now ready. If not done already, it is time 3128 * to notify the application it can process the rings and 3129 * configure the vDPA device if present. 3130 */ 3131 3132 if (!(dev->flags & VIRTIO_DEV_RUNNING)) { 3133 if (dev->notify_ops->new_device(dev->vid) == 0) 3134 dev->flags |= VIRTIO_DEV_RUNNING; 3135 } 3136 3137 vdpa_dev = dev->vdpa_dev; 3138 if (!vdpa_dev) 3139 goto out; 3140 3141 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3142 if (vdpa_dev->ops->dev_conf(dev->vid)) 3143 VHOST_LOG_CONFIG(ERR, "(%s) failed to configure vDPA device\n", 3144 dev->ifname); 3145 else 3146 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; 3147 } 3148 3149 out: 3150 return 0; 3151 } 3152 3153 static int process_slave_message_reply(struct virtio_net *dev, 3154 const struct vhu_msg_context *ctx) 3155 { 3156 struct vhu_msg_context msg_reply; 3157 int ret; 3158 3159 if ((ctx->msg.flags & VHOST_USER_NEED_REPLY) == 0) 3160 return 0; 3161 3162 ret = read_vhost_message(dev, dev->slave_req_fd, &msg_reply); 3163 if (ret <= 0) { 3164 if (ret < 0) 3165 VHOST_LOG_CONFIG(ERR, "(%s) vhost read slave message reply failed\n", 3166 dev->ifname); 3167 else 3168 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 3169 ret = -1; 3170 goto out; 3171 } 3172 3173 ret = 0; 3174 if (msg_reply.msg.request.slave != ctx->msg.request.slave) { 3175 VHOST_LOG_CONFIG(ERR, "(%s) received unexpected msg type (%u), expected %u\n", 3176 dev->ifname, msg_reply.msg.request.slave, ctx->msg.request.slave); 3177 ret = -1; 3178 goto out; 3179 } 3180 3181 ret = msg_reply.msg.payload.u64 ? -1 : 0; 3182 3183 out: 3184 rte_spinlock_unlock(&dev->slave_req_lock); 3185 return ret; 3186 } 3187 3188 int 3189 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) 3190 { 3191 int ret; 3192 struct vhu_msg_context ctx = { 3193 .msg = { 3194 .request.slave = VHOST_USER_SLAVE_IOTLB_MSG, 3195 .flags = VHOST_USER_VERSION, 3196 .size = sizeof(ctx.msg.payload.iotlb), 3197 .payload.iotlb = { 3198 .iova = iova, 3199 .perm = perm, 3200 .type = VHOST_IOTLB_MISS, 3201 }, 3202 }, 3203 }; 3204 3205 ret = send_vhost_message(dev, dev->slave_req_fd, &ctx); 3206 if (ret < 0) { 3207 VHOST_LOG_CONFIG(ERR, "(%s) failed to send IOTLB miss message (%d)\n", 3208 dev->ifname, ret); 3209 return ret; 3210 } 3211 3212 return 0; 3213 } 3214 3215 static int 3216 vhost_user_slave_config_change(struct virtio_net *dev, bool need_reply) 3217 { 3218 int ret; 3219 struct vhu_msg_context ctx = { 3220 .msg = { 3221 .request.slave = VHOST_USER_SLAVE_CONFIG_CHANGE_MSG, 3222 .flags = VHOST_USER_VERSION, 3223 .size = 0, 3224 } 3225 }; 3226 3227 if (need_reply) 3228 ctx.msg.flags |= VHOST_USER_NEED_REPLY; 3229 3230 ret = send_vhost_slave_message(dev, &ctx); 3231 if (ret < 0) { 3232 VHOST_LOG_CONFIG(ERR, "(%s) failed to send config change (%d)\n", 3233 dev->ifname, ret); 3234 return ret; 3235 } 3236 3237 return process_slave_message_reply(dev, &ctx); 3238 } 3239 3240 int 3241 rte_vhost_slave_config_change(int vid, bool need_reply) 3242 { 3243 struct virtio_net *dev; 3244 3245 dev = get_device(vid); 3246 if (!dev) 3247 return -ENODEV; 3248 3249 return vhost_user_slave_config_change(dev, need_reply); 3250 } 3251 3252 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, 3253 int index, int fd, 3254 uint64_t offset, 3255 uint64_t size) 3256 { 3257 int ret; 3258 struct vhu_msg_context ctx = { 3259 .msg = { 3260 .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 3261 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, 3262 .size = sizeof(ctx.msg.payload.area), 3263 .payload.area = { 3264 .u64 = index & VHOST_USER_VRING_IDX_MASK, 3265 .size = size, 3266 .offset = offset, 3267 }, 3268 }, 3269 }; 3270 3271 if (fd < 0) 3272 ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 3273 else { 3274 ctx.fds[0] = fd; 3275 ctx.fd_num = 1; 3276 } 3277 3278 ret = send_vhost_slave_message(dev, &ctx); 3279 if (ret < 0) { 3280 VHOST_LOG_CONFIG(ERR, "(%s) failed to set host notifier (%d)\n", 3281 dev->ifname, ret); 3282 return ret; 3283 } 3284 3285 return process_slave_message_reply(dev, &ctx); 3286 } 3287 3288 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable) 3289 { 3290 struct virtio_net *dev; 3291 struct rte_vdpa_device *vdpa_dev; 3292 int vfio_device_fd, ret = 0; 3293 uint64_t offset, size; 3294 unsigned int i, q_start, q_last; 3295 3296 dev = get_device(vid); 3297 if (!dev) 3298 return -ENODEV; 3299 3300 vdpa_dev = dev->vdpa_dev; 3301 if (vdpa_dev == NULL) 3302 return -ENODEV; 3303 3304 if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || 3305 !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || 3306 !(dev->protocol_features & 3307 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || 3308 !(dev->protocol_features & 3309 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || 3310 !(dev->protocol_features & 3311 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) 3312 return -ENOTSUP; 3313 3314 if (qid == RTE_VHOST_QUEUE_ALL) { 3315 q_start = 0; 3316 q_last = dev->nr_vring - 1; 3317 } else { 3318 if (qid >= dev->nr_vring) 3319 return -EINVAL; 3320 q_start = qid; 3321 q_last = qid; 3322 } 3323 3324 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); 3325 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); 3326 3327 vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); 3328 if (vfio_device_fd < 0) 3329 return -ENOTSUP; 3330 3331 if (enable) { 3332 for (i = q_start; i <= q_last; i++) { 3333 if (vdpa_dev->ops->get_notify_area(vid, i, &offset, 3334 &size) < 0) { 3335 ret = -ENOTSUP; 3336 goto disable; 3337 } 3338 3339 if (vhost_user_slave_set_vring_host_notifier(dev, i, 3340 vfio_device_fd, offset, size) < 0) { 3341 ret = -EFAULT; 3342 goto disable; 3343 } 3344 } 3345 } else { 3346 disable: 3347 for (i = q_start; i <= q_last; i++) { 3348 vhost_user_slave_set_vring_host_notifier(dev, i, -1, 3349 0, 0); 3350 } 3351 } 3352 3353 return ret; 3354 } 3355