1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 /* Security model 6 * -------------- 7 * The vhost-user protocol connection is an external interface, so it must be 8 * robust against invalid inputs. 9 * 10 * This is important because the vhost-user master is only one step removed 11 * from the guest. Malicious guests that have escaped will then launch further 12 * attacks from the vhost-user master. 13 * 14 * Even in deployments where guests are trusted, a bug in the vhost-user master 15 * can still cause invalid messages to be sent. Such messages must not 16 * compromise the stability of the DPDK application by causing crashes, memory 17 * corruption, or other problematic behavior. 18 * 19 * Do not assume received VhostUserMsg fields contain sensible values! 20 */ 21 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #include <unistd.h> 27 #include <fcntl.h> 28 #include <sys/ioctl.h> 29 #include <sys/mman.h> 30 #include <sys/stat.h> 31 #include <sys/syscall.h> 32 #ifdef RTE_LIBRTE_VHOST_NUMA 33 #include <numaif.h> 34 #endif 35 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 36 #include <linux/userfaultfd.h> 37 #endif 38 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 39 #include <linux/memfd.h> 40 #define MEMFD_SUPPORTED 41 #endif 42 43 #include <rte_common.h> 44 #include <rte_malloc.h> 45 #include <rte_log.h> 46 #include <rte_vfio.h> 47 #include <rte_errno.h> 48 49 #include "iotlb.h" 50 #include "vhost.h" 51 #include "vhost_user.h" 52 53 #define VIRTIO_MIN_MTU 68 54 #define VIRTIO_MAX_MTU 65535 55 56 #define INFLIGHT_ALIGNMENT 64 57 #define INFLIGHT_VERSION 0x1 58 59 static const char *vhost_message_str[VHOST_USER_MAX] = { 60 [VHOST_USER_NONE] = "VHOST_USER_NONE", 61 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", 62 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", 63 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", 64 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", 65 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", 66 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", 67 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", 68 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", 69 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", 70 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", 71 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", 72 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", 73 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", 74 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", 75 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", 76 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", 77 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", 78 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", 79 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", 80 [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", 81 [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", 82 [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", 83 [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", 84 [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", 85 [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", 86 [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", 87 [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", 88 [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD", 89 [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD", 90 [VHOST_USER_SET_STATUS] = "VHOST_USER_SET_STATUS", 91 [VHOST_USER_GET_STATUS] = "VHOST_USER_GET_STATUS", 92 }; 93 94 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 95 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 96 97 static void 98 close_msg_fds(struct vhu_msg_context *ctx) 99 { 100 int i; 101 102 for (i = 0; i < ctx->fd_num; i++) { 103 int fd = ctx->fds[i]; 104 105 if (fd == -1) 106 continue; 107 108 ctx->fds[i] = -1; 109 close(fd); 110 } 111 } 112 113 /* 114 * Ensure the expected number of FDs is received, 115 * close all FDs and return an error if this is not the case. 116 */ 117 static int 118 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds) 119 { 120 if (ctx->fd_num == expected_fds) 121 return 0; 122 123 VHOST_LOG_CONFIG(ERR, "(%s) expect %d FDs for request %s, received %d\n", 124 dev->ifname, expected_fds, 125 vhost_message_str[ctx->msg.request.master], 126 ctx->fd_num); 127 128 close_msg_fds(ctx); 129 130 return -1; 131 } 132 133 static uint64_t 134 get_blk_size(int fd) 135 { 136 struct stat stat; 137 int ret; 138 139 ret = fstat(fd, &stat); 140 return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; 141 } 142 143 static int 144 async_dma_map(struct virtio_net *dev, struct rte_vhost_mem_region *region, bool do_map) 145 { 146 uint64_t host_iova; 147 int ret = 0; 148 149 host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr); 150 if (do_map) { 151 /* Add mapped region into the default container of DPDK. */ 152 ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD, 153 region->host_user_addr, 154 host_iova, 155 region->size); 156 if (ret) { 157 /* 158 * DMA device may bind with kernel driver, in this case, 159 * we don't need to program IOMMU manually. However, if no 160 * device is bound with vfio/uio in DPDK, and vfio kernel 161 * module is loaded, the API will still be called and return 162 * with ENODEV/ENOSUP. 163 * 164 * DPDK vfio only returns ENODEV/ENOSUP in very similar 165 * situations(vfio either unsupported, or supported 166 * but no devices found). Either way, no mappings could be 167 * performed. We treat it as normal case in async path. 168 */ 169 if (rte_errno == ENODEV || rte_errno == ENOTSUP) 170 return 0; 171 172 VHOST_LOG_CONFIG(ERR, "(%s) DMA engine map failed\n", dev->ifname); 173 /* DMA mapping errors won't stop VHST_USER_SET_MEM_TABLE. */ 174 return 0; 175 } 176 177 } else { 178 /* Remove mapped region from the default container of DPDK. */ 179 ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD, 180 region->host_user_addr, 181 host_iova, 182 region->size); 183 if (ret) { 184 /* like DMA map, ignore the kernel driver case when unmap. */ 185 if (rte_errno == EINVAL) 186 return 0; 187 188 VHOST_LOG_CONFIG(ERR, "(%s) DMA engine unmap failed\n", dev->ifname); 189 return ret; 190 } 191 } 192 193 return ret; 194 } 195 196 static void 197 free_mem_region(struct virtio_net *dev) 198 { 199 uint32_t i; 200 struct rte_vhost_mem_region *reg; 201 202 if (!dev || !dev->mem) 203 return; 204 205 for (i = 0; i < dev->mem->nregions; i++) { 206 reg = &dev->mem->regions[i]; 207 if (reg->host_user_addr) { 208 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 209 async_dma_map(dev, reg, false); 210 211 munmap(reg->mmap_addr, reg->mmap_size); 212 close(reg->fd); 213 } 214 } 215 } 216 217 void 218 vhost_backend_cleanup(struct virtio_net *dev) 219 { 220 struct rte_vdpa_device *vdpa_dev; 221 222 vdpa_dev = dev->vdpa_dev; 223 if (vdpa_dev && vdpa_dev->ops->dev_cleanup != NULL) 224 vdpa_dev->ops->dev_cleanup(dev->vid); 225 226 if (dev->mem) { 227 free_mem_region(dev); 228 rte_free(dev->mem); 229 dev->mem = NULL; 230 } 231 232 rte_free(dev->guest_pages); 233 dev->guest_pages = NULL; 234 235 if (dev->log_addr) { 236 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 237 dev->log_addr = 0; 238 } 239 240 if (dev->inflight_info) { 241 if (dev->inflight_info->addr) { 242 munmap(dev->inflight_info->addr, 243 dev->inflight_info->size); 244 dev->inflight_info->addr = NULL; 245 } 246 247 if (dev->inflight_info->fd >= 0) { 248 close(dev->inflight_info->fd); 249 dev->inflight_info->fd = -1; 250 } 251 252 rte_free(dev->inflight_info); 253 dev->inflight_info = NULL; 254 } 255 256 if (dev->slave_req_fd >= 0) { 257 close(dev->slave_req_fd); 258 dev->slave_req_fd = -1; 259 } 260 261 if (dev->postcopy_ufd >= 0) { 262 close(dev->postcopy_ufd); 263 dev->postcopy_ufd = -1; 264 } 265 266 dev->postcopy_listening = 0; 267 } 268 269 static void 270 vhost_user_notify_queue_state(struct virtio_net *dev, uint16_t index, 271 int enable) 272 { 273 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 274 struct vhost_virtqueue *vq = dev->virtqueue[index]; 275 276 /* Configure guest notifications on enable */ 277 if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF) 278 vhost_enable_guest_notification(dev, vq, vq->notif_enable); 279 280 if (vdpa_dev && vdpa_dev->ops->set_vring_state) 281 vdpa_dev->ops->set_vring_state(dev->vid, index, enable); 282 283 if (dev->notify_ops->vring_state_changed) 284 dev->notify_ops->vring_state_changed(dev->vid, 285 index, enable); 286 } 287 288 /* 289 * This function just returns success at the moment unless 290 * the device hasn't been initialised. 291 */ 292 static int 293 vhost_user_set_owner(struct virtio_net **pdev, 294 struct vhu_msg_context *ctx, 295 int main_fd __rte_unused) 296 { 297 struct virtio_net *dev = *pdev; 298 299 if (validate_msg_fds(dev, ctx, 0) != 0) 300 return RTE_VHOST_MSG_RESULT_ERR; 301 302 return RTE_VHOST_MSG_RESULT_OK; 303 } 304 305 static int 306 vhost_user_reset_owner(struct virtio_net **pdev, 307 struct vhu_msg_context *ctx, 308 int main_fd __rte_unused) 309 { 310 struct virtio_net *dev = *pdev; 311 312 if (validate_msg_fds(dev, ctx, 0) != 0) 313 return RTE_VHOST_MSG_RESULT_ERR; 314 315 vhost_destroy_device_notify(dev); 316 317 cleanup_device(dev, 0); 318 reset_device(dev); 319 return RTE_VHOST_MSG_RESULT_OK; 320 } 321 322 /* 323 * The features that we support are requested. 324 */ 325 static int 326 vhost_user_get_features(struct virtio_net **pdev, 327 struct vhu_msg_context *ctx, 328 int main_fd __rte_unused) 329 { 330 struct virtio_net *dev = *pdev; 331 uint64_t features = 0; 332 333 if (validate_msg_fds(dev, ctx, 0) != 0) 334 return RTE_VHOST_MSG_RESULT_ERR; 335 336 rte_vhost_driver_get_features(dev->ifname, &features); 337 338 ctx->msg.payload.u64 = features; 339 ctx->msg.size = sizeof(ctx->msg.payload.u64); 340 ctx->fd_num = 0; 341 342 return RTE_VHOST_MSG_RESULT_REPLY; 343 } 344 345 /* 346 * The queue number that we support are requested. 347 */ 348 static int 349 vhost_user_get_queue_num(struct virtio_net **pdev, 350 struct vhu_msg_context *ctx, 351 int main_fd __rte_unused) 352 { 353 struct virtio_net *dev = *pdev; 354 uint32_t queue_num = 0; 355 356 if (validate_msg_fds(dev, ctx, 0) != 0) 357 return RTE_VHOST_MSG_RESULT_ERR; 358 359 rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); 360 361 ctx->msg.payload.u64 = (uint64_t)queue_num; 362 ctx->msg.size = sizeof(ctx->msg.payload.u64); 363 ctx->fd_num = 0; 364 365 return RTE_VHOST_MSG_RESULT_REPLY; 366 } 367 368 /* 369 * We receive the negotiated features supported by us and the virtio device. 370 */ 371 static int 372 vhost_user_set_features(struct virtio_net **pdev, 373 struct vhu_msg_context *ctx, 374 int main_fd __rte_unused) 375 { 376 struct virtio_net *dev = *pdev; 377 uint64_t features = ctx->msg.payload.u64; 378 uint64_t vhost_features = 0; 379 struct rte_vdpa_device *vdpa_dev; 380 381 if (validate_msg_fds(dev, ctx, 0) != 0) 382 return RTE_VHOST_MSG_RESULT_ERR; 383 384 rte_vhost_driver_get_features(dev->ifname, &vhost_features); 385 if (features & ~vhost_features) { 386 VHOST_LOG_CONFIG(ERR, "(%s) received invalid negotiated features.\n", 387 dev->ifname); 388 dev->flags |= VIRTIO_DEV_FEATURES_FAILED; 389 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 390 391 return RTE_VHOST_MSG_RESULT_ERR; 392 } 393 394 if (dev->flags & VIRTIO_DEV_RUNNING) { 395 if (dev->features == features) 396 return RTE_VHOST_MSG_RESULT_OK; 397 398 /* 399 * Error out if master tries to change features while device is 400 * in running state. The exception being VHOST_F_LOG_ALL, which 401 * is enabled when the live-migration starts. 402 */ 403 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { 404 VHOST_LOG_CONFIG(ERR, "(%s) features changed while device is running.\n", 405 dev->ifname); 406 return RTE_VHOST_MSG_RESULT_ERR; 407 } 408 409 if (dev->notify_ops->features_changed) 410 dev->notify_ops->features_changed(dev->vid, features); 411 } 412 413 dev->features = features; 414 if (dev->features & 415 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 416 (1ULL << VIRTIO_F_VERSION_1) | 417 (1ULL << VIRTIO_F_RING_PACKED))) { 418 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); 419 } else { 420 dev->vhost_hlen = sizeof(struct virtio_net_hdr); 421 } 422 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Virtio features: 0x%" PRIx64 "\n", 423 dev->ifname, dev->features); 424 VHOST_LOG_CONFIG(DEBUG, "(%s) mergeable RX buffers %s, virtio 1 %s\n", 425 dev->ifname, 426 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", 427 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); 428 429 if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && 430 !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { 431 /* 432 * Remove all but first queue pair if MQ hasn't been 433 * negotiated. This is safe because the device is not 434 * running at this stage. 435 */ 436 while (dev->nr_vring > 2) { 437 struct vhost_virtqueue *vq; 438 439 vq = dev->virtqueue[--dev->nr_vring]; 440 if (!vq) 441 continue; 442 443 dev->virtqueue[dev->nr_vring] = NULL; 444 cleanup_vq(vq, 1); 445 cleanup_vq_inflight(dev, vq); 446 free_vq(dev, vq); 447 } 448 } 449 450 vdpa_dev = dev->vdpa_dev; 451 if (vdpa_dev) 452 vdpa_dev->ops->set_features(dev->vid); 453 454 dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED; 455 return RTE_VHOST_MSG_RESULT_OK; 456 } 457 458 /* 459 * The virtio device sends us the size of the descriptor ring. 460 */ 461 static int 462 vhost_user_set_vring_num(struct virtio_net **pdev, 463 struct vhu_msg_context *ctx, 464 int main_fd __rte_unused) 465 { 466 struct virtio_net *dev = *pdev; 467 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 468 469 if (validate_msg_fds(dev, ctx, 0) != 0) 470 return RTE_VHOST_MSG_RESULT_ERR; 471 472 if (ctx->msg.payload.state.num > 32768) { 473 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 474 dev->ifname, ctx->msg.payload.state.num); 475 return RTE_VHOST_MSG_RESULT_ERR; 476 } 477 478 vq->size = ctx->msg.payload.state.num; 479 480 /* VIRTIO 1.0, 2.4 Virtqueues says: 481 * 482 * Queue Size value is always a power of 2. The maximum Queue Size 483 * value is 32768. 484 * 485 * VIRTIO 1.1 2.7 Virtqueues says: 486 * 487 * Packed virtqueues support up to 2^15 entries each. 488 */ 489 if (!vq_is_packed(dev)) { 490 if (vq->size & (vq->size - 1)) { 491 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 492 dev->ifname, vq->size); 493 return RTE_VHOST_MSG_RESULT_ERR; 494 } 495 } 496 497 if (vq_is_packed(dev)) { 498 rte_free(vq->shadow_used_packed); 499 vq->shadow_used_packed = rte_malloc_socket(NULL, 500 vq->size * 501 sizeof(struct vring_used_elem_packed), 502 RTE_CACHE_LINE_SIZE, vq->numa_node); 503 if (!vq->shadow_used_packed) { 504 VHOST_LOG_CONFIG(ERR, 505 "(%s) failed to allocate memory for shadow used ring.\n", 506 dev->ifname); 507 return RTE_VHOST_MSG_RESULT_ERR; 508 } 509 510 } else { 511 rte_free(vq->shadow_used_split); 512 513 vq->shadow_used_split = rte_malloc_socket(NULL, 514 vq->size * sizeof(struct vring_used_elem), 515 RTE_CACHE_LINE_SIZE, vq->numa_node); 516 517 if (!vq->shadow_used_split) { 518 VHOST_LOG_CONFIG(ERR, 519 "(%s) failed to allocate memory for vq internal data.\n", 520 dev->ifname); 521 return RTE_VHOST_MSG_RESULT_ERR; 522 } 523 } 524 525 rte_free(vq->batch_copy_elems); 526 vq->batch_copy_elems = rte_malloc_socket(NULL, 527 vq->size * sizeof(struct batch_copy_elem), 528 RTE_CACHE_LINE_SIZE, vq->numa_node); 529 if (!vq->batch_copy_elems) { 530 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate memory for batching copy.\n", 531 dev->ifname); 532 return RTE_VHOST_MSG_RESULT_ERR; 533 } 534 535 return RTE_VHOST_MSG_RESULT_OK; 536 } 537 538 /* 539 * Reallocate virtio_dev, vhost_virtqueue and related data structures to 540 * make them on the same numa node as the memory of vring descriptor. 541 */ 542 #ifdef RTE_LIBRTE_VHOST_NUMA 543 static struct virtio_net* 544 numa_realloc(struct virtio_net *dev, int index) 545 { 546 int node, dev_node; 547 struct virtio_net *old_dev; 548 struct vhost_virtqueue *vq; 549 struct batch_copy_elem *bce; 550 struct guest_page *gp; 551 struct rte_vhost_memory *mem; 552 size_t mem_size; 553 int ret; 554 555 old_dev = dev; 556 vq = dev->virtqueue[index]; 557 558 /* 559 * If VQ is ready, it is too late to reallocate, it certainly already 560 * happened anyway on VHOST_USER_SET_VRING_ADRR. 561 */ 562 if (vq->ready) 563 return dev; 564 565 ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR); 566 if (ret) { 567 VHOST_LOG_CONFIG(ERR, "(%s) unable to get virtqueue %d numa information.\n", 568 dev->ifname, index); 569 return dev; 570 } 571 572 if (node == vq->numa_node) 573 goto out_dev_realloc; 574 575 vq = rte_realloc_socket(vq, sizeof(*vq), 0, node); 576 if (!vq) { 577 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc virtqueue %d on node %d\n", 578 dev->ifname, index, node); 579 return dev; 580 } 581 582 if (vq != dev->virtqueue[index]) { 583 VHOST_LOG_CONFIG(INFO, "(%s) reallocated virtqueue on node %d\n", 584 dev->ifname, node); 585 dev->virtqueue[index] = vq; 586 vhost_user_iotlb_init(dev, index); 587 } 588 589 if (vq_is_packed(dev)) { 590 struct vring_used_elem_packed *sup; 591 592 sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup), 593 RTE_CACHE_LINE_SIZE, node); 594 if (!sup) { 595 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow packed on node %d\n", 596 dev->ifname, node); 597 return dev; 598 } 599 vq->shadow_used_packed = sup; 600 } else { 601 struct vring_used_elem *sus; 602 603 sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus), 604 RTE_CACHE_LINE_SIZE, node); 605 if (!sus) { 606 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow split on node %d\n", 607 dev->ifname, node); 608 return dev; 609 } 610 vq->shadow_used_split = sus; 611 } 612 613 bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce), 614 RTE_CACHE_LINE_SIZE, node); 615 if (!bce) { 616 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc batch copy elem on node %d\n", 617 dev->ifname, node); 618 return dev; 619 } 620 vq->batch_copy_elems = bce; 621 622 if (vq->log_cache) { 623 struct log_cache_entry *lc; 624 625 lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node); 626 if (!lc) { 627 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc log cache on node %d\n", 628 dev->ifname, node); 629 return dev; 630 } 631 vq->log_cache = lc; 632 } 633 634 if (vq->resubmit_inflight) { 635 struct rte_vhost_resubmit_info *ri; 636 637 ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node); 638 if (!ri) { 639 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit inflight on node %d\n", 640 dev->ifname, node); 641 return dev; 642 } 643 vq->resubmit_inflight = ri; 644 645 if (ri->resubmit_list) { 646 struct rte_vhost_resubmit_desc *rd; 647 648 rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num, 649 0, node); 650 if (!rd) { 651 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit list on node %d\n", 652 dev->ifname, node); 653 return dev; 654 } 655 ri->resubmit_list = rd; 656 } 657 } 658 659 vq->numa_node = node; 660 661 out_dev_realloc: 662 663 if (dev->flags & VIRTIO_DEV_RUNNING) 664 return dev; 665 666 ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR); 667 if (ret) { 668 VHOST_LOG_CONFIG(ERR, "(%s) unable to get numa information.\n", dev->ifname); 669 return dev; 670 } 671 672 if (dev_node == node) 673 return dev; 674 675 dev = rte_realloc_socket(old_dev, sizeof(*dev), 0, node); 676 if (!dev) { 677 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc dev on node %d\n", 678 old_dev->ifname, node); 679 return old_dev; 680 } 681 682 VHOST_LOG_CONFIG(INFO, "(%s) reallocated device on node %d\n", dev->ifname, node); 683 vhost_devices[dev->vid] = dev; 684 685 mem_size = sizeof(struct rte_vhost_memory) + 686 sizeof(struct rte_vhost_mem_region) * dev->mem->nregions; 687 mem = rte_realloc_socket(dev->mem, mem_size, 0, node); 688 if (!mem) { 689 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc mem table on node %d\n", 690 dev->ifname, node); 691 return dev; 692 } 693 dev->mem = mem; 694 695 gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp), 696 RTE_CACHE_LINE_SIZE, node); 697 if (!gp) { 698 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc guest pages on node %d\n", 699 dev->ifname, node); 700 return dev; 701 } 702 dev->guest_pages = gp; 703 704 return dev; 705 } 706 #else 707 static struct virtio_net* 708 numa_realloc(struct virtio_net *dev, int index __rte_unused) 709 { 710 return dev; 711 } 712 #endif 713 714 /* Converts QEMU virtual address to Vhost virtual address. */ 715 static uint64_t 716 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) 717 { 718 struct rte_vhost_mem_region *r; 719 uint32_t i; 720 721 if (unlikely(!dev || !dev->mem)) 722 goto out_error; 723 724 /* Find the region where the address lives. */ 725 for (i = 0; i < dev->mem->nregions; i++) { 726 r = &dev->mem->regions[i]; 727 728 if (qva >= r->guest_user_addr && 729 qva < r->guest_user_addr + r->size) { 730 731 if (unlikely(*len > r->guest_user_addr + r->size - qva)) 732 *len = r->guest_user_addr + r->size - qva; 733 734 return qva - r->guest_user_addr + 735 r->host_user_addr; 736 } 737 } 738 out_error: 739 *len = 0; 740 741 return 0; 742 } 743 744 745 /* 746 * Converts ring address to Vhost virtual address. 747 * If IOMMU is enabled, the ring address is a guest IO virtual address, 748 * else it is a QEMU virtual address. 749 */ 750 static uint64_t 751 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 752 uint64_t ra, uint64_t *size) 753 { 754 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { 755 uint64_t vva; 756 757 vhost_user_iotlb_rd_lock(vq); 758 vva = vhost_iova_to_vva(dev, vq, ra, 759 size, VHOST_ACCESS_RW); 760 vhost_user_iotlb_rd_unlock(vq); 761 762 return vva; 763 } 764 765 return qva_to_vva(dev, ra, size); 766 } 767 768 static uint64_t 769 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) 770 { 771 uint64_t log_gpa; 772 773 vhost_user_iotlb_rd_lock(vq); 774 log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr); 775 vhost_user_iotlb_rd_unlock(vq); 776 777 return log_gpa; 778 } 779 780 static struct virtio_net * 781 translate_ring_addresses(struct virtio_net *dev, int vq_index) 782 { 783 struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; 784 struct vhost_vring_addr *addr = &vq->ring_addrs; 785 uint64_t len, expected_len; 786 787 if (addr->flags & (1 << VHOST_VRING_F_LOG)) { 788 vq->log_guest_addr = 789 log_addr_to_gpa(dev, vq); 790 if (vq->log_guest_addr == 0) { 791 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map log_guest_addr.\n", 792 dev->ifname); 793 return dev; 794 } 795 } 796 797 if (vq_is_packed(dev)) { 798 len = sizeof(struct vring_packed_desc) * vq->size; 799 vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) 800 ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); 801 if (vq->desc_packed == NULL || 802 len != sizeof(struct vring_packed_desc) * 803 vq->size) { 804 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc_packed ring.\n", 805 dev->ifname); 806 return dev; 807 } 808 809 dev = numa_realloc(dev, vq_index); 810 vq = dev->virtqueue[vq_index]; 811 addr = &vq->ring_addrs; 812 813 len = sizeof(struct vring_packed_desc_event); 814 vq->driver_event = (struct vring_packed_desc_event *) 815 (uintptr_t)ring_addr_to_vva(dev, 816 vq, addr->avail_user_addr, &len); 817 if (vq->driver_event == NULL || 818 len != sizeof(struct vring_packed_desc_event)) { 819 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find driver area address.\n", 820 dev->ifname); 821 return dev; 822 } 823 824 len = sizeof(struct vring_packed_desc_event); 825 vq->device_event = (struct vring_packed_desc_event *) 826 (uintptr_t)ring_addr_to_vva(dev, 827 vq, addr->used_user_addr, &len); 828 if (vq->device_event == NULL || 829 len != sizeof(struct vring_packed_desc_event)) { 830 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find device area address.\n", 831 dev->ifname); 832 return dev; 833 } 834 835 vq->access_ok = true; 836 return dev; 837 } 838 839 /* The addresses are converted from QEMU virtual to Vhost virtual. */ 840 if (vq->desc && vq->avail && vq->used) 841 return dev; 842 843 len = sizeof(struct vring_desc) * vq->size; 844 vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, 845 vq, addr->desc_user_addr, &len); 846 if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { 847 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc ring.\n", dev->ifname); 848 return dev; 849 } 850 851 dev = numa_realloc(dev, vq_index); 852 vq = dev->virtqueue[vq_index]; 853 addr = &vq->ring_addrs; 854 855 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 856 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 857 len += sizeof(uint16_t); 858 expected_len = len; 859 vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, 860 vq, addr->avail_user_addr, &len); 861 if (vq->avail == 0 || len != expected_len) { 862 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map avail ring.\n", dev->ifname); 863 return dev; 864 } 865 866 len = sizeof(struct vring_used) + 867 sizeof(struct vring_used_elem) * vq->size; 868 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 869 len += sizeof(uint16_t); 870 expected_len = len; 871 vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, 872 vq, addr->used_user_addr, &len); 873 if (vq->used == 0 || len != expected_len) { 874 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map used ring.\n", dev->ifname); 875 return dev; 876 } 877 878 if (vq->last_used_idx != vq->used->idx) { 879 VHOST_LOG_CONFIG(WARNING, "(%s) last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", 880 dev->ifname, 881 vq->last_used_idx, vq->used->idx); 882 vq->last_used_idx = vq->used->idx; 883 vq->last_avail_idx = vq->used->idx; 884 VHOST_LOG_CONFIG(WARNING, "(%s) some packets maybe resent for Tx and dropped for Rx\n", 885 dev->ifname); 886 } 887 888 vq->access_ok = true; 889 890 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address desc: %p\n", dev->ifname, vq->desc); 891 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address avail: %p\n", dev->ifname, vq->avail); 892 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address used: %p\n", dev->ifname, vq->used); 893 VHOST_LOG_CONFIG(DEBUG, "(%s) log_guest_addr: %" PRIx64 "\n", 894 dev->ifname, vq->log_guest_addr); 895 896 return dev; 897 } 898 899 /* 900 * The virtio device sends us the desc, used and avail ring addresses. 901 * This function then converts these to our address space. 902 */ 903 static int 904 vhost_user_set_vring_addr(struct virtio_net **pdev, 905 struct vhu_msg_context *ctx, 906 int main_fd __rte_unused) 907 { 908 struct virtio_net *dev = *pdev; 909 struct vhost_virtqueue *vq; 910 struct vhost_vring_addr *addr = &ctx->msg.payload.addr; 911 bool access_ok; 912 913 if (validate_msg_fds(dev, ctx, 0) != 0) 914 return RTE_VHOST_MSG_RESULT_ERR; 915 916 if (dev->mem == NULL) 917 return RTE_VHOST_MSG_RESULT_ERR; 918 919 /* addr->index refers to the queue index. The txq 1, rxq is 0. */ 920 vq = dev->virtqueue[ctx->msg.payload.addr.index]; 921 922 access_ok = vq->access_ok; 923 924 /* 925 * Rings addresses should not be interpreted as long as the ring is not 926 * started and enabled 927 */ 928 memcpy(&vq->ring_addrs, addr, sizeof(*addr)); 929 930 vring_invalidate(dev, vq); 931 932 if ((vq->enabled && (dev->features & 933 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) || 934 access_ok) { 935 dev = translate_ring_addresses(dev, ctx->msg.payload.addr.index); 936 if (!dev) 937 return RTE_VHOST_MSG_RESULT_ERR; 938 939 *pdev = dev; 940 } 941 942 return RTE_VHOST_MSG_RESULT_OK; 943 } 944 945 /* 946 * The virtio device sends us the available ring last used index. 947 */ 948 static int 949 vhost_user_set_vring_base(struct virtio_net **pdev, 950 struct vhu_msg_context *ctx, 951 int main_fd __rte_unused) 952 { 953 struct virtio_net *dev = *pdev; 954 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 955 uint64_t val = ctx->msg.payload.state.num; 956 957 if (validate_msg_fds(dev, ctx, 0) != 0) 958 return RTE_VHOST_MSG_RESULT_ERR; 959 960 if (vq_is_packed(dev)) { 961 /* 962 * Bit[0:14]: avail index 963 * Bit[15]: avail wrap counter 964 */ 965 vq->last_avail_idx = val & 0x7fff; 966 vq->avail_wrap_counter = !!(val & (0x1 << 15)); 967 /* 968 * Set used index to same value as available one, as 969 * their values should be the same since ring processing 970 * was stopped at get time. 971 */ 972 vq->last_used_idx = vq->last_avail_idx; 973 vq->used_wrap_counter = vq->avail_wrap_counter; 974 } else { 975 vq->last_used_idx = ctx->msg.payload.state.num; 976 vq->last_avail_idx = ctx->msg.payload.state.num; 977 } 978 979 VHOST_LOG_CONFIG(INFO, 980 "(%s) vring base idx:%u last_used_idx:%u last_avail_idx:%u.\n", 981 dev->ifname, ctx->msg.payload.state.index, vq->last_used_idx, 982 vq->last_avail_idx); 983 984 return RTE_VHOST_MSG_RESULT_OK; 985 } 986 987 static int 988 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, 989 uint64_t host_phys_addr, uint64_t size) 990 { 991 struct guest_page *page, *last_page; 992 struct guest_page *old_pages; 993 994 if (dev->nr_guest_pages == dev->max_guest_pages) { 995 dev->max_guest_pages *= 2; 996 old_pages = dev->guest_pages; 997 dev->guest_pages = rte_realloc(dev->guest_pages, 998 dev->max_guest_pages * sizeof(*page), 999 RTE_CACHE_LINE_SIZE); 1000 if (dev->guest_pages == NULL) { 1001 VHOST_LOG_CONFIG(ERR, "(%s) cannot realloc guest_pages\n", dev->ifname); 1002 rte_free(old_pages); 1003 return -1; 1004 } 1005 } 1006 1007 if (dev->nr_guest_pages > 0) { 1008 last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; 1009 /* merge if the two pages are continuous */ 1010 if (host_phys_addr == last_page->host_phys_addr + 1011 last_page->size) { 1012 last_page->size += size; 1013 return 0; 1014 } 1015 } 1016 1017 page = &dev->guest_pages[dev->nr_guest_pages++]; 1018 page->guest_phys_addr = guest_phys_addr; 1019 page->host_phys_addr = host_phys_addr; 1020 page->size = size; 1021 1022 return 0; 1023 } 1024 1025 static int 1026 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, 1027 uint64_t page_size) 1028 { 1029 uint64_t reg_size = reg->size; 1030 uint64_t host_user_addr = reg->host_user_addr; 1031 uint64_t guest_phys_addr = reg->guest_phys_addr; 1032 uint64_t host_phys_addr; 1033 uint64_t size; 1034 1035 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); 1036 size = page_size - (guest_phys_addr & (page_size - 1)); 1037 size = RTE_MIN(size, reg_size); 1038 1039 if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0) 1040 return -1; 1041 1042 host_user_addr += size; 1043 guest_phys_addr += size; 1044 reg_size -= size; 1045 1046 while (reg_size > 0) { 1047 size = RTE_MIN(reg_size, page_size); 1048 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t) 1049 host_user_addr); 1050 if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, 1051 size) < 0) 1052 return -1; 1053 1054 host_user_addr += size; 1055 guest_phys_addr += size; 1056 reg_size -= size; 1057 } 1058 1059 /* sort guest page array if over binary search threshold */ 1060 if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) { 1061 qsort((void *)dev->guest_pages, dev->nr_guest_pages, 1062 sizeof(struct guest_page), guest_page_addrcmp); 1063 } 1064 1065 return 0; 1066 } 1067 1068 #ifdef RTE_LIBRTE_VHOST_DEBUG 1069 /* TODO: enable it only in debug mode? */ 1070 static void 1071 dump_guest_pages(struct virtio_net *dev) 1072 { 1073 uint32_t i; 1074 struct guest_page *page; 1075 1076 for (i = 0; i < dev->nr_guest_pages; i++) { 1077 page = &dev->guest_pages[i]; 1078 1079 VHOST_LOG_CONFIG(INFO, "(%s) guest physical page region %u\n", 1080 dev->ifname, i); 1081 VHOST_LOG_CONFIG(INFO, "(%s)\tguest_phys_addr: %" PRIx64 "\n", 1082 dev->ifname, page->guest_phys_addr); 1083 VHOST_LOG_CONFIG(INFO, "(%s)\thost_phys_addr : %" PRIx64 "\n", 1084 dev->ifname, page->host_phys_addr); 1085 VHOST_LOG_CONFIG(INFO, "(%s)\tsize : %" PRIx64 "\n", 1086 dev->ifname, page->size); 1087 } 1088 } 1089 #else 1090 #define dump_guest_pages(dev) 1091 #endif 1092 1093 static bool 1094 vhost_memory_changed(struct VhostUserMemory *new, 1095 struct rte_vhost_memory *old) 1096 { 1097 uint32_t i; 1098 1099 if (new->nregions != old->nregions) 1100 return true; 1101 1102 for (i = 0; i < new->nregions; ++i) { 1103 VhostUserMemoryRegion *new_r = &new->regions[i]; 1104 struct rte_vhost_mem_region *old_r = &old->regions[i]; 1105 1106 if (new_r->guest_phys_addr != old_r->guest_phys_addr) 1107 return true; 1108 if (new_r->memory_size != old_r->size) 1109 return true; 1110 if (new_r->userspace_addr != old_r->guest_user_addr) 1111 return true; 1112 } 1113 1114 return false; 1115 } 1116 1117 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 1118 static int 1119 vhost_user_postcopy_region_register(struct virtio_net *dev, 1120 struct rte_vhost_mem_region *reg) 1121 { 1122 struct uffdio_register reg_struct; 1123 1124 /* 1125 * Let's register all the mmapped area to ensure 1126 * alignment on page boundary. 1127 */ 1128 reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr; 1129 reg_struct.range.len = reg->mmap_size; 1130 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 1131 1132 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, 1133 ®_struct)) { 1134 VHOST_LOG_CONFIG(ERR, "(%s) failed to register ufd for region " 1135 "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n", 1136 dev->ifname, 1137 (uint64_t)reg_struct.range.start, 1138 (uint64_t)reg_struct.range.start + 1139 (uint64_t)reg_struct.range.len - 1, 1140 dev->postcopy_ufd, 1141 strerror(errno)); 1142 return -1; 1143 } 1144 1145 VHOST_LOG_CONFIG(INFO, 1146 "(%s)\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n", 1147 dev->ifname, 1148 (uint64_t)reg_struct.range.start, 1149 (uint64_t)reg_struct.range.start + 1150 (uint64_t)reg_struct.range.len - 1); 1151 1152 return 0; 1153 } 1154 #else 1155 static int 1156 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused, 1157 struct rte_vhost_mem_region *reg __rte_unused) 1158 { 1159 return -1; 1160 } 1161 #endif 1162 1163 static int 1164 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd, 1165 struct vhu_msg_context *ctx) 1166 { 1167 struct VhostUserMemory *memory; 1168 struct rte_vhost_mem_region *reg; 1169 struct vhu_msg_context ack_ctx; 1170 uint32_t i; 1171 1172 if (!dev->postcopy_listening) 1173 return 0; 1174 1175 /* 1176 * We haven't a better way right now than sharing 1177 * DPDK's virtual address with Qemu, so that Qemu can 1178 * retrieve the region offset when handling userfaults. 1179 */ 1180 memory = &ctx->msg.payload.memory; 1181 for (i = 0; i < memory->nregions; i++) { 1182 reg = &dev->mem->regions[i]; 1183 memory->regions[i].userspace_addr = reg->host_user_addr; 1184 } 1185 1186 /* Send the addresses back to qemu */ 1187 ctx->fd_num = 0; 1188 send_vhost_reply(dev, main_fd, ctx); 1189 1190 /* Wait for qemu to acknowledge it got the addresses 1191 * we've got to wait before we're allowed to generate faults. 1192 */ 1193 if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) { 1194 VHOST_LOG_CONFIG(ERR, "(%s) failed to read qemu ack on postcopy set-mem-table\n", 1195 dev->ifname); 1196 return -1; 1197 } 1198 1199 if (validate_msg_fds(dev, &ack_ctx, 0) != 0) 1200 return -1; 1201 1202 if (ack_ctx.msg.request.master != VHOST_USER_SET_MEM_TABLE) { 1203 VHOST_LOG_CONFIG(ERR, "(%s) bad qemu ack on postcopy set-mem-table (%d)\n", 1204 dev->ifname, ack_ctx.msg.request.master); 1205 return -1; 1206 } 1207 1208 /* Now userfault register and we can use the memory */ 1209 for (i = 0; i < memory->nregions; i++) { 1210 reg = &dev->mem->regions[i]; 1211 if (vhost_user_postcopy_region_register(dev, reg) < 0) 1212 return -1; 1213 } 1214 1215 return 0; 1216 } 1217 1218 static int 1219 vhost_user_mmap_region(struct virtio_net *dev, 1220 struct rte_vhost_mem_region *region, 1221 uint64_t mmap_offset) 1222 { 1223 void *mmap_addr; 1224 uint64_t mmap_size; 1225 uint64_t alignment; 1226 int populate; 1227 int ret; 1228 1229 /* Check for memory_size + mmap_offset overflow */ 1230 if (mmap_offset >= -region->size) { 1231 VHOST_LOG_CONFIG(ERR, "(%s) mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow\n", 1232 dev->ifname, mmap_offset, region->size); 1233 return -1; 1234 } 1235 1236 mmap_size = region->size + mmap_offset; 1237 1238 /* mmap() without flag of MAP_ANONYMOUS, should be called with length 1239 * argument aligned with hugepagesz at older longterm version Linux, 1240 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL. 1241 * 1242 * To avoid failure, make sure in caller to keep length aligned. 1243 */ 1244 alignment = get_blk_size(region->fd); 1245 if (alignment == (uint64_t)-1) { 1246 VHOST_LOG_CONFIG(ERR, "(%s) couldn't get hugepage size through fstat\n", 1247 dev->ifname); 1248 return -1; 1249 } 1250 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); 1251 if (mmap_size == 0) { 1252 /* 1253 * It could happen if initial mmap_size + alignment overflows 1254 * the sizeof uint64, which could happen if either mmap_size or 1255 * alignment value is wrong. 1256 * 1257 * mmap() kernel implementation would return an error, but 1258 * better catch it before and provide useful info in the logs. 1259 */ 1260 VHOST_LOG_CONFIG(ERR, "(%s) mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid\n", 1261 dev->ifname, region->size + mmap_offset, alignment); 1262 return -1; 1263 } 1264 1265 populate = dev->async_copy ? MAP_POPULATE : 0; 1266 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1267 MAP_SHARED | populate, region->fd, 0); 1268 1269 if (mmap_addr == MAP_FAILED) { 1270 VHOST_LOG_CONFIG(ERR, "(%s) mmap failed (%s).\n", dev->ifname, strerror(errno)); 1271 return -1; 1272 } 1273 1274 region->mmap_addr = mmap_addr; 1275 region->mmap_size = mmap_size; 1276 region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; 1277 1278 if (dev->async_copy) { 1279 if (add_guest_pages(dev, region, alignment) < 0) { 1280 VHOST_LOG_CONFIG(ERR, "(%s) adding guest pages to region failed.\n", 1281 dev->ifname); 1282 return -1; 1283 } 1284 1285 if (rte_vfio_is_enabled("vfio")) { 1286 ret = async_dma_map(dev, region, true); 1287 if (ret) { 1288 VHOST_LOG_CONFIG(ERR, 1289 "(%s) configure IOMMU for DMA engine failed\n", 1290 dev->ifname); 1291 return -1; 1292 } 1293 } 1294 } 1295 1296 VHOST_LOG_CONFIG(INFO, "(%s) guest memory region size: 0x%" PRIx64 "\n", 1297 dev->ifname, region->size); 1298 VHOST_LOG_CONFIG(INFO, "(%s)\t guest physical addr: 0x%" PRIx64 "\n", 1299 dev->ifname, region->guest_phys_addr); 1300 VHOST_LOG_CONFIG(INFO, "(%s)\t guest virtual addr: 0x%" PRIx64 "\n", 1301 dev->ifname, region->guest_user_addr); 1302 VHOST_LOG_CONFIG(INFO, "(%s)\t host virtual addr: 0x%" PRIx64 "\n", 1303 dev->ifname, region->host_user_addr); 1304 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap addr : 0x%" PRIx64 "\n", 1305 dev->ifname, (uint64_t)(uintptr_t)mmap_addr); 1306 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap size : 0x%" PRIx64 "\n", 1307 dev->ifname, mmap_size); 1308 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap align: 0x%" PRIx64 "\n", 1309 dev->ifname, alignment); 1310 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap off : 0x%" PRIx64 "\n", 1311 dev->ifname, mmap_offset); 1312 1313 return 0; 1314 } 1315 1316 static int 1317 vhost_user_set_mem_table(struct virtio_net **pdev, 1318 struct vhu_msg_context *ctx, 1319 int main_fd) 1320 { 1321 struct virtio_net *dev = *pdev; 1322 struct VhostUserMemory *memory = &ctx->msg.payload.memory; 1323 struct rte_vhost_mem_region *reg; 1324 int numa_node = SOCKET_ID_ANY; 1325 uint64_t mmap_offset; 1326 uint32_t i; 1327 bool async_notify = false; 1328 1329 if (validate_msg_fds(dev, ctx, memory->nregions) != 0) 1330 return RTE_VHOST_MSG_RESULT_ERR; 1331 1332 if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { 1333 VHOST_LOG_CONFIG(ERR, "(%s) too many memory regions (%u)\n", 1334 dev->ifname, memory->nregions); 1335 goto close_msg_fds; 1336 } 1337 1338 if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { 1339 VHOST_LOG_CONFIG(INFO, "(%s) memory regions not changed\n", dev->ifname); 1340 1341 close_msg_fds(ctx); 1342 1343 return RTE_VHOST_MSG_RESULT_OK; 1344 } 1345 1346 if (dev->mem) { 1347 if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) { 1348 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 1349 1350 if (vdpa_dev && vdpa_dev->ops->dev_close) 1351 vdpa_dev->ops->dev_close(dev->vid); 1352 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 1353 } 1354 1355 /* notify the vhost application to stop DMA transfers */ 1356 if (dev->async_copy && dev->notify_ops->vring_state_changed) { 1357 for (i = 0; i < dev->nr_vring; i++) { 1358 dev->notify_ops->vring_state_changed(dev->vid, 1359 i, 0); 1360 } 1361 async_notify = true; 1362 } 1363 1364 free_mem_region(dev); 1365 rte_free(dev->mem); 1366 dev->mem = NULL; 1367 } 1368 1369 /* Flush IOTLB cache as previous HVAs are now invalid */ 1370 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 1371 for (i = 0; i < dev->nr_vring; i++) 1372 vhost_user_iotlb_flush_all(dev->virtqueue[i]); 1373 1374 /* 1375 * If VQ 0 has already been allocated, try to allocate on the same 1376 * NUMA node. It can be reallocated later in numa_realloc(). 1377 */ 1378 if (dev->nr_vring > 0) 1379 numa_node = dev->virtqueue[0]->numa_node; 1380 1381 dev->nr_guest_pages = 0; 1382 if (dev->guest_pages == NULL) { 1383 dev->max_guest_pages = 8; 1384 dev->guest_pages = rte_zmalloc_socket(NULL, 1385 dev->max_guest_pages * 1386 sizeof(struct guest_page), 1387 RTE_CACHE_LINE_SIZE, 1388 numa_node); 1389 if (dev->guest_pages == NULL) { 1390 VHOST_LOG_CONFIG(ERR, 1391 "(%s) failed to allocate memory for dev->guest_pages\n", 1392 dev->ifname); 1393 goto close_msg_fds; 1394 } 1395 } 1396 1397 dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) + 1398 sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node); 1399 if (dev->mem == NULL) { 1400 VHOST_LOG_CONFIG(ERR, 1401 "(%s) failed to allocate memory for dev->mem\n", 1402 dev->ifname); 1403 goto free_guest_pages; 1404 } 1405 1406 for (i = 0; i < memory->nregions; i++) { 1407 reg = &dev->mem->regions[i]; 1408 1409 reg->guest_phys_addr = memory->regions[i].guest_phys_addr; 1410 reg->guest_user_addr = memory->regions[i].userspace_addr; 1411 reg->size = memory->regions[i].memory_size; 1412 reg->fd = ctx->fds[i]; 1413 1414 /* 1415 * Assign invalid file descriptor value to avoid double 1416 * closing on error path. 1417 */ 1418 ctx->fds[i] = -1; 1419 1420 mmap_offset = memory->regions[i].mmap_offset; 1421 1422 if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) { 1423 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap region %u\n", dev->ifname, i); 1424 goto free_mem_table; 1425 } 1426 1427 dev->mem->nregions++; 1428 } 1429 1430 if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0) 1431 goto free_mem_table; 1432 1433 for (i = 0; i < dev->nr_vring; i++) { 1434 struct vhost_virtqueue *vq = dev->virtqueue[i]; 1435 1436 if (!vq) 1437 continue; 1438 1439 if (vq->desc || vq->avail || vq->used) { 1440 /* 1441 * If the memory table got updated, the ring addresses 1442 * need to be translated again as virtual addresses have 1443 * changed. 1444 */ 1445 vring_invalidate(dev, vq); 1446 1447 dev = translate_ring_addresses(dev, i); 1448 if (!dev) { 1449 dev = *pdev; 1450 goto free_mem_table; 1451 } 1452 1453 *pdev = dev; 1454 } 1455 } 1456 1457 dump_guest_pages(dev); 1458 1459 if (async_notify) { 1460 for (i = 0; i < dev->nr_vring; i++) 1461 dev->notify_ops->vring_state_changed(dev->vid, i, 1); 1462 } 1463 1464 return RTE_VHOST_MSG_RESULT_OK; 1465 1466 free_mem_table: 1467 free_mem_region(dev); 1468 rte_free(dev->mem); 1469 dev->mem = NULL; 1470 1471 free_guest_pages: 1472 rte_free(dev->guest_pages); 1473 dev->guest_pages = NULL; 1474 close_msg_fds: 1475 close_msg_fds(ctx); 1476 return RTE_VHOST_MSG_RESULT_ERR; 1477 } 1478 1479 static bool 1480 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) 1481 { 1482 bool rings_ok; 1483 1484 if (!vq) 1485 return false; 1486 1487 if (vq_is_packed(dev)) 1488 rings_ok = vq->desc_packed && vq->driver_event && 1489 vq->device_event; 1490 else 1491 rings_ok = vq->desc && vq->avail && vq->used; 1492 1493 return rings_ok && 1494 vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && 1495 vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && 1496 vq->enabled; 1497 } 1498 1499 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u 1500 1501 static int 1502 virtio_is_ready(struct virtio_net *dev) 1503 { 1504 struct vhost_virtqueue *vq; 1505 uint32_t i, nr_vring = dev->nr_vring; 1506 1507 if (dev->flags & VIRTIO_DEV_READY) 1508 return 1; 1509 1510 if (!dev->nr_vring) 1511 return 0; 1512 1513 if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) { 1514 nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY; 1515 1516 if (dev->nr_vring < nr_vring) 1517 return 0; 1518 } 1519 1520 for (i = 0; i < nr_vring; i++) { 1521 vq = dev->virtqueue[i]; 1522 1523 if (!vq_is_ready(dev, vq)) 1524 return 0; 1525 } 1526 1527 /* If supported, ensure the frontend is really done with config */ 1528 if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS)) 1529 if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)) 1530 return 0; 1531 1532 dev->flags |= VIRTIO_DEV_READY; 1533 1534 if (!(dev->flags & VIRTIO_DEV_RUNNING)) 1535 VHOST_LOG_CONFIG(INFO, "(%s) virtio is now ready for processing.\n", dev->ifname); 1536 return 1; 1537 } 1538 1539 static void * 1540 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd) 1541 { 1542 void *ptr; 1543 int mfd = -1; 1544 char fname[20] = "/tmp/memfd-XXXXXX"; 1545 1546 *fd = -1; 1547 #ifdef MEMFD_SUPPORTED 1548 mfd = memfd_create(name, MFD_CLOEXEC); 1549 #else 1550 RTE_SET_USED(name); 1551 #endif 1552 if (mfd == -1) { 1553 mfd = mkstemp(fname); 1554 if (mfd == -1) { 1555 VHOST_LOG_CONFIG(ERR, "(%s) failed to get inflight buffer fd\n", 1556 dev->ifname); 1557 return NULL; 1558 } 1559 1560 unlink(fname); 1561 } 1562 1563 if (ftruncate(mfd, size) == -1) { 1564 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc inflight buffer\n", dev->ifname); 1565 close(mfd); 1566 return NULL; 1567 } 1568 1569 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); 1570 if (ptr == MAP_FAILED) { 1571 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap inflight buffer\n", dev->ifname); 1572 close(mfd); 1573 return NULL; 1574 } 1575 1576 *fd = mfd; 1577 return ptr; 1578 } 1579 1580 static uint32_t 1581 get_pervq_shm_size_split(uint16_t queue_size) 1582 { 1583 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) * 1584 queue_size + sizeof(uint64_t) + 1585 sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT); 1586 } 1587 1588 static uint32_t 1589 get_pervq_shm_size_packed(uint16_t queue_size) 1590 { 1591 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed) 1592 * queue_size + sizeof(uint64_t) + 1593 sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9, 1594 INFLIGHT_ALIGNMENT); 1595 } 1596 1597 static int 1598 vhost_user_get_inflight_fd(struct virtio_net **pdev, 1599 struct vhu_msg_context *ctx, 1600 int main_fd __rte_unused) 1601 { 1602 struct rte_vhost_inflight_info_packed *inflight_packed; 1603 uint64_t pervq_inflight_size, mmap_size; 1604 uint16_t num_queues, queue_size; 1605 struct virtio_net *dev = *pdev; 1606 int fd, i, j; 1607 int numa_node = SOCKET_ID_ANY; 1608 void *addr; 1609 1610 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) { 1611 VHOST_LOG_CONFIG(ERR, "(%s) invalid get_inflight_fd message size is %d\n", 1612 dev->ifname, ctx->msg.size); 1613 return RTE_VHOST_MSG_RESULT_ERR; 1614 } 1615 1616 /* 1617 * If VQ 0 has already been allocated, try to allocate on the same 1618 * NUMA node. It can be reallocated later in numa_realloc(). 1619 */ 1620 if (dev->nr_vring > 0) 1621 numa_node = dev->virtqueue[0]->numa_node; 1622 1623 if (dev->inflight_info == NULL) { 1624 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1625 sizeof(struct inflight_mem_info), 0, numa_node); 1626 if (!dev->inflight_info) { 1627 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1628 dev->ifname); 1629 return RTE_VHOST_MSG_RESULT_ERR; 1630 } 1631 dev->inflight_info->fd = -1; 1632 } 1633 1634 num_queues = ctx->msg.payload.inflight.num_queues; 1635 queue_size = ctx->msg.payload.inflight.queue_size; 1636 1637 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd num_queues: %u\n", 1638 dev->ifname, ctx->msg.payload.inflight.num_queues); 1639 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd queue_size: %u\n", 1640 dev->ifname, ctx->msg.payload.inflight.queue_size); 1641 1642 if (vq_is_packed(dev)) 1643 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1644 else 1645 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1646 1647 mmap_size = num_queues * pervq_inflight_size; 1648 addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd); 1649 if (!addr) { 1650 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc vhost inflight area\n", dev->ifname); 1651 ctx->msg.payload.inflight.mmap_size = 0; 1652 return RTE_VHOST_MSG_RESULT_ERR; 1653 } 1654 memset(addr, 0, mmap_size); 1655 1656 if (dev->inflight_info->addr) { 1657 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1658 dev->inflight_info->addr = NULL; 1659 } 1660 1661 if (dev->inflight_info->fd >= 0) { 1662 close(dev->inflight_info->fd); 1663 dev->inflight_info->fd = -1; 1664 } 1665 1666 dev->inflight_info->addr = addr; 1667 dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; 1668 dev->inflight_info->fd = ctx->fds[0] = fd; 1669 ctx->msg.payload.inflight.mmap_offset = 0; 1670 ctx->fd_num = 1; 1671 1672 if (vq_is_packed(dev)) { 1673 for (i = 0; i < num_queues; i++) { 1674 inflight_packed = 1675 (struct rte_vhost_inflight_info_packed *)addr; 1676 inflight_packed->used_wrap_counter = 1; 1677 inflight_packed->old_used_wrap_counter = 1; 1678 for (j = 0; j < queue_size; j++) 1679 inflight_packed->desc[j].next = j + 1; 1680 addr = (void *)((char *)addr + pervq_inflight_size); 1681 } 1682 } 1683 1684 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_size: %"PRIu64"\n", 1685 dev->ifname, ctx->msg.payload.inflight.mmap_size); 1686 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_offset: %"PRIu64"\n", 1687 dev->ifname, ctx->msg.payload.inflight.mmap_offset); 1688 VHOST_LOG_CONFIG(INFO, "(%s) send inflight fd: %d\n", dev->ifname, ctx->fds[0]); 1689 1690 return RTE_VHOST_MSG_RESULT_REPLY; 1691 } 1692 1693 static int 1694 vhost_user_set_inflight_fd(struct virtio_net **pdev, 1695 struct vhu_msg_context *ctx, 1696 int main_fd __rte_unused) 1697 { 1698 uint64_t mmap_size, mmap_offset; 1699 uint16_t num_queues, queue_size; 1700 struct virtio_net *dev = *pdev; 1701 uint32_t pervq_inflight_size; 1702 struct vhost_virtqueue *vq; 1703 void *addr; 1704 int fd, i; 1705 int numa_node = SOCKET_ID_ANY; 1706 1707 fd = ctx->fds[0]; 1708 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) { 1709 VHOST_LOG_CONFIG(ERR, "(%s) invalid set_inflight_fd message size is %d,fd is %d\n", 1710 dev->ifname, ctx->msg.size, fd); 1711 return RTE_VHOST_MSG_RESULT_ERR; 1712 } 1713 1714 mmap_size = ctx->msg.payload.inflight.mmap_size; 1715 mmap_offset = ctx->msg.payload.inflight.mmap_offset; 1716 num_queues = ctx->msg.payload.inflight.num_queues; 1717 queue_size = ctx->msg.payload.inflight.queue_size; 1718 1719 if (vq_is_packed(dev)) 1720 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1721 else 1722 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1723 1724 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_size: %"PRIu64"\n", 1725 dev->ifname, mmap_size); 1726 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_offset: %"PRIu64"\n", 1727 dev->ifname, mmap_offset); 1728 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd num_queues: %u\n", dev->ifname, num_queues); 1729 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd queue_size: %u\n", dev->ifname, queue_size); 1730 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd fd: %d\n", dev->ifname, fd); 1731 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd pervq_inflight_size: %d\n", 1732 dev->ifname, pervq_inflight_size); 1733 1734 /* 1735 * If VQ 0 has already been allocated, try to allocate on the same 1736 * NUMA node. It can be reallocated later in numa_realloc(). 1737 */ 1738 if (dev->nr_vring > 0) 1739 numa_node = dev->virtqueue[0]->numa_node; 1740 1741 if (!dev->inflight_info) { 1742 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1743 sizeof(struct inflight_mem_info), 0, numa_node); 1744 if (dev->inflight_info == NULL) { 1745 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1746 dev->ifname); 1747 return RTE_VHOST_MSG_RESULT_ERR; 1748 } 1749 dev->inflight_info->fd = -1; 1750 } 1751 1752 if (dev->inflight_info->addr) { 1753 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1754 dev->inflight_info->addr = NULL; 1755 } 1756 1757 addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1758 fd, mmap_offset); 1759 if (addr == MAP_FAILED) { 1760 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap share memory.\n", dev->ifname); 1761 return RTE_VHOST_MSG_RESULT_ERR; 1762 } 1763 1764 if (dev->inflight_info->fd >= 0) { 1765 close(dev->inflight_info->fd); 1766 dev->inflight_info->fd = -1; 1767 } 1768 1769 dev->inflight_info->fd = fd; 1770 dev->inflight_info->addr = addr; 1771 dev->inflight_info->size = mmap_size; 1772 1773 for (i = 0; i < num_queues; i++) { 1774 vq = dev->virtqueue[i]; 1775 if (!vq) 1776 continue; 1777 1778 if (vq_is_packed(dev)) { 1779 vq->inflight_packed = addr; 1780 vq->inflight_packed->desc_num = queue_size; 1781 } else { 1782 vq->inflight_split = addr; 1783 vq->inflight_split->desc_num = queue_size; 1784 } 1785 addr = (void *)((char *)addr + pervq_inflight_size); 1786 } 1787 1788 return RTE_VHOST_MSG_RESULT_OK; 1789 } 1790 1791 static int 1792 vhost_user_set_vring_call(struct virtio_net **pdev, 1793 struct vhu_msg_context *ctx, 1794 int main_fd __rte_unused) 1795 { 1796 struct virtio_net *dev = *pdev; 1797 struct vhost_vring_file file; 1798 struct vhost_virtqueue *vq; 1799 int expected_fds; 1800 1801 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1802 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1803 return RTE_VHOST_MSG_RESULT_ERR; 1804 1805 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 1806 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 1807 file.fd = VIRTIO_INVALID_EVENTFD; 1808 else 1809 file.fd = ctx->fds[0]; 1810 VHOST_LOG_CONFIG(INFO, "(%s) vring call idx:%d file:%d\n", 1811 dev->ifname, file.index, file.fd); 1812 1813 vq = dev->virtqueue[file.index]; 1814 1815 if (vq->ready) { 1816 vq->ready = false; 1817 vhost_user_notify_queue_state(dev, file.index, 0); 1818 } 1819 1820 if (vq->callfd >= 0) 1821 close(vq->callfd); 1822 1823 vq->callfd = file.fd; 1824 1825 return RTE_VHOST_MSG_RESULT_OK; 1826 } 1827 1828 static int vhost_user_set_vring_err(struct virtio_net **pdev, 1829 struct vhu_msg_context *ctx, 1830 int main_fd __rte_unused) 1831 { 1832 struct virtio_net *dev = *pdev; 1833 int expected_fds; 1834 1835 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1836 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1837 return RTE_VHOST_MSG_RESULT_ERR; 1838 1839 if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) 1840 close(ctx->fds[0]); 1841 VHOST_LOG_CONFIG(INFO, "(%s) not implemented\n", dev->ifname); 1842 1843 return RTE_VHOST_MSG_RESULT_OK; 1844 } 1845 1846 static int 1847 resubmit_desc_compare(const void *a, const void *b) 1848 { 1849 const struct rte_vhost_resubmit_desc *desc0 = a; 1850 const struct rte_vhost_resubmit_desc *desc1 = b; 1851 1852 if (desc1->counter > desc0->counter) 1853 return 1; 1854 1855 return -1; 1856 } 1857 1858 static int 1859 vhost_check_queue_inflights_split(struct virtio_net *dev, 1860 struct vhost_virtqueue *vq) 1861 { 1862 uint16_t i; 1863 uint16_t resubmit_num = 0, last_io, num; 1864 struct vring_used *used = vq->used; 1865 struct rte_vhost_resubmit_info *resubmit; 1866 struct rte_vhost_inflight_info_split *inflight_split; 1867 1868 if (!(dev->protocol_features & 1869 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1870 return RTE_VHOST_MSG_RESULT_OK; 1871 1872 /* The frontend may still not support the inflight feature 1873 * although we negotiate the protocol feature. 1874 */ 1875 if ((!vq->inflight_split)) 1876 return RTE_VHOST_MSG_RESULT_OK; 1877 1878 if (!vq->inflight_split->version) { 1879 vq->inflight_split->version = INFLIGHT_VERSION; 1880 return RTE_VHOST_MSG_RESULT_OK; 1881 } 1882 1883 if (vq->resubmit_inflight) 1884 return RTE_VHOST_MSG_RESULT_OK; 1885 1886 inflight_split = vq->inflight_split; 1887 vq->global_counter = 0; 1888 last_io = inflight_split->last_inflight_io; 1889 1890 if (inflight_split->used_idx != used->idx) { 1891 inflight_split->desc[last_io].inflight = 0; 1892 rte_atomic_thread_fence(__ATOMIC_SEQ_CST); 1893 inflight_split->used_idx = used->idx; 1894 } 1895 1896 for (i = 0; i < inflight_split->desc_num; i++) { 1897 if (inflight_split->desc[i].inflight == 1) 1898 resubmit_num++; 1899 } 1900 1901 vq->last_avail_idx += resubmit_num; 1902 1903 if (resubmit_num) { 1904 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 1905 0, vq->numa_node); 1906 if (!resubmit) { 1907 VHOST_LOG_CONFIG(ERR, 1908 "(%s) failed to allocate memory for resubmit info.\n", 1909 dev->ifname); 1910 return RTE_VHOST_MSG_RESULT_ERR; 1911 } 1912 1913 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 1914 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 1915 0, vq->numa_node); 1916 if (!resubmit->resubmit_list) { 1917 VHOST_LOG_CONFIG(ERR, 1918 "(%s) failed to allocate memory for inflight desc.\n", 1919 dev->ifname); 1920 rte_free(resubmit); 1921 return RTE_VHOST_MSG_RESULT_ERR; 1922 } 1923 1924 num = 0; 1925 for (i = 0; i < vq->inflight_split->desc_num; i++) { 1926 if (vq->inflight_split->desc[i].inflight == 1) { 1927 resubmit->resubmit_list[num].index = i; 1928 resubmit->resubmit_list[num].counter = 1929 inflight_split->desc[i].counter; 1930 num++; 1931 } 1932 } 1933 resubmit->resubmit_num = num; 1934 1935 if (resubmit->resubmit_num > 1) 1936 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 1937 sizeof(struct rte_vhost_resubmit_desc), 1938 resubmit_desc_compare); 1939 1940 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 1941 vq->resubmit_inflight = resubmit; 1942 } 1943 1944 return RTE_VHOST_MSG_RESULT_OK; 1945 } 1946 1947 static int 1948 vhost_check_queue_inflights_packed(struct virtio_net *dev, 1949 struct vhost_virtqueue *vq) 1950 { 1951 uint16_t i; 1952 uint16_t resubmit_num = 0, old_used_idx, num; 1953 struct rte_vhost_resubmit_info *resubmit; 1954 struct rte_vhost_inflight_info_packed *inflight_packed; 1955 1956 if (!(dev->protocol_features & 1957 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1958 return RTE_VHOST_MSG_RESULT_OK; 1959 1960 /* The frontend may still not support the inflight feature 1961 * although we negotiate the protocol feature. 1962 */ 1963 if ((!vq->inflight_packed)) 1964 return RTE_VHOST_MSG_RESULT_OK; 1965 1966 if (!vq->inflight_packed->version) { 1967 vq->inflight_packed->version = INFLIGHT_VERSION; 1968 return RTE_VHOST_MSG_RESULT_OK; 1969 } 1970 1971 if (vq->resubmit_inflight) 1972 return RTE_VHOST_MSG_RESULT_OK; 1973 1974 inflight_packed = vq->inflight_packed; 1975 vq->global_counter = 0; 1976 old_used_idx = inflight_packed->old_used_idx; 1977 1978 if (inflight_packed->used_idx != old_used_idx) { 1979 if (inflight_packed->desc[old_used_idx].inflight == 0) { 1980 inflight_packed->old_used_idx = 1981 inflight_packed->used_idx; 1982 inflight_packed->old_used_wrap_counter = 1983 inflight_packed->used_wrap_counter; 1984 inflight_packed->old_free_head = 1985 inflight_packed->free_head; 1986 } else { 1987 inflight_packed->used_idx = 1988 inflight_packed->old_used_idx; 1989 inflight_packed->used_wrap_counter = 1990 inflight_packed->old_used_wrap_counter; 1991 inflight_packed->free_head = 1992 inflight_packed->old_free_head; 1993 } 1994 } 1995 1996 for (i = 0; i < inflight_packed->desc_num; i++) { 1997 if (inflight_packed->desc[i].inflight == 1) 1998 resubmit_num++; 1999 } 2000 2001 if (resubmit_num) { 2002 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 2003 0, vq->numa_node); 2004 if (resubmit == NULL) { 2005 VHOST_LOG_CONFIG(ERR, 2006 "(%s) failed to allocate memory for resubmit info.\n", 2007 dev->ifname); 2008 return RTE_VHOST_MSG_RESULT_ERR; 2009 } 2010 2011 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 2012 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 2013 0, vq->numa_node); 2014 if (resubmit->resubmit_list == NULL) { 2015 VHOST_LOG_CONFIG(ERR, 2016 "(%s) failed to allocate memory for resubmit desc.\n", 2017 dev->ifname); 2018 rte_free(resubmit); 2019 return RTE_VHOST_MSG_RESULT_ERR; 2020 } 2021 2022 num = 0; 2023 for (i = 0; i < inflight_packed->desc_num; i++) { 2024 if (vq->inflight_packed->desc[i].inflight == 1) { 2025 resubmit->resubmit_list[num].index = i; 2026 resubmit->resubmit_list[num].counter = 2027 inflight_packed->desc[i].counter; 2028 num++; 2029 } 2030 } 2031 resubmit->resubmit_num = num; 2032 2033 if (resubmit->resubmit_num > 1) 2034 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 2035 sizeof(struct rte_vhost_resubmit_desc), 2036 resubmit_desc_compare); 2037 2038 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 2039 vq->resubmit_inflight = resubmit; 2040 } 2041 2042 return RTE_VHOST_MSG_RESULT_OK; 2043 } 2044 2045 static int 2046 vhost_user_set_vring_kick(struct virtio_net **pdev, 2047 struct vhu_msg_context *ctx, 2048 int main_fd __rte_unused) 2049 { 2050 struct virtio_net *dev = *pdev; 2051 struct vhost_vring_file file; 2052 struct vhost_virtqueue *vq; 2053 int expected_fds; 2054 2055 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 2056 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 2057 return RTE_VHOST_MSG_RESULT_ERR; 2058 2059 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2060 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 2061 file.fd = VIRTIO_INVALID_EVENTFD; 2062 else 2063 file.fd = ctx->fds[0]; 2064 VHOST_LOG_CONFIG(INFO, "(%s) vring kick idx:%d file:%d\n", 2065 dev->ifname, file.index, file.fd); 2066 2067 /* Interpret ring addresses only when ring is started. */ 2068 dev = translate_ring_addresses(dev, file.index); 2069 if (!dev) { 2070 if (file.fd != VIRTIO_INVALID_EVENTFD) 2071 close(file.fd); 2072 2073 return RTE_VHOST_MSG_RESULT_ERR; 2074 } 2075 2076 *pdev = dev; 2077 2078 vq = dev->virtqueue[file.index]; 2079 2080 /* 2081 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, 2082 * the ring starts already enabled. Otherwise, it is enabled via 2083 * the SET_VRING_ENABLE message. 2084 */ 2085 if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { 2086 vq->enabled = true; 2087 } 2088 2089 if (vq->ready) { 2090 vq->ready = false; 2091 vhost_user_notify_queue_state(dev, file.index, 0); 2092 } 2093 2094 if (vq->kickfd >= 0) 2095 close(vq->kickfd); 2096 vq->kickfd = file.fd; 2097 2098 if (vq_is_packed(dev)) { 2099 if (vhost_check_queue_inflights_packed(dev, vq)) { 2100 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2101 dev->ifname, file.index); 2102 return RTE_VHOST_MSG_RESULT_ERR; 2103 } 2104 } else { 2105 if (vhost_check_queue_inflights_split(dev, vq)) { 2106 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2107 dev->ifname, file.index); 2108 return RTE_VHOST_MSG_RESULT_ERR; 2109 } 2110 } 2111 2112 return RTE_VHOST_MSG_RESULT_OK; 2113 } 2114 2115 /* 2116 * when virtio is stopped, qemu will send us the GET_VRING_BASE message. 2117 */ 2118 static int 2119 vhost_user_get_vring_base(struct virtio_net **pdev, 2120 struct vhu_msg_context *ctx, 2121 int main_fd __rte_unused) 2122 { 2123 struct virtio_net *dev = *pdev; 2124 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 2125 uint64_t val; 2126 2127 if (validate_msg_fds(dev, ctx, 0) != 0) 2128 return RTE_VHOST_MSG_RESULT_ERR; 2129 2130 /* We have to stop the queue (virtio) if it is running. */ 2131 vhost_destroy_device_notify(dev); 2132 2133 dev->flags &= ~VIRTIO_DEV_READY; 2134 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 2135 2136 /* Here we are safe to get the indexes */ 2137 if (vq_is_packed(dev)) { 2138 /* 2139 * Bit[0:14]: avail index 2140 * Bit[15]: avail wrap counter 2141 */ 2142 val = vq->last_avail_idx & 0x7fff; 2143 val |= vq->avail_wrap_counter << 15; 2144 ctx->msg.payload.state.num = val; 2145 } else { 2146 ctx->msg.payload.state.num = vq->last_avail_idx; 2147 } 2148 2149 VHOST_LOG_CONFIG(INFO, "(%s) vring base idx:%d file:%d\n", 2150 dev->ifname, ctx->msg.payload.state.index, 2151 ctx->msg.payload.state.num); 2152 /* 2153 * Based on current qemu vhost-user implementation, this message is 2154 * sent and only sent in vhost_vring_stop. 2155 * TODO: cleanup the vring, it isn't usable since here. 2156 */ 2157 if (vq->kickfd >= 0) 2158 close(vq->kickfd); 2159 2160 vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; 2161 2162 if (vq->callfd >= 0) 2163 close(vq->callfd); 2164 2165 vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; 2166 2167 vq->signalled_used_valid = false; 2168 2169 if (vq_is_packed(dev)) { 2170 rte_free(vq->shadow_used_packed); 2171 vq->shadow_used_packed = NULL; 2172 } else { 2173 rte_free(vq->shadow_used_split); 2174 vq->shadow_used_split = NULL; 2175 } 2176 2177 rte_free(vq->batch_copy_elems); 2178 vq->batch_copy_elems = NULL; 2179 2180 rte_free(vq->log_cache); 2181 vq->log_cache = NULL; 2182 2183 ctx->msg.size = sizeof(ctx->msg.payload.state); 2184 ctx->fd_num = 0; 2185 2186 vhost_user_iotlb_flush_all(vq); 2187 2188 vring_invalidate(dev, vq); 2189 2190 return RTE_VHOST_MSG_RESULT_REPLY; 2191 } 2192 2193 /* 2194 * when virtio queues are ready to work, qemu will send us to 2195 * enable the virtio queue pair. 2196 */ 2197 static int 2198 vhost_user_set_vring_enable(struct virtio_net **pdev, 2199 struct vhu_msg_context *ctx, 2200 int main_fd __rte_unused) 2201 { 2202 struct virtio_net *dev = *pdev; 2203 bool enable = !!ctx->msg.payload.state.num; 2204 int index = (int)ctx->msg.payload.state.index; 2205 2206 if (validate_msg_fds(dev, ctx, 0) != 0) 2207 return RTE_VHOST_MSG_RESULT_ERR; 2208 2209 VHOST_LOG_CONFIG(INFO, "(%s) set queue enable: %d to qp idx: %d\n", 2210 dev->ifname, enable, index); 2211 2212 if (enable && dev->virtqueue[index]->async) { 2213 if (dev->virtqueue[index]->async->pkts_inflight_n) { 2214 VHOST_LOG_CONFIG(ERR, 2215 "(%s) failed to enable vring. Inflight packets must be completed first\n", 2216 dev->ifname); 2217 return RTE_VHOST_MSG_RESULT_ERR; 2218 } 2219 } 2220 2221 dev->virtqueue[index]->enabled = enable; 2222 2223 return RTE_VHOST_MSG_RESULT_OK; 2224 } 2225 2226 static int 2227 vhost_user_get_protocol_features(struct virtio_net **pdev, 2228 struct vhu_msg_context *ctx, 2229 int main_fd __rte_unused) 2230 { 2231 struct virtio_net *dev = *pdev; 2232 uint64_t features, protocol_features; 2233 2234 if (validate_msg_fds(dev, ctx, 0) != 0) 2235 return RTE_VHOST_MSG_RESULT_ERR; 2236 2237 rte_vhost_driver_get_features(dev->ifname, &features); 2238 rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); 2239 2240 ctx->msg.payload.u64 = protocol_features; 2241 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2242 ctx->fd_num = 0; 2243 2244 return RTE_VHOST_MSG_RESULT_REPLY; 2245 } 2246 2247 static int 2248 vhost_user_set_protocol_features(struct virtio_net **pdev, 2249 struct vhu_msg_context *ctx, 2250 int main_fd __rte_unused) 2251 { 2252 struct virtio_net *dev = *pdev; 2253 uint64_t protocol_features = ctx->msg.payload.u64; 2254 uint64_t slave_protocol_features = 0; 2255 2256 if (validate_msg_fds(dev, ctx, 0) != 0) 2257 return RTE_VHOST_MSG_RESULT_ERR; 2258 2259 rte_vhost_driver_get_protocol_features(dev->ifname, 2260 &slave_protocol_features); 2261 if (protocol_features & ~slave_protocol_features) { 2262 VHOST_LOG_CONFIG(ERR, "(%s) received invalid protocol features.\n", dev->ifname); 2263 return RTE_VHOST_MSG_RESULT_ERR; 2264 } 2265 2266 dev->protocol_features = protocol_features; 2267 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Vhost-user protocol features: 0x%" PRIx64 "\n", 2268 dev->ifname, dev->protocol_features); 2269 2270 return RTE_VHOST_MSG_RESULT_OK; 2271 } 2272 2273 static int 2274 vhost_user_set_log_base(struct virtio_net **pdev, 2275 struct vhu_msg_context *ctx, 2276 int main_fd __rte_unused) 2277 { 2278 struct virtio_net *dev = *pdev; 2279 int fd = ctx->fds[0]; 2280 uint64_t size, off; 2281 void *addr; 2282 uint32_t i; 2283 2284 if (validate_msg_fds(dev, ctx, 1) != 0) 2285 return RTE_VHOST_MSG_RESULT_ERR; 2286 2287 if (fd < 0) { 2288 VHOST_LOG_CONFIG(ERR, "(%s) invalid log fd: %d\n", dev->ifname, fd); 2289 return RTE_VHOST_MSG_RESULT_ERR; 2290 } 2291 2292 if (ctx->msg.size != sizeof(VhostUserLog)) { 2293 VHOST_LOG_CONFIG(ERR, "(%s) invalid log base msg size: %"PRId32" != %d\n", 2294 dev->ifname, ctx->msg.size, (int)sizeof(VhostUserLog)); 2295 goto close_msg_fds; 2296 } 2297 2298 size = ctx->msg.payload.log.mmap_size; 2299 off = ctx->msg.payload.log.mmap_offset; 2300 2301 /* Check for mmap size and offset overflow. */ 2302 if (off >= -size) { 2303 VHOST_LOG_CONFIG(ERR, 2304 "(%s) log offset %#"PRIx64" and log size %#"PRIx64" overflow\n", 2305 dev->ifname, off, size); 2306 goto close_msg_fds; 2307 } 2308 2309 VHOST_LOG_CONFIG(INFO, "(%s) log mmap size: %"PRId64", offset: %"PRId64"\n", 2310 dev->ifname, size, off); 2311 2312 /* 2313 * mmap from 0 to workaround a hugepage mmap bug: mmap will 2314 * fail when offset is not page size aligned. 2315 */ 2316 addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 2317 close(fd); 2318 if (addr == MAP_FAILED) { 2319 VHOST_LOG_CONFIG(ERR, "(%s) mmap log base failed!\n", dev->ifname); 2320 return RTE_VHOST_MSG_RESULT_ERR; 2321 } 2322 2323 /* 2324 * Free previously mapped log memory on occasionally 2325 * multiple VHOST_USER_SET_LOG_BASE. 2326 */ 2327 if (dev->log_addr) { 2328 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 2329 } 2330 dev->log_addr = (uint64_t)(uintptr_t)addr; 2331 dev->log_base = dev->log_addr + off; 2332 dev->log_size = size; 2333 2334 for (i = 0; i < dev->nr_vring; i++) { 2335 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2336 2337 rte_free(vq->log_cache); 2338 vq->log_cache = NULL; 2339 vq->log_cache_nb_elem = 0; 2340 vq->log_cache = rte_malloc_socket("vq log cache", 2341 sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR, 2342 0, vq->numa_node); 2343 /* 2344 * If log cache alloc fail, don't fail migration, but no 2345 * caching will be done, which will impact performance 2346 */ 2347 if (!vq->log_cache) 2348 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate VQ logging cache\n", 2349 dev->ifname); 2350 } 2351 2352 /* 2353 * The spec is not clear about it (yet), but QEMU doesn't expect 2354 * any payload in the reply. 2355 */ 2356 ctx->msg.size = 0; 2357 ctx->fd_num = 0; 2358 2359 return RTE_VHOST_MSG_RESULT_REPLY; 2360 2361 close_msg_fds: 2362 close_msg_fds(ctx); 2363 return RTE_VHOST_MSG_RESULT_ERR; 2364 } 2365 2366 static int vhost_user_set_log_fd(struct virtio_net **pdev, 2367 struct vhu_msg_context *ctx, 2368 int main_fd __rte_unused) 2369 { 2370 struct virtio_net *dev = *pdev; 2371 2372 if (validate_msg_fds(dev, ctx, 1) != 0) 2373 return RTE_VHOST_MSG_RESULT_ERR; 2374 2375 close(ctx->fds[0]); 2376 VHOST_LOG_CONFIG(INFO, "(%s) not implemented.\n", dev->ifname); 2377 2378 return RTE_VHOST_MSG_RESULT_OK; 2379 } 2380 2381 /* 2382 * An rarp packet is constructed and broadcasted to notify switches about 2383 * the new location of the migrated VM, so that packets from outside will 2384 * not be lost after migration. 2385 * 2386 * However, we don't actually "send" a rarp packet here, instead, we set 2387 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. 2388 */ 2389 static int 2390 vhost_user_send_rarp(struct virtio_net **pdev, 2391 struct vhu_msg_context *ctx, 2392 int main_fd __rte_unused) 2393 { 2394 struct virtio_net *dev = *pdev; 2395 uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64; 2396 struct rte_vdpa_device *vdpa_dev; 2397 2398 if (validate_msg_fds(dev, ctx, 0) != 0) 2399 return RTE_VHOST_MSG_RESULT_ERR; 2400 2401 VHOST_LOG_CONFIG(DEBUG, "(%s) MAC: " RTE_ETHER_ADDR_PRT_FMT "\n", 2402 dev->ifname, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); 2403 memcpy(dev->mac.addr_bytes, mac, 6); 2404 2405 /* 2406 * Set the flag to inject a RARP broadcast packet at 2407 * rte_vhost_dequeue_burst(). 2408 * 2409 * __ATOMIC_RELEASE ordering is for making sure the mac is 2410 * copied before the flag is set. 2411 */ 2412 __atomic_store_n(&dev->broadcast_rarp, 1, __ATOMIC_RELEASE); 2413 vdpa_dev = dev->vdpa_dev; 2414 if (vdpa_dev && vdpa_dev->ops->migration_done) 2415 vdpa_dev->ops->migration_done(dev->vid); 2416 2417 return RTE_VHOST_MSG_RESULT_OK; 2418 } 2419 2420 static int 2421 vhost_user_net_set_mtu(struct virtio_net **pdev, 2422 struct vhu_msg_context *ctx, 2423 int main_fd __rte_unused) 2424 { 2425 struct virtio_net *dev = *pdev; 2426 2427 if (validate_msg_fds(dev, ctx, 0) != 0) 2428 return RTE_VHOST_MSG_RESULT_ERR; 2429 2430 if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU || 2431 ctx->msg.payload.u64 > VIRTIO_MAX_MTU) { 2432 VHOST_LOG_CONFIG(ERR, "(%s) invalid MTU size (%"PRIu64")\n", 2433 dev->ifname, ctx->msg.payload.u64); 2434 2435 return RTE_VHOST_MSG_RESULT_ERR; 2436 } 2437 2438 dev->mtu = ctx->msg.payload.u64; 2439 2440 return RTE_VHOST_MSG_RESULT_OK; 2441 } 2442 2443 static int 2444 vhost_user_set_req_fd(struct virtio_net **pdev, 2445 struct vhu_msg_context *ctx, 2446 int main_fd __rte_unused) 2447 { 2448 struct virtio_net *dev = *pdev; 2449 int fd = ctx->fds[0]; 2450 2451 if (validate_msg_fds(dev, ctx, 1) != 0) 2452 return RTE_VHOST_MSG_RESULT_ERR; 2453 2454 if (fd < 0) { 2455 VHOST_LOG_CONFIG(ERR, "(%s) invalid file descriptor for slave channel (%d)\n", 2456 dev->ifname, fd); 2457 return RTE_VHOST_MSG_RESULT_ERR; 2458 } 2459 2460 if (dev->slave_req_fd >= 0) 2461 close(dev->slave_req_fd); 2462 2463 dev->slave_req_fd = fd; 2464 2465 return RTE_VHOST_MSG_RESULT_OK; 2466 } 2467 2468 static int 2469 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2470 { 2471 struct vhost_vring_addr *ra; 2472 uint64_t start, end, len; 2473 2474 start = imsg->iova; 2475 end = start + imsg->size; 2476 2477 ra = &vq->ring_addrs; 2478 len = sizeof(struct vring_desc) * vq->size; 2479 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2480 return 1; 2481 2482 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 2483 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2484 return 1; 2485 2486 len = sizeof(struct vring_used) + 2487 sizeof(struct vring_used_elem) * vq->size; 2488 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2489 return 1; 2490 2491 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2492 len = sizeof(uint64_t); 2493 if (ra->log_guest_addr < end && 2494 (ra->log_guest_addr + len) > start) 2495 return 1; 2496 } 2497 2498 return 0; 2499 } 2500 2501 static int 2502 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2503 { 2504 struct vhost_vring_addr *ra; 2505 uint64_t start, end, len; 2506 2507 start = imsg->iova; 2508 end = start + imsg->size; 2509 2510 ra = &vq->ring_addrs; 2511 len = sizeof(struct vring_packed_desc) * vq->size; 2512 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2513 return 1; 2514 2515 len = sizeof(struct vring_packed_desc_event); 2516 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2517 return 1; 2518 2519 len = sizeof(struct vring_packed_desc_event); 2520 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2521 return 1; 2522 2523 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2524 len = sizeof(uint64_t); 2525 if (ra->log_guest_addr < end && 2526 (ra->log_guest_addr + len) > start) 2527 return 1; 2528 } 2529 2530 return 0; 2531 } 2532 2533 static int is_vring_iotlb(struct virtio_net *dev, 2534 struct vhost_virtqueue *vq, 2535 struct vhost_iotlb_msg *imsg) 2536 { 2537 if (vq_is_packed(dev)) 2538 return is_vring_iotlb_packed(vq, imsg); 2539 else 2540 return is_vring_iotlb_split(vq, imsg); 2541 } 2542 2543 static int 2544 vhost_user_iotlb_msg(struct virtio_net **pdev, 2545 struct vhu_msg_context *ctx, 2546 int main_fd __rte_unused) 2547 { 2548 struct virtio_net *dev = *pdev; 2549 struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb; 2550 uint16_t i; 2551 uint64_t vva, len; 2552 2553 if (validate_msg_fds(dev, ctx, 0) != 0) 2554 return RTE_VHOST_MSG_RESULT_ERR; 2555 2556 switch (imsg->type) { 2557 case VHOST_IOTLB_UPDATE: 2558 len = imsg->size; 2559 vva = qva_to_vva(dev, imsg->uaddr, &len); 2560 if (!vva) 2561 return RTE_VHOST_MSG_RESULT_ERR; 2562 2563 for (i = 0; i < dev->nr_vring; i++) { 2564 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2565 2566 if (!vq) 2567 continue; 2568 2569 vhost_user_iotlb_cache_insert(dev, vq, imsg->iova, vva, 2570 len, imsg->perm); 2571 2572 if (is_vring_iotlb(dev, vq, imsg)) 2573 *pdev = dev = translate_ring_addresses(dev, i); 2574 } 2575 break; 2576 case VHOST_IOTLB_INVALIDATE: 2577 for (i = 0; i < dev->nr_vring; i++) { 2578 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2579 2580 if (!vq) 2581 continue; 2582 2583 vhost_user_iotlb_cache_remove(vq, imsg->iova, 2584 imsg->size); 2585 2586 if (is_vring_iotlb(dev, vq, imsg)) 2587 vring_invalidate(dev, vq); 2588 } 2589 break; 2590 default: 2591 VHOST_LOG_CONFIG(ERR, "(%s) invalid IOTLB message type (%d)\n", 2592 dev->ifname, imsg->type); 2593 return RTE_VHOST_MSG_RESULT_ERR; 2594 } 2595 2596 return RTE_VHOST_MSG_RESULT_OK; 2597 } 2598 2599 static int 2600 vhost_user_set_postcopy_advise(struct virtio_net **pdev, 2601 struct vhu_msg_context *ctx, 2602 int main_fd __rte_unused) 2603 { 2604 struct virtio_net *dev = *pdev; 2605 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 2606 struct uffdio_api api_struct; 2607 2608 if (validate_msg_fds(dev, ctx, 0) != 0) 2609 return RTE_VHOST_MSG_RESULT_ERR; 2610 2611 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 2612 2613 if (dev->postcopy_ufd == -1) { 2614 VHOST_LOG_CONFIG(ERR, "(%s) userfaultfd not available: %s\n", 2615 dev->ifname, strerror(errno)); 2616 return RTE_VHOST_MSG_RESULT_ERR; 2617 } 2618 api_struct.api = UFFD_API; 2619 api_struct.features = 0; 2620 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 2621 VHOST_LOG_CONFIG(ERR, "(%s) UFFDIO_API ioctl failure: %s\n", 2622 dev->ifname, strerror(errno)); 2623 close(dev->postcopy_ufd); 2624 dev->postcopy_ufd = -1; 2625 return RTE_VHOST_MSG_RESULT_ERR; 2626 } 2627 ctx->fds[0] = dev->postcopy_ufd; 2628 ctx->fd_num = 1; 2629 2630 return RTE_VHOST_MSG_RESULT_REPLY; 2631 #else 2632 dev->postcopy_ufd = -1; 2633 ctx->fd_num = 0; 2634 2635 return RTE_VHOST_MSG_RESULT_ERR; 2636 #endif 2637 } 2638 2639 static int 2640 vhost_user_set_postcopy_listen(struct virtio_net **pdev, 2641 struct vhu_msg_context *ctx __rte_unused, 2642 int main_fd __rte_unused) 2643 { 2644 struct virtio_net *dev = *pdev; 2645 2646 if (validate_msg_fds(dev, ctx, 0) != 0) 2647 return RTE_VHOST_MSG_RESULT_ERR; 2648 2649 if (dev->mem && dev->mem->nregions) { 2650 VHOST_LOG_CONFIG(ERR, "(%s) regions already registered at postcopy-listen\n", 2651 dev->ifname); 2652 return RTE_VHOST_MSG_RESULT_ERR; 2653 } 2654 dev->postcopy_listening = 1; 2655 2656 return RTE_VHOST_MSG_RESULT_OK; 2657 } 2658 2659 static int 2660 vhost_user_postcopy_end(struct virtio_net **pdev, 2661 struct vhu_msg_context *ctx, 2662 int main_fd __rte_unused) 2663 { 2664 struct virtio_net *dev = *pdev; 2665 2666 if (validate_msg_fds(dev, ctx, 0) != 0) 2667 return RTE_VHOST_MSG_RESULT_ERR; 2668 2669 dev->postcopy_listening = 0; 2670 if (dev->postcopy_ufd >= 0) { 2671 close(dev->postcopy_ufd); 2672 dev->postcopy_ufd = -1; 2673 } 2674 2675 ctx->msg.payload.u64 = 0; 2676 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2677 ctx->fd_num = 0; 2678 2679 return RTE_VHOST_MSG_RESULT_REPLY; 2680 } 2681 2682 static int 2683 vhost_user_get_status(struct virtio_net **pdev, 2684 struct vhu_msg_context *ctx, 2685 int main_fd __rte_unused) 2686 { 2687 struct virtio_net *dev = *pdev; 2688 2689 if (validate_msg_fds(dev, ctx, 0) != 0) 2690 return RTE_VHOST_MSG_RESULT_ERR; 2691 2692 ctx->msg.payload.u64 = dev->status; 2693 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2694 ctx->fd_num = 0; 2695 2696 return RTE_VHOST_MSG_RESULT_REPLY; 2697 } 2698 2699 static int 2700 vhost_user_set_status(struct virtio_net **pdev, 2701 struct vhu_msg_context *ctx, 2702 int main_fd __rte_unused) 2703 { 2704 struct virtio_net *dev = *pdev; 2705 2706 if (validate_msg_fds(dev, ctx, 0) != 0) 2707 return RTE_VHOST_MSG_RESULT_ERR; 2708 2709 /* As per Virtio specification, the device status is 8bits long */ 2710 if (ctx->msg.payload.u64 > UINT8_MAX) { 2711 VHOST_LOG_CONFIG(ERR, "(%s) invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64 "\n", 2712 dev->ifname, ctx->msg.payload.u64); 2713 return RTE_VHOST_MSG_RESULT_ERR; 2714 } 2715 2716 dev->status = ctx->msg.payload.u64; 2717 2718 if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) && 2719 (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) { 2720 VHOST_LOG_CONFIG(ERR, 2721 "(%s) FEATURES_OK bit is set but feature negotiation failed\n", 2722 dev->ifname); 2723 /* 2724 * Clear the bit to let the driver know about the feature 2725 * negotiation failure 2726 */ 2727 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 2728 } 2729 2730 VHOST_LOG_CONFIG(INFO, "(%s) new device status(0x%08x):\n", dev->ifname, 2731 dev->status); 2732 VHOST_LOG_CONFIG(INFO, "(%s)\t-RESET: %u\n", dev->ifname, 2733 (dev->status == VIRTIO_DEVICE_STATUS_RESET)); 2734 VHOST_LOG_CONFIG(INFO, "(%s)\t-ACKNOWLEDGE: %u\n", dev->ifname, 2735 !!(dev->status & VIRTIO_DEVICE_STATUS_ACK)); 2736 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER: %u\n", dev->ifname, 2737 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER)); 2738 VHOST_LOG_CONFIG(INFO, "(%s)\t-FEATURES_OK: %u\n", dev->ifname, 2739 !!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK)); 2740 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER_OK: %u\n", dev->ifname, 2741 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)); 2742 VHOST_LOG_CONFIG(INFO, "(%s)\t-DEVICE_NEED_RESET: %u\n", dev->ifname, 2743 !!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET)); 2744 VHOST_LOG_CONFIG(INFO, "(%s)\t-FAILED: %u\n", dev->ifname, 2745 !!(dev->status & VIRTIO_DEVICE_STATUS_FAILED)); 2746 2747 return RTE_VHOST_MSG_RESULT_OK; 2748 } 2749 2750 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev, 2751 struct vhu_msg_context *ctx, 2752 int main_fd); 2753 2754 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { 2755 [VHOST_USER_NONE] = NULL, 2756 [VHOST_USER_GET_FEATURES] = vhost_user_get_features, 2757 [VHOST_USER_SET_FEATURES] = vhost_user_set_features, 2758 [VHOST_USER_SET_OWNER] = vhost_user_set_owner, 2759 [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner, 2760 [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table, 2761 [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base, 2762 [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd, 2763 [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num, 2764 [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr, 2765 [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base, 2766 [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base, 2767 [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick, 2768 [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call, 2769 [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err, 2770 [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features, 2771 [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features, 2772 [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num, 2773 [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable, 2774 [VHOST_USER_SEND_RARP] = vhost_user_send_rarp, 2775 [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu, 2776 [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd, 2777 [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg, 2778 [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, 2779 [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, 2780 [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, 2781 [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd, 2782 [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd, 2783 [VHOST_USER_SET_STATUS] = vhost_user_set_status, 2784 [VHOST_USER_GET_STATUS] = vhost_user_get_status, 2785 }; 2786 2787 /* return bytes# of read on success or negative val on failure. */ 2788 static int 2789 read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2790 { 2791 int ret; 2792 2793 ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE, 2794 ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num); 2795 if (ret <= 0) { 2796 return ret; 2797 } else if (ret != VHOST_USER_HDR_SIZE) { 2798 VHOST_LOG_CONFIG(ERR, "(%s) Unexpected header size read\n", dev->ifname); 2799 close_msg_fds(ctx); 2800 return -1; 2801 } 2802 2803 if (ctx->msg.size) { 2804 if (ctx->msg.size > sizeof(ctx->msg.payload)) { 2805 VHOST_LOG_CONFIG(ERR, "(%s) invalid msg size: %d\n", 2806 dev->ifname, ctx->msg.size); 2807 return -1; 2808 } 2809 ret = read(sockfd, &ctx->msg.payload, ctx->msg.size); 2810 if (ret <= 0) 2811 return ret; 2812 if (ret != (int)ctx->msg.size) { 2813 VHOST_LOG_CONFIG(ERR, "(%s) read control message failed\n", dev->ifname); 2814 return -1; 2815 } 2816 } 2817 2818 return ret; 2819 } 2820 2821 static int 2822 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2823 { 2824 if (!ctx) 2825 return 0; 2826 2827 return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, 2828 VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num); 2829 } 2830 2831 static int 2832 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2833 { 2834 if (!ctx) 2835 return 0; 2836 2837 ctx->msg.flags &= ~VHOST_USER_VERSION_MASK; 2838 ctx->msg.flags &= ~VHOST_USER_NEED_REPLY; 2839 ctx->msg.flags |= VHOST_USER_VERSION; 2840 ctx->msg.flags |= VHOST_USER_REPLY_MASK; 2841 2842 return send_vhost_message(dev, sockfd, ctx); 2843 } 2844 2845 static int 2846 send_vhost_slave_message(struct virtio_net *dev, 2847 struct vhu_msg_context *ctx) 2848 { 2849 int ret; 2850 2851 if (ctx->msg.flags & VHOST_USER_NEED_REPLY) 2852 rte_spinlock_lock(&dev->slave_req_lock); 2853 2854 ret = send_vhost_message(dev, dev->slave_req_fd, ctx); 2855 if (ret < 0 && (ctx->msg.flags & VHOST_USER_NEED_REPLY)) 2856 rte_spinlock_unlock(&dev->slave_req_lock); 2857 2858 return ret; 2859 } 2860 2861 /* 2862 * Allocate a queue pair if it hasn't been allocated yet 2863 */ 2864 static int 2865 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, 2866 struct vhu_msg_context *ctx) 2867 { 2868 uint32_t vring_idx; 2869 2870 switch (ctx->msg.request.master) { 2871 case VHOST_USER_SET_VRING_KICK: 2872 case VHOST_USER_SET_VRING_CALL: 2873 case VHOST_USER_SET_VRING_ERR: 2874 vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2875 break; 2876 case VHOST_USER_SET_VRING_NUM: 2877 case VHOST_USER_SET_VRING_BASE: 2878 case VHOST_USER_GET_VRING_BASE: 2879 case VHOST_USER_SET_VRING_ENABLE: 2880 vring_idx = ctx->msg.payload.state.index; 2881 break; 2882 case VHOST_USER_SET_VRING_ADDR: 2883 vring_idx = ctx->msg.payload.addr.index; 2884 break; 2885 default: 2886 return 0; 2887 } 2888 2889 if (vring_idx >= VHOST_MAX_VRING) { 2890 VHOST_LOG_CONFIG(ERR, "(%s) invalid vring index: %u\n", dev->ifname, vring_idx); 2891 return -1; 2892 } 2893 2894 if (dev->virtqueue[vring_idx]) 2895 return 0; 2896 2897 return alloc_vring_queue(dev, vring_idx); 2898 } 2899 2900 static void 2901 vhost_user_lock_all_queue_pairs(struct virtio_net *dev) 2902 { 2903 unsigned int i = 0; 2904 unsigned int vq_num = 0; 2905 2906 while (vq_num < dev->nr_vring) { 2907 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2908 2909 if (vq) { 2910 rte_spinlock_lock(&vq->access_lock); 2911 vq_num++; 2912 } 2913 i++; 2914 } 2915 } 2916 2917 static void 2918 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) 2919 { 2920 unsigned int i = 0; 2921 unsigned int vq_num = 0; 2922 2923 while (vq_num < dev->nr_vring) { 2924 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2925 2926 if (vq) { 2927 rte_spinlock_unlock(&vq->access_lock); 2928 vq_num++; 2929 } 2930 i++; 2931 } 2932 } 2933 2934 int 2935 vhost_user_msg_handler(int vid, int fd) 2936 { 2937 struct virtio_net *dev; 2938 struct vhu_msg_context ctx; 2939 struct rte_vdpa_device *vdpa_dev; 2940 int ret; 2941 int unlock_required = 0; 2942 bool handled; 2943 int request; 2944 uint32_t i; 2945 2946 dev = get_device(vid); 2947 if (dev == NULL) 2948 return -1; 2949 2950 if (!dev->notify_ops) { 2951 dev->notify_ops = vhost_driver_callback_get(dev->ifname); 2952 if (!dev->notify_ops) { 2953 VHOST_LOG_CONFIG(ERR, "(%s) failed to get callback ops for driver\n", 2954 dev->ifname); 2955 return -1; 2956 } 2957 } 2958 2959 ret = read_vhost_message(dev, fd, &ctx); 2960 if (ret <= 0) { 2961 if (ret < 0) 2962 VHOST_LOG_CONFIG(ERR, "(%s) vhost read message failed\n", dev->ifname); 2963 else 2964 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 2965 2966 return -1; 2967 } 2968 2969 ret = 0; 2970 request = ctx.msg.request.master; 2971 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX && 2972 vhost_message_str[request]) { 2973 if (request != VHOST_USER_IOTLB_MSG) 2974 VHOST_LOG_CONFIG(INFO, "(%s) read message %s\n", 2975 dev->ifname, vhost_message_str[request]); 2976 else 2977 VHOST_LOG_CONFIG(DEBUG, "(%s) read message %s\n", 2978 dev->ifname, vhost_message_str[request]); 2979 } else { 2980 VHOST_LOG_CONFIG(DEBUG, "(%s) external request %d\n", dev->ifname, request); 2981 } 2982 2983 ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx); 2984 if (ret < 0) { 2985 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc queue\n", dev->ifname); 2986 return -1; 2987 } 2988 2989 /* 2990 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE 2991 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops 2992 * and device is destroyed. destroy_device waits for queues to be 2993 * inactive, so it is safe. Otherwise taking the access_lock 2994 * would cause a dead lock. 2995 */ 2996 switch (request) { 2997 case VHOST_USER_SET_FEATURES: 2998 case VHOST_USER_SET_PROTOCOL_FEATURES: 2999 case VHOST_USER_SET_OWNER: 3000 case VHOST_USER_SET_MEM_TABLE: 3001 case VHOST_USER_SET_LOG_BASE: 3002 case VHOST_USER_SET_LOG_FD: 3003 case VHOST_USER_SET_VRING_NUM: 3004 case VHOST_USER_SET_VRING_ADDR: 3005 case VHOST_USER_SET_VRING_BASE: 3006 case VHOST_USER_SET_VRING_KICK: 3007 case VHOST_USER_SET_VRING_CALL: 3008 case VHOST_USER_SET_VRING_ERR: 3009 case VHOST_USER_SET_VRING_ENABLE: 3010 case VHOST_USER_SEND_RARP: 3011 case VHOST_USER_NET_SET_MTU: 3012 case VHOST_USER_SET_SLAVE_REQ_FD: 3013 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3014 vhost_user_lock_all_queue_pairs(dev); 3015 unlock_required = 1; 3016 } 3017 break; 3018 default: 3019 break; 3020 3021 } 3022 3023 handled = false; 3024 if (dev->extern_ops.pre_msg_handle) { 3025 ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, 3026 (void *)&ctx.msg); 3027 switch (ret) { 3028 case RTE_VHOST_MSG_RESULT_REPLY: 3029 send_vhost_reply(dev, fd, &ctx); 3030 /* Fall-through */ 3031 case RTE_VHOST_MSG_RESULT_ERR: 3032 case RTE_VHOST_MSG_RESULT_OK: 3033 handled = true; 3034 goto skip_to_post_handle; 3035 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3036 default: 3037 break; 3038 } 3039 } 3040 3041 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) { 3042 if (!vhost_message_handlers[request]) 3043 goto skip_to_post_handle; 3044 ret = vhost_message_handlers[request](&dev, &ctx, fd); 3045 3046 switch (ret) { 3047 case RTE_VHOST_MSG_RESULT_ERR: 3048 VHOST_LOG_CONFIG(ERR, "(%s) processing %s failed.\n", 3049 dev->ifname, vhost_message_str[request]); 3050 handled = true; 3051 break; 3052 case RTE_VHOST_MSG_RESULT_OK: 3053 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded.\n", 3054 dev->ifname, vhost_message_str[request]); 3055 handled = true; 3056 break; 3057 case RTE_VHOST_MSG_RESULT_REPLY: 3058 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded and needs reply.\n", 3059 dev->ifname, vhost_message_str[request]); 3060 send_vhost_reply(dev, fd, &ctx); 3061 handled = true; 3062 break; 3063 default: 3064 break; 3065 } 3066 } 3067 3068 skip_to_post_handle: 3069 if (ret != RTE_VHOST_MSG_RESULT_ERR && 3070 dev->extern_ops.post_msg_handle) { 3071 ret = (*dev->extern_ops.post_msg_handle)(dev->vid, 3072 (void *)&ctx.msg); 3073 switch (ret) { 3074 case RTE_VHOST_MSG_RESULT_REPLY: 3075 send_vhost_reply(dev, fd, &ctx); 3076 /* Fall-through */ 3077 case RTE_VHOST_MSG_RESULT_ERR: 3078 case RTE_VHOST_MSG_RESULT_OK: 3079 handled = true; 3080 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3081 default: 3082 break; 3083 } 3084 } 3085 3086 /* If message was not handled at this stage, treat it as an error */ 3087 if (!handled) { 3088 VHOST_LOG_CONFIG(ERR, "(%s) vhost message (req: %d) was not handled.\n", 3089 dev->ifname, request); 3090 close_msg_fds(&ctx); 3091 ret = RTE_VHOST_MSG_RESULT_ERR; 3092 } 3093 3094 /* 3095 * If the request required a reply that was already sent, 3096 * this optional reply-ack won't be sent as the 3097 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). 3098 */ 3099 if (ctx.msg.flags & VHOST_USER_NEED_REPLY) { 3100 ctx.msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR; 3101 ctx.msg.size = sizeof(ctx.msg.payload.u64); 3102 ctx.fd_num = 0; 3103 send_vhost_reply(dev, fd, &ctx); 3104 } else if (ret == RTE_VHOST_MSG_RESULT_ERR) { 3105 VHOST_LOG_CONFIG(ERR, "(%s) vhost message handling failed.\n", dev->ifname); 3106 return -1; 3107 } 3108 3109 for (i = 0; i < dev->nr_vring; i++) { 3110 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3111 bool cur_ready = vq_is_ready(dev, vq); 3112 3113 if (cur_ready != (vq && vq->ready)) { 3114 vq->ready = cur_ready; 3115 vhost_user_notify_queue_state(dev, i, cur_ready); 3116 } 3117 } 3118 3119 if (unlock_required) 3120 vhost_user_unlock_all_queue_pairs(dev); 3121 3122 if (!virtio_is_ready(dev)) 3123 goto out; 3124 3125 /* 3126 * Virtio is now ready. If not done already, it is time 3127 * to notify the application it can process the rings and 3128 * configure the vDPA device if present. 3129 */ 3130 3131 if (!(dev->flags & VIRTIO_DEV_RUNNING)) { 3132 if (dev->notify_ops->new_device(dev->vid) == 0) 3133 dev->flags |= VIRTIO_DEV_RUNNING; 3134 } 3135 3136 vdpa_dev = dev->vdpa_dev; 3137 if (!vdpa_dev) 3138 goto out; 3139 3140 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3141 if (vdpa_dev->ops->dev_conf(dev->vid)) 3142 VHOST_LOG_CONFIG(ERR, "(%s) failed to configure vDPA device\n", 3143 dev->ifname); 3144 else 3145 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; 3146 } 3147 3148 out: 3149 return 0; 3150 } 3151 3152 static int process_slave_message_reply(struct virtio_net *dev, 3153 const struct vhu_msg_context *ctx) 3154 { 3155 struct vhu_msg_context msg_reply; 3156 int ret; 3157 3158 if ((ctx->msg.flags & VHOST_USER_NEED_REPLY) == 0) 3159 return 0; 3160 3161 ret = read_vhost_message(dev, dev->slave_req_fd, &msg_reply); 3162 if (ret <= 0) { 3163 if (ret < 0) 3164 VHOST_LOG_CONFIG(ERR, "(%s) vhost read slave message reply failed\n", 3165 dev->ifname); 3166 else 3167 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 3168 ret = -1; 3169 goto out; 3170 } 3171 3172 ret = 0; 3173 if (msg_reply.msg.request.slave != ctx->msg.request.slave) { 3174 VHOST_LOG_CONFIG(ERR, "(%s) received unexpected msg type (%u), expected %u\n", 3175 dev->ifname, msg_reply.msg.request.slave, ctx->msg.request.slave); 3176 ret = -1; 3177 goto out; 3178 } 3179 3180 ret = msg_reply.msg.payload.u64 ? -1 : 0; 3181 3182 out: 3183 rte_spinlock_unlock(&dev->slave_req_lock); 3184 return ret; 3185 } 3186 3187 int 3188 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) 3189 { 3190 int ret; 3191 struct vhu_msg_context ctx = { 3192 .msg = { 3193 .request.slave = VHOST_USER_SLAVE_IOTLB_MSG, 3194 .flags = VHOST_USER_VERSION, 3195 .size = sizeof(ctx.msg.payload.iotlb), 3196 .payload.iotlb = { 3197 .iova = iova, 3198 .perm = perm, 3199 .type = VHOST_IOTLB_MISS, 3200 }, 3201 }, 3202 }; 3203 3204 ret = send_vhost_message(dev, dev->slave_req_fd, &ctx); 3205 if (ret < 0) { 3206 VHOST_LOG_CONFIG(ERR, "(%s) failed to send IOTLB miss message (%d)\n", 3207 dev->ifname, ret); 3208 return ret; 3209 } 3210 3211 return 0; 3212 } 3213 3214 static int 3215 vhost_user_slave_config_change(struct virtio_net *dev, bool need_reply) 3216 { 3217 int ret; 3218 struct vhu_msg_context ctx = { 3219 .msg = { 3220 .request.slave = VHOST_USER_SLAVE_CONFIG_CHANGE_MSG, 3221 .flags = VHOST_USER_VERSION, 3222 .size = 0, 3223 } 3224 }; 3225 3226 if (need_reply) 3227 ctx.msg.flags |= VHOST_USER_NEED_REPLY; 3228 3229 ret = send_vhost_slave_message(dev, &ctx); 3230 if (ret < 0) { 3231 VHOST_LOG_CONFIG(ERR, "(%s) failed to send config change (%d)\n", 3232 dev->ifname, ret); 3233 return ret; 3234 } 3235 3236 return process_slave_message_reply(dev, &ctx); 3237 } 3238 3239 int 3240 rte_vhost_slave_config_change(int vid, bool need_reply) 3241 { 3242 struct virtio_net *dev; 3243 3244 dev = get_device(vid); 3245 if (!dev) 3246 return -ENODEV; 3247 3248 return vhost_user_slave_config_change(dev, need_reply); 3249 } 3250 3251 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, 3252 int index, int fd, 3253 uint64_t offset, 3254 uint64_t size) 3255 { 3256 int ret; 3257 struct vhu_msg_context ctx = { 3258 .msg = { 3259 .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 3260 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, 3261 .size = sizeof(ctx.msg.payload.area), 3262 .payload.area = { 3263 .u64 = index & VHOST_USER_VRING_IDX_MASK, 3264 .size = size, 3265 .offset = offset, 3266 }, 3267 }, 3268 }; 3269 3270 if (fd < 0) 3271 ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 3272 else { 3273 ctx.fds[0] = fd; 3274 ctx.fd_num = 1; 3275 } 3276 3277 ret = send_vhost_slave_message(dev, &ctx); 3278 if (ret < 0) { 3279 VHOST_LOG_CONFIG(ERR, "(%s) failed to set host notifier (%d)\n", 3280 dev->ifname, ret); 3281 return ret; 3282 } 3283 3284 return process_slave_message_reply(dev, &ctx); 3285 } 3286 3287 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable) 3288 { 3289 struct virtio_net *dev; 3290 struct rte_vdpa_device *vdpa_dev; 3291 int vfio_device_fd, ret = 0; 3292 uint64_t offset, size; 3293 unsigned int i, q_start, q_last; 3294 3295 dev = get_device(vid); 3296 if (!dev) 3297 return -ENODEV; 3298 3299 vdpa_dev = dev->vdpa_dev; 3300 if (vdpa_dev == NULL) 3301 return -ENODEV; 3302 3303 if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || 3304 !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || 3305 !(dev->protocol_features & 3306 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || 3307 !(dev->protocol_features & 3308 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || 3309 !(dev->protocol_features & 3310 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) 3311 return -ENOTSUP; 3312 3313 if (qid == RTE_VHOST_QUEUE_ALL) { 3314 q_start = 0; 3315 q_last = dev->nr_vring - 1; 3316 } else { 3317 if (qid >= dev->nr_vring) 3318 return -EINVAL; 3319 q_start = qid; 3320 q_last = qid; 3321 } 3322 3323 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); 3324 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); 3325 3326 vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); 3327 if (vfio_device_fd < 0) 3328 return -ENOTSUP; 3329 3330 if (enable) { 3331 for (i = q_start; i <= q_last; i++) { 3332 if (vdpa_dev->ops->get_notify_area(vid, i, &offset, 3333 &size) < 0) { 3334 ret = -ENOTSUP; 3335 goto disable; 3336 } 3337 3338 if (vhost_user_slave_set_vring_host_notifier(dev, i, 3339 vfio_device_fd, offset, size) < 0) { 3340 ret = -EFAULT; 3341 goto disable; 3342 } 3343 } 3344 } else { 3345 disable: 3346 for (i = q_start; i <= q_last; i++) { 3347 vhost_user_slave_set_vring_host_notifier(dev, i, -1, 3348 0, 0); 3349 } 3350 } 3351 3352 return ret; 3353 } 3354