1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #ifndef _VHOST_NET_CDEV_H_ 6 #define _VHOST_NET_CDEV_H_ 7 #include <stdint.h> 8 #include <stdio.h> 9 #include <stdbool.h> 10 #include <stdlib.h> 11 #include <sys/queue.h> 12 #include <unistd.h> 13 #include <linux/virtio_net.h> 14 #include <sys/socket.h> 15 #include <linux/if.h> 16 #include <sys/mman.h> 17 18 #include <rte_log.h> 19 #include <rte_ether.h> 20 #include <rte_malloc.h> 21 #include <rte_dmadev.h> 22 23 #include "rte_vhost.h" 24 #include "vdpa_driver.h" 25 26 #include "rte_vhost_async.h" 27 28 /* Used to indicate that the device is running on a data core */ 29 #define VIRTIO_DEV_RUNNING ((uint32_t)1 << 0) 30 /* Used to indicate that the device is ready to operate */ 31 #define VIRTIO_DEV_READY ((uint32_t)1 << 1) 32 /* Used to indicate that the built-in vhost net device backend is enabled */ 33 #define VIRTIO_DEV_BUILTIN_VIRTIO_NET ((uint32_t)1 << 2) 34 /* Used to indicate that the device has its own data path and configured */ 35 #define VIRTIO_DEV_VDPA_CONFIGURED ((uint32_t)1 << 3) 36 /* Used to indicate that the feature negotiation failed */ 37 #define VIRTIO_DEV_FEATURES_FAILED ((uint32_t)1 << 4) 38 /* Used to indicate that the virtio_net tx code should fill TX ol_flags */ 39 #define VIRTIO_DEV_LEGACY_OL_FLAGS ((uint32_t)1 << 5) 40 /* Used to indicate the application has requested statistics collection */ 41 #define VIRTIO_DEV_STATS_ENABLED ((uint32_t)1 << 6) 42 /* Used to indicate the application has requested iommu support */ 43 #define VIRTIO_DEV_SUPPORT_IOMMU ((uint32_t)1 << 7) 44 45 /* Backend value set by guest. */ 46 #define VIRTIO_DEV_STOPPED -1 47 48 #define BUF_VECTOR_MAX 256 49 50 #define VHOST_LOG_CACHE_NR 32 51 52 #define MAX_PKT_BURST 32 53 54 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST) 55 #define VHOST_MAX_ASYNC_VEC 2048 56 #define VIRTIO_MAX_RX_PKTLEN 9728U 57 #define VHOST_DMA_MAX_COPY_COMPLETE ((VIRTIO_MAX_RX_PKTLEN / RTE_MBUF_DEFAULT_DATAROOM) \ 58 * MAX_PKT_BURST) 59 60 #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ 61 ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ 62 VRING_DESC_F_WRITE) 63 #define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ 64 ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) 65 #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ 66 VRING_DESC_F_INDIRECT) 67 68 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ 69 sizeof(struct vring_packed_desc)) 70 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) 71 72 #ifdef VHOST_GCC_UNROLL_PRAGMA 73 #define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ 74 for (iter = val; iter < size; iter++) 75 #endif 76 77 #ifdef VHOST_CLANG_UNROLL_PRAGMA 78 #define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ 79 for (iter = val; iter < size; iter++) 80 #endif 81 82 #ifdef VHOST_ICC_UNROLL_PRAGMA 83 #define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ 84 for (iter = val; iter < size; iter++) 85 #endif 86 87 #ifndef vhost_for_each_try_unroll 88 #define vhost_for_each_try_unroll(iter, val, num) \ 89 for (iter = val; iter < num; iter++) 90 #endif 91 92 struct virtio_net; 93 struct vhost_virtqueue; 94 95 typedef void (*vhost_iotlb_remove_notify)(uint64_t addr, uint64_t off, uint64_t size); 96 97 typedef int (*vhost_iotlb_miss_cb)(struct virtio_net *dev, uint64_t iova, uint8_t perm); 98 99 typedef int (*vhost_vring_inject_irq_cb)(struct virtio_net *dev, struct vhost_virtqueue *vq); 100 /** 101 * Structure that contains backend-specific ops. 102 */ 103 struct vhost_backend_ops { 104 vhost_iotlb_remove_notify iotlb_remove_notify; 105 vhost_iotlb_miss_cb iotlb_miss; 106 vhost_vring_inject_irq_cb inject_irq; 107 }; 108 109 /** 110 * Structure contains buffer address, length and descriptor index 111 * from vring to do scatter RX. 112 */ 113 struct buf_vector { 114 uint64_t buf_iova; 115 uint64_t buf_addr; 116 uint32_t buf_len; 117 uint32_t desc_idx; 118 }; 119 120 /* 121 * Structure contains the info for each batched memory copy. 122 */ 123 struct batch_copy_elem { 124 void *dst; 125 void *src; 126 uint32_t len; 127 uint64_t log_addr; 128 }; 129 130 /* 131 * Structure that contains the info for batched dirty logging. 132 */ 133 struct log_cache_entry { 134 uint32_t offset; 135 unsigned long val; 136 }; 137 138 struct vring_used_elem_packed { 139 uint16_t id; 140 uint16_t flags; 141 uint32_t len; 142 uint32_t count; 143 }; 144 145 /** 146 * Virtqueue statistics 147 */ 148 struct virtqueue_stats { 149 uint64_t packets; 150 uint64_t bytes; 151 uint64_t multicast; 152 uint64_t broadcast; 153 /* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */ 154 uint64_t size_bins[8]; 155 uint64_t iotlb_hits; 156 uint64_t iotlb_misses; 157 uint64_t inflight_submitted; 158 uint64_t inflight_completed; 159 uint64_t mbuf_alloc_failed; 160 uint64_t guest_notifications_suppressed; 161 /* Counters below are atomic, and should be incremented as such. */ 162 RTE_ATOMIC(uint64_t) guest_notifications; 163 RTE_ATOMIC(uint64_t) guest_notifications_offloaded; 164 RTE_ATOMIC(uint64_t) guest_notifications_error; 165 }; 166 167 /** 168 * iovec 169 */ 170 struct vhost_iovec { 171 void *src_addr; 172 void *dst_addr; 173 size_t len; 174 }; 175 176 /** 177 * iovec iterator 178 */ 179 struct vhost_iov_iter { 180 /** pointer to the iovec array */ 181 struct vhost_iovec *iov; 182 /** number of iovec in this iterator */ 183 unsigned long nr_segs; 184 }; 185 186 struct async_dma_vchan_info { 187 /* circular array to track if packet copy completes */ 188 bool **pkts_cmpl_flag_addr; 189 190 /* max elements in 'pkts_cmpl_flag_addr' */ 191 uint16_t ring_size; 192 /* ring index mask for 'pkts_cmpl_flag_addr' */ 193 uint16_t ring_mask; 194 195 /** 196 * DMA virtual channel lock. Although it is able to bind DMA 197 * virtual channels to data plane threads, vhost control plane 198 * thread could call data plane functions too, thus causing 199 * DMA device contention. 200 * 201 * For example, in VM exit case, vhost control plane thread needs 202 * to clear in-flight packets before disable vring, but there could 203 * be anotther data plane thread is enqueuing packets to the same 204 * vring with the same DMA virtual channel. As dmadev PMD functions 205 * are lock-free, the control plane and data plane threads could 206 * operate the same DMA virtual channel at the same time. 207 */ 208 rte_spinlock_t dma_lock; 209 }; 210 211 struct async_dma_info { 212 struct async_dma_vchan_info *vchans; 213 /* number of registered virtual channels */ 214 uint16_t nr_vchans; 215 }; 216 217 extern struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX]; 218 219 /** 220 * inflight async packet information 221 */ 222 struct async_inflight_info { 223 struct rte_mbuf *mbuf; 224 uint16_t descs; /* num of descs inflight */ 225 uint16_t nr_buffers; /* num of buffers inflight for packed ring */ 226 struct virtio_net_hdr nethdr; 227 }; 228 229 struct vhost_async { 230 struct vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT]; 231 struct vhost_iovec iovec[VHOST_MAX_ASYNC_VEC]; 232 uint16_t iter_idx; 233 uint16_t iovec_idx; 234 235 /* data transfer status */ 236 struct async_inflight_info *pkts_info; 237 /** 238 * Packet reorder array. "true" indicates that DMA device 239 * completes all copies for the packet. 240 * 241 * Note that this array could be written by multiple threads 242 * simultaneously. For example, in the case of thread0 and 243 * thread1 RX packets from NIC and then enqueue packets to 244 * vring0 and vring1 with own DMA device DMA0 and DMA1, it's 245 * possible for thread0 to get completed copies belonging to 246 * vring1 from DMA0, while thread0 is calling rte_vhost_poll 247 * _enqueue_completed() for vring0 and thread1 is calling 248 * rte_vhost_submit_enqueue_burst() for vring1. In this case, 249 * vq->access_lock cannot protect pkts_cmpl_flag of vring1. 250 * 251 * However, since offloading is per-packet basis, each packet 252 * flag will only be written by one thread. And single byte 253 * write is atomic, so no lock for pkts_cmpl_flag is needed. 254 */ 255 bool *pkts_cmpl_flag; 256 uint16_t pkts_idx; 257 uint16_t pkts_inflight_n; 258 union { 259 struct vring_used_elem *descs_split; 260 struct vring_used_elem_packed *buffers_packed; 261 }; 262 union { 263 uint16_t desc_idx_split; 264 uint16_t buffer_idx_packed; 265 }; 266 union { 267 uint16_t last_desc_idx_split; 268 uint16_t last_buffer_idx_packed; 269 }; 270 }; 271 272 #define VHOST_RECONNECT_VERSION 0x0 273 #define VHOST_MAX_VRING 0x100 274 #define VHOST_MAX_QUEUE_PAIRS 0x80 275 276 struct __rte_cache_aligned vhost_reconnect_vring { 277 uint16_t last_avail_idx; 278 bool avail_wrap_counter; 279 }; 280 281 struct vhost_reconnect_data { 282 uint32_t version; 283 uint64_t features; 284 uint8_t status; 285 struct virtio_net_config config; 286 uint32_t nr_vrings; 287 struct vhost_reconnect_vring vring[VHOST_MAX_VRING]; 288 }; 289 290 /** 291 * Structure contains variables relevant to RX/TX virtqueues. 292 */ 293 struct __rte_cache_aligned vhost_virtqueue { 294 union { 295 struct vring_desc *desc; 296 struct vring_packed_desc *desc_packed; 297 }; 298 union { 299 struct vring_avail *avail; 300 struct vring_packed_desc_event *driver_event; 301 }; 302 union { 303 struct vring_used *used; 304 struct vring_packed_desc_event *device_event; 305 }; 306 uint16_t size; 307 308 uint16_t last_avail_idx; 309 uint16_t last_used_idx; 310 /* Last used index we notify to front end. */ 311 uint16_t signalled_used; 312 bool signalled_used_valid; 313 #define VIRTIO_INVALID_EVENTFD (-1) 314 #define VIRTIO_UNINITIALIZED_EVENTFD (-2) 315 316 bool enabled; 317 /* Protected by vq->access_lock */ 318 bool access_ok __rte_guarded_var; 319 bool ready; 320 321 rte_rwlock_t access_lock; 322 323 324 union { 325 struct vring_used_elem *shadow_used_split; 326 struct vring_used_elem_packed *shadow_used_packed; 327 }; 328 uint16_t shadow_used_idx; 329 /* Record packed ring enqueue latest desc cache aligned index */ 330 uint16_t shadow_aligned_idx; 331 /* Record packed ring first dequeue desc index */ 332 uint16_t shadow_last_used_idx; 333 334 uint16_t batch_copy_nb_elems; 335 struct batch_copy_elem *batch_copy_elems; 336 int numa_node; 337 bool used_wrap_counter; 338 bool avail_wrap_counter; 339 340 /* Physical address of used ring, for logging */ 341 uint16_t log_cache_nb_elem; 342 uint64_t log_guest_addr; 343 struct log_cache_entry *log_cache; 344 345 rte_rwlock_t iotlb_lock; 346 347 /* Used to notify the guest (trigger interrupt) */ 348 int callfd; 349 /* Currently unused as polling mode is enabled */ 350 int kickfd; 351 352 /* Index of this vq in dev->virtqueue[] */ 353 uint32_t index; 354 355 /* inflight share memory info */ 356 union { 357 struct rte_vhost_inflight_info_split *inflight_split; 358 struct rte_vhost_inflight_info_packed *inflight_packed; 359 }; 360 struct rte_vhost_resubmit_info *resubmit_inflight; 361 uint64_t global_counter; 362 363 struct vhost_async *async __rte_guarded_var; 364 365 int notif_enable; 366 #define VIRTIO_UNINITIALIZED_NOTIF (-1) 367 368 struct vhost_vring_addr ring_addrs; 369 struct virtqueue_stats stats; 370 371 RTE_ATOMIC(bool) irq_pending; 372 struct vhost_reconnect_vring *reconnect_log; 373 }; 374 375 /* Virtio device status as per Virtio specification */ 376 #define VIRTIO_DEVICE_STATUS_RESET 0x00 377 #define VIRTIO_DEVICE_STATUS_ACK 0x01 378 #define VIRTIO_DEVICE_STATUS_DRIVER 0x02 379 #define VIRTIO_DEVICE_STATUS_DRIVER_OK 0x04 380 #define VIRTIO_DEVICE_STATUS_FEATURES_OK 0x08 381 #define VIRTIO_DEVICE_STATUS_DEV_NEED_RESET 0x40 382 #define VIRTIO_DEVICE_STATUS_FAILED 0x80 383 384 /* Declare IOMMU related bits for older kernels */ 385 #ifndef VIRTIO_F_IOMMU_PLATFORM 386 387 #define VIRTIO_F_IOMMU_PLATFORM 33 388 389 struct vhost_iotlb_msg { 390 __u64 iova; 391 __u64 size; 392 __u64 uaddr; 393 #define VHOST_ACCESS_RO 0x1 394 #define VHOST_ACCESS_WO 0x2 395 #define VHOST_ACCESS_RW 0x3 396 __u8 perm; 397 #define VHOST_IOTLB_MISS 1 398 #define VHOST_IOTLB_UPDATE 2 399 #define VHOST_IOTLB_INVALIDATE 3 400 #define VHOST_IOTLB_ACCESS_FAIL 4 401 __u8 type; 402 }; 403 404 #define VHOST_IOTLB_MSG 0x1 405 406 struct vhost_msg { 407 int type; 408 union { 409 struct vhost_iotlb_msg iotlb; 410 __u8 padding[64]; 411 }; 412 }; 413 #endif 414 415 /* 416 * Define virtio 1.0 for older kernels 417 */ 418 #ifndef VIRTIO_F_VERSION_1 419 #define VIRTIO_F_VERSION_1 32 420 #endif 421 422 /* Declare packed ring related bits for older kernels */ 423 #ifndef VIRTIO_F_RING_PACKED 424 425 #define VIRTIO_F_RING_PACKED 34 426 427 struct vring_packed_desc { 428 uint64_t addr; 429 uint32_t len; 430 uint16_t id; 431 uint16_t flags; 432 }; 433 434 struct vring_packed_desc_event { 435 uint16_t off_wrap; 436 uint16_t flags; 437 }; 438 #endif 439 440 /* 441 * Declare below packed ring defines unconditionally 442 * as Kernel header might use different names. 443 */ 444 #define VRING_DESC_F_AVAIL (1ULL << 7) 445 #define VRING_DESC_F_USED (1ULL << 15) 446 447 #define VRING_EVENT_F_ENABLE 0x0 448 #define VRING_EVENT_F_DISABLE 0x1 449 #define VRING_EVENT_F_DESC 0x2 450 451 /* 452 * Available and used descs are in same order 453 */ 454 #ifndef VIRTIO_F_IN_ORDER 455 #define VIRTIO_F_IN_ORDER 35 456 #endif 457 458 /* Features supported by this builtin vhost-user net driver. */ 459 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ 460 (1ULL << VIRTIO_F_ANY_LAYOUT) | \ 461 (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ 462 (1ULL << VIRTIO_NET_F_MQ) | \ 463 (1ULL << VIRTIO_F_VERSION_1) | \ 464 (1ULL << VIRTIO_NET_F_GSO) | \ 465 (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ 466 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ 467 (1ULL << VIRTIO_NET_F_HOST_UFO) | \ 468 (1ULL << VIRTIO_NET_F_HOST_ECN) | \ 469 (1ULL << VIRTIO_NET_F_CSUM) | \ 470 (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ 471 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ 472 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ 473 (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ 474 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ 475 (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ 476 (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ 477 (1ULL << VIRTIO_F_IN_ORDER) | \ 478 (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 479 480 481 struct guest_page { 482 uint64_t guest_phys_addr; 483 uint64_t host_iova; 484 uint64_t host_user_addr; 485 uint64_t size; 486 }; 487 488 struct inflight_mem_info { 489 int fd; 490 void *addr; 491 uint64_t size; 492 }; 493 494 /** 495 * Device structure contains all configuration information relating 496 * to the device. 497 */ 498 struct __rte_cache_aligned virtio_net { 499 /* Frontend (QEMU) memory and memory region information */ 500 struct rte_vhost_memory *mem; 501 uint64_t features; 502 uint64_t protocol_features; 503 int vid; 504 uint32_t flags; 505 uint16_t vhost_hlen; 506 /* to tell if we need broadcast rarp packet */ 507 RTE_ATOMIC(int16_t) broadcast_rarp; 508 uint32_t nr_vring; 509 int async_copy; 510 511 int extbuf; 512 int linearbuf; 513 struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; 514 515 rte_rwlock_t iotlb_pending_lock; 516 struct vhost_iotlb_entry *iotlb_pool; 517 TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list; 518 TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list; 519 int iotlb_cache_nr; 520 rte_spinlock_t iotlb_free_lock; 521 SLIST_HEAD(, vhost_iotlb_entry) iotlb_free_list; 522 523 struct inflight_mem_info *inflight_info; 524 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) 525 char ifname[IF_NAME_SZ]; 526 uint64_t log_size; 527 uint64_t log_base; 528 uint64_t log_addr; 529 struct rte_ether_addr mac; 530 uint16_t mtu; 531 uint8_t status; 532 533 struct rte_vhost_device_ops const *notify_ops; 534 535 uint32_t nr_guest_pages; 536 uint32_t max_guest_pages; 537 struct guest_page *guest_pages; 538 539 int backend_req_fd; 540 rte_spinlock_t backend_req_lock; 541 542 int postcopy_ufd; 543 int postcopy_listening; 544 int vduse_ctrl_fd; 545 int vduse_dev_fd; 546 547 struct vhost_virtqueue *cvq; 548 549 struct rte_vdpa_device *vdpa_dev; 550 551 /* context data for the external message handlers */ 552 void *extern_data; 553 /* pre and post vhost user message handlers for the device */ 554 struct rte_vhost_user_extern_ops extern_ops; 555 556 struct vhost_backend_ops *backend_ops; 557 558 struct vhost_reconnect_data *reconnect_log; 559 }; 560 561 static __rte_always_inline void 562 vhost_virtqueue_reconnect_log_split(struct vhost_virtqueue *vq) 563 { 564 if (vq->reconnect_log != NULL) 565 vq->reconnect_log->last_avail_idx = vq->last_avail_idx; 566 } 567 568 static __rte_always_inline void 569 vhost_virtqueue_reconnect_log_packed(struct vhost_virtqueue *vq) 570 { 571 if (vq->reconnect_log != NULL) { 572 vq->reconnect_log->last_avail_idx = vq->last_avail_idx; 573 vq->reconnect_log->avail_wrap_counter = vq->avail_wrap_counter; 574 } 575 } 576 577 static inline void 578 vq_assert_lock__(struct virtio_net *dev, struct vhost_virtqueue *vq, const char *func) 579 __rte_assert_exclusive_lock(&vq->access_lock) 580 { 581 if (unlikely(!rte_rwlock_write_is_locked(&vq->access_lock))) 582 rte_panic("VHOST_CONFIG: (%s) %s() called without access lock taken.\n", 583 dev->ifname, func); 584 } 585 #define vq_assert_lock(dev, vq) vq_assert_lock__(dev, vq, __func__) 586 587 static __rte_always_inline bool 588 vq_is_packed(struct virtio_net *dev) 589 { 590 return dev->features & (1ull << VIRTIO_F_RING_PACKED); 591 } 592 593 static inline bool 594 desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) 595 { 596 uint16_t flags = rte_atomic_load_explicit((unsigned short __rte_atomic *)&desc->flags, 597 rte_memory_order_acquire); 598 599 return wrap_counter == !!(flags & VRING_DESC_F_AVAIL) && 600 wrap_counter != !!(flags & VRING_DESC_F_USED); 601 } 602 603 static inline void 604 vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) 605 { 606 vq->last_used_idx += num; 607 if (vq->last_used_idx >= vq->size) { 608 vq->used_wrap_counter ^= 1; 609 vq->last_used_idx -= vq->size; 610 } 611 } 612 613 static inline void 614 vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) 615 { 616 vq->last_avail_idx += num; 617 if (vq->last_avail_idx >= vq->size) { 618 vq->avail_wrap_counter ^= 1; 619 vq->last_avail_idx -= vq->size; 620 } 621 vhost_virtqueue_reconnect_log_packed(vq); 622 } 623 624 void __vhost_log_cache_write(struct virtio_net *dev, 625 struct vhost_virtqueue *vq, 626 uint64_t addr, uint64_t len); 627 void __vhost_log_cache_write_iova(struct virtio_net *dev, 628 struct vhost_virtqueue *vq, 629 uint64_t iova, uint64_t len) 630 __rte_shared_locks_required(&vq->iotlb_lock); 631 void __vhost_log_cache_sync(struct virtio_net *dev, 632 struct vhost_virtqueue *vq); 633 634 void __vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len); 635 void __vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, 636 uint64_t iova, uint64_t len) 637 __rte_shared_locks_required(&vq->iotlb_lock); 638 639 static __rte_always_inline void 640 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) 641 { 642 if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) 643 __vhost_log_write(dev, addr, len); 644 } 645 646 static __rte_always_inline void 647 vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) 648 { 649 if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) 650 __vhost_log_cache_sync(dev, vq); 651 } 652 653 static __rte_always_inline void 654 vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, 655 uint64_t addr, uint64_t len) 656 { 657 if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) 658 __vhost_log_cache_write(dev, vq, addr, len); 659 } 660 661 static __rte_always_inline void 662 vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, 663 uint64_t offset, uint64_t len) 664 { 665 if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) { 666 if (unlikely(vq->log_guest_addr == 0)) 667 return; 668 __vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, 669 len); 670 } 671 } 672 673 static __rte_always_inline void 674 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, 675 uint64_t offset, uint64_t len) 676 { 677 if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) { 678 if (unlikely(vq->log_guest_addr == 0)) 679 return; 680 __vhost_log_write(dev, vq->log_guest_addr + offset, len); 681 } 682 } 683 684 static __rte_always_inline void 685 vhost_log_cache_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, 686 uint64_t iova, uint64_t len) 687 __rte_shared_locks_required(&vq->iotlb_lock) 688 { 689 if (likely(!(dev->features & (1ULL << VHOST_F_LOG_ALL)))) 690 return; 691 692 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 693 __vhost_log_cache_write_iova(dev, vq, iova, len); 694 else 695 __vhost_log_cache_write(dev, vq, iova, len); 696 } 697 698 static __rte_always_inline void 699 vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, 700 uint64_t iova, uint64_t len) 701 __rte_shared_locks_required(&vq->iotlb_lock) 702 { 703 if (likely(!(dev->features & (1ULL << VHOST_F_LOG_ALL)))) 704 return; 705 706 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 707 __vhost_log_write_iova(dev, vq, iova, len); 708 else 709 __vhost_log_write(dev, iova, len); 710 } 711 712 extern int vhost_config_log_level; 713 #define RTE_LOGTYPE_VHOST_CONFIG vhost_config_log_level 714 extern int vhost_data_log_level; 715 #define RTE_LOGTYPE_VHOST_DATA vhost_data_log_level 716 717 #define VHOST_CONFIG_LOG(prefix, level, ...) \ 718 RTE_LOG_LINE_PREFIX(level, VHOST_CONFIG, "(%s) ", prefix, __VA_ARGS__) 719 720 #define VHOST_DATA_LOG(prefix, level, ...) \ 721 RTE_LOG_DP_LINE_PREFIX(level, VHOST_DATA, "(%s) ", prefix, __VA_ARGS__) 722 723 #ifdef RTE_LIBRTE_VHOST_DEBUG 724 #define VHOST_MAX_PRINT_BUFF 6072 725 #define PRINT_PACKET(device, addr, size, header) do { \ 726 char *pkt_addr = (char *)(addr); \ 727 unsigned int index; \ 728 char packet[VHOST_MAX_PRINT_BUFF]; \ 729 \ 730 if ((header)) \ 731 snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ 732 else \ 733 snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ 734 for (index = 0; index < (size); index++) { \ 735 snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ 736 "%02hhx ", pkt_addr[index]); \ 737 } \ 738 snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ 739 \ 740 RTE_LOG_DP(DEBUG, VHOST_DATA, "(%s) %s", dev->ifname, packet); \ 741 } while (0) 742 #else 743 #define PRINT_PACKET(device, addr, size, header) do {} while (0) 744 #endif 745 746 extern struct virtio_net *vhost_devices[RTE_MAX_VHOST_DEVICE]; 747 748 #define VHOST_BINARY_SEARCH_THRESH 256 749 750 static __rte_always_inline int guest_page_addrcmp(const void *p1, 751 const void *p2) 752 { 753 const struct guest_page *page1 = (const struct guest_page *)p1; 754 const struct guest_page *page2 = (const struct guest_page *)p2; 755 756 if (page1->guest_phys_addr > page2->guest_phys_addr) 757 return 1; 758 if (page1->guest_phys_addr < page2->guest_phys_addr) 759 return -1; 760 761 return 0; 762 } 763 764 static __rte_always_inline int guest_page_rangecmp(const void *p1, const void *p2) 765 { 766 const struct guest_page *page1 = (const struct guest_page *)p1; 767 const struct guest_page *page2 = (const struct guest_page *)p2; 768 769 if (page1->guest_phys_addr >= page2->guest_phys_addr) { 770 if (page1->guest_phys_addr < page2->guest_phys_addr + page2->size) 771 return 0; 772 else 773 return 1; 774 } else 775 return -1; 776 } 777 778 static __rte_always_inline rte_iova_t 779 gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa, 780 uint64_t gpa_size, uint64_t *hpa_size) 781 { 782 uint32_t i; 783 struct guest_page *page; 784 struct guest_page key; 785 786 *hpa_size = gpa_size; 787 if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) { 788 key.guest_phys_addr = gpa; 789 page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages, 790 sizeof(struct guest_page), guest_page_rangecmp); 791 if (page) { 792 if (gpa + gpa_size <= 793 page->guest_phys_addr + page->size) { 794 return gpa - page->guest_phys_addr + 795 page->host_iova; 796 } else if (gpa < page->guest_phys_addr + 797 page->size) { 798 *hpa_size = page->guest_phys_addr + 799 page->size - gpa; 800 return gpa - page->guest_phys_addr + 801 page->host_iova; 802 } 803 } 804 } else { 805 for (i = 0; i < dev->nr_guest_pages; i++) { 806 page = &dev->guest_pages[i]; 807 808 if (gpa >= page->guest_phys_addr) { 809 if (gpa + gpa_size <= 810 page->guest_phys_addr + page->size) { 811 return gpa - page->guest_phys_addr + 812 page->host_iova; 813 } else if (gpa < page->guest_phys_addr + 814 page->size) { 815 *hpa_size = page->guest_phys_addr + 816 page->size - gpa; 817 return gpa - page->guest_phys_addr + 818 page->host_iova; 819 } 820 } 821 } 822 } 823 824 *hpa_size = 0; 825 return 0; 826 } 827 828 /* Convert guest physical address to host physical address */ 829 static __rte_always_inline rte_iova_t 830 gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) 831 { 832 rte_iova_t hpa; 833 uint64_t hpa_size; 834 835 hpa = gpa_to_first_hpa(dev, gpa, size, &hpa_size); 836 return hpa_size == size ? hpa : 0; 837 } 838 839 static __rte_always_inline uint64_t 840 hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len) 841 { 842 struct rte_vhost_mem_region *r; 843 uint32_t i; 844 845 if (unlikely(!dev || !dev->mem)) 846 return 0; 847 848 for (i = 0; i < dev->mem->nregions; i++) { 849 r = &dev->mem->regions[i]; 850 851 if (vva >= r->host_user_addr && 852 vva + len < r->host_user_addr + r->size) { 853 return r->guest_phys_addr + vva - r->host_user_addr; 854 } 855 } 856 return 0; 857 } 858 859 static __rte_always_inline struct virtio_net * 860 get_device(int vid) 861 { 862 struct virtio_net *dev = NULL; 863 864 if (likely(vid >= 0 && vid < RTE_MAX_VHOST_DEVICE)) 865 dev = vhost_devices[vid]; 866 867 if (unlikely(!dev)) { 868 VHOST_CONFIG_LOG("device", ERR, "(%d) device not found.", vid); 869 } 870 871 return dev; 872 } 873 874 int vhost_new_device(struct vhost_backend_ops *ops); 875 void cleanup_device(struct virtio_net *dev, int destroy); 876 void reset_device(struct virtio_net *dev); 877 void vhost_destroy_device(int); 878 void vhost_destroy_device_notify(struct virtio_net *dev); 879 880 void cleanup_vq(struct vhost_virtqueue *vq, int destroy); 881 void cleanup_vq_inflight(struct virtio_net *dev, struct vhost_virtqueue *vq); 882 void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq); 883 884 int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); 885 886 void vhost_attach_vdpa_device(int vid, struct rte_vdpa_device *dev); 887 888 void vhost_set_ifname(int, const char *if_name, unsigned int if_len); 889 void vhost_setup_virtio_net(int vid, bool enable, bool legacy_ol_flags, bool stats_enabled, 890 bool support_iommu); 891 void vhost_enable_extbuf(int vid); 892 void vhost_enable_linearbuf(int vid); 893 int vhost_enable_guest_notification(struct virtio_net *dev, 894 struct vhost_virtqueue *vq, int enable); 895 896 struct rte_vhost_device_ops const *vhost_driver_callback_get(const char *path); 897 898 /* 899 * Backend-specific cleanup. 900 * 901 * TODO: fix it; we have one backend now 902 */ 903 void vhost_backend_cleanup(struct virtio_net *dev); 904 905 uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 906 uint64_t iova, uint64_t *len, uint8_t perm) 907 __rte_shared_locks_required(&vq->iotlb_lock); 908 void *vhost_alloc_copy_ind_table(struct virtio_net *dev, 909 struct vhost_virtqueue *vq, 910 uint64_t desc_addr, uint64_t desc_len) 911 __rte_shared_locks_required(&vq->iotlb_lock); 912 int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) 913 __rte_exclusive_locks_required(&vq->access_lock) 914 __rte_shared_locks_required(&vq->iotlb_lock); 915 uint64_t translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq, 916 uint64_t log_addr) 917 __rte_shared_locks_required(&vq->iotlb_lock); 918 void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq) 919 __rte_exclusive_locks_required(&vq->access_lock); 920 921 static __rte_always_inline uint64_t 922 vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 923 uint64_t iova, uint64_t *len, uint8_t perm) 924 __rte_shared_locks_required(&vq->iotlb_lock) 925 { 926 if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) 927 return rte_vhost_va_from_guest_pa(dev->mem, iova, len); 928 929 return __vhost_iova_to_vva(dev, vq, iova, len, perm); 930 } 931 932 #define vhost_avail_event(vr) \ 933 (*(volatile uint16_t*)&(vr)->used->ring[(vr)->size]) 934 #define vhost_used_event(vr) \ 935 (*(volatile uint16_t*)&(vr)->avail->ring[(vr)->size]) 936 937 /* 938 * The following is used with VIRTIO_RING_F_EVENT_IDX. 939 * Assuming a given event_idx value from the other size, if we have 940 * just incremented index from old to new_idx, should we trigger an 941 * event? 942 */ 943 static __rte_always_inline int 944 vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) 945 { 946 return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); 947 } 948 949 static __rte_always_inline void 950 vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq) 951 { 952 bool expected = false; 953 954 if (dev->notify_ops->guest_notify) { 955 if (rte_atomic_compare_exchange_strong_explicit(&vq->irq_pending, &expected, true, 956 rte_memory_order_release, rte_memory_order_relaxed)) { 957 if (dev->notify_ops->guest_notify(dev->vid, vq->index)) { 958 if (dev->flags & VIRTIO_DEV_STATS_ENABLED) 959 rte_atomic_fetch_add_explicit( 960 &vq->stats.guest_notifications_offloaded, 961 1, rte_memory_order_relaxed); 962 return; 963 } 964 965 /* Offloading failed, fallback to direct IRQ injection */ 966 rte_atomic_store_explicit(&vq->irq_pending, false, 967 rte_memory_order_release); 968 } else { 969 vq->stats.guest_notifications_suppressed++; 970 return; 971 } 972 } 973 974 if (dev->backend_ops->inject_irq(dev, vq)) { 975 if (dev->flags & VIRTIO_DEV_STATS_ENABLED) 976 rte_atomic_fetch_add_explicit(&vq->stats.guest_notifications_error, 977 1, rte_memory_order_relaxed); 978 return; 979 } 980 981 if (dev->flags & VIRTIO_DEV_STATS_ENABLED) 982 rte_atomic_fetch_add_explicit(&vq->stats.guest_notifications, 983 1, rte_memory_order_relaxed); 984 if (dev->notify_ops->guest_notified) 985 dev->notify_ops->guest_notified(dev->vid); 986 } 987 988 static __rte_always_inline void 989 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) 990 { 991 /* Flush used->idx update before we read avail->flags. */ 992 rte_atomic_thread_fence(rte_memory_order_seq_cst); 993 994 /* Don't kick guest if we don't reach index specified by guest. */ 995 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { 996 uint16_t old = vq->signalled_used; 997 uint16_t new = vq->last_used_idx; 998 bool signalled_used_valid = vq->signalled_used_valid; 999 1000 vq->signalled_used = new; 1001 vq->signalled_used_valid = true; 1002 1003 VHOST_DATA_LOG(dev->ifname, DEBUG, 1004 "%s: used_event_idx=%d, old=%d, new=%d", 1005 __func__, vhost_used_event(vq), old, new); 1006 1007 if (vhost_need_event(vhost_used_event(vq), new, old) || 1008 unlikely(!signalled_used_valid)) 1009 vhost_vring_inject_irq(dev, vq); 1010 } else { 1011 /* Kick the guest if necessary. */ 1012 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1013 vhost_vring_inject_irq(dev, vq); 1014 } 1015 } 1016 1017 static __rte_always_inline void 1018 vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) 1019 { 1020 uint16_t old, new, off, off_wrap; 1021 bool signalled_used_valid, kick = false; 1022 1023 /* Flush used desc update. */ 1024 rte_atomic_thread_fence(rte_memory_order_seq_cst); 1025 1026 if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { 1027 if (vq->driver_event->flags != 1028 VRING_EVENT_F_DISABLE) 1029 kick = true; 1030 goto kick; 1031 } 1032 1033 old = vq->signalled_used; 1034 new = vq->last_used_idx; 1035 vq->signalled_used = new; 1036 signalled_used_valid = vq->signalled_used_valid; 1037 vq->signalled_used_valid = true; 1038 1039 if (vq->driver_event->flags != VRING_EVENT_F_DESC) { 1040 if (vq->driver_event->flags != VRING_EVENT_F_DISABLE) 1041 kick = true; 1042 goto kick; 1043 } 1044 1045 if (unlikely(!signalled_used_valid)) { 1046 kick = true; 1047 goto kick; 1048 } 1049 1050 rte_atomic_thread_fence(rte_memory_order_acquire); 1051 1052 off_wrap = vq->driver_event->off_wrap; 1053 off = off_wrap & ~(1 << 15); 1054 1055 if (new <= old) 1056 old -= vq->size; 1057 1058 if (vq->used_wrap_counter != off_wrap >> 15) 1059 off -= vq->size; 1060 1061 if (vhost_need_event(off, new, old)) 1062 kick = true; 1063 kick: 1064 if (kick) 1065 vhost_vring_inject_irq(dev, vq); 1066 } 1067 1068 static __rte_always_inline void 1069 free_ind_table(void *idesc) 1070 { 1071 rte_free(idesc); 1072 } 1073 1074 static __rte_always_inline void 1075 restore_mbuf(struct rte_mbuf *m) 1076 { 1077 uint32_t mbuf_size, priv_size; 1078 1079 while (m) { 1080 priv_size = rte_pktmbuf_priv_size(m->pool); 1081 mbuf_size = sizeof(struct rte_mbuf) + priv_size; 1082 /* start of buffer is after mbuf structure and priv data */ 1083 1084 m->buf_addr = (char *)m + mbuf_size; 1085 rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size); 1086 m = m->next; 1087 } 1088 } 1089 1090 static __rte_always_inline bool 1091 mbuf_is_consumed(struct rte_mbuf *m) 1092 { 1093 while (m) { 1094 if (rte_mbuf_refcnt_read(m) > 1) 1095 return false; 1096 m = m->next; 1097 } 1098 1099 return true; 1100 } 1101 1102 void mem_set_dump(struct virtio_net *dev, void *ptr, size_t size, bool enable, uint64_t alignment); 1103 1104 #endif /* _VHOST_NET_CDEV_H_ */ 1105