1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/env.h" 38 #include "spdk/likely.h" 39 #include "spdk/string.h" 40 #include "spdk/util.h" 41 #include "spdk/memory.h" 42 #include "spdk/barrier.h" 43 #include "spdk/vhost.h" 44 #include "vhost_internal.h" 45 #include <rte_version.h> 46 47 #include "spdk_internal/vhost_user.h" 48 49 /* Path to folder where character device will be created. Can be set by user. */ 50 static char g_vhost_user_dev_dirname[PATH_MAX] = ""; 51 52 static struct spdk_thread *g_vhost_user_init_thread; 53 54 /** 55 * DPDK calls our callbacks synchronously but the work those callbacks 56 * perform needs to be async. Luckily, all DPDK callbacks are called on 57 * a DPDK-internal pthread, so we'll just wait on a semaphore in there. 58 */ 59 static sem_t g_dpdk_sem; 60 61 /** Return code for the current DPDK callback */ 62 static int g_dpdk_response; 63 64 struct vhost_session_fn_ctx { 65 /** Device pointer obtained before enqueueing the event */ 66 struct spdk_vhost_dev *vdev; 67 68 /** ID of the session to send event to. */ 69 uint32_t vsession_id; 70 71 /** User provided function to be executed on session's thread. */ 72 spdk_vhost_session_fn cb_fn; 73 74 /** 75 * User provided function to be called on the init thread 76 * after iterating through all sessions. 77 */ 78 spdk_vhost_dev_fn cpl_fn; 79 80 /** Custom user context */ 81 void *user_ctx; 82 }; 83 84 static void __attribute__((constructor)) 85 _vhost_user_sem_init(void) 86 { 87 if (sem_init(&g_dpdk_sem, 0, 0) != 0) { 88 SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n"); 89 abort(); 90 } 91 } 92 93 static void __attribute__((destructor)) 94 _vhost_user_sem_destroy(void) 95 { 96 sem_destroy(&g_dpdk_sem); 97 } 98 99 static inline void 100 vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end, 101 uint64_t *len, struct rte_vhost_mem_region *region) 102 { 103 *start = FLOOR_2MB(region->mmap_addr); 104 *end = CEIL_2MB(region->mmap_addr + region->mmap_size); 105 if (*start == *previous_start) { 106 *start += (size_t) VALUE_2MB; 107 } 108 *previous_start = *start; 109 *len = *end - *start; 110 } 111 112 void 113 vhost_session_mem_register(struct rte_vhost_memory *mem) 114 { 115 uint64_t start, end, len; 116 uint32_t i; 117 uint64_t previous_start = UINT64_MAX; 118 119 120 for (i = 0; i < mem->nregions; i++) { 121 vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); 122 SPDK_INFOLOG(vhost, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", 123 start, len); 124 125 if (spdk_mem_register((void *)start, len) != 0) { 126 SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n", 127 i); 128 continue; 129 } 130 } 131 } 132 133 void 134 vhost_session_mem_unregister(struct rte_vhost_memory *mem) 135 { 136 uint64_t start, end, len; 137 uint32_t i; 138 uint64_t previous_start = UINT64_MAX; 139 140 for (i = 0; i < mem->nregions; i++) { 141 vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); 142 if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) { 143 continue; /* region has not been registered */ 144 } 145 146 if (spdk_mem_unregister((void *)start, len) != 0) { 147 assert(false); 148 } 149 } 150 } 151 152 static int 153 _stop_session(struct spdk_vhost_session *vsession) 154 { 155 struct spdk_vhost_dev *vdev = vsession->vdev; 156 struct spdk_vhost_virtqueue *q; 157 int rc; 158 uint16_t i; 159 160 rc = vdev->backend->stop_session(vsession); 161 if (rc != 0) { 162 SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid); 163 return rc; 164 } 165 166 for (i = 0; i < vsession->max_queues; i++) { 167 q = &vsession->virtqueue[i]; 168 169 /* vring.desc and vring.desc_packed are in a union struct 170 * so q->vring.desc can replace q->vring.desc_packed. 171 */ 172 if (q->vring.desc == NULL) { 173 continue; 174 } 175 176 /* Packed virtqueues support up to 2^15 entries each 177 * so left one bit can be used as wrap counter. 178 */ 179 if (q->packed.packed_ring) { 180 q->last_avail_idx = q->last_avail_idx | 181 ((uint16_t)q->packed.avail_phase << 15); 182 q->last_used_idx = q->last_used_idx | 183 ((uint16_t)q->packed.used_phase << 15); 184 } 185 186 rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx); 187 } 188 189 vhost_session_mem_unregister(vsession->mem); 190 free(vsession->mem); 191 192 return 0; 193 } 194 195 static int 196 new_connection(int vid) 197 { 198 struct spdk_vhost_dev *vdev; 199 struct spdk_vhost_session *vsession; 200 size_t dev_dirname_len; 201 char ifname[PATH_MAX]; 202 char *ctrlr_name; 203 204 if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { 205 SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid); 206 return -1; 207 } 208 209 spdk_vhost_lock(); 210 211 ctrlr_name = &ifname[0]; 212 dev_dirname_len = strlen(g_vhost_user_dev_dirname); 213 if (strncmp(ctrlr_name, g_vhost_user_dev_dirname, dev_dirname_len) == 0) { 214 ctrlr_name += dev_dirname_len; 215 } 216 217 vdev = spdk_vhost_dev_find(ctrlr_name); 218 if (vdev == NULL) { 219 SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); 220 spdk_vhost_unlock(); 221 return -1; 222 } 223 224 /* We expect sessions inside vdev->vsessions to be sorted in ascending 225 * order in regard of vsession->id. For now we always set id = vsessions_cnt++ 226 * and append each session to the very end of the vsessions list. 227 * This is required for vhost_user_dev_foreach_session() to work. 228 */ 229 if (vdev->vsessions_num == UINT_MAX) { 230 assert(false); 231 return -EINVAL; 232 } 233 234 if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) + 235 vdev->backend->session_ctx_size)) { 236 SPDK_ERRLOG("vsession alloc failed\n"); 237 spdk_vhost_unlock(); 238 return -1; 239 } 240 memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size); 241 242 vsession->vdev = vdev; 243 vsession->vid = vid; 244 vsession->id = vdev->vsessions_num++; 245 vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid); 246 if (vsession->name == NULL) { 247 SPDK_ERRLOG("vsession alloc failed\n"); 248 spdk_vhost_unlock(); 249 free(vsession); 250 return -1; 251 } 252 vsession->started = false; 253 vsession->initialized = false; 254 vsession->next_stats_check_time = 0; 255 vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS * 256 spdk_get_ticks_hz() / 1000UL; 257 TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq); 258 259 vhost_session_install_rte_compat_hooks(vsession); 260 spdk_vhost_unlock(); 261 return 0; 262 } 263 264 static int 265 start_device(int vid) 266 { 267 struct spdk_vhost_dev *vdev; 268 struct spdk_vhost_session *vsession; 269 int rc = -1; 270 uint16_t i; 271 bool packed_ring; 272 273 spdk_vhost_lock(); 274 275 vsession = vhost_session_find_by_vid(vid); 276 if (vsession == NULL) { 277 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 278 goto out; 279 } 280 281 vdev = vsession->vdev; 282 if (vsession->started) { 283 /* already started, nothing to do */ 284 rc = 0; 285 goto out; 286 } 287 288 if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) { 289 SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); 290 goto out; 291 } 292 293 packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0); 294 295 vsession->max_queues = 0; 296 memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue)); 297 for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { 298 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; 299 300 q->vsession = vsession; 301 q->vring_idx = -1; 302 if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) { 303 continue; 304 } 305 q->vring_idx = i; 306 rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight); 307 308 /* vring.desc and vring.desc_packed are in a union struct 309 * so q->vring.desc can replace q->vring.desc_packed. 310 */ 311 if (q->vring.desc == NULL || q->vring.size == 0) { 312 continue; 313 } 314 315 if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) { 316 q->vring.desc = NULL; 317 continue; 318 } 319 320 if (packed_ring) { 321 /* Use the inflight mem to restore the last_avail_idx and last_used_idx. 322 * When the vring format is packed, there is no used_idx in the 323 * used ring, so VM can't resend the used_idx to VHOST when reconnect. 324 * QEMU version 5.2.0 supports the packed inflight before that it only 325 * supports split ring inflight because it doesn't send negotiated features 326 * before get inflight fd. Users can use RPC to enable this function. 327 */ 328 if (spdk_unlikely(g_packed_ring_recovery)) { 329 rte_vhost_get_vring_base_from_inflight(vsession->vid, i, 330 &q->last_avail_idx, 331 &q->last_used_idx); 332 } 333 334 /* Packed virtqueues support up to 2^15 entries each 335 * so left one bit can be used as wrap counter. 336 */ 337 q->packed.avail_phase = q->last_avail_idx >> 15; 338 q->last_avail_idx = q->last_avail_idx & 0x7FFF; 339 q->packed.used_phase = q->last_used_idx >> 15; 340 q->last_used_idx = q->last_used_idx & 0x7FFF; 341 342 if (!vsession->interrupt_mode) { 343 /* Disable I/O submission notifications, we'll be polling. */ 344 q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; 345 } 346 } else { 347 if (!vsession->interrupt_mode) { 348 /* Disable I/O submission notifications, we'll be polling. */ 349 q->vring.used->flags = VRING_USED_F_NO_NOTIFY; 350 } 351 } 352 353 q->packed.packed_ring = packed_ring; 354 vsession->max_queues = i + 1; 355 } 356 357 if (vhost_get_mem_table(vid, &vsession->mem) != 0) { 358 SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); 359 goto out; 360 } 361 362 /* 363 * Not sure right now but this look like some kind of QEMU bug and guest IO 364 * might be frozed without kicking all queues after live-migration. This look like 365 * the previous vhost instance failed to effectively deliver all interrupts before 366 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts 367 * should be ignored by guest virtio driver. 368 * 369 * Tested on QEMU 2.10.91 and 2.11.50. 370 */ 371 for (i = 0; i < vsession->max_queues; i++) { 372 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; 373 374 /* vring.desc and vring.desc_packed are in a union struct 375 * so q->vring.desc can replace q->vring.desc_packed. 376 */ 377 if (q->vring.desc != NULL && q->vring.size > 0) { 378 rte_vhost_vring_call(vsession->vid, q->vring_idx); 379 } 380 } 381 382 vhost_user_session_set_coalescing(vdev, vsession, NULL); 383 vhost_session_mem_register(vsession->mem); 384 vsession->initialized = true; 385 rc = vdev->backend->start_session(vsession); 386 if (rc != 0) { 387 vhost_session_mem_unregister(vsession->mem); 388 free(vsession->mem); 389 goto out; 390 } 391 392 out: 393 spdk_vhost_unlock(); 394 return rc; 395 } 396 397 static void 398 stop_device(int vid) 399 { 400 struct spdk_vhost_session *vsession; 401 402 spdk_vhost_lock(); 403 vsession = vhost_session_find_by_vid(vid); 404 if (vsession == NULL) { 405 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 406 spdk_vhost_unlock(); 407 return; 408 } 409 410 if (!vsession->started) { 411 /* already stopped, nothing to do */ 412 spdk_vhost_unlock(); 413 return; 414 } 415 416 _stop_session(vsession); 417 spdk_vhost_unlock(); 418 419 return; 420 } 421 422 static void 423 destroy_connection(int vid) 424 { 425 struct spdk_vhost_session *vsession; 426 427 spdk_vhost_lock(); 428 vsession = vhost_session_find_by_vid(vid); 429 if (vsession == NULL) { 430 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 431 spdk_vhost_unlock(); 432 return; 433 } 434 435 if (vsession->started) { 436 if (_stop_session(vsession) != 0) { 437 spdk_vhost_unlock(); 438 return; 439 } 440 } 441 442 TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq); 443 free(vsession->name); 444 free(vsession); 445 spdk_vhost_unlock(); 446 } 447 448 #if RTE_VERSION >= RTE_VERSION_NUM(21, 11, 0, 0) 449 static const struct rte_vhost_device_ops g_spdk_vhost_ops = { 450 #else 451 static const struct vhost_device_ops g_spdk_vhost_ops = { 452 #endif 453 .new_device = start_device, 454 .destroy_device = stop_device, 455 .new_connection = new_connection, 456 .destroy_connection = destroy_connection, 457 }; 458 459 static struct spdk_vhost_session * 460 vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id) 461 { 462 struct spdk_vhost_session *vsession; 463 464 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 465 if (vsession->id == id) { 466 return vsession; 467 } 468 } 469 470 return NULL; 471 } 472 473 struct spdk_vhost_session * 474 vhost_session_find_by_vid(int vid) 475 { 476 struct spdk_vhost_dev *vdev; 477 struct spdk_vhost_session *vsession; 478 479 for (vdev = spdk_vhost_dev_next(NULL); vdev != NULL; 480 vdev = spdk_vhost_dev_next(vdev)) { 481 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 482 if (vsession->vid == vid) { 483 return vsession; 484 } 485 } 486 } 487 488 return NULL; 489 } 490 491 static void 492 wait_for_semaphore(int timeout_sec, const char *errmsg) 493 { 494 struct timespec timeout; 495 int rc; 496 497 clock_gettime(CLOCK_REALTIME, &timeout); 498 timeout.tv_sec += timeout_sec; 499 rc = sem_timedwait(&g_dpdk_sem, &timeout); 500 if (rc != 0) { 501 SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); 502 sem_wait(&g_dpdk_sem); 503 } 504 } 505 506 static void 507 vhost_session_cb_done(int rc) 508 { 509 g_dpdk_response = rc; 510 sem_post(&g_dpdk_sem); 511 } 512 513 void 514 vhost_user_session_start_done(struct spdk_vhost_session *vsession, int response) 515 { 516 if (response == 0) { 517 vsession->started = true; 518 519 assert(vsession->vdev->active_session_num < UINT32_MAX); 520 vsession->vdev->active_session_num++; 521 } 522 523 vhost_session_cb_done(response); 524 } 525 526 void 527 vhost_user_session_stop_done(struct spdk_vhost_session *vsession, int response) 528 { 529 if (response == 0) { 530 vsession->started = false; 531 532 assert(vsession->vdev->active_session_num > 0); 533 vsession->vdev->active_session_num--; 534 } 535 536 vhost_session_cb_done(response); 537 } 538 539 static void 540 vhost_event_cb(void *arg1) 541 { 542 struct vhost_session_fn_ctx *ctx = arg1; 543 struct spdk_vhost_session *vsession; 544 545 if (spdk_vhost_trylock() != 0) { 546 spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1); 547 return; 548 } 549 550 vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id); 551 ctx->cb_fn(ctx->vdev, vsession, NULL); 552 spdk_vhost_unlock(); 553 } 554 555 int 556 vhost_user_session_send_event(struct spdk_vhost_session *vsession, 557 spdk_vhost_session_fn cb_fn, unsigned timeout_sec, 558 const char *errmsg) 559 { 560 struct vhost_session_fn_ctx ev_ctx = {0}; 561 struct spdk_vhost_dev *vdev = vsession->vdev; 562 563 ev_ctx.vdev = vdev; 564 ev_ctx.vsession_id = vsession->id; 565 ev_ctx.cb_fn = cb_fn; 566 567 spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx); 568 569 spdk_vhost_unlock(); 570 wait_for_semaphore(timeout_sec, errmsg); 571 spdk_vhost_lock(); 572 573 return g_dpdk_response; 574 } 575 576 static void 577 foreach_session_finish_cb(void *arg1) 578 { 579 struct vhost_session_fn_ctx *ev_ctx = arg1; 580 struct spdk_vhost_dev *vdev = ev_ctx->vdev; 581 582 if (spdk_vhost_trylock() != 0) { 583 spdk_thread_send_msg(spdk_get_thread(), 584 foreach_session_finish_cb, arg1); 585 return; 586 } 587 588 assert(vdev->pending_async_op_num > 0); 589 vdev->pending_async_op_num--; 590 if (ev_ctx->cpl_fn != NULL) { 591 ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx); 592 } 593 594 spdk_vhost_unlock(); 595 free(ev_ctx); 596 } 597 598 static void 599 foreach_session(void *arg1) 600 { 601 struct vhost_session_fn_ctx *ev_ctx = arg1; 602 struct spdk_vhost_session *vsession; 603 struct spdk_vhost_dev *vdev = ev_ctx->vdev; 604 int rc; 605 606 if (spdk_vhost_trylock() != 0) { 607 spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1); 608 return; 609 } 610 611 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 612 if (vsession->initialized) { 613 rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx); 614 if (rc < 0) { 615 goto out; 616 } 617 } 618 } 619 620 out: 621 spdk_vhost_unlock(); 622 623 spdk_thread_send_msg(g_vhost_user_init_thread, foreach_session_finish_cb, arg1); 624 } 625 626 void 627 vhost_user_dev_foreach_session(struct spdk_vhost_dev *vdev, 628 spdk_vhost_session_fn fn, 629 spdk_vhost_dev_fn cpl_fn, 630 void *arg) 631 { 632 struct vhost_session_fn_ctx *ev_ctx; 633 634 ev_ctx = calloc(1, sizeof(*ev_ctx)); 635 if (ev_ctx == NULL) { 636 SPDK_ERRLOG("Failed to alloc vhost event.\n"); 637 assert(false); 638 return; 639 } 640 641 ev_ctx->vdev = vdev; 642 ev_ctx->cb_fn = fn; 643 ev_ctx->cpl_fn = cpl_fn; 644 ev_ctx->user_ctx = arg; 645 646 assert(vdev->pending_async_op_num < UINT32_MAX); 647 vdev->pending_async_op_num++; 648 649 spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx); 650 } 651 652 void 653 vhost_user_session_set_interrupt_mode(struct spdk_vhost_session *vsession, bool interrupt_mode) 654 { 655 uint16_t i; 656 bool packed_ring; 657 int rc = 0; 658 659 packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0); 660 661 for (i = 0; i < vsession->max_queues; i++) { 662 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; 663 uint64_t num_events = 1; 664 665 /* vring.desc and vring.desc_packed are in a union struct 666 * so q->vring.desc can replace q->vring.desc_packed. 667 */ 668 if (q->vring.desc == NULL || q->vring.size == 0) { 669 continue; 670 } 671 672 if (interrupt_mode) { 673 /* Enable I/O submission notifications, we'll be interrupting. */ 674 if (packed_ring) { 675 * (volatile uint16_t *) &q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_ENABLE; 676 } else { 677 * (volatile uint16_t *) &q->vring.used->flags = 0; 678 } 679 680 /* In case of race condition, always kick vring when switch to intr */ 681 rc = write(q->vring.kickfd, &num_events, sizeof(num_events)); 682 if (rc < 0) { 683 SPDK_ERRLOG("failed to kick vring: %s.\n", spdk_strerror(errno)); 684 } 685 686 vsession->interrupt_mode = true; 687 } else { 688 /* Disable I/O submission notifications, we'll be polling. */ 689 if (packed_ring) { 690 * (volatile uint16_t *) &q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; 691 } else { 692 * (volatile uint16_t *) &q->vring.used->flags = VRING_USED_F_NO_NOTIFY; 693 } 694 695 vsession->interrupt_mode = false; 696 } 697 } 698 } 699 700 701 static enum rte_vhost_msg_result 702 extern_vhost_pre_msg_handler(int vid, void *_msg) 703 { 704 struct vhost_user_msg *msg = _msg; 705 struct spdk_vhost_session *vsession; 706 707 vsession = vhost_session_find_by_vid(vid); 708 if (vsession == NULL) { 709 SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); 710 assert(false); 711 return RTE_VHOST_MSG_RESULT_ERR; 712 } 713 714 switch (msg->request) { 715 case VHOST_USER_GET_VRING_BASE: 716 if (vsession->forced_polling && vsession->started) { 717 /* Our queue is stopped for whatever reason, but we may still 718 * need to poll it after it's initialized again. 719 */ 720 g_spdk_vhost_ops.destroy_device(vid); 721 } 722 break; 723 case VHOST_USER_SET_VRING_BASE: 724 case VHOST_USER_SET_VRING_ADDR: 725 case VHOST_USER_SET_VRING_NUM: 726 if (vsession->forced_polling && vsession->started) { 727 /* Additional queues are being initialized, so we either processed 728 * enough I/Os and are switching from SeaBIOS to the OS now, or 729 * we were never in SeaBIOS in the first place. Either way, we 730 * don't need our workaround anymore. 731 */ 732 g_spdk_vhost_ops.destroy_device(vid); 733 vsession->forced_polling = false; 734 } 735 break; 736 case VHOST_USER_SET_VRING_KICK: 737 /* rte_vhost(after 20.08) will call new_device after one active vring is 738 * configured, we will start the session before all vrings are available, 739 * so for each new vring, if the session is started, we need to restart it 740 * again. 741 */ 742 case VHOST_USER_SET_VRING_CALL: 743 /* rte_vhost will close the previous callfd and won't notify 744 * us about any change. This will effectively make SPDK fail 745 * to deliver any subsequent interrupts until a session is 746 * restarted. We stop the session here before closing the previous 747 * fd (so that all interrupts must have been delivered by the 748 * time the descriptor is closed) and start right after (which 749 * will make SPDK retrieve the latest, up-to-date callfd from 750 * rte_vhost. 751 */ 752 case VHOST_USER_SET_MEM_TABLE: 753 /* rte_vhost will unmap previous memory that SPDK may still 754 * have pending DMA operations on. We can't let that happen, 755 * so stop the device before letting rte_vhost unmap anything. 756 * This will block until all pending I/Os are finished. 757 * We will start the device again from the post-processing 758 * message handler. 759 */ 760 if (vsession->started) { 761 g_spdk_vhost_ops.destroy_device(vid); 762 vsession->needs_restart = true; 763 } 764 break; 765 case VHOST_USER_GET_CONFIG: { 766 int rc = 0; 767 768 spdk_vhost_lock(); 769 if (vsession->vdev->backend->vhost_get_config) { 770 rc = vsession->vdev->backend->vhost_get_config(vsession->vdev, 771 msg->payload.cfg.region, msg->payload.cfg.size); 772 if (rc != 0) { 773 msg->size = 0; 774 } 775 } 776 spdk_vhost_unlock(); 777 778 return RTE_VHOST_MSG_RESULT_REPLY; 779 } 780 case VHOST_USER_SET_CONFIG: { 781 int rc = 0; 782 783 spdk_vhost_lock(); 784 if (vsession->vdev->backend->vhost_set_config) { 785 rc = vsession->vdev->backend->vhost_set_config(vsession->vdev, 786 msg->payload.cfg.region, msg->payload.cfg.offset, 787 msg->payload.cfg.size, msg->payload.cfg.flags); 788 } 789 spdk_vhost_unlock(); 790 791 return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR; 792 } 793 default: 794 break; 795 } 796 797 return RTE_VHOST_MSG_RESULT_NOT_HANDLED; 798 } 799 800 static enum rte_vhost_msg_result 801 extern_vhost_post_msg_handler(int vid, void *_msg) 802 { 803 struct vhost_user_msg *msg = _msg; 804 struct spdk_vhost_session *vsession; 805 806 vsession = vhost_session_find_by_vid(vid); 807 if (vsession == NULL) { 808 SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); 809 assert(false); 810 return RTE_VHOST_MSG_RESULT_ERR; 811 } 812 813 if (vsession->needs_restart) { 814 g_spdk_vhost_ops.new_device(vid); 815 vsession->needs_restart = false; 816 return RTE_VHOST_MSG_RESULT_NOT_HANDLED; 817 } 818 819 switch (msg->request) { 820 case VHOST_USER_SET_FEATURES: 821 /* rte_vhost requires all queues to be fully initialized in order 822 * to start I/O processing. This behavior is not compliant with the 823 * vhost-user specification and doesn't work with QEMU 2.12+, which 824 * will only initialize 1 I/O queue for the SeaBIOS boot. 825 * Theoretically, we should start polling each virtqueue individually 826 * after receiving its SET_VRING_KICK message, but rte_vhost is not 827 * designed to poll individual queues. So here we use a workaround 828 * to detect when the vhost session could be potentially at that SeaBIOS 829 * stage and we mark it to start polling as soon as its first virtqueue 830 * gets initialized. This doesn't hurt any non-QEMU vhost slaves 831 * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent 832 * at any time, but QEMU will send it at least once on SeaBIOS 833 * initialization - whenever powered-up or rebooted. 834 */ 835 vsession->forced_polling = true; 836 break; 837 case VHOST_USER_SET_VRING_KICK: 838 /* vhost-user spec tells us to start polling a queue after receiving 839 * its SET_VRING_KICK message. Let's do it! 840 */ 841 if (vsession->forced_polling && !vsession->started) { 842 g_spdk_vhost_ops.new_device(vid); 843 } 844 break; 845 default: 846 break; 847 } 848 849 return RTE_VHOST_MSG_RESULT_NOT_HANDLED; 850 } 851 852 struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = { 853 .pre_msg_handle = extern_vhost_pre_msg_handler, 854 .post_msg_handle = extern_vhost_post_msg_handler, 855 }; 856 857 void 858 vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession) 859 { 860 int rc; 861 862 rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL); 863 if (rc != 0) { 864 SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n", 865 vsession->vid); 866 return; 867 } 868 } 869 870 int 871 vhost_register_unix_socket(const char *path, const char *ctrl_name, 872 uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features) 873 { 874 struct stat file_stat; 875 uint64_t features = 0; 876 877 /* Register vhost driver to handle vhost messages. */ 878 if (stat(path, &file_stat) != -1) { 879 if (!S_ISSOCK(file_stat.st_mode)) { 880 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " 881 "The file already exists and is not a socket.\n", 882 path); 883 return -EIO; 884 } else if (unlink(path) != 0) { 885 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " 886 "The socket already exists and failed to unlink.\n", 887 path); 888 return -EIO; 889 } 890 } 891 892 #if RTE_VERSION < RTE_VERSION_NUM(20, 8, 0, 0) 893 if (rte_vhost_driver_register(path, 0) != 0) { 894 #else 895 if (rte_vhost_driver_register(path, RTE_VHOST_USER_ASYNC_COPY) != 0) { 896 #endif 897 SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name); 898 SPDK_ERRLOG("Check if domain socket %s already exists\n", path); 899 return -EIO; 900 } 901 if (rte_vhost_driver_set_features(path, virtio_features) || 902 rte_vhost_driver_disable_features(path, disabled_features)) { 903 SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name); 904 905 rte_vhost_driver_unregister(path); 906 return -EIO; 907 } 908 909 if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) { 910 rte_vhost_driver_unregister(path); 911 SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name); 912 return -EIO; 913 } 914 915 rte_vhost_driver_get_protocol_features(path, &features); 916 features |= protocol_features; 917 rte_vhost_driver_set_protocol_features(path, features); 918 919 if (rte_vhost_driver_start(path) != 0) { 920 SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n", 921 ctrl_name, errno, spdk_strerror(errno)); 922 rte_vhost_driver_unregister(path); 923 return -EIO; 924 } 925 926 return 0; 927 } 928 929 int 930 vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) 931 { 932 return rte_vhost_get_mem_table(vid, mem); 933 } 934 935 int 936 vhost_driver_unregister(const char *path) 937 { 938 return rte_vhost_driver_unregister(path); 939 } 940 941 int 942 vhost_get_negotiated_features(int vid, uint64_t *negotiated_features) 943 { 944 return rte_vhost_get_negotiated_features(vid, negotiated_features); 945 } 946 947 int 948 vhost_user_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, 949 uint32_t iops_threshold) 950 { 951 uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; 952 uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; 953 954 if (delay_time_base >= UINT32_MAX) { 955 SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); 956 return -EINVAL; 957 } else if (io_rate == 0) { 958 SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, 959 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS); 960 return -EINVAL; 961 } 962 963 vdev->coalescing_delay_us = delay_base_us; 964 vdev->coalescing_iops_threshold = iops_threshold; 965 return 0; 966 } 967 968 int 969 vhost_user_session_set_coalescing(struct spdk_vhost_dev *vdev, 970 struct spdk_vhost_session *vsession, void *ctx) 971 { 972 vsession->coalescing_delay_time_base = 973 vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL; 974 vsession->coalescing_io_rate_threshold = 975 vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; 976 return 0; 977 } 978 979 int 980 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, 981 uint32_t iops_threshold) 982 { 983 int rc; 984 985 rc = vhost_user_dev_set_coalescing(vdev, delay_base_us, iops_threshold); 986 if (rc != 0) { 987 return rc; 988 } 989 990 vhost_user_dev_foreach_session(vdev, vhost_user_session_set_coalescing, NULL, NULL); 991 return 0; 992 } 993 994 void 995 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, 996 uint32_t *iops_threshold) 997 { 998 if (delay_base_us) { 999 *delay_base_us = vdev->coalescing_delay_us; 1000 } 1001 1002 if (iops_threshold) { 1003 *iops_threshold = vdev->coalescing_iops_threshold; 1004 } 1005 } 1006 1007 int 1008 spdk_vhost_set_socket_path(const char *basename) 1009 { 1010 int ret; 1011 1012 if (basename && strlen(basename) > 0) { 1013 ret = snprintf(g_vhost_user_dev_dirname, sizeof(g_vhost_user_dev_dirname) - 2, "%s", basename); 1014 if (ret <= 0) { 1015 return -EINVAL; 1016 } 1017 if ((size_t)ret >= sizeof(g_vhost_user_dev_dirname) - 2) { 1018 SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); 1019 return -EINVAL; 1020 } 1021 1022 if (g_vhost_user_dev_dirname[ret - 1] != '/') { 1023 g_vhost_user_dev_dirname[ret] = '/'; 1024 g_vhost_user_dev_dirname[ret + 1] = '\0'; 1025 } 1026 } 1027 1028 return 0; 1029 } 1030 1031 static void 1032 vhost_dev_thread_exit(void *arg1) 1033 { 1034 spdk_thread_exit(spdk_get_thread()); 1035 } 1036 1037 int 1038 vhost_user_dev_register(struct spdk_vhost_dev *vdev, const char *name, struct spdk_cpuset *cpumask, 1039 const struct spdk_vhost_dev_backend *backend) 1040 { 1041 char path[PATH_MAX]; 1042 1043 if (snprintf(path, sizeof(path), "%s%s", g_vhost_user_dev_dirname, name) >= (int)sizeof(path)) { 1044 SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", 1045 name,g_vhost_user_dev_dirname, name); 1046 return -EINVAL; 1047 } 1048 1049 vdev->path = strdup(path); 1050 if (vdev->path == NULL) { 1051 return -EIO; 1052 } 1053 1054 vdev->thread = spdk_thread_create(vdev->name, cpumask); 1055 if (vdev->thread == NULL) { 1056 free(vdev->path); 1057 SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name); 1058 return -EIO; 1059 } 1060 1061 vdev->registered = true; 1062 vdev->backend = backend; 1063 TAILQ_INIT(&vdev->vsessions); 1064 1065 vhost_user_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, 1066 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); 1067 1068 if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features, 1069 vdev->protocol_features)) { 1070 spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); 1071 free(vdev->path); 1072 return -EIO; 1073 } 1074 1075 return 0; 1076 } 1077 1078 int 1079 vhost_user_dev_unregister(struct spdk_vhost_dev *vdev) 1080 { 1081 if (!TAILQ_EMPTY(&vdev->vsessions)) { 1082 SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); 1083 return -EBUSY; 1084 } 1085 1086 if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) { 1087 SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" 1088 "Check if domain socket %s still exists\n", 1089 vdev->name, vdev->path); 1090 return -EIO; 1091 } 1092 1093 spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); 1094 free(vdev->path); 1095 1096 return 0; 1097 } 1098 1099 static bool g_vhost_user_started = false; 1100 1101 int 1102 vhost_user_init(void) 1103 { 1104 size_t len; 1105 1106 if (g_vhost_user_started) { 1107 return 0; 1108 } 1109 1110 if (g_vhost_user_dev_dirname[0] == '\0') { 1111 if (getcwd(g_vhost_user_dev_dirname, sizeof(g_vhost_user_dev_dirname) - 1) == NULL) { 1112 SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); 1113 return -1; 1114 } 1115 1116 len = strlen(g_vhost_user_dev_dirname); 1117 if (g_vhost_user_dev_dirname[len - 1] != '/') { 1118 g_vhost_user_dev_dirname[len] = '/'; 1119 g_vhost_user_dev_dirname[len + 1] = '\0'; 1120 } 1121 } 1122 1123 g_vhost_user_started = true; 1124 1125 g_vhost_user_init_thread = spdk_get_thread(); 1126 assert(g_vhost_user_init_thread != NULL); 1127 1128 return 0; 1129 } 1130 1131 static void * 1132 vhost_user_session_shutdown(void *arg) 1133 { 1134 struct spdk_vhost_dev *vdev = NULL; 1135 struct spdk_vhost_session *vsession; 1136 vhost_fini_cb vhost_cb = arg; 1137 1138 for (vdev = spdk_vhost_dev_next(NULL); vdev != NULL; 1139 vdev = spdk_vhost_dev_next(vdev)) { 1140 spdk_vhost_lock(); 1141 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 1142 if (vsession->started) { 1143 _stop_session(vsession); 1144 } 1145 } 1146 spdk_vhost_unlock(); 1147 vhost_driver_unregister(vdev->path); 1148 vdev->registered = false; 1149 } 1150 1151 SPDK_INFOLOG(vhost, "Exiting\n"); 1152 spdk_thread_send_msg(g_vhost_user_init_thread, vhost_cb, NULL); 1153 return NULL; 1154 } 1155 1156 void 1157 vhost_user_fini(vhost_fini_cb vhost_cb) 1158 { 1159 pthread_t tid; 1160 int rc; 1161 1162 if (!g_vhost_user_started) { 1163 vhost_cb(NULL); 1164 return; 1165 } 1166 1167 g_vhost_user_started = false; 1168 1169 /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK 1170 * ops for stopping a device or removing a connection, we need to call it from 1171 * a separate thread to avoid deadlock. 1172 */ 1173 rc = pthread_create(&tid, NULL, &vhost_user_session_shutdown, vhost_cb); 1174 if (rc < 0) { 1175 SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); 1176 abort(); 1177 } 1178 pthread_detach(tid); 1179 } 1180